Rev 40: Support partial index reads when performing suggestions, massively increasing scaling. (Robert Collins) in http://people.ubuntu.com/~robertc/baz2.0/plugins/search/trunk
Robert Collins
robertc at robertcollins.net
Wed Jun 18 14:09:18 BST 2008
At http://people.ubuntu.com/~robertc/baz2.0/plugins/search/trunk
------------------------------------------------------------
revno: 40
revision-id: robertc at robertcollins.net-20080618130915-c20vmim4ks0j6wt9
parent: robertc at robertcollins.net-20080618003554-ee0vqxi1vy2swr2e
committer: Robert Collins <robertc at robertcollins.net>
branch nick: trunk
timestamp: Wed 2008-06-18 23:09:15 +1000
message:
Support partial index reads when performing suggestions, massively increasing scaling. (Robert Collins)
modified:
NEWS news-20080608052041-z5bahsl8kwl0uf4x-2
index.py index.py-20080608055509-hnimeek7q8tctkqf-2
tests/test_index.py test_index.py-20080608055509-hnimeek7q8tctkqf-4
transport.py transport.py-20080612012941-c32db8zkpo1mhde8-2
=== modified file 'NEWS'
--- a/NEWS 2008-06-17 12:25:37 +0000
+++ b/NEWS 2008-06-18 13:09:15 +0000
@@ -59,3 +59,6 @@
* New module ``transport`` containing ``FileView`` to map a packs contents
as a transport object, allowing bzr indices to be stored in a pack.
(Robert Collins)
+
+ * New subclass of ``GraphIndex``, ``SuggestableGraphIndex`` used for
+ generating search suggestions/recommendations. (Robert Collins)
=== modified file 'index.py'
--- a/index.py 2008-06-18 00:35:54 +0000
+++ b/index.py 2008-06-18 13:09:15 +0000
@@ -1125,10 +1125,76 @@
:param key: The key to search with.
"""
# Make it work:
- if len(key) > 1:
- candidates = self.iter_entries_prefix([key[:-1] + (None,)])
+ # Partly copied from iter_entries()
+ # PERFORMANCE TODO: parse and bisect all remaining data at some
+ # threshold of total-index processing/get calling layers that expect to
+ # read the entire index to use the iter_all_entries method instead.
+ half_page = self._transport.recommended_page_size() // 2
+ # For when we don't know the length to permit bisection.
+ if self._size is None:
+ if len(key) > 1:
+ candidates = self.iter_entries_prefix([key[:-1] + (None,)])
+ else:
+ candidates = self.iter_all_entries()
+ for candidate in candidates:
+ if candidate[1][-1].startswith(key[-1]):
+ yield candidate
else:
- candidates = self.iter_all_entries()
- for candidate in candidates:
- if candidate[1][-1].startswith(key[-1]):
- yield candidate
+ # Bisect to find the start.
+ # TODO: If we know a reasonable upper bound we could do one IO for
+ # the remainder.
+ # loop parsing more until wwe have one range covering the
+ # suggestions.
+ step = self._size //2
+ search = [(step, key)]
+ found = self._lookup_keys_via_location(search)
+ while True:
+ step = step // 2
+ if found[0][1] not in [-1, 1]:
+ # We can now figure out where to start answering from.
+ break
+ search = [(found[0][0][0] + step * found[0][1], key)]
+ found = self._lookup_keys_via_location(search)
+ while True:
+ lower_index = self._parsed_key_index(key)
+ parsed_range = self._parsed_key_map[lower_index]
+ last_key = parsed_range[1]
+ if last_key[:-1] > key[:-1]:
+ # enough is parsed
+ break
+ if last_key[:-1] == key[:-1]:
+ if (last_key[-1] > key[-1] and not
+ last_key[-1].startswith(key[-1])):
+ # enough is parsed
+ break
+ hi_parsed = self._parsed_byte_map[lower_index][1]
+ if hi_parsed == self._size:
+ # all parsed
+ break
+ next_probe = hi_parsed + half_page - 1
+ if lower_index + 1 < len(self._parsed_byte_map):
+ next_bottom = self._parsed_byte_map[lower_index +1][0]
+ if next_bottom <= next_probe:
+ # read before the parsed area
+ next_probe = next_bottom - 800
+ self._read_and_parse([(next_probe, 800)])
+ # Now, scan for all keys in the potential range, and test them for
+ # being candidates, yielding if they are.
+ if self.node_ref_lists:
+ raise ValueError("TODO: implement resolving of reference lists"
+ " on starts_with searches.")
+ lower_index = self._parsed_key_index(key)
+ parsed_range = self._parsed_byte_map[lower_index]
+ for offset, candidate_node in self._keys_by_offset.iteritems():
+ if offset < parsed_range[0] or offset >= parsed_range[1]:
+ continue
+ candidate_key = candidate_node[0]
+ if (candidate_key[:-1] == key[:-1] and
+ candidate_key[-1].startswith(key[-1])):
+ if self.node_ref_lists:
+ value, refs = self._bisect_nodes[candidate_key]
+ yield (self, candidate_key, value,
+ self._resolve_references(refs))
+ else:
+ value = self._bisect_nodes[candidate_key]
+ yield (self, candidate_key, value)
=== modified file 'tests/test_index.py'
--- a/tests/test_index.py 2008-06-17 12:25:37 +0000
+++ b/tests/test_index.py 2008-06-18 13:09:15 +0000
@@ -520,4 +520,4 @@
(query_index, ('pref', 'pref'), ''),
(query_index, ('pref', 'preferential'), ''),
],
- list(query_index.iter_entries_starts_with(('pref', 'pref'))))
+ sorted(query_index.iter_entries_starts_with(('pref', 'pref'))))
=== modified file 'transport.py'
--- a/transport.py 2008-06-12 01:30:02 +0000
+++ b/transport.py 2008-06-18 13:09:15 +0000
@@ -85,3 +85,7 @@
data = data[lower_trim:upper_trim]
offset = offset - base
yield offset, data
+
+ def recommended_page_size(self):
+ """See Transport.recommended_page_size."""
+ return self._backing_transport.recommended_page_size()
More information about the bazaar-commits
mailing list