Rev 40: Support partial index reads when performing suggestions, massively increasing scaling. (Robert Collins) in http://people.ubuntu.com/~robertc/baz2.0/plugins/search/trunk

Wed Jun 18 14:09:18 BST 2008

At http://people.ubuntu.com/~robertc/baz2.0/plugins/search/trunk

------------------------------------------------------------
revno: 40
revision-id: robertc at robertcollins.net-20080618130915-c20vmim4ks0j6wt9
parent: robertc at robertcollins.net-20080618003554-ee0vqxi1vy2swr2e
committer: Robert Collins <robertc at robertcollins.net>
branch nick: trunk
timestamp: Wed 2008-06-18 23:09:15 +1000
message:
  Support partial index reads when performing suggestions, massively increasing scaling. (Robert Collins)
modified:
  NEWS                           news-20080608052041-z5bahsl8kwl0uf4x-2
  index.py                       index.py-20080608055509-hnimeek7q8tctkqf-2
  tests/test_index.py            test_index.py-20080608055509-hnimeek7q8tctkqf-4
  transport.py                   transport.py-20080612012941-c32db8zkpo1mhde8-2
=== modified file 'NEWS'

--- a/NEWS	2008-06-17 12:25:37 +0000
+++ b/NEWS	2008-06-18 13:09:15 +0000
@@ -59,3 +59,6 @@
     * New module ``transport`` containing ``FileView`` to map a packs contents
       as a transport object, allowing bzr indices to be stored in a pack.
       (Robert Collins)
+
+    * New subclass of ``GraphIndex``, ``SuggestableGraphIndex`` used for
+      generating search suggestions/recommendations. (Robert Collins)

=== modified file 'index.py'
--- a/index.py	2008-06-18 00:35:54 +0000
+++ b/index.py	2008-06-18 13:09:15 +0000
@@ -1125,10 +1125,76 @@
         :param key: The key to search with.
         """
         # Make it work:
-        if len(key) > 1:
-            candidates = self.iter_entries_prefix([key[:-1] + (None,)])
+        # Partly copied from iter_entries()
+        # PERFORMANCE TODO: parse and bisect all remaining data at some
+        # threshold of total-index processing/get calling layers that expect to
+        # read the entire index to use the iter_all_entries  method instead.
+        half_page = self._transport.recommended_page_size() // 2
+        # For when we don't know the length to permit bisection.
+        if self._size is None:
+            if len(key) > 1:
+                candidates = self.iter_entries_prefix([key[:-1] + (None,)])
+            else:
+                candidates = self.iter_all_entries()
+            for candidate in candidates:
+                if candidate[1][-1].startswith(key[-1]):
+                    yield candidate
         else:
-            candidates = self.iter_all_entries()
-        for candidate in candidates:
-            if candidate[1][-1].startswith(key[-1]):
-                yield candidate
+            # Bisect to find the start.
+            # TODO: If we know a reasonable upper bound we could do one IO for
+            # the remainder.
+            # loop parsing more until wwe have one range covering the
+            # suggestions.
+            step = self._size //2
+            search = [(step, key)]
+            found = self._lookup_keys_via_location(search)
+            while True:
+                step = step // 2
+                if found[0][1] not in [-1, 1]:
+                    # We can now figure out where to start answering from.
+                    break
+                search = [(found[0][0][0] + step * found[0][1], key)]
+                found = self._lookup_keys_via_location(search)
+            while True:
+                lower_index = self._parsed_key_index(key)
+                parsed_range = self._parsed_key_map[lower_index]
+                last_key = parsed_range[1]
+                if last_key[:-1] > key[:-1]:
+                    # enough is parsed
+                    break
+                if last_key[:-1] == key[:-1]:
+                    if (last_key[-1] > key[-1] and not
+                        last_key[-1].startswith(key[-1])):
+                        # enough is parsed
+                        break
+                hi_parsed = self._parsed_byte_map[lower_index][1]
+                if hi_parsed == self._size:
+                    # all parsed
+                    break
+                next_probe = hi_parsed + half_page - 1
+                if lower_index + 1 < len(self._parsed_byte_map):
+                    next_bottom = self._parsed_byte_map[lower_index +1][0]
+                    if next_bottom <= next_probe:
+                        # read before the parsed area
+                        next_probe = next_bottom - 800
+                self._read_and_parse([(next_probe, 800)])
+            # Now, scan for all keys in the potential range, and test them for
+            # being candidates, yielding if they are.
+            if self.node_ref_lists:
+                raise ValueError("TODO: implement resolving of reference lists"
+                    " on starts_with searches.")
+            lower_index = self._parsed_key_index(key)
+            parsed_range = self._parsed_byte_map[lower_index]
+            for offset, candidate_node in self._keys_by_offset.iteritems():
+                if offset < parsed_range[0] or offset >= parsed_range[1]:
+                    continue
+                candidate_key = candidate_node[0]
+                if (candidate_key[:-1] == key[:-1] and
+                    candidate_key[-1].startswith(key[-1])):
+                    if self.node_ref_lists:
+                        value, refs = self._bisect_nodes[candidate_key]
+                        yield (self, candidate_key, value,
+                            self._resolve_references(refs))
+                    else:
+                        value = self._bisect_nodes[candidate_key]
+                        yield (self, candidate_key, value)

=== modified file 'tests/test_index.py'
--- a/tests/test_index.py	2008-06-17 12:25:37 +0000
+++ b/tests/test_index.py	2008-06-18 13:09:15 +0000
@@ -520,4 +520,4 @@
             (query_index, ('pref', 'pref'), ''),
             (query_index, ('pref', 'preferential'), ''),
             ],
-            list(query_index.iter_entries_starts_with(('pref', 'pref'))))
+            sorted(query_index.iter_entries_starts_with(('pref', 'pref'))))

=== modified file 'transport.py'
--- a/transport.py	2008-06-12 01:30:02 +0000
+++ b/transport.py	2008-06-18 13:09:15 +0000
@@ -85,3 +85,7 @@
             data = data[lower_trim:upper_trim]
             offset = offset - base
             yield offset, data
+
+    def recommended_page_size(self):
+        """See Transport.recommended_page_size."""
+        return self._backing_transport.recommended_page_size()