Rev 2896: Add support for key references to the index lookup_keys_via_location bisection interface. in http://people.ubuntu.com/~robertc/baz2.0/index

Sun Oct 7 23:52:15 BST 2007

At http://people.ubuntu.com/~robertc/baz2.0/index

------------------------------------------------------------
revno: 2896
revision-id: robertc at robertcollins.net-20071007225205-4ttygs5100xl33ac
parent: robertc at robertcollins.net-20071007220449-stt24xz9eaaj703l
committer: Robert Collins <robertc at robertcollins.net>
branch nick: index
timestamp: Mon 2007-10-08 08:52:05 +1000
message:
  Add support for key references to the index lookup_keys_via_location bisection interface.
modified:
  bzrlib/index.py                index.py-20070712131115-lolkarso50vjr64s-1
  bzrlib/tests/test_index.py     test_index.py-20070712131115-lolkarso50vjr64s-2
=== modified file 'bzrlib/index.py'

--- a/bzrlib/index.py	2007-10-07 22:04:49 +0000
+++ b/bzrlib/index.py	2007-10-07 22:52:05 +0000
@@ -245,6 +245,10 @@
         """
         self._transport = transport
         self._name = name
+        # becomes a dict of key:(value, reference-list-byte-locations) 
+        # used by the bisection interface to store parsed but not resolved
+        # keys.
+        self._bisect_nodes = None
         self._nodes = None
         # a sorted list of slice-addresses for the parsed bytes of the file.
         # e.g. (0,1) would mean that byte 0 is parsed.
@@ -374,6 +378,13 @@
         except ValueError:
             raise errors.BadIndexOptions(self)
 
+    def _resolve_references(self, references):
+        """Return the resolved key references for references."""
+        node_refs = []
+        for ref_list in references:
+            node_refs.append(tuple([self._keys_by_offset[ref][0] for ref in ref_list]))
+        return tuple(node_refs)
+
     def _parsed_byte_index(self, offset):
         """Return the index of the entry immediately before offset.
 
@@ -439,6 +450,11 @@
         'bar'), ('foobar', 'gam') and do a prefix search for ('foo', None) then
         only the former key is returned.
 
+        WARNING: Note that this method currently causes a full index parse
+        unconditionally (which is reasonably appropriate as it is a means for
+        thunking many small indices into one larger one and still supplies
+        iter_all_entries at the thunk layer).
+
         :param keys: An iterable providing the key prefixes to be retrieved.
             Each key prefix takes the form of a tuple the length of a key, but
             with the last N elements 'None' rather than a regular bytestring.
@@ -516,6 +532,11 @@
     def lookup_keys_via_location(self, location_keys):
         """Public interface for implementing bisection.
 
+        If _buffer_all has been called, then all the data for the index is in
+        memory, and this method should not be called, as it uses a separate
+        cache because it cannot pre-resolve all indices, which buffer_all does
+        for performance.
+
         :param location_keys: A list of location, key tuples.
         :return: A list of (location_key, result) tuples as expected by
             bzrlib.bisect_multi.bisect_multi_bytes.
@@ -547,23 +568,16 @@
             if length > 0:
                 readv_ranges.append((location, length))
         # read the header if needed
-        if self._nodes is None:
+        if self._bisect_nodes is None:
             readv_ranges.append((0, 200))
-        if readv_ranges:
-            readv_data = self._transport.readv(self._name, readv_ranges, True,
-                self._size)
-            # parse
-            for offset, data in readv_data:
-                if self._nodes is None:
-                    # this must be the start
-                    assert offset == 0
-                    offset, data = self._parse_header_from_bytes(data)
-                self._parse_region(offset, data)
-                # print offset, len(data), data
+        self._read_and_parse(readv_ranges)
         # generate results:
         #  - figure out <, >, missing, present
         #  - result present references so we can return them.
         result = []
+        # keys that we cannot answer until we resolve references
+        pending_references = []
+        pending_locations = set()
         for location, key in location_keys:
             # can we answer from cache?
             index = self._parsed_key_index(key)
@@ -572,12 +586,24 @@
                  # end of the file has been parsed
                  self._parsed_byte_map[index][1] == self._size)):
                 # the key has been parsed, so no lookup is needed
-                if key in self._nodes:
+                if key in self._bisect_nodes:
                     if self.node_ref_lists:
-                        value, refs = self._nodes[key]
-                        result.append(((location, key), (self, key, value, refs)))
+                        # the references may not have been all parsed.
+                        value, refs = self._bisect_nodes[key]
+                        wanted_locations = []
+                        for ref_list in refs:
+                            for ref in ref_list:
+                                if ref not in self._keys_by_offset:
+                                    wanted_locations.append(ref)
+                        if wanted_locations:
+                            pending_locations.update(wanted_locations)
+                            pending_references.append((location, key))
+                            continue
+                        result.append(((location, key), (self, key,
+                            value, self._resolve_references(refs))))
                     else:
-                        result.append(((location, key), (self, key, self._nodes[key])))
+                        result.append(((location, key),
+                            (self, key, self._bisect_nodes[key])))
                 else:
                     result.append(((location, key), False))
                 continue
@@ -590,6 +616,24 @@
             else:
                 direction = +1
             result.append(((location, key), direction))
+        readv_ranges = []
+        # lookup data to resolve references
+        for location in pending_locations:
+            length = 800
+            if location + length > self._size:
+                length = self._size - location
+            # TODO: trim out parsed locations (e.g. if the 800 is into the
+            # parsed region trim it, and dont use the ajust_for_latency
+            # facility)
+            if length > 0:
+                readv_ranges.append((location, length))
+        self._read_and_parse(readv_ranges)
+        for location, key in pending_references:
+            # answer key references we had to look-up-late.
+            index = self._parsed_key_index(key)
+            value, refs = self._bisect_nodes[key]
+            result.append(((location, key), (self, key,
+                value, self._resolve_references(refs))))
         return result
 
     def _parse_header_from_bytes(self, bytes):
@@ -632,9 +676,8 @@
         self._expected_elements = 3 + self._key_length
         # raw data keyed by offset
         self._keys_by_offset = {}
-        # ready-to-return key:value or key:value, node_ref_lists
-        self._nodes = {}
-        self._nodes_by_key = {}
+        # keys with the value and node references
+        self._bisect_nodes = {}
         return header_end, bytes[header_end:]
 
     def _parse_region(self, offset, data):
@@ -748,34 +791,15 @@
             ref_lists = tuple(ref_lists)
             self._keys_by_offset[pos] = (key, absent, ref_lists, value)
             pos += len(line) + 1 # +1 for the \n
+            if absent:
+                continue
+            if self.node_ref_lists:
+                node_value = (value, ref_lists)
+            else:
+                node_value = value
+            self._bisect_nodes[key] = node_value
+            # print "parsed ", key
         self._parsed_bytes(offset, first_key, offset + len(data), key)
-        # XXXXXX repeated work here.
-        for key, absent, references, value in self._keys_by_offset.itervalues():
-            if absent:
-                continue
-            # resolve references:
-            if self.node_ref_lists:
-                node_refs = []
-                for ref_list in references:
-                    node_refs.append(tuple([self._keys_by_offset[ref][0] for ref in ref_list]))
-                node_value = (value, tuple(node_refs))
-            else:
-                node_value = value
-            self._nodes[key] = node_value
-            if self._key_length > 1:
-                subkey = list(reversed(key[:-1]))
-                key_dict = self._nodes_by_key
-                if self.node_ref_lists:
-                    key_value = key, node_value[0], node_value[1]
-                else:
-                    key_value = key, node_value
-                # possibly should do this on-demand, but it seems likely it is 
-                # always wanted
-                # For a key of (foo, bar, baz) create
-                # _nodes_by_key[foo][bar][baz] = key_value
-                for subkey in key[:-1]:
-                    key_dict = key_dict.setdefault(subkey, {})
-                key_dict[key[-1]] = key_value
 
     def _parsed_bytes(self, start, start_key, end, end_key):
         """Mark the bytes from start to end as parsed.
@@ -818,6 +842,23 @@
                 self._parsed_byte_map.insert(index, new_value)
                 self._parsed_key_map.insert(index, new_key)
 
+    def _read_and_parse(self, readv_ranges):
+        """Read the the ranges and parse the resulting data.
+
+        :param readv_ranges: A prepared readv range list.
+        """
+        if readv_ranges:
+            readv_data = self._transport.readv(self._name, readv_ranges, True,
+                self._size)
+            # parse
+            for offset, data in readv_data:
+                if self._bisect_nodes is None:
+                    # this must be the start
+                    assert offset == 0
+                    offset, data = self._parse_header_from_bytes(data)
+                self._parse_region(offset, data)
+                # print offset, len(data), data
+
     def _signature(self):
         """The file signature for this index type."""
         return _SIGNATURE

=== modified file 'bzrlib/tests/test_index.py'
--- a/bzrlib/tests/test_index.py	2007-10-07 22:04:49 +0000
+++ b/bzrlib/tests/test_index.py	2007-10-07 22:52:05 +0000
@@ -546,6 +546,44 @@
         self.assertEqual([((index._size // 2, ('50', )), +1)],
             result)
 
+    def test_lookup_key_resolves_references(self):
+        # generate a big enough index that we only read some of it on a typical
+        # bisection lookup.
+        nodes = []
+        def make_key(number):
+            return (str(number) + 'X'*100,)
+        def make_value(number):
+            return str(number) + 'Y'*100
+        for counter in range(64):
+            nodes.append((make_key(counter), make_value(counter),
+                ((make_key(counter + 20),),)  ))
+        index = self.make_index(ref_lists=1, nodes=nodes)
+        # lookup a key in the middle that does not exist, so that when we can
+        # check that the referred-to-keys are not accessed automatically.
+        result =index.lookup_keys_via_location(
+            [(index._size // 2, ('40', ))])
+        # check the parse map - only the start and middle should have been
+        # parsed.
+        self.assertEqual([(0, 3890), (6444, 10274)], index._parsed_byte_map)
+        self.assertEqual([(None, make_key(25)), (make_key(37), make_key(52))],
+            index._parsed_key_map)
+        # and check the transport activity likewise.
+        self.assertEqual(
+            [('readv', 'index', [(7906, 800), (0, 200)], True, 15813)],
+            index._transport._activity)
+        # reset the transport log for testing the reference lookup
+        del index._transport._activity[:]
+        # now looking up a key in the portion of the file already parsed should
+        # only perform IO to resolve its key references.
+        result = index.lookup_keys_via_location([(4000, make_key(40))])
+        self.assertEqual(
+            [((4000, make_key(40)),
+              (index, make_key(40), make_value(40), ((make_key(60),),)))],
+            result)
+        self.assertEqual([('readv', 'index', [(11976, 800)], True, 15813)],
+            index._transport._activity)
+
+
     def test_iter_all_entries_empty(self):
         index = self.make_index()
         self.assertEqual([], list(index.iter_all_entries()))