[RFC] pack index lookup performance

Tue Apr 15 23:16:44 BST 2008

On Wed, 2008-04-16 at 07:19 +1000, Robert Collins wrote:
> On Tue, 2008-04-15 at 17:02 -0400, Aaron Bentley wrote:
> 
> 
> > It seems we should be examining the hit caches of all indices before
> > doing more expensive operations.  Are we doing that?
> 
> One way we could do this is add a 'cached_only' flag to the index
> iteration, and then in the CombinedIndex add another loop using that
> flag.

I did a quick crufty version of this up; it made no measurable
difference for me.

=== modified file 'bzrlib/index.py'

--- bzrlib/index.py	2007-12-18 19:42:10 +0000
+++ bzrlib/index.py	2008-04-15 22:14:57 +0000
@@ -272,6 +272,7 @@
         self._keys_by_offset = None
         self._nodes_by_key = None
         self._size = size
+        self._cached_only = False
 
     def __eq__(self, other):
         """Equal when self and other were created with the same parameters."""
@@ -456,10 +457,13 @@
             for key in keys:
                 yield self, key, self._nodes[key]
 
-    def iter_entries(self, keys):
+    def iter_entries(self, keys, cached_only=False):
         """Iterate over keys within the index.
 
         :param keys: An iterable providing the keys to be retrieved.
+        :param cached_only: If True only answer keys from the cache (skip
+            bisecting for unknown keys). Does not preclude doing IO to resolve
+            references.
         :return: An iterable as per iter_all_entries, but restricted to the
             keys supplied. No additional keys will be returned, and every
             key supplied that is in the index will be returned.
@@ -467,6 +471,9 @@
         # PERFORMANCE TODO: parse and bisect all remaining data at some
         # threshold of total-index processing/get calling layers that expect to
         # read the entire index to use the iter_all_entries  method instead.
+        # kludge: probably much easier in the range_map branch, but it will do
+        # the job for testing.
+        self._cached_only = cached_only
         keys = set(keys)
         if not keys:
             return []
@@ -590,7 +597,8 @@
         readv_ranges = []
         for location, key in location_keys:
             # can we answer from cache?
-            if self._bisect_nodes and key in self._bisect_nodes:
+            if self._cached_only or (self._bisect_nodes and key in
+                self._bisect_nodes):
                 # We have the key parsed.
                 continue
             index = self._parsed_key_index(key)
@@ -616,7 +624,7 @@
             if length > 0:
                 readv_ranges.append((location, length))
         # read the header if needed
-        if self._bisect_nodes is None:
+        if not self._cached_only and self._bisect_nodes is None:
             readv_ranges.append(_HEADER_READV)
         self._read_and_parse(readv_ranges)
         # generate results:
@@ -628,7 +636,7 @@
         pending_locations = set()
         for location, key in location_keys:
             # can we answer from cache?
-            if key in self._bisect_nodes:
+            if self._bisect_nodes and key in self._bisect_nodes:
                 # the key has been parsed, so no lookup is needed
                 if self.node_ref_lists:
                     # the references may not have been all parsed.
@@ -648,6 +656,9 @@
                     result.append(((location, key),
                         (self, key, self._bisect_nodes[key])))
                 continue
+            elif self._cached_only:
+                result.append(((location, key), False))
+                continue
             else:
                 # has the region the key should be in, been parsed?
                 index = self._parsed_key_index(key)
@@ -1068,6 +1079,15 @@
             efficient order for the index.
         """
         keys = set(keys)
+        # First pass - cached in memory keys, which avoids doing bisection for key
+        # misses in each index.
+        for index in self._indices:
+            if not keys:
+                return
+            for node in index.iter_entries(keys, cached_only=True):
+                keys.remove(node[1])
+                yield node
+        # Second pass - actually look for each key.
         for index in self._indices:
             if not keys:
                 return

=== modified file 'bzrlib/tests/test_index.py'
--- bzrlib/tests/test_index.py	2007-10-15 07:56:04 +0000
+++ bzrlib/tests/test_index.py	2008-04-15 22:11:59 +0000
@@ -682,6 +682,19 @@
             (index, ('ref', ), 'refdata', ((), ))]),
             set(index.iter_entries([('name', ), ('ref', )])))
 
+    def test_iter_cached_only_unread_keys_skipped(self):
+        index = self.make_index(1, nodes=[
+            (('name', ), 'data', ([('ref', )], )),
+            (('ref', ), 'refdata', ([], ))])
+        self.assertEqual(set(),
+            set(index.iter_entries([('name', ), ('ref', )], cached_only=True)))
+        # looking for anything should parse the keys in this index
+        list(index.iter_entries([('foo', )]))
+        # and now the cached contents are readable
+        self.assertEqual(set([(index, ('name', ), 'data', ((('ref',),),)),
+            (index, ('ref', ), 'refdata', ((), ))]),
+            set(index.iter_entries([('name', ), ('ref', )], cached_only=True)))
+
     def test_iter_nothing_empty(self):
         index = self.make_index()
         self.assertEqual([], list(index.iter_entries([])))



-- 
GPG key available at: <http://www.robertcollins.net/keys.txt>.
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 189 bytes
Desc: This is a digitally signed message part
Url : https://lists.ubuntu.com/archives/bazaar/attachments/20080416/cfc5eee4/attachment.pgp