[RFC] pack index lookup performance
Robert Collins
robertc at robertcollins.net
Tue Apr 15 23:16:44 BST 2008
On Wed, 2008-04-16 at 07:19 +1000, Robert Collins wrote:
> On Tue, 2008-04-15 at 17:02 -0400, Aaron Bentley wrote:
>
>
> > It seems we should be examining the hit caches of all indices before
> > doing more expensive operations. Are we doing that?
>
> One way we could do this is add a 'cached_only' flag to the index
> iteration, and then in the CombinedIndex add another loop using that
> flag.
I did a quick crufty version of this up; it made no measurable
difference for me.
=== modified file 'bzrlib/index.py'
--- bzrlib/index.py 2007-12-18 19:42:10 +0000
+++ bzrlib/index.py 2008-04-15 22:14:57 +0000
@@ -272,6 +272,7 @@
self._keys_by_offset = None
self._nodes_by_key = None
self._size = size
+ self._cached_only = False
def __eq__(self, other):
"""Equal when self and other were created with the same parameters."""
@@ -456,10 +457,13 @@
for key in keys:
yield self, key, self._nodes[key]
- def iter_entries(self, keys):
+ def iter_entries(self, keys, cached_only=False):
"""Iterate over keys within the index.
:param keys: An iterable providing the keys to be retrieved.
+ :param cached_only: If True only answer keys from the cache (skip
+ bisecting for unknown keys). Does not preclude doing IO to resolve
+ references.
:return: An iterable as per iter_all_entries, but restricted to the
keys supplied. No additional keys will be returned, and every
key supplied that is in the index will be returned.
@@ -467,6 +471,9 @@
# PERFORMANCE TODO: parse and bisect all remaining data at some
# threshold of total-index processing/get calling layers that expect to
# read the entire index to use the iter_all_entries method instead.
+ # kludge: probably much easier in the range_map branch, but it will do
+ # the job for testing.
+ self._cached_only = cached_only
keys = set(keys)
if not keys:
return []
@@ -590,7 +597,8 @@
readv_ranges = []
for location, key in location_keys:
# can we answer from cache?
- if self._bisect_nodes and key in self._bisect_nodes:
+ if self._cached_only or (self._bisect_nodes and key in
+ self._bisect_nodes):
# We have the key parsed.
continue
index = self._parsed_key_index(key)
@@ -616,7 +624,7 @@
if length > 0:
readv_ranges.append((location, length))
# read the header if needed
- if self._bisect_nodes is None:
+ if not self._cached_only and self._bisect_nodes is None:
readv_ranges.append(_HEADER_READV)
self._read_and_parse(readv_ranges)
# generate results:
@@ -628,7 +636,7 @@
pending_locations = set()
for location, key in location_keys:
# can we answer from cache?
- if key in self._bisect_nodes:
+ if self._bisect_nodes and key in self._bisect_nodes:
# the key has been parsed, so no lookup is needed
if self.node_ref_lists:
# the references may not have been all parsed.
@@ -648,6 +656,9 @@
result.append(((location, key),
(self, key, self._bisect_nodes[key])))
continue
+ elif self._cached_only:
+ result.append(((location, key), False))
+ continue
else:
# has the region the key should be in, been parsed?
index = self._parsed_key_index(key)
@@ -1068,6 +1079,15 @@
efficient order for the index.
"""
keys = set(keys)
+ # First pass - cached in memory keys, which avoids doing bisection for key
+ # misses in each index.
+ for index in self._indices:
+ if not keys:
+ return
+ for node in index.iter_entries(keys, cached_only=True):
+ keys.remove(node[1])
+ yield node
+ # Second pass - actually look for each key.
for index in self._indices:
if not keys:
return
=== modified file 'bzrlib/tests/test_index.py'
--- bzrlib/tests/test_index.py 2007-10-15 07:56:04 +0000
+++ bzrlib/tests/test_index.py 2008-04-15 22:11:59 +0000
@@ -682,6 +682,19 @@
(index, ('ref', ), 'refdata', ((), ))]),
set(index.iter_entries([('name', ), ('ref', )])))
+ def test_iter_cached_only_unread_keys_skipped(self):
+ index = self.make_index(1, nodes=[
+ (('name', ), 'data', ([('ref', )], )),
+ (('ref', ), 'refdata', ([], ))])
+ self.assertEqual(set(),
+ set(index.iter_entries([('name', ), ('ref', )], cached_only=True)))
+ # looking for anything should parse the keys in this index
+ list(index.iter_entries([('foo', )]))
+ # and now the cached contents are readable
+ self.assertEqual(set([(index, ('name', ), 'data', ((('ref',),),)),
+ (index, ('ref', ), 'refdata', ((), ))]),
+ set(index.iter_entries([('name', ), ('ref', )], cached_only=True)))
+
def test_iter_nothing_empty(self):
index = self.make_index()
self.assertEqual([], list(index.iter_entries([])))
--
GPG key available at: <http://www.robertcollins.net/keys.txt>.
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 189 bytes
Desc: This is a digitally signed message part
Url : https://lists.ubuntu.com/archives/bazaar/attachments/20080416/cfc5eee4/attachment.pgp
More information about the bazaar
mailing list