Rev 2896: Add support for key references to the index lookup_keys_via_location bisection interface. in http://people.ubuntu.com/~robertc/baz2.0/index
Robert Collins
robertc at robertcollins.net
Sun Oct 7 23:52:15 BST 2007
At http://people.ubuntu.com/~robertc/baz2.0/index
------------------------------------------------------------
revno: 2896
revision-id: robertc at robertcollins.net-20071007225205-4ttygs5100xl33ac
parent: robertc at robertcollins.net-20071007220449-stt24xz9eaaj703l
committer: Robert Collins <robertc at robertcollins.net>
branch nick: index
timestamp: Mon 2007-10-08 08:52:05 +1000
message:
Add support for key references to the index lookup_keys_via_location bisection interface.
modified:
bzrlib/index.py index.py-20070712131115-lolkarso50vjr64s-1
bzrlib/tests/test_index.py test_index.py-20070712131115-lolkarso50vjr64s-2
=== modified file 'bzrlib/index.py'
--- a/bzrlib/index.py 2007-10-07 22:04:49 +0000
+++ b/bzrlib/index.py 2007-10-07 22:52:05 +0000
@@ -245,6 +245,10 @@
"""
self._transport = transport
self._name = name
+ # becomes a dict of key:(value, reference-list-byte-locations)
+ # used by the bisection interface to store parsed but not resolved
+ # keys.
+ self._bisect_nodes = None
self._nodes = None
# a sorted list of slice-addresses for the parsed bytes of the file.
# e.g. (0,1) would mean that byte 0 is parsed.
@@ -374,6 +378,13 @@
except ValueError:
raise errors.BadIndexOptions(self)
+ def _resolve_references(self, references):
+ """Return the resolved key references for references."""
+ node_refs = []
+ for ref_list in references:
+ node_refs.append(tuple([self._keys_by_offset[ref][0] for ref in ref_list]))
+ return tuple(node_refs)
+
def _parsed_byte_index(self, offset):
"""Return the index of the entry immediately before offset.
@@ -439,6 +450,11 @@
'bar'), ('foobar', 'gam') and do a prefix search for ('foo', None) then
only the former key is returned.
+ WARNING: Note that this method currently causes a full index parse
+ unconditionally (which is reasonably appropriate as it is a means for
+ thunking many small indices into one larger one and still supplies
+ iter_all_entries at the thunk layer).
+
:param keys: An iterable providing the key prefixes to be retrieved.
Each key prefix takes the form of a tuple the length of a key, but
with the last N elements 'None' rather than a regular bytestring.
@@ -516,6 +532,11 @@
def lookup_keys_via_location(self, location_keys):
"""Public interface for implementing bisection.
+ If _buffer_all has been called, then all the data for the index is in
+ memory, and this method should not be called, as it uses a separate
+ cache because it cannot pre-resolve all indices, which buffer_all does
+ for performance.
+
:param location_keys: A list of location, key tuples.
:return: A list of (location_key, result) tuples as expected by
bzrlib.bisect_multi.bisect_multi_bytes.
@@ -547,23 +568,16 @@
if length > 0:
readv_ranges.append((location, length))
# read the header if needed
- if self._nodes is None:
+ if self._bisect_nodes is None:
readv_ranges.append((0, 200))
- if readv_ranges:
- readv_data = self._transport.readv(self._name, readv_ranges, True,
- self._size)
- # parse
- for offset, data in readv_data:
- if self._nodes is None:
- # this must be the start
- assert offset == 0
- offset, data = self._parse_header_from_bytes(data)
- self._parse_region(offset, data)
- # print offset, len(data), data
+ self._read_and_parse(readv_ranges)
# generate results:
# - figure out <, >, missing, present
# - result present references so we can return them.
result = []
+ # keys that we cannot answer until we resolve references
+ pending_references = []
+ pending_locations = set()
for location, key in location_keys:
# can we answer from cache?
index = self._parsed_key_index(key)
@@ -572,12 +586,24 @@
# end of the file has been parsed
self._parsed_byte_map[index][1] == self._size)):
# the key has been parsed, so no lookup is needed
- if key in self._nodes:
+ if key in self._bisect_nodes:
if self.node_ref_lists:
- value, refs = self._nodes[key]
- result.append(((location, key), (self, key, value, refs)))
+ # the references may not have been all parsed.
+ value, refs = self._bisect_nodes[key]
+ wanted_locations = []
+ for ref_list in refs:
+ for ref in ref_list:
+ if ref not in self._keys_by_offset:
+ wanted_locations.append(ref)
+ if wanted_locations:
+ pending_locations.update(wanted_locations)
+ pending_references.append((location, key))
+ continue
+ result.append(((location, key), (self, key,
+ value, self._resolve_references(refs))))
else:
- result.append(((location, key), (self, key, self._nodes[key])))
+ result.append(((location, key),
+ (self, key, self._bisect_nodes[key])))
else:
result.append(((location, key), False))
continue
@@ -590,6 +616,24 @@
else:
direction = +1
result.append(((location, key), direction))
+ readv_ranges = []
+ # lookup data to resolve references
+ for location in pending_locations:
+ length = 800
+ if location + length > self._size:
+ length = self._size - location
+ # TODO: trim out parsed locations (e.g. if the 800 is into the
+ # parsed region trim it, and dont use the ajust_for_latency
+ # facility)
+ if length > 0:
+ readv_ranges.append((location, length))
+ self._read_and_parse(readv_ranges)
+ for location, key in pending_references:
+ # answer key references we had to look-up-late.
+ index = self._parsed_key_index(key)
+ value, refs = self._bisect_nodes[key]
+ result.append(((location, key), (self, key,
+ value, self._resolve_references(refs))))
return result
def _parse_header_from_bytes(self, bytes):
@@ -632,9 +676,8 @@
self._expected_elements = 3 + self._key_length
# raw data keyed by offset
self._keys_by_offset = {}
- # ready-to-return key:value or key:value, node_ref_lists
- self._nodes = {}
- self._nodes_by_key = {}
+ # keys with the value and node references
+ self._bisect_nodes = {}
return header_end, bytes[header_end:]
def _parse_region(self, offset, data):
@@ -748,34 +791,15 @@
ref_lists = tuple(ref_lists)
self._keys_by_offset[pos] = (key, absent, ref_lists, value)
pos += len(line) + 1 # +1 for the \n
+ if absent:
+ continue
+ if self.node_ref_lists:
+ node_value = (value, ref_lists)
+ else:
+ node_value = value
+ self._bisect_nodes[key] = node_value
+ # print "parsed ", key
self._parsed_bytes(offset, first_key, offset + len(data), key)
- # XXXXXX repeated work here.
- for key, absent, references, value in self._keys_by_offset.itervalues():
- if absent:
- continue
- # resolve references:
- if self.node_ref_lists:
- node_refs = []
- for ref_list in references:
- node_refs.append(tuple([self._keys_by_offset[ref][0] for ref in ref_list]))
- node_value = (value, tuple(node_refs))
- else:
- node_value = value
- self._nodes[key] = node_value
- if self._key_length > 1:
- subkey = list(reversed(key[:-1]))
- key_dict = self._nodes_by_key
- if self.node_ref_lists:
- key_value = key, node_value[0], node_value[1]
- else:
- key_value = key, node_value
- # possibly should do this on-demand, but it seems likely it is
- # always wanted
- # For a key of (foo, bar, baz) create
- # _nodes_by_key[foo][bar][baz] = key_value
- for subkey in key[:-1]:
- key_dict = key_dict.setdefault(subkey, {})
- key_dict[key[-1]] = key_value
def _parsed_bytes(self, start, start_key, end, end_key):
"""Mark the bytes from start to end as parsed.
@@ -818,6 +842,23 @@
self._parsed_byte_map.insert(index, new_value)
self._parsed_key_map.insert(index, new_key)
+ def _read_and_parse(self, readv_ranges):
+ """Read the the ranges and parse the resulting data.
+
+ :param readv_ranges: A prepared readv range list.
+ """
+ if readv_ranges:
+ readv_data = self._transport.readv(self._name, readv_ranges, True,
+ self._size)
+ # parse
+ for offset, data in readv_data:
+ if self._bisect_nodes is None:
+ # this must be the start
+ assert offset == 0
+ offset, data = self._parse_header_from_bytes(data)
+ self._parse_region(offset, data)
+ # print offset, len(data), data
+
def _signature(self):
"""The file signature for this index type."""
return _SIGNATURE
=== modified file 'bzrlib/tests/test_index.py'
--- a/bzrlib/tests/test_index.py 2007-10-07 22:04:49 +0000
+++ b/bzrlib/tests/test_index.py 2007-10-07 22:52:05 +0000
@@ -546,6 +546,44 @@
self.assertEqual([((index._size // 2, ('50', )), +1)],
result)
+ def test_lookup_key_resolves_references(self):
+ # generate a big enough index that we only read some of it on a typical
+ # bisection lookup.
+ nodes = []
+ def make_key(number):
+ return (str(number) + 'X'*100,)
+ def make_value(number):
+ return str(number) + 'Y'*100
+ for counter in range(64):
+ nodes.append((make_key(counter), make_value(counter),
+ ((make_key(counter + 20),),) ))
+ index = self.make_index(ref_lists=1, nodes=nodes)
+ # lookup a key in the middle that does not exist, so that when we can
+ # check that the referred-to-keys are not accessed automatically.
+ result =index.lookup_keys_via_location(
+ [(index._size // 2, ('40', ))])
+ # check the parse map - only the start and middle should have been
+ # parsed.
+ self.assertEqual([(0, 3890), (6444, 10274)], index._parsed_byte_map)
+ self.assertEqual([(None, make_key(25)), (make_key(37), make_key(52))],
+ index._parsed_key_map)
+ # and check the transport activity likewise.
+ self.assertEqual(
+ [('readv', 'index', [(7906, 800), (0, 200)], True, 15813)],
+ index._transport._activity)
+ # reset the transport log for testing the reference lookup
+ del index._transport._activity[:]
+ # now looking up a key in the portion of the file already parsed should
+ # only perform IO to resolve its key references.
+ result = index.lookup_keys_via_location([(4000, make_key(40))])
+ self.assertEqual(
+ [((4000, make_key(40)),
+ (index, make_key(40), make_value(40), ((make_key(60),),)))],
+ result)
+ self.assertEqual([('readv', 'index', [(11976, 800)], True, 15813)],
+ index._transport._activity)
+
+
def test_iter_all_entries_empty(self):
index = self.make_index()
self.assertEqual([], list(index.iter_all_entries()))
More information about the bazaar-commits
mailing list