Rev 4799: Implement the code to actually read and parse the entries. in http://bazaar.launchpad.net/~jameinel/bzr/chk-index
John Arbash Meinel
john at arbash-meinel.com
Wed Oct 28 19:33:31 GMT 2009
At http://bazaar.launchpad.net/~jameinel/bzr/chk-index
------------------------------------------------------------
revno: 4799
revision-id: john at arbash-meinel.com-20091028193322-psg0tagb5xubpq51
parent: john at arbash-meinel.com-20091028182419-isq5q9qja7jeifmo
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: chk-index
timestamp: Wed 2009-10-28 14:33:22 -0500
message:
Implement the code to actually read and parse the entries.
-------------- next part --------------
=== modified file 'bzrlib/chk_index.py'
--- a/bzrlib/chk_index.py 2009-10-28 18:24:19 +0000
+++ b/bzrlib/chk_index.py 2009-10-28 19:33:22 +0000
@@ -602,7 +602,18 @@
# It is probably more efficient to have one large dict than many small
# ones. Especially given that dicts < 50k entries have a resize-factor
# of 4:1, rather than 2:1.
+ # So for now, _mini_index = [(offset, size, loaded?)...]
self._mini_index = None
+ # TODO: Should we map from key => data, or bit_key => data
+ # bit_key is going to be smaller in memory, but does that
+ # trade-off well against the cost of casting between object
+ # representations? Plus (key,) may already be interned since we
+ # requested it...
+ # Note that if we do prefix-only keys, then we have to use
+ # 'bit_key', since we don't *have* the full key to compare
+ # against. So for now, this is
+ # {bit_key: (group_tuple, internal_start, internal_length)}
+ self._entries = {}
def _ensure_header(self):
if self._header is not None:
@@ -621,6 +632,9 @@
_, bytes = self._transport.readv(self._name, [(0, read_size)]).next()
header_bytes = bytes[:CHKIndexHeader._RESERVED_HEADER_BYTES]
self._header = CHKIndexHeader.deserialize(header_bytes)
+ self._header._build_group_coder()
+ self._header._build_mini_index_coder()
+ self._header._build_entry_coder()
# The above checks ensure that our first read always includes the mini
# index and groups.
self._parse_mini_index(bytes)
@@ -634,17 +648,28 @@
:param bytes: Bytes of the index starting at the byte 0.
"""
stuple = static_tuple.StaticTuple
+ end_of_index = self._header.entry_offset + self._header.num_entries * (
+ self._header.num_hash_bytes
+ + self._header.entry_group_offset_bytes
+ + self._header.entry_group_start_bytes
+ + self._header.entry_group_length_bytes)
if self._header.num_mini_index_entries == 0:
# (offset, loaded?)
- self._mini_index = [stuple(self._header.entry_offset, False)]
+ size = end_of_index - self._header.entry_offset
+ self._mini_index = [stuple(self._header.entry_offset, size, False)]
return
- self._header._build_mini_index_coder()
start = self._header.mini_index_offset
end = self._header.group_index_offset
format = self._header.mini_index_coder.format[1:]
format = '>' + (format * self._header.num_mini_index_entries)
- as_ints = struct.unpack(format, bytes[start:end])
- self._mini_index = [stuple(i, False) for i in as_ints]
+ start_pos = struct.unpack(format, bytes[start:end])
+ mini_index = []
+ for idx in xrange(self._header.num_mini_index_entries - 1):
+ pos = start_pos[idx]
+ next_pos = start_pos[idx+1]
+ mini_index.append(stuple(pos, next_pos-pos, False))
+ mini_index.append(stuple(next_pos, end_of_index-next_pos, False))
+ self._mini_index = mini_index
def _parse_groups(self, bytes):
"""Parse the group information from bytes.
@@ -654,7 +679,6 @@
"""
start = self._header.group_index_offset
end = self._header.entry_offset
- self._header._build_group_coder()
format = self._header.group_coder.format[1:]
format = '>' + (format * self._header.num_groups)
as_ints = struct.unpack(format, bytes[start:end])
@@ -666,3 +690,48 @@
"""How many entries are in this index."""
self._ensure_header()
return self._header.num_entries
+
+ def _bit_key_to_key(self, bit_key):
+ # TODO: intern()?
+ # TODO: Handle when/if we support less-than-full-width keys
+ # In which case we should probably return a 'psha1:' for 'partial
+ # sha1' or something along those lines
+ return static_tuple.StaticTuple('sha1:' + binascii.b2a_hex(bit_key))
+
+ def _key_to_bit_key(self, key):
+ # TODO: Handle when/if we support less-than-full-width keys and 'sha1:'
+ # may be a 'psha1:' key
+ assert len(key) == 1
+ assert key[0].startswith('sha1:')
+ return binascii.a2b_hex(key[0][5:])
+
+ def keys(self):
+ """Return the list of all keys stored in this index."""
+ self._ensure_header()
+ self._read_all()
+ to_key = self._bit_key_to_key
+ return [to_key(bit_key) for bit_key in self._entries]
+
+ def _read_all(self):
+ """Read in all the entry records and store them in self._entries"""
+ offsets = [idx for idx, info in enumerate(self._mini_index)
+ if not info[2]]
+ self._read_entries(offsets)
+
+ def _read_entries(self, indexes):
+ """Read in the entries corresponding to the given mini_index pages."""
+ stuple = static_tuple.StaticTuple
+ mi = self._mini_index
+ pages = [mi[idx][:2] for idx in indexes]
+ unpack = self._header.entry_coder.unpack
+ size = self._header.entry_coder.size
+ for offset, content in self._transport.readv(self._name, pages):
+ for start in xrange(0, len(content), size):
+ (bit_key, group_offset, inner_start,
+ inner_len) = unpack(content[start:start+size])
+ group = self._groups[group_offset]
+ self._entries[bit_key] = stuple(group, inner_start, inner_len)
+ # Mark these locations as having been read
+ for idx in indexes:
+ val = self._mini_index[idx]
+ self._mini_index[idx] = stuple(val[0], val[1], True)
=== modified file 'bzrlib/tests/test_chk_index.py'
--- a/bzrlib/tests/test_chk_index.py 2009-10-28 18:24:19 +0000
+++ b/bzrlib/tests/test_chk_index.py 2009-10-28 19:33:22 +0000
@@ -757,7 +757,7 @@
self.assertEqual(1, index._header.num_groups)
self.assertIsNot(None, index._groups)
self.assertEqual([(0, 0)], index._groups)
- self.assertEqual([(122, False)], index._mini_index)
+ self.assertEqual([(122, 0, False)], index._mini_index)
def test__ensure_header_tiny(self):
index = self.make_index([(k1, 0, 1000, 0, 10),
@@ -772,7 +772,7 @@
self.assertEqual([(0, 0), (0, 1000)], index._groups)
# TODO: the value of _mini_index when there isn't an index present is
# currently up for grabs.
- self.assertEqual([(126, False)], index._mini_index)
+ self.assertEqual([(126, 69, False)], index._mini_index)
def test__ensure_header_small(self):
nodes = make_chk_nodes(400, 20)
@@ -786,6 +786,26 @@
groups = sorted(set([(node[1], node[2]) for node in nodes]))
groups.insert(0, (0, 0))
self.assertEqual(groups, index._groups)
- mini_index = [(x, False) for x in [278, 734, 1382, 1958, 2582, 3038,
- 3854, 4454, 5198, 5798, 6398, 7190, 7814, 8294, 8918, 9398]]
+ mini_index = []
+ # TODO: Would be better if this was not hard-coded
+ start_positions = [278, 734, 1382, 1958, 2582, 3038, 3854, 4454, 5198,
+ 5798, 6398, 7190, 7814, 8294, 8918, 9398,
+ 9878] # this is the end of index
+ for idx in range(len(start_positions)-1):
+ pos = start_positions[idx]
+ mini_index.append((pos, start_positions[idx+1] - pos, False))
self.assertEqual(mini_index, index._mini_index)
+
+ def test_key_count(self):
+ index = self.make_index([(k1, 0, 1000, 0, 10),
+ (k2, 0, 1000, 10, 9),
+ (k3, 0, 1000, 19, 9),
+ ])
+ self.assertEqual(3, index.key_count())
+
+ def test_keys(self):
+ index = self.make_index([(k1, 0, 1000, 0, 10),
+ (k2, 0, 1000, 10, 9),
+ (k3, 0, 1000, 19, 9),
+ ])
+ self.assertEqual(sorted([k1, k2, k3]), sorted(index.keys()))
More information about the bazaar-commits
mailing list