Rev 4799: Implement the code to actually read and parse the entries. in http://bazaar.launchpad.net/~jameinel/bzr/chk-index

Wed Oct 28 19:33:31 GMT 2009

At http://bazaar.launchpad.net/~jameinel/bzr/chk-index

------------------------------------------------------------
revno: 4799
revision-id: john at arbash-meinel.com-20091028193322-psg0tagb5xubpq51
parent: john at arbash-meinel.com-20091028182419-isq5q9qja7jeifmo
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: chk-index
timestamp: Wed 2009-10-28 14:33:22 -0500
message:
  Implement the code to actually read and parse the entries.
-------------- next part --------------
=== modified file 'bzrlib/chk_index.py'

--- a/bzrlib/chk_index.py	2009-10-28 18:24:19 +0000
+++ b/bzrlib/chk_index.py	2009-10-28 19:33:22 +0000
@@ -602,7 +602,18 @@
         # It is probably more efficient to have one large dict than many small
         # ones. Especially given that dicts < 50k entries have a resize-factor
         # of 4:1, rather than 2:1.
+        # So for now, _mini_index = [(offset, size, loaded?)...]
         self._mini_index = None
+        # TODO: Should we map from key => data, or bit_key => data
+        #       bit_key is going to be smaller in memory, but does that
+        #       trade-off well against the cost of casting between object
+        #       representations? Plus (key,) may already be interned since we
+        #       requested it...
+        #       Note that if we do prefix-only keys, then we have to use
+        #       'bit_key', since we don't *have* the full key to compare
+        #       against. So for now, this is
+        #       {bit_key: (group_tuple, internal_start, internal_length)}
+        self._entries = {}
 
     def _ensure_header(self):
         if self._header is not None:
@@ -621,6 +632,9 @@
         _, bytes = self._transport.readv(self._name, [(0, read_size)]).next()
         header_bytes = bytes[:CHKIndexHeader._RESERVED_HEADER_BYTES]
         self._header = CHKIndexHeader.deserialize(header_bytes)
+        self._header._build_group_coder()
+        self._header._build_mini_index_coder()
+        self._header._build_entry_coder()
         # The above checks ensure that our first read always includes the mini
         # index and groups.
         self._parse_mini_index(bytes)
@@ -634,17 +648,28 @@
         :param bytes: Bytes of the index starting at the byte 0.
         """
         stuple = static_tuple.StaticTuple
+        end_of_index = self._header.entry_offset + self._header.num_entries * (
+            self._header.num_hash_bytes
+            + self._header.entry_group_offset_bytes
+            + self._header.entry_group_start_bytes
+            + self._header.entry_group_length_bytes)
         if self._header.num_mini_index_entries == 0:
             # (offset, loaded?)
-            self._mini_index = [stuple(self._header.entry_offset, False)]
+            size = end_of_index - self._header.entry_offset
+            self._mini_index = [stuple(self._header.entry_offset, size, False)]
             return
-        self._header._build_mini_index_coder()
         start = self._header.mini_index_offset
         end = self._header.group_index_offset
         format = self._header.mini_index_coder.format[1:]
         format = '>' + (format * self._header.num_mini_index_entries)
-        as_ints = struct.unpack(format, bytes[start:end])
-        self._mini_index = [stuple(i, False) for i in as_ints]
+        start_pos = struct.unpack(format, bytes[start:end])
+        mini_index = []
+        for idx in xrange(self._header.num_mini_index_entries - 1):
+            pos = start_pos[idx]
+            next_pos = start_pos[idx+1]
+            mini_index.append(stuple(pos, next_pos-pos, False))
+        mini_index.append(stuple(next_pos, end_of_index-next_pos, False))
+        self._mini_index = mini_index
 
     def _parse_groups(self, bytes):
         """Parse the group information from bytes.
@@ -654,7 +679,6 @@
         """
         start = self._header.group_index_offset
         end = self._header.entry_offset
-        self._header._build_group_coder()
         format = self._header.group_coder.format[1:]
         format = '>' + (format * self._header.num_groups)
         as_ints = struct.unpack(format, bytes[start:end])
@@ -666,3 +690,48 @@
         """How many entries are in this index."""
         self._ensure_header()
         return self._header.num_entries
+
+    def _bit_key_to_key(self, bit_key):
+        # TODO: intern()?
+        # TODO: Handle when/if we support less-than-full-width keys
+        #       In which case we should probably return a 'psha1:' for 'partial
+        #       sha1' or something along those lines
+        return static_tuple.StaticTuple('sha1:' + binascii.b2a_hex(bit_key))
+
+    def _key_to_bit_key(self, key):
+        # TODO: Handle when/if we support less-than-full-width keys and 'sha1:'
+        #       may be a 'psha1:' key
+        assert len(key) == 1
+        assert key[0].startswith('sha1:')
+        return binascii.a2b_hex(key[0][5:])
+
+    def keys(self):
+        """Return the list of all keys stored in this index."""
+        self._ensure_header()
+        self._read_all()
+        to_key = self._bit_key_to_key
+        return [to_key(bit_key) for bit_key in self._entries]
+
+    def _read_all(self):
+        """Read in all the entry records and store them in self._entries"""
+        offsets = [idx for idx, info in enumerate(self._mini_index)
+                        if not info[2]]
+        self._read_entries(offsets)
+
+    def _read_entries(self, indexes):
+        """Read in the entries corresponding to the given mini_index pages."""
+        stuple = static_tuple.StaticTuple
+        mi = self._mini_index
+        pages = [mi[idx][:2] for idx in indexes]
+        unpack = self._header.entry_coder.unpack
+        size = self._header.entry_coder.size
+        for offset, content in self._transport.readv(self._name, pages):
+            for start in xrange(0, len(content), size):
+                (bit_key, group_offset, inner_start,
+                 inner_len) = unpack(content[start:start+size])
+                group = self._groups[group_offset]
+                self._entries[bit_key] = stuple(group, inner_start, inner_len)
+        # Mark these locations as having been read
+        for idx in indexes:
+            val = self._mini_index[idx]
+            self._mini_index[idx] = stuple(val[0], val[1], True)

=== modified file 'bzrlib/tests/test_chk_index.py'
--- a/bzrlib/tests/test_chk_index.py	2009-10-28 18:24:19 +0000
+++ b/bzrlib/tests/test_chk_index.py	2009-10-28 19:33:22 +0000
@@ -757,7 +757,7 @@
         self.assertEqual(1, index._header.num_groups)
         self.assertIsNot(None, index._groups)
         self.assertEqual([(0, 0)], index._groups)
-        self.assertEqual([(122, False)], index._mini_index)
+        self.assertEqual([(122, 0, False)], index._mini_index)
 
     def test__ensure_header_tiny(self):
         index = self.make_index([(k1, 0, 1000, 0, 10),
@@ -772,7 +772,7 @@
         self.assertEqual([(0, 0), (0, 1000)], index._groups)
         # TODO: the value of _mini_index when there isn't an index present is
         #       currently up for grabs.
-        self.assertEqual([(126, False)], index._mini_index)
+        self.assertEqual([(126, 69, False)], index._mini_index)
 
     def test__ensure_header_small(self):
         nodes = make_chk_nodes(400, 20)
@@ -786,6 +786,26 @@
         groups = sorted(set([(node[1], node[2]) for node in nodes]))
         groups.insert(0, (0, 0))
         self.assertEqual(groups, index._groups)
-        mini_index = [(x, False) for x in [278, 734, 1382, 1958, 2582, 3038,
-            3854, 4454, 5198, 5798, 6398, 7190, 7814, 8294, 8918, 9398]]
+        mini_index = []
+        # TODO: Would be better if this was not hard-coded
+        start_positions = [278, 734, 1382, 1958, 2582, 3038, 3854, 4454, 5198,
+                           5798, 6398, 7190, 7814, 8294, 8918, 9398,
+                           9878] # this is the end of index
+        for idx in range(len(start_positions)-1):
+            pos = start_positions[idx]
+            mini_index.append((pos, start_positions[idx+1] - pos, False))
         self.assertEqual(mini_index, index._mini_index)
+
+    def test_key_count(self):
+        index = self.make_index([(k1, 0, 1000, 0, 10),
+                                 (k2, 0, 1000, 10, 9),
+                                 (k3, 0, 1000, 19, 9),
+                                ])
+        self.assertEqual(3, index.key_count())
+
+    def test_keys(self):
+        index = self.make_index([(k1, 0, 1000, 0, 10),
+                                 (k2, 0, 1000, 10, 9),
+                                 (k3, 0, 1000, 19, 9),
+                                ])
+        self.assertEqual(sorted([k1, k2, k3]), sorted(index.keys()))