Rev 5523: (a little broken) More internal restructuring. in http://bazaar.launchpad.net/~jameinel/bzr/2.3-gcb-peak-mem

Fri Nov 19 23:11:52 GMT 2010

At http://bazaar.launchpad.net/~jameinel/bzr/2.3-gcb-peak-mem

------------------------------------------------------------
revno: 5523
revision-id: john at arbash-meinel.com-20101119231136-7nvqe5yn3am0v196
parent: john at arbash-meinel.com-20101119222956-kklvotp0bxojgvia
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: 2.3-gcb-peak-mem
timestamp: Fri 2010-11-19 17:11:36 -0600
message:
  (a little broken) More internal restructuring.
  
  We don't need a pointer to the source object, what we need is the final offset.
  We could switch to just a source pointer, and get rid of the match_tail and
  match_pre. Or we could use a source_idx only, and compute the others on the fly.
  We would lose the ability to know that we were in a limited insert range, which
  allows us to sort the entries logically, but it would let us get rid of a
  lot more data structure.
  Note that you can bisect for the source object if you know the global offset,
  because the sources are stored in order.
  
  Not all tests pass, but the basic matching seems to be correct, just need to
  update the tests to match.
-------------- next part --------------
=== modified file 'bzrlib/_delta_index_pyx.pyx'

--- a/bzrlib/_delta_index_pyx.pyx	2010-11-19 22:29:56 +0000
+++ b/bzrlib/_delta_index_pyx.pyx	2010-11-19 23:11:36 +0000
@@ -71,8 +71,8 @@
     const_data ptr
     # The RABIN_VAL associated with the matching bytes
     unsigned int rabin_val
-    # The index for the actual SourceInfo object
-    unsigned short source_idx
+    # The number of bytes since the start of all sources
+    unsigned int global_offset
     # The maximum number of bytes that can be matched around this pointer.
     # Note that this is limited to 64kB, and a theoretical match could be
     # longer. However, that will be handled by a later match
@@ -307,7 +307,7 @@
         search.start(rabin_val)
         return search
 
-    cdef rabin_entry *add(self, const_data ptr, unsigned short source_idx,
+    cdef rabin_entry *add(self, const_data ptr, unsigned int global_offset,
                           unsigned int val, Py_ssize_t match_pre,
                           Py_ssize_t match_tail) except NULL:
         """Add this entry to the hash map."""
@@ -328,11 +328,11 @@
         entry.match_pre = match_pre
         entry.match_tail = match_tail
         entry.ptr = ptr
-        entry.source_idx = source_idx
+        entry.global_offset = global_offset
         self.entry_count += 1
         return entry
 
-    def _py_add(self, val, pre_bytes, post_bytes):
+    def _py_add(self, val, global_offset, pre_bytes, post_bytes):
         """Add a new entry to the map. Used by the test suite.
 
         :param val: The RABIN_HASH value for this entry
@@ -341,7 +341,7 @@
         """
         assert pre_bytes < 256
         assert post_bytes < 65536
-        self.add(NULL, 0, val, pre_bytes, post_bytes)
+        self.add(NULL, global_offset, val, pre_bytes, post_bytes)
 
     def _py_find_all(self, val):
         """Find all entries that match the given rabin_val.
@@ -365,6 +365,7 @@
                     # debugging
                     continue
                 # We don't care about match_count or search steps here
+                # TODO: Add the global_offset to this list
                 matches.append((slot.rabin_val, i,
                                 slot.match_pre, slot.match_tail,
                                 0, 0))
@@ -485,9 +486,7 @@
         self.max_bucket_size = MAX_HASH_BUCKET_ENTRIES
 
     def _matches_as_dict(self):
-        cdef SourceInfo source
         cdef unsigned int i
-        cdef Py_ssize_t local_offset
         cdef rabin_entry *slot
 
         matches = {}
@@ -496,14 +495,12 @@
             if (slot.rabin_val == 0
                 and (slot.ptr == NULL or slot.ptr == dummy_ptr)):
                 continue
-            source = self.sources[slot.source_idx]
-            local_offset = slot.ptr - PyString_AS_STRING(source.buf)
-            matches[slot.rabin_val] = source.start_offset + local_offset
+            matches[slot.rabin_val] = slot.global_offset
         return matches
 
-    def _compute_offsets(self, source):
+    def _compute_offsets(self, source, start_offset):
         """Add the hash entries for this data as a 'source' string."""
-        cdef unsigned short source_counter
+        cdef unsigned int global_offset
         cdef const_data ptr
         cdef const_data start
         cdef const_data end
@@ -518,15 +515,12 @@
 
         if not PyString_CheckExact(source):
             raise TypeError('source must be a str')
-        source_counter = <unsigned short>(len(self.sources) - 1)
+        global_offset = start_offset
         size = PyString_GET_SIZE(source)
         if size == 0:
             # TODO: Test that empty strings don't screw things up
             # Nothing to add
             return
-        # I don't fully understand why the first byte is skipped, but that was
-        # the choice made by the old code.
-        num_entries = (size - 1) / (RABIN_WINDOW)
         # We walk backwards through the data, so that early matches are the
         # first entries in the hash buckets.
         start = (<const_data>PyString_AS_STRING(source))
@@ -536,23 +530,24 @@
         #       1/4 chance of having both source and target aligned.
         ptr = start
         last == NULL
-        while ptr < end - RABIN_WINDOW - 1:
+        while ptr < end - RABIN_WINDOW:
             val = 0
             for i from 1 <= i <= RABIN_WINDOW:
                 RABIN_ADD(val, ptr[i])
+            ptr += RABIN_WINDOW
             match_pre = ptr - start
             match_tail = end - ptr
-            ptr += RABIN_WINDOW
+            global_offset += RABIN_WINDOW
             if (last != NULL and val == last.rabin_val):
                 # We have a repeated entry. We only store the first of
                 # consecutive entries, so we just skip this one
                 pass
             else:
-                last = self._hash_map.add(ptr, source_counter,
+                last = self._hash_map.add(ptr, global_offset,
                                           val, match_pre, match_tail)
 
-    def _compute_offsets_from_delta(self, delta_source):
-        cdef unsigned short source_counter
+    def _compute_offsets_from_delta(self, delta_source, start_offset):
+        cdef unsigned int global_offset
         cdef const_data ptr
         cdef const_data start
         cdef const_data end
@@ -569,7 +564,7 @@
 
         if not PyString_CheckExact(delta_source):
             raise TypeError('delta_source must be a str')
-        source_counter = <unsigned short>(len(self.sources) - 1)
+        global_offset = start_offset
         size = PyString_GET_SIZE(delta_source)
         if size == 0:
             # TODO: Test that empty strings don't screw things up
@@ -620,8 +615,8 @@
                     if val == prev_val:
                         # Keep only the first of matching sequences
                         continue
-                    self._hash_map.add(ptr, source_counter, val, match_pre,
-                                       match_tail)
+                    self._hash_map.add(ptr, global_offset + (ptr - start), val,
+                                       match_pre, match_tail)
                     num_entries += 1
                 ptr += c
             else:
@@ -676,7 +671,7 @@
         new_info = SourceInfo(source, start_offset)
         self.sources.append(new_info)
         self._hash_map.reserve(len(source) / RABIN_WINDOW)
-        self._compute_offsets(source)
+        self._compute_offsets(source, start_offset)
         self._limit_hash_buckets()
         self.total_source_bytes += len(source) + extra_offset
 
@@ -690,7 +685,7 @@
         self.sources.append(new_info)
         # This is an upper bound of the number of entries we'll find
         self._hash_map.reserve(len(delta_source) / RABIN_WINDOW)
-        self._compute_offsets_from_delta(delta_source)
+        self._compute_offsets_from_delta(delta_source, start_offset)
         self._limit_hash_buckets()
         self.total_source_bytes += len(delta_source) + extra_offset
 
@@ -718,6 +713,7 @@
     cdef unsigned int insert_count
 
     cdef Py_ssize_t match_size
+    cdef unsigned int match_offset
     cdef rabin_entry *match_entry
 
     cdef int noisy
@@ -832,7 +828,6 @@
     cdef const_data _expand_match(self, const_data data) except NULL:
         """Check if we can expand the current match by data we've output."""
         cdef const_data ref_data
-        cdef SourceInfo ref_source
         cdef Py_ssize_t match_pre
 
         if not self.insert_count:
@@ -840,11 +835,12 @@
             return data
         ref_data = self.match_entry.ptr
         match_pre = self.match_entry.match_pre
-        while (match_pre > 0
-               and ref_data[match_pre - 1] == data[-1]):
+        while (match_pre > 0 and ref_data[-1] == data[-1]):
             self.match_size += 1
             match_pre -= 1
             data -= 1
+            ref_data -= 1
+            self.match_offset -= 1
             self.out_pos -= 1
             self.insert_count -= 1
             if self.insert_count > 0:
@@ -897,23 +893,17 @@
 
     cdef const_data _output_match(self, const_data data) except NULL:
         """We have a match that we like, insert it into the output stream"""
-        cdef SourceInfo source
         cdef Py_ssize_t match_size
         cdef Py_ssize_t local_offset
 
+        self.match_offset = self.match_entry.global_offset
         data = self._expand_match(data)
         self._finalize_pending_insert()
         match_size = self.match_size
         # Matches are limited to 64kB in the current version
         if match_size > 0x10000:
             match_size = 0x10000
-        source = self.index.sources[self.match_entry.source_idx]
-        local_offset = data - PyString_AS_STRING(source.buf)
-        if local_offset < 0 or local_offset > PyString_GET_SIZE(source.buf):
-            raise RuntimeError("match start after source content len")
-        if local_offset + self.match_size > PyString_GET_SIZE(source.buf):
-            raise RuntimeError("match end after than source content")
-        self._output_copy(local_offset + source.start_offset, match_size)
+        self._output_copy(self.match_offset, match_size)
 
         data += match_size
         if self.match_size > match_size:

=== modified file 'bzrlib/tests/test__delta_index.py'
--- a/bzrlib/tests/test__delta_index.py	2010-11-19 20:05:10 +0000
+++ b/bzrlib/tests/test__delta_index.py	2010-11-19 23:11:36 +0000
@@ -111,6 +111,11 @@
         super(TestCaseWithRabinIndex, self).setUp()
         self.index = self._module.RabinIndex()
 
+    def assertHashMatches(self, expected, rabin_val):
+        """Assert that we get the expected matches."""
+        matches = self.index._hash_map._py_find_all(rabin_val)
+        self.assertEqual(expected, matches)
+
 
 class TestRabinHashMap(tests.TestCase):
 
@@ -238,18 +243,11 @@
         val = self._module._py_rabin(source[1:])
         self.index.add_source(source)
         self.assertEqual(1, self.index.num_entries)
-        self.assertEqual(0x0f, self.index.hash_mask)
-        bucket = self.index.buckets[val & self.index.hash_mask]
         self.assertEqual(1, len(self.index.sources))
         self.assertEqual(source, self.index.sources[0].buf)
         self.assertEqual(0, self.index.sources[0].start_offset)
-        self.assertIsNot(None, bucket)
-        offset = bucket.first
-        self.assertIsNot(None, offset)
-        self.assertEqual(val, offset.val)
-        self.assertIs(None, offset.next)
-        # The pointer points to the end of the matching section
-        self.assertEqual(16, self.index._offset_into_source(offset))
+        # rabin_val, offset, match_pre, match_tail, count, step
+        self.assertHashMatches([(val, val & 1023, 16, 1, 1, 1)], val)
 
     def test_add_two_sources(self):
         source = '01234567890123456'
@@ -257,22 +255,15 @@
         self.index.add_source(source)
         self.index.add_source(source, 2)
         self.assertEqual(2, self.index.num_entries)
-        self.assertEqual(0x0f, self.index.hash_mask)
         self.assertEqual(2, len(self.index.sources))
         self.assertEqual(source, self.index.sources[0].buf)
         self.assertEqual(source, self.index.sources[1].buf)
         self.assertEqual(0, self.index.sources[0].start_offset)
         self.assertEqual(19, self.index.sources[1].start_offset)
-        bucket = self.index.buckets[val & self.index.hash_mask]
-        self.assertIsNot(None, bucket)
-        self.assertEqual(2, bucket.count)
-        offset = bucket.first
-        self.assertIsNot(None, offset)
-        self.assertEqual(val, offset.val)
-        offset = offset.next
-        self.assertIsNot(None, offset)
-        self.assertEqual(val, offset.val)
-        self.assertIs(None, offset.next)
+        # rabin_val, offset, match_pre, match_tail, count, step
+        self.assertHashMatches([(val, val & 1023, 16, 1, 1, 1),
+                                (val, (val & 1023) + 1, 16, 1, 2, 2),
+                               ], val)
 
     def test_add_source_continuous_repeated_content(self):
         source = 'x' + ('1234567890123456' * 4)