Rev 57: We now basically have full support for using diff-delta as the compressor. in http://bzr.arbash-meinel.com/plugins/groupcompress_rabin

Fri Feb 27 19:54:28 GMT 2009

At http://bzr.arbash-meinel.com/plugins/groupcompress_rabin

------------------------------------------------------------
revno: 57
revision-id: john at arbash-meinel.com-20090227195427-5rw3pjlgkssido0d
parent: john at arbash-meinel.com-20090227184307-h8zgtnf217omdw1h
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: groupcompress_rabin
timestamp: Fri 2009-02-27 13:54:27 -0600
message:
  We now basically have full support for using diff-delta as the compressor.
  
  Will still need some tuning/tweaking to see how we want to proceed.
-------------- next part --------------
=== modified file 'groupcompress.py'

--- a/groupcompress.py	2009-02-27 17:36:23 +0000
+++ b/groupcompress.py	2009-02-27 19:54:27 +0000
@@ -53,6 +53,17 @@
     )
 
 
+def parse(bytes):
+    action, label_line, sha1_line, delta_bytes = bytes.split('\n', 3)
+    if (action not in ('fulltext', 'delta')
+        or not label_line.startswith('label: ')
+        or not sha1_line.startswith('sha1: ')):
+        raise AssertionError("bad text record %r" % (bytes,))
+    label = tuple(label_line[7:].split('\x00'))
+    sha1 = sha1_line[6:]
+    return action, label, sha1, delta_bytes
+
+
 def sort_gc_optimal(parent_map):
     """Sort and group the keys in parent_map into gc-optimal order.
 
@@ -102,7 +113,7 @@
 
         :paeam delta: If False, do not compress records.
         """
-        self._compressed = []
+        self.lines = []
         self.endpoint = 0
         self.input_bytes = 0
         self.labels_deltas = {}
@@ -127,14 +138,21 @@
         if key[-1] is None:
             key = key[:-1] + ('sha1:' + sha1,)
         label = '\x00'.join(key)
-        ## new_lines = []
-        input_len = sum(map(len, chunks))
-        new_lines = ['label: %s\n' % label,
-                     'sha1: %s\n' % sha1,
-                     'len: %s\n' % input_len,
-                    ]
+        target_text = ''.join(chunks)
+        input_len = len(target_text)
+        new_chunks = ['label: %s\nsha1: %s\n' % (label, sha1)]
+        source_text = ''.join(self.lines)
+        delta = _groupcompress_c.make_delta(source_text, target_text)
+        if delta is None:
+            # We can't delta (perhaps source_text is empty)
+            # so mark this as an insert
+            new_chunks.insert(0, 'fulltext\n')
+            new_chunks.extend(chunks)
+        else:
+            new_chunks.insert(0, 'delta\n')
+            new_chunks.append(delta)
         delta_start = (self.endpoint, len(self.lines))
-        self.output_lines(new_lines, index_lines)
+        self.output_chunks(new_chunks)
         self.input_bytes += input_len
         delta_end = (self.endpoint, len(self.lines))
         self.labels_deltas[key] = (delta_start, delta_end)
@@ -147,73 +165,26 @@
         :return: An iterable over bytes and the sha1.
         """
         delta_details = self.labels_deltas[key]
-        delta_lines = self.lines[delta_details[0][1]:delta_details[1][1]]
-        label, sha1, delta = parse(delta_lines)
-        ## delta = parse(delta_lines)
+        delta_chunks = self.lines[delta_details[0][1]:delta_details[1][1]]
+        action, label, sha1, delta = parse(''.join(delta_chunks))
         if label != key:
             raise AssertionError("wrong key: %r, wanted %r" % (label, key))
-        # Perhaps we want to keep the line offsets too in memory at least?
-        chunks = apply_delta(''.join(self.lines), delta)
-        sha1 = sha_strings(chunks)
-        return chunks, sha1
-
-    def flush_multi(self, instructions, lines, new_lines, index_lines):
-        """Flush a bunch of different ranges out.
-
-        This should only be called with data that are "pure" copies.
-        """
-        flush_range = self.flush_range
-        if len(instructions) > 2:
-            # This is the number of lines to be copied
-            total_copy_range = sum(i[2] for i in instructions)
-            if len(instructions) > 0.5 * total_copy_range:
-                # We are copying N lines, but taking more than N/2
-                # copy instructions to do so. We will go ahead and expand this
-                # text so that other code is able to match against it
-                flush_range(instructions[0][0], None, total_copy_range,
-                            lines, new_lines, index_lines)
-                return
-        for ns, os, rl in instructions:
-            flush_range(ns, os, rl, lines, new_lines, index_lines)
-
-    def flush_range(self, range_start, copy_start, range_len, lines, new_lines, index_lines):
-        insert_instruction = "i,%d\n" % range_len
-        if copy_start is not None:
-            # range stops, flush and start a new copy range
-            stop_byte = self.line_offsets[copy_start + range_len - 1]
-            if copy_start == 0:
-                start_byte = 0
-            else:
-                start_byte = self.line_offsets[copy_start - 1]
-            bytes = stop_byte - start_byte
-            copy_control_instruction = "c,%d,%d\n" % (start_byte, bytes)
-            if (bytes + len(insert_instruction) >
-                len(copy_control_instruction)):
-                new_lines.append(copy_control_instruction)
-                index_lines.append(False)
-                return
-        # not copying, or inserting is shorter than copying, so insert.
-        new_lines.append(insert_instruction)
-        new_lines.extend(lines[range_start:range_start+range_len])
-        index_lines.append(False)
-        index_lines.extend([copy_start is None]*range_len)
-
-    def output_lines(self, new_lines, index_lines):
-        """Output some lines.
-
-        :param new_lines: The lines to output.
-        :param index_lines: A boolean flag for each line - when True, index
-            that line.
-        """
-        # indexed_newlines = [idx for idx, val in enumerate(index_lines)
-        #                          if val and new_lines[idx] == '\n']
-        # if indexed_newlines:
-        #     import pdb; pdb.set_trace()
+        if action == 'fulltext':
+            bytes = delta
+        else:
+            source = ''.join(self.lines[delta_details[0][0]])
+            bytes = _groupcompress_c.apply_delta(source, delta)
+        sha1 = sha_string(bytes)
+        return [bytes], sha1
+
+    def output_chunks(self, new_chunks):
+        """Output some chunks.
+
+        :param new_chunks: The chunks to output.
+        """
         endpoint = self.endpoint
-        self.line_locations.extend_lines(new_lines, index_lines)
-        for line in new_lines:
-            endpoint += len(line)
-            self.line_offsets.append(endpoint)
+        self.lines.extend(new_chunks)
+        endpoint += sum(map(len, new_chunks))
         self.endpoint = endpoint
 
     def ratio(self):
@@ -397,18 +368,7 @@
                     result[key] = self._unadded_refs[key]
         return result
 
-    def _get_delta_lines(self, key):
-        """Helper function for manual debugging.
-
-        This is a convenience function that shouldn't be used in production
-        code.
-        """
-        build_details = self._index.get_build_details([key])[key]
-        index_memo = build_details[0]
-        group, delta_lines = self._get_group_and_delta_lines(index_memo)
-        return delta_lines
-
-    def _get_group_and_delta_lines(self, index_memo):
+    def _get_group_and_delta_bytes(self, index_memo):
         read_memo = index_memo[0:3]
         # get the group:
         try:
@@ -427,7 +387,7 @@
         # parse - requires split_lines, better to have byte offsets
         # here (but not by much - we only split the region for the
         # recipe, and we often want to end up with lines anyway.
-        return plain, split_lines(plain[index_memo[3]:index_memo[4]])
+        return plain, plain[index_memo[3]:index_memo[4]]
 
     def get_missing_compression_parent_keys(self):
         """Return the keys of missing compression parents.
@@ -506,13 +466,17 @@
                 parents = self._unadded_refs[key]
             else:
                 index_memo, _, parents, (method, _) = locations[key]
-                plain, delta_lines = self._get_group_and_delta_lines(index_memo)
-                ## delta = parse(delta_lines)
-                label, sha1, delta = parse(delta_lines)
+                plain, delta_bytes = self._get_group_and_delta_bytes(index_memo)
+                action, label, sha1, delta = parse(delta_bytes)
                 if label != key:
                     raise AssertionError("wrong key: %r, wanted %r" % (label, key))
-                chunks = apply_delta(plain, delta)
-                ## sha1 = sha_strings(lines)
+                if action == 'fulltext':
+                    chunks = [delta]
+                else:
+                    # TODO: relax apply_delta so that it can allow source to be
+                    #       longer than expected
+                    chunks = [_groupcompress_c.apply_delta(
+                                plain[0:index_memo[3]], delta)]
                 if sha_strings(chunks) != sha1:
                     raise AssertionError('sha1 sum did not match')
             yield ChunkedContentFactory(key, parents, sha1, chunks)
@@ -889,9 +853,6 @@
 
 
 try:
-    from bzrlib.plugins.groupcompress import _groupcompress_c
+    from bzrlib.plugins.groupcompress_rabin import _groupcompress_c
 except ImportError:
     pass
-else:
-    GroupCompressor._equivalence_table_class = _groupcompress_c.EquivalenceTable
-    _get_longest_match = _groupcompress_c._get_longest_match

=== modified file 'tests/test_groupcompress.py'
--- a/tests/test_groupcompress.py	2009-02-27 03:28:10 +0000
+++ b/tests/test_groupcompress.py	2009-02-27 19:54:27 +0000
@@ -21,7 +21,7 @@
 
 from bzrlib import tests
 from bzrlib.osutils import sha_strings
-from bzrlib.plugins.groupcompress import errors, groupcompress
+from bzrlib.plugins.groupcompress_rabin import errors, groupcompress
 from bzrlib.tests import (
     TestCaseWithTransport,
     TestScenarioApplier,
@@ -35,7 +35,7 @@
     vf_interface_tests = loader.loadTestsFromTestCase(TestVersionedFiles)
     cleanup_pack_group = groupcompress.cleanup_pack_group
     make_pack_factory = groupcompress.make_pack_factory
-    group_scenario = ('groupcompress-nograph', {
+    group_scenario = ('groupcompressrabin-nograph', {
             'cleanup':cleanup_pack_group,
             'factory':make_pack_factory(False, False, 1),
             'graph': False,
@@ -84,6 +84,7 @@
         self.assertEqual(sha_strings(['common long line\n', 'different\n']),
                          sha1_2)
         expected_lines.extend([
+            'delta\n'
             'label: newlabel\n',
             'sha1: %s\n' % sha1_2,
             # copy the line common