Rev 106: fix up the failing tests. in http://bazaar.launchpad.net/%7Ebzr/bzr-groupcompress/rabin

Wed Mar 4 15:36:11 GMT 2009

At http://bazaar.launchpad.net/%7Ebzr/bzr-groupcompress/rabin

------------------------------------------------------------
revno: 106
revision-id: john at arbash-meinel.com-20090304152748-iqp4zqlzvnq5pm23
parent: john at arbash-meinel.com-20090304150015-b6o2fru8grx5ubpm
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: rabin
timestamp: Wed 2009-03-04 09:27:48 -0600
message:
  fix up the failing tests.
  
  The new delta code needs a 16-byte window to match, so to *know* that there will
  be a match, you need ~32-bytes in common. (guarantees that 16-bytes somewhere in
  that 32-byte range will match.)
  Also, when setting 'max_delta', it is possible that we run out of bytes before
  we actually find the last match, which would make things compress better.
  This is rare in practice, because texts are longer than 40 bytes. But it happens
  in testing.
-------------- next part --------------
=== modified file 'groupcompress.py'

--- a/groupcompress.py	2009-03-03 22:50:27 +0000
+++ b/groupcompress.py	2009-03-04 15:27:48 +0000
@@ -134,14 +134,14 @@
         self.labels_deltas = {}
         self._delta_index = _groupcompress_pyx.DeltaIndex()
 
-    def compress(self, key, chunks, expected_sha, soft=False):
+    def compress(self, key, bytes, expected_sha, soft=False):
         """Compress lines with label key.
 
         :param key: A key tuple. It is stored in the output
             for identification of the text during decompression. If the last
             element is 'None' it is replaced with the sha1 of the text -
             e.g. sha1:xxxxxxx.
-        :param chunks: The chunks to be compressed
+        :param bytes: The bytes to be compressed
         :param expected_sha: If non-None, the sha the lines are believed to
             have. During compression the sha is calculated; a mismatch will
             cause an error.
@@ -150,9 +150,6 @@
         :return: The sha1 of lines, and the number of bytes accumulated in
             the group output so far.
         """
-        # TODO: Change this to a bytes interface, since the output is now a
-        #       bytes interface anyway.
-        bytes = ''.join(chunks)
         if not _FAST or expected_sha is None:
             sha1 = sha_string(bytes)
         else:
@@ -629,7 +626,7 @@
                         groups += 1
                 last_prefix = prefix
             found_sha1, end_point = self._compressor.compress(record.key,
-                [bytes], record.sha1, soft=soft)
+                bytes, record.sha1, soft=soft)
             if record.key[-1] is None:
                 key = record.key[:-1] + ('sha1:' + found_sha1,)
             else:

=== modified file 'tests/test_groupcompress.py'
--- a/tests/test_groupcompress.py	2009-03-02 19:43:37 +0000
+++ b/tests/test_groupcompress.py	2009-03-04 15:27:48 +0000
@@ -20,7 +20,7 @@
 import zlib
 
 from bzrlib import tests
-from bzrlib.osutils import sha_strings
+from bzrlib.osutils import sha_string
 from bzrlib.plugins.groupcompress_rabin import errors, groupcompress
 from bzrlib.tests import (
     TestCaseWithTransport,
@@ -60,8 +60,8 @@
         # diff against NUKK
         compressor = groupcompress.GroupCompressor(True)
         sha1, end_point = compressor.compress(('label',),
-            ['strange\n', 'common\n'], None)
-        self.assertEqual(sha_strings(['strange\n', 'common\n']), sha1)
+            'strange\ncommon\n', None)
+        self.assertEqual(sha_string('strange\ncommon\n'), sha1)
         expected_lines = [
             'fulltext\n',
             'label:label\nsha1:%s\n' % sha1,
@@ -71,27 +71,44 @@
         self.assertEqual(expected_lines, compressor.lines)
         self.assertEqual(sum(map(len, expected_lines)), end_point)
 
+    def _chunks_to_repr_lines(self, chunks):
+        return '\n'.join(map(repr, ''.join(chunks).split('\n')))
+
+    def assertEqualDiffEncoded(self, expected, actual):
+        """Compare the actual content to the expected content.
+
+        :param expected: A group of chunks that we expect to see
+        :param actual: The measured 'chunks'
+
+        We will transform the chunks back into lines, and then run 'repr()'
+        over them to handle non-ascii characters.
+        """
+        self.assertEqualDiff(self._chunks_to_repr_lines(expected),
+                             self._chunks_to_repr_lines(actual))
+
     def test_two_nosha_delta(self):
         compressor = groupcompress.GroupCompressor(True)
         sha1_1, _ = compressor.compress(('label',),
-            ['strange\n', 'common very very very long line\n'], None)
+            'strange\ncommon long line\nthat needs a 16 byte match\n', None)
         expected_lines = list(compressor.lines)
         sha1_2, end_point = compressor.compress(('newlabel',),
-            ['common very very very long line\n', 'different\n'], None)
-        self.assertEqual(sha_strings(['common very very very long line\n',
-                                      'different\n']), sha1_2)
+            'common long line\nthat needs a 16 byte match\ndifferent\n', None)
+        self.assertEqual(sha_string('common long line\n'
+                                    'that needs a 16 byte match\n'
+                                    'different\n'), sha1_2)
         expected_lines.extend([
             'delta\n'
             'label:newlabel\n',
             'sha1:%s\n' % sha1_2,
             'len:16\n',
+            # source and target length
+            '\x7e\x36',
             # copy the line common
-            'c,72,17\n',
+            '\x91\x52\x2c', #copy, offset 0x52, len 0x2c
             # add the line different, and the trailing newline
-            'i,2\n',
-            'different\n',
+            '\x0adifferent\n', # insert 10 bytes
             ])
-        self.assertEqualDiff(''.join(expected_lines), ''.join(compressor.lines))
+        self.assertEqualDiffEncoded(expected_lines, compressor.lines)
         self.assertEqual(sum(map(len, expected_lines)), end_point)
 
     def test_three_nosha_delta(self):
@@ -99,52 +116,50 @@
         # both parents.
         compressor = groupcompress.GroupCompressor(True)
         sha1_1, end_point = compressor.compress(('label',),
-            ['strange\n', 'common long line\n'], None)
+            'strange\ncommon very very long line\nwith some extra text\n', None)
         sha1_2, _ = compressor.compress(('newlabel',),
-            ['common long line\n', 'different\n', 'moredifferent\n'], None)
+            'different\nmoredifferent\nand then some more\n', None)
         expected_lines = list(compressor.lines)
         sha1_3, end_point = compressor.compress(('label3',),
-            ['new\n', 'common long line\n', 'different\n', 'moredifferent\n'],
+            'new\ncommon very very long line\nwith some extra text\n'
+            'different\nmoredifferent\nand then some more\n',
             None)
         self.assertEqual(
-            sha_strings(['new\n', 'common long line\n', 'different\n',
-                         'moredifferent\n']),
+            sha_string('new\ncommon very very long line\nwith some extra text\n'
+                       'different\nmoredifferent\nand then some more\n'),
             sha1_3)
         expected_lines.extend([
             'delta\n',
             'label:label3\n',
             'sha1:%s\n' % sha1_3,
-            'len:11\n',
+            'len:13\n',
+            '\xfa\x01\x5f' # source and target length
             # insert new
-            'i,1\n',
-            'new\n',
-            # copy the line common
-            'c,72,17\n',
-            # copy the lines different, moredifferent and trailing newline
-            'c,165,25\n',
+            '\x03new',
+            # Copy of first parent 'common' range
+            '\x91\x51\x31' # copy, offset 0x51, 0x31 bytes
+            # Copy of second parent 'different' range
+            '\x91\xcf\x2b' # copy, offset 0xcf, 0x2b bytes
             ])
-        self.assertEqualDiff(''.join(expected_lines),
-                             ''.join(compressor.lines))
+        self.assertEqualDiffEncoded(expected_lines, compressor.lines)
         self.assertEqual(sum(map(len, expected_lines)), end_point)
 
     def test_stats(self):
         compressor = groupcompress.GroupCompressor(True)
-        compressor.compress(('label',),
-            ['strange\n', 'common\n'], None)
+        compressor.compress(('label',), 'strange\ncommon\n', None)
         compressor.compress(('newlabel',),
-            ['common\n', 'different\n', 'moredifferent\n'], None)
+                            'common\ndifferent\nmoredifferent\n', None)
         compressor.compress(('label3',),
-            ['new\n', 'common\n', 'different\n', 'moredifferent\n'], None)
+                            'new\ncommon\ndifferent\nmoredifferent\n', None)
         self.assertAlmostEqual(0.3, compressor.ratio(), 1)
 
     def test_extract_from_compressor(self):
         # Knit fetching will try to reconstruct texts locally which results in
         # reading something that is in the compressor stream already.
         compressor = groupcompress.GroupCompressor(True)
-        sha_1,  _ = compressor.compress(('label',),
-            ['strange\n', 'common\n'], None)
+        sha_1,  _ = compressor.compress(('label',), 'strange\ncommon\n', None)
         sha_2, _ = compressor.compress(('newlabel',),
-            ['common\n', 'different\n', 'moredifferent\n'], None)
+            'common\ndifferent\nmoredifferent\n', None)
         # get the first out
         self.assertEqual((['strange\ncommon\n'], sha_1),
             compressor.extract(('label',)))