Rev 3898: Change GroupCompressor.compress() to return the start_point. in http://bazaar.launchpad.net/%7Ebzr/bzr/brisbane-core

Mon Mar 23 20:37:11 GMT 2009

At http://bazaar.launchpad.net/%7Ebzr/bzr/brisbane-core

------------------------------------------------------------
revno: 3898
revision-id: john at arbash-meinel.com-20090323203538-kj155dzrd5epzaf5
parent: john at arbash-meinel.com-20090323201046-ek580vnq69i270lp
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: brisbane-core
timestamp: Mon 2009-03-23 15:35:38 -0500
message:
  Change GroupCompressor.compress() to return the start_point.
  
  Also, mark empty content with start=end=0.
  This also gives us a good starting point to handle duplicate entries (if we
  find that makes a difference.)
  From experimentation, using 0,0 for empty entries actually makes a big difference
  in the text index. Mostly because about 1/2 of all entries have no content,
  (all of the directory records, for example), so it allows the compression
  to shrink the index a bit.
-------------- next part --------------
=== modified file 'bzrlib/groupcompress.py'

--- a/bzrlib/groupcompress.py	2009-03-23 20:04:42 +0000
+++ b/bzrlib/groupcompress.py	2009-03-23 20:35:38 +0000
@@ -59,6 +59,10 @@
 _NO_LABELS = True
 _FAST = False
 
+# osutils.sha_string('')
+_null_sha1 = 'da39a3ee5e6b4b0d3255bfef95601890afd80709'
+
+
 def encode_base128_int(val):
     """Convert an integer into a 7-bit lsb encoding."""
     bytes = []
@@ -558,17 +562,16 @@
         compressor = GroupCompressor()
         tstart = time.time()
         old_length = self._block._content_length
-        cur_endpoint = 0
+        end_point = 0
         for factory in self._factories:
             bytes = factory.get_bytes_as('fulltext')
-            (found_sha1, end_point, type,
+            (found_sha1, start_point, end_point, type,
              length) = compressor.compress(factory.key, bytes, factory.sha1)
             # Now update this factory with the new offsets, etc
             factory.sha1 = found_sha1
-            factory._start = cur_endpoint
+            factory._start = start_point
             factory._end = end_point
-            cur_endpoint = end_point
-        self._last_byte = cur_endpoint
+        self._last_byte = end_point
         new_block = compressor.flush()
         # TODO: Should we check that new_block really *is* smaller than the old
         #       block? It seems hard to come up with a method that it would
@@ -770,12 +773,21 @@
             the group output so far.
         :seealso VersionedFiles.add_lines:
         """
-        if not _FAST or expected_sha is None:
-            sha1 = sha_string(bytes)
-        else:
+        if not bytes: # empty, like a dir entry, etc
+            if nostore_sha == _null_sha1:
+                raise errors.ExistingContent()
+            self._block.add_entry(key, type='empty',
+                                  sha1=None, start=0,
+                                  length=0)
+            return _null_sha1, 0, 0, 'fulltext', 0
+        # we assume someone knew what they were doing when they passed it in
+        if expected_sha is not None:
             sha1 = expected_sha
-        if sha1 == nostore_sha:
-            raise errors.ExistingContent()
+        else:
+            sha1 = osutils.sha_string(bytes)
+        if nostore_sha is not None:
+            if sha1 == nostore_sha:
+                raise errors.ExistingContent()
         if key[-1] is None:
             key = key[:-1] + ('sha1:' + sha1,)
         input_len = len(bytes)
@@ -813,6 +825,7 @@
                 self._delta_index.add_delta_source(delta, len_mini_header)
         self._block.add_entry(key, type=type, sha1=sha1,
                               start=self.endpoint, length=length)
+        start = self.endpoint
         delta_start = (self.endpoint, len(self.lines))
         self.num_keys += 1
         self.output_chunks(new_chunks)
@@ -823,7 +836,7 @@
             raise AssertionError('the delta index is out of sync'
                 'with the output lines %s != %s'
                 % (self._delta_index._source_offset, self.endpoint))
-        return sha1, self.endpoint, type, length
+        return sha1, start, self.endpoint, type, length
 
     def extract(self, key):
         """Extract a key previously added to the compressor.
@@ -1451,7 +1464,7 @@
             if max_fulltext_len < len(bytes):
                 max_fulltext_len = len(bytes)
                 max_fulltext_prefix = prefix
-            (found_sha1, end_point, type,
+            (found_sha1, start_point, end_point, type,
              length) = self._compressor.compress(record.key,
                 bytes, record.sha1, soft=soft,
                 nostore_sha=nostore_sha)
@@ -1499,9 +1512,8 @@
             if start_new_block:
                 self._compressor.pop_last()
                 flush()
-                basis_end = 0
                 max_fulltext_len = len(bytes)
-                (found_sha1, end_point, type,
+                (found_sha1, start_point, end_point, type,
                  length) = self._compressor.compress(record.key,
                     bytes, record.sha1)
                 last_fulltext_len = length

=== modified file 'bzrlib/tests/test_groupcompress.py'
--- a/bzrlib/tests/test_groupcompress.py	2009-03-20 15:43:10 +0000
+++ b/bzrlib/tests/test_groupcompress.py	2009-03-23 20:35:38 +0000
@@ -42,15 +42,37 @@
     def test_one_nosha_delta(self):
         # diff against NUKK
         compressor = groupcompress.GroupCompressor()
-        sha1, end_point, _, _ = compressor.compress(('label',),
+        sha1, start_point, end_point, _, _ = compressor.compress(('label',),
             'strange\ncommon\n', None)
         self.assertEqual(sha_string('strange\ncommon\n'), sha1)
         expected_lines = [
             'f', '\x0f', 'strange\ncommon\n',
             ]
         self.assertEqual(expected_lines, compressor.lines)
+        self.assertEqual(0, start_point)
         self.assertEqual(sum(map(len, expected_lines)), end_point)
 
+    def test_empty_content(self):
+        compressor = groupcompress.GroupCompressor()
+        # Adding empty bytes should return the 'null' record
+        sha1, start_point, end_point, kind, _ = compressor.compress(('empty',),
+            '', None)
+        self.assertEqual(0, start_point)
+        self.assertEqual(0, end_point)
+        self.assertEqual('fulltext', kind)
+        self.assertEqual(groupcompress._null_sha1, sha1)
+        self.assertEqual(0, compressor.endpoint)
+        self.assertEqual([], compressor.lines)
+        # Even after adding some content
+        compressor.compress(('content',), 'some\nbytes\n', None)
+        self.assertTrue(compressor.endpoint > 0)
+        sha1, start_point, end_point, kind, _ = compressor.compress(('empty2',),
+            '', None)
+        self.assertEqual(0, start_point)
+        self.assertEqual(0, end_point)
+        self.assertEqual('fulltext', kind)
+        self.assertEqual(groupcompress._null_sha1, sha1)
+
     def _chunks_to_repr_lines(self, chunks):
         return '\n'.join(map(repr, ''.join(chunks).split('\n')))
 
@@ -68,10 +90,10 @@
 
     def test_two_nosha_delta(self):
         compressor = groupcompress.GroupCompressor()
-        sha1_1, _, _, _ = compressor.compress(('label',),
+        sha1_1, _, _, _, _ = compressor.compress(('label',),
             'strange\ncommon long line\nthat needs a 16 byte match\n', None)
         expected_lines = list(compressor.lines)
-        sha1_2, end_point, _, _ = compressor.compress(('newlabel',),
+        sha1_2, start_point, end_point, _, _ = compressor.compress(('newlabel',),
             'common long line\nthat needs a 16 byte match\ndifferent\n', None)
         self.assertEqual(sha_string('common long line\n'
                                     'that needs a 16 byte match\n'
@@ -93,12 +115,12 @@
         # The first interesting test: make a change that should use lines from
         # both parents.
         compressor = groupcompress.GroupCompressor()
-        sha1_1, end_point, _, _ = compressor.compress(('label',),
+        sha1_1, _, _, _, _ = compressor.compress(('label',),
             'strange\ncommon very very long line\nwith some extra text\n', None)
-        sha1_2, _, _, _ = compressor.compress(('newlabel',),
+        sha1_2, _, _, _, _ = compressor.compress(('newlabel',),
             'different\nmoredifferent\nand then some more\n', None)
         expected_lines = list(compressor.lines)
-        sha1_3, end_point, _, _ = compressor.compress(('label3',),
+        sha1_3, start_point, end_point, _, _ = compressor.compress(('label3',),
             'new\ncommon very very long line\nwith some extra text\n'
             'different\nmoredifferent\nand then some more\n',
             None)
@@ -137,10 +159,10 @@
         # Knit fetching will try to reconstruct texts locally which results in
         # reading something that is in the compressor stream already.
         compressor = groupcompress.GroupCompressor()
-        sha1_1, _, _, _ = compressor.compress(('label',),
+        sha1_1, _, _, _, _ = compressor.compress(('label',),
             'strange\ncommon long line\nthat needs a 16 byte match\n', None)
         expected_lines = list(compressor.lines)
-        sha1_2, end_point, _, _ = compressor.compress(('newlabel',),
+        sha1_2, _, end_point, _, _ = compressor.compress(('newlabel',),
             'common long line\nthat needs a 16 byte match\ndifferent\n', None)
         # get the first out
         self.assertEqual(('strange\ncommon long line\n'