Rev 3903: groupcompress now copies the blocks exactly as they were given. in http://bzr.arbash-meinel.com/branches/bzr/brisbane/lazy_gc_stream

Tue Mar 17 17:46:26 GMT 2009

At http://bzr.arbash-meinel.com/branches/bzr/brisbane/lazy_gc_stream

------------------------------------------------------------
revno: 3903
revision-id: john at arbash-meinel.com-20090317174617-osa5ia09no26xm1w
parent: john at arbash-meinel.com-20090317161231-nzb4dk8t35ucw84u
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: lazy_gc_stream
timestamp: Tue 2009-03-17 12:46:17 -0500
message:
  groupcompress now copies the blocks exactly as they were given.
  
  One major concern here is that 'topo_sort' is not particularly stable. For example,
  given a history of a=>b=>c=>d and e=>f=>g=>h, it easily groups the contents as,
  h,a,b,c,d,e,f,g. Which is interleaving unrelated histories.
  This will actually cause transmission of the e-h group 2x, and cause effective
  'bloat'.
  We can still tell 'get_record_stream' to remove some of this.
  Also, autopack still needs to be told to *not* re-use blocks.
-------------- next part --------------
=== modified file 'bzrlib/groupcompress.py'

--- a/bzrlib/groupcompress.py	2009-03-17 16:12:31 +0000
+++ b/bzrlib/groupcompress.py	2009-03-17 17:46:17 +0000
@@ -1339,10 +1339,31 @@
         last_fulltext_len = None
         max_fulltext_len = 0
         max_fulltext_prefix = None
+        insert_manager = None
+        block_start = None
+        block_length = None
         for record in stream:
             # Raise an error when a record is missing.
             if record.storage_kind == 'absent':
                 raise errors.RevisionNotPresent(record.key, self)
+            if record.storage_kind == 'groupcompress-block':
+                # Insert the raw block into the target repo
+                insert_manager = record._manager
+                bytes = record._manager._block.to_bytes()
+                _, start, length = self._access.add_raw_records(
+                    [(None, len(bytes))], bytes)[0]
+                del bytes
+                block_start = start
+                block_length = length
+            if record.storage_kind in ('groupcompress-block',
+                                       'groupcompress-block-ref'):
+                assert insert_manager is not None
+                assert record._manager is insert_manager
+                value = "%d %d %d %d" % (block_start, block_length,
+                                         record._start, record._end)
+                nodes = [(record.key, value, (record.parents,))]
+                self._index.add_records(nodes, random_id=random_id)
+                continue
             try:
                 bytes = record.get_bytes_as('fulltext')
             except errors.UnavailableRepresentation:

=== modified file 'bzrlib/tests/test_groupcompress.py'
--- a/bzrlib/tests/test_groupcompress.py	2009-03-17 16:12:31 +0000
+++ b/bzrlib/tests/test_groupcompress.py	2009-03-17 17:46:17 +0000
@@ -23,6 +23,7 @@
     errors,
     osutils,
     tests,
+    versionedfile,
     )
 from bzrlib.osutils import sha_string
 from bzrlib.tests import (
@@ -429,8 +430,10 @@
 
 class TestCaseWithGroupCompressVersionedFiles(tests.TestCaseWithTransport):
 
-    def make_test_vf(self, create_graph, keylength=1, do_cleanup=True):
-        t = self.get_transport()
+    def make_test_vf(self, create_graph, keylength=1, do_cleanup=True,
+                     dir='.'):
+        t = self.get_transport(dir)
+        t.ensure_base()
         vf = groupcompress.make_pack_factory(graph=create_graph,
             delta=False, keylength=keylength)(t)
         if do_cleanup:
@@ -443,7 +446,8 @@
     def test_get_record_stream_as_requested(self):
         # Consider promoting 'as-requested' to general availability, and
         # make this a VF interface test
-        vf = self.make_test_vf(False, do_cleanup=False)
+        vf = self.make_test_vf(False, do_cleanup=False,
+                               dir='source')
         vf.add_lines(('a',), (), ['lines\n'])
         vf.add_lines(('b',), (), ['lines\n'])
         vf.add_lines(('c',), (), ['lines\n'])
@@ -461,7 +465,7 @@
         groupcompress.cleanup_pack_group(vf)
 
         # It should work even after being repacked into another VF
-        vf2 = self.make_test_vf(False)
+        vf2 = self.make_test_vf(False, dir='target')
         vf2.insert_record_stream(vf.get_record_stream(
                     [('b',), ('a',), ('d',), ('c',)], 'as-requested', False))
         vf2.writer.end()
@@ -475,6 +479,56 @@
                     'as-requested', False)]
         self.assertEqual([('b',), ('a',), ('d',), ('c',)], keys)
 
+    def test_get_record_stream_block(self):
+        vf = self.make_test_vf(True, do_cleanup=False, dir='source')
+        def grouped_stream(revision_ids, first_parents=()):
+            parents = first_parents
+            for revision_id in revision_ids:
+                key = (revision_id,)
+                record = versionedfile.FulltextContentFactory(
+                    key, parents, None,
+                    'some content that is\n'
+                    'identical except for\n'
+                    'revision_id:%s\n' % (revision_id,))
+                yield record
+                parents = (key,)
+        # One group, a-d
+        vf.insert_record_stream(grouped_stream(['a', 'b', 'c', 'd']))
+        # Second group, e-h
+        vf.insert_record_stream(grouped_stream(['e', 'f', 'g', 'h'],
+                                               first_parents=(('d',),)))
+        block_bytes = {}
+        stream = vf.get_record_stream([(r,) for r in 'abcdefgh'],
+                                      'unordered', False)
+        for record in stream:
+            if record.key in [('a',), ('e',)]:
+                self.assertEqual('groupcompress-block', record.storage_kind)
+            else:
+                self.assertEqual('groupcompress-block-ref',
+                                 record.storage_kind)
+            block_bytes[record.key] = record._manager._block._z_content
+        for r in 'abcd':
+            key = (r,)
+            self.assertIs(block_bytes[key], block_bytes[('a',)])
+            self.assertNotEqual(block_bytes[key], block_bytes[('e',)])
+        for r in 'efgh':
+            key = (r,)
+            self.assertIs(block_bytes[key], block_bytes[('e',)])
+            self.assertNotEqual(block_bytes[key], block_bytes[('a',)])
+        # Now copy the blocks into another vf, and ensure that the blocks are
+        # preserved without creating new entries
+        vf2 = self.make_test_vf(True, dir='target')
+        # ordering in 'groupcompress' order, should actually swap the groups in
+        # the target vf, but the groups themselves should not be disturbed.
+        vf2.insert_record_stream(vf.get_record_stream(
+            [(r,) for r in 'abcdefgh'], 'groupcompress', False))
+        groupcompress.cleanup_pack_group(vf)
+        stream = vf2.get_record_stream([(r,) for r in 'abcdefgh'],
+                                       'groupcompress', False)
+        vf2.writer.end()
+        for record in stream:
+            self.assertEqual(block_bytes[record.key],
+                             record._manager._block._z_content)
 
 class TestLazyGroupCompress(tests.TestCaseWithTransport):