Rev 4219: Merge bbc at 3908 in file:///home/vila/src/bzr/experimental/gc-py-bbc/

Tue Mar 31 08:43:43 BST 2009

At file:///home/vila/src/bzr/experimental/gc-py-bbc/

------------------------------------------------------------
revno: 4219 [merge]
revision-id: v.ladeuil+lp at free.fr-20090331074343-wghocs28bnzbjlh2
parent: v.ladeuil+lp at free.fr-20090331074216-yxfi6sra7nh61v09
parent: v.ladeuil+lp at free.fr-20090331073419-89g1md60o2slg5t4
committer: Vincent Ladeuil <v.ladeuil+lp at free.fr>
branch nick: bbc
timestamp: Tue 2009-03-31 09:43:43 +0200
message:
  Merge bbc at 3908
modified:
  bzrlib/chk_map.py              chk_map.py-20081001014447-ue6kkuhofvdecvxa-1
  bzrlib/groupcompress.py        groupcompress.py-20080705181503-ccbxd6xuy1bdnrpu-8
  bzrlib/repofmt/pack_repo.py    pack_repo.py-20070813041115-gjv5ma7ktfqwsjgn-1
  bzrlib/repository.py           rev_storage.py-20051111201905-119e9401e46257e3
  bzrlib/tests/per_repository/test_commit_builder.py test_commit_builder.py-20060606110838-76e3ra5slucqus81-1
  bzrlib/tests/test_chk_map.py   test_chk_map.py-20081001014447-ue6kkuhofvdecvxa-2
-------------- next part --------------
=== modified file 'bzrlib/chk_map.py'

--- a/bzrlib/chk_map.py	2009-03-27 16:36:50 +0000
+++ b/bzrlib/chk_map.py	2009-03-30 21:13:24 +0000
@@ -1289,8 +1289,7 @@
     next_interesting = set()
     uninteresting_items = set()
     interesting_items = set()
-    interesting_records = []
-    # records_read = set()
+    interesting_to_yield = []
     for record in store.get_record_stream(chks_to_read, 'unordered', True):
         # records_read.add(record.key())
         if pb is not None:
@@ -1307,20 +1306,17 @@
                 # store
                 uninteresting_items.update(node.iteritems(None))
         else:
-            interesting_records.append(record)
+            interesting_to_yield.append(record.key)
             if type(node) is InternalNode:
                 next_interesting.update(node.refs())
             else:
                 interesting_items.update(node.iteritems(None))
-    # TODO: Filter out records that have already been read, as node splitting
-    #       can cause us to reference the same nodes via shorter and longer
-    #       paths
     return (next_uninteresting, uninteresting_items,
-            next_interesting, interesting_records, interesting_items)
+            next_interesting, interesting_to_yield, interesting_items)
 
 
 def _find_all_uninteresting(store, interesting_root_keys,
-                            uninteresting_root_keys, adapter, pb):
+                            uninteresting_root_keys, pb):
     """Determine the full set of uninteresting keys."""
     # What about duplicates between interesting_root_keys and
     # uninteresting_root_keys?
@@ -1336,7 +1332,7 @@
     # First step, find the direct children of both the interesting and
     # uninteresting set
     (uninteresting_keys, uninteresting_items,
-     interesting_keys, interesting_records,
+     interesting_keys, interesting_to_yield,
      interesting_items) = _find_children_info(store, interesting_root_keys,
                                               uninteresting_root_keys,
                                               pb=pb)
@@ -1357,10 +1353,7 @@
             # TODO: Handle 'absent'
             if pb is not None:
                 pb.tick()
-            try:
-                bytes = record.get_bytes_as('fulltext')
-            except errors.UnavailableRepresentation:
-                bytes = adapter.get_bytes(record)
+            bytes = record.get_bytes_as('fulltext')
             # We don't care about search_key_func for this code, because we
             # only care about external references.
             node = _deserialise(bytes, record.key, search_key_func=None)
@@ -1375,7 +1368,7 @@
                 all_uninteresting_items.update(node._items.iteritems())
         chks_to_read = next_chks
     return (all_uninteresting_chks, all_uninteresting_items,
-            interesting_keys, interesting_records, interesting_items)
+            interesting_keys, interesting_to_yield, interesting_items)
 
 
 def iter_interesting_nodes(store, interesting_root_keys,
@@ -1390,7 +1383,7 @@
     :param uninteresting_root_keys: keys which should be filtered out of the
         result set.
     :return: Yield
-        (interesting records, interesting chk's, interesting key:values)
+        (interesting record, {interesting key:values})
     """
     # TODO: consider that it may be more memory efficient to use the 20-byte
     #       sha1 string, rather than tuples of hexidecimal sha1 strings.
@@ -1399,33 +1392,24 @@
     #       able to use nodes from the _page_cache as well as actually
     #       requesting bytes from the store.
 
-    # A way to adapt from the compressed texts back into fulltexts
-    # In a way, this seems like a layering inversion to have CHKMap know the
-    # details of versionedfile
-    adapter_class = versionedfile.adapter_registry.get(
-        ('knit-ft-gz', 'fulltext'))
-    adapter = adapter_class(store)
-
     (all_uninteresting_chks, all_uninteresting_items, interesting_keys,
-     interesting_records, interesting_items) = _find_all_uninteresting(store,
-        interesting_root_keys, uninteresting_root_keys, adapter, pb)
+     interesting_to_yield, interesting_items) = _find_all_uninteresting(store,
+        interesting_root_keys, uninteresting_root_keys, pb)
 
     # Now that we know everything uninteresting, we can yield information from
     # our first request
     interesting_items.difference_update(all_uninteresting_items)
-    records = dict((record.key, record) for record in interesting_records
-                    if record.key not in all_uninteresting_chks)
-    if records or interesting_items:
-        yield records, interesting_items
+    interesting_to_yield = set(interesting_to_yield) - all_uninteresting_chks
+    if interesting_items:
+        yield None, interesting_items
+    if interesting_to_yield:
+        # We request these records again, rather than buffering the root
+        # records, most likely they are still in the _group_cache anyway.
+        for record in store.get_record_stream(interesting_to_yield,
+                                              'unordered', False):
+            yield record, []
+    all_uninteresting_chks.update(interesting_to_yield)
     interesting_keys.difference_update(all_uninteresting_chks)
-    # TODO: We need a test for this
-    #       This handles the case where after a split, one of the child trees
-    #       is identical to one of the interesting root keys. Like if you had a
-    #       leaf node, with "aa" "ab", that then overflowed at "bb". You would
-    #       get a new internal node, but it would have one leaf node with
-    #       ("aa", "ab") and another leaf node with "bb". And you don't want to
-    #       re-transmit that ("aa", "ab") node again
-    all_uninteresting_chks.update(interesting_root_keys)
 
     chks_to_read = interesting_keys
     counter = 0
@@ -1436,10 +1420,7 @@
             if pb is not None:
                 pb.update('find chk pages', counter)
             # TODO: Handle 'absent'?
-            try:
-                bytes = record.get_bytes_as('fulltext')
-            except errors.UnavailableRepresentation:
-                bytes = adapter.get_bytes(record)
+            bytes = record.get_bytes_as('fulltext')
             # We don't care about search_key_func for this code, because we
             # only care about external references.
             node = _deserialise(bytes, record.key, search_key_func=None)
@@ -1468,7 +1449,7 @@
                 #       whole thing, but it does mean that callers need to
                 #       understand they may get duplicate values.
                 # all_uninteresting_items.update(interesting_items)
-            yield {record.key: record}, interesting_items
+            yield record, interesting_items
         chks_to_read = next_chks
 
 

=== modified file 'bzrlib/groupcompress.py'
--- a/bzrlib/groupcompress.py	2009-03-27 16:36:50 +0000
+++ b/bzrlib/groupcompress.py	2009-03-30 21:30:33 +0000
@@ -506,12 +506,9 @@
                 self._manager._prepare_for_extract()
                 block = self._manager._block
                 self._bytes = block.extract(self.key, self._start, self._end)
-                # XXX: It seems the smart fetch extracts inventories and chk
-                #      pages as fulltexts to find the next chk pages, but then
-                #      passes them down to be inserted as a
-                #      groupcompress-block, so this is not safe to do. Perhaps
-                #      we could just change the storage kind to "fulltext" at
-                #      that point?
+                # There are code paths that first extract as fulltext, and then
+                # extract as storage_kind (smart fetch). So we don't break the
+                # refcycle here, but instead in manager.get_record_stream()
                 # self._manager = None
             if storage_kind == 'fulltext':
                 return self._bytes
@@ -551,13 +548,7 @@
             yield factory
             # Break the ref-cycle
             factory._bytes = None
-            # XXX: this is not safe, the smart fetch code requests the content
-            #      as both a 'fulltext', and then later on as a
-            #      groupcompress-block. The iter_interesting_nodes code also is
-            #      still buffering multiple records and returning them later.
-            #      So that code would need to be updated to either re-fetch the
-            #      original object, or buffer it somehow.
-            # factory._manager = None
+            factory._manager = None
         # TODO: Consider setting self._factories = None after the above loop,
         #       as it will break the reference cycle
 
@@ -667,6 +658,8 @@
             record_header = '%s\n%s\n%d\n%d\n' % (
                 key_bytes, parent_bytes, factory._start, factory._end)
             header_lines.append(record_header)
+            # TODO: Can we break the refcycle at this point and set
+            #       factory._manager = None?
         header_bytes = ''.join(header_lines)
         del header_lines
         header_bytes_len = len(header_bytes)

=== modified file 'bzrlib/repofmt/pack_repo.py'
--- a/bzrlib/repofmt/pack_repo.py	2009-03-27 16:36:50 +0000
+++ b/bzrlib/repofmt/pack_repo.py	2009-03-30 21:13:24 +0000
@@ -2471,7 +2471,7 @@
         revision_ids = frozenset(revision_ids)
         file_id_revisions = {}
         bytes_to_info = CHKInventory._bytes_to_utf8name_key
-        for records, items in chk_map.iter_interesting_nodes(self.chk_bytes,
+        for record, items in chk_map.iter_interesting_nodes(self.chk_bytes,
                     interesting_root_keys, uninteresting_root_keys,
                     pb=pb):
             # This is cheating a bit to use the last grabbed 'inv', but it

=== modified file 'bzrlib/repository.py'
--- a/bzrlib/repository.py	2009-03-30 12:14:49 +0000
+++ b/bzrlib/repository.py	2009-03-31 07:43:43 +0000
@@ -4287,12 +4287,12 @@
         def to_stream_adapter():
             """Adapt the iter_interesting_nodes result to a single stream.
 
-            iter_interesting_nodes returns records as it processes them, which
-            can be in batches. But we only want a single stream to be inserted.
+            iter_interesting_nodes returns records as it processes them, along
+            with keys. However, we only want to return the records themselves.
             """
             for record, items in interesting:
-                for value in record.itervalues():
-                    yield value
+                if record is not None:
+                    yield record
         # XXX: We could instead call get_record_stream(records.keys())
         #      ATM, this will always insert the records as fulltexts, and
         #      requires that you can hang on to records once you have gone

=== modified file 'bzrlib/tests/per_repository/test_commit_builder.py'
--- a/bzrlib/tests/per_repository/test_commit_builder.py	2009-03-30 12:14:49 +0000
+++ b/bzrlib/tests/per_repository/test_commit_builder.py	2009-03-31 07:43:43 +0000
@@ -185,7 +185,6 @@
             except:
                 builder.abort()
                 raise
-            builder.finish_inventory()
             self.assertEqual(revision_id, builder.commit('foo bar'))
         finally:
             tree.unlock()

=== modified file 'bzrlib/tests/test_chk_map.py'
--- a/bzrlib/tests/test_chk_map.py	2009-03-25 07:54:11 +0000
+++ b/bzrlib/tests/test_chk_map.py	2009-03-30 21:13:24 +0000
@@ -1846,32 +1846,38 @@
         store = self.get_chk_bytes()
         iter_nodes = chk_map.iter_interesting_nodes(store, interesting_keys,
                                                     uninteresting_keys)
-        for count, (exp, act) in enumerate(izip(expected, iter_nodes)):
-            exp_record_keys, exp_items = exp
-            records, items = act
-            exp_tuple = (sorted(exp_record_keys), sorted(exp_items))
-            act_tuple = (sorted(records.keys()), sorted(items))
-            self.assertEqual(exp_tuple, act_tuple)
-        self.assertEqual(len(expected), count + 1)
+        nodes = list(iter_nodes)
+        for count, (exp, act) in enumerate(izip(expected, nodes)):
+            exp_record, exp_items = exp
+            record, items = act
+            exp_tuple = (exp_record, sorted(exp_items))
+            if record is None:
+                act_tuple = (None, sorted(items))
+            else:
+                act_tuple = (record.key, sorted(items))
+            self.assertEqual(exp_tuple, act_tuple,
+                             'entry %d did not match expected' % count)
+        self.assertEqual(len(expected), len(nodes))
 
     def test_empty_to_one_keys(self):
         target = self.get_map_key({('a',): 'content'})
         self.assertIterInteresting(
-            [([target], [(('a',), 'content')])],
-            [target], [])
+            [(target, [(('a',), 'content')]),
+            ], [target], [])
 
     def test_none_to_one_key(self):
         basis = self.get_map_key({})
         target = self.get_map_key({('a',): 'content'})
         self.assertIterInteresting(
-            [([target], [(('a',), 'content')])],
-            [target], [basis])
+            [(None, [(('a',), 'content')]),
+             (target, []),
+            ], [target], [basis])
 
     def test_one_to_none_key(self):
         basis = self.get_map_key({('a',): 'content'})
         target = self.get_map_key({})
         self.assertIterInteresting(
-            [([target], [])],
+            [(target, [])],
             [target], [basis])
 
     def test_common_pages(self):
@@ -1896,8 +1902,8 @@
         b_key = target_map._root_node._items['b'].key()
         # This should return the root node, and the node for the 'b' key
         self.assertIterInteresting(
-            [([target], []),
-             ([b_key], [(('b',), 'other content')])],
+            [(target, []),
+             (b_key, [(('b',), 'other content')])],
             [target], [basis])
 
     def test_common_sub_page(self):
@@ -1924,11 +1930,77 @@
         # The key for the leaf aab node
         aab_key = target_map._root_node._items['a']._items['aab'].key()
         self.assertIterInteresting(
-            [([target], []),
-             ([a_key], []),
-             ([aab_key], [(('aab',), 'new')])],
+            [(target, []),
+             (a_key, []),
+             (aab_key, [(('aab',), 'new')])],
             [target], [basis])
 
+    def test_common_leaf(self):
+        basis = self.get_map_key({})
+        target1 = self.get_map_key({('aaa',): 'common'})
+        target2 = self.get_map_key({('aaa',): 'common',
+                                    ('bbb',): 'new',
+                                   })
+        target3 = self.get_map_key({('aaa',): 'common',
+                                    ('aac',): 'other',
+                                    ('bbb',): 'new',
+                                   })
+        # The LeafNode containing 'aaa': 'common' occurs at 3 different levels.
+        # Once as a root node, once as a second layer, and once as a third
+        # layer. It should only be returned one time regardless
+        target1_map = CHKMap(self.get_chk_bytes(), target1)
+        self.assertEqualDiff(
+            "'' LeafNode\n"
+            "      ('aaa',) 'common'\n",
+            target1_map._dump_tree())
+        target2_map = CHKMap(self.get_chk_bytes(), target2)
+        self.assertEqualDiff(
+            "'' InternalNode\n"
+            "  'a' LeafNode\n"
+            "      ('aaa',) 'common'\n"
+            "  'b' LeafNode\n"
+            "      ('bbb',) 'new'\n",
+            target2_map._dump_tree())
+        target3_map = CHKMap(self.get_chk_bytes(), target3)
+        self.assertEqualDiff(
+            "'' InternalNode\n"
+            "  'a' InternalNode\n"
+            "    'aaa' LeafNode\n"
+            "      ('aaa',) 'common'\n"
+            "    'aac' LeafNode\n"
+            "      ('aac',) 'other'\n"
+            "  'b' LeafNode\n"
+            "      ('bbb',) 'new'\n",
+            target3_map._dump_tree())
+        aaa_key = target1_map._root_node.key()
+        b_key = target2_map._root_node._items['b'].key()
+        a_key = target3_map._root_node._items['a'].key()
+        aac_key = target3_map._root_node._items['a']._items['aac'].key()
+        self.assertIterInteresting(
+            [(None, [(('aaa',), 'common')]),
+             (target1, []),
+             (target2, []),
+             (target3, []),
+             (b_key, [(('bbb',), 'new')]),
+             (a_key, []),
+             (aac_key, [(('aac',), 'other')]),
+            ], [target1, target2, target3], [basis])
+
+        self.assertIterInteresting(
+            [(target2, []),
+             (target3, []),
+             (b_key, [(('bbb',), 'new')]),
+             (a_key, []),
+             (aac_key, [(('aac',), 'other')]),
+            ], [target2, target3], [target1])
+
+        # This may be a case that we relax. A root node is a deep child of the
+        # excluded set. The cost is buffering root nodes until we have
+        # determined all possible exclusions. (Because a prefix of '', cannot
+        # be excluded.)
+        self.assertIterInteresting(
+            [], [target1], [target3])
+
     def test_multiple_maps(self):
         basis1 = self.get_map_key({('aaa',): 'common',
                                    ('aab',): 'basis1',
@@ -1976,9 +2048,10 @@
         # The key for the leaf bba node
         bba_key = target2_map._root_node._items['b']._items['bba'].key()
         self.assertIterInteresting(
-            [([target1, target2], []),
-             ([a_key], []),
-             ([b_key], []),
-             ([aac_key], [(('aac',), 'target1')]),
-             ([bba_key], [(('bba',), 'target2')]),
+            [(target1, []),
+             (target2, []),
+             (a_key, []),
+             (b_key, []),
+             (aac_key, [(('aac',), 'target1')]),
+             (bba_key, [(('bba',), 'target2')]),
             ], [target1, target2], [basis1, basis2])