Rev 4835: (jam) Improve conversion efficiency by using a better heads() in file:///home/pqm/archives/thelove/bzr/%2Btrunk/

Mon Nov 30 04:49:35 GMT 2009

At file:///home/pqm/archives/thelove/bzr/%2Btrunk/

------------------------------------------------------------
revno: 4835 [merge]
revision-id: pqm at pqm.ubuntu.com-20091130044931-b5rjfh24zq1d3lju
parent: pqm at pqm.ubuntu.com-20091128071329-pb89sfsgsvxcyob9
parent: john at arbash-meinel.com-20091130033903-2n14h8xr1ebp9qna
committer: Canonical.com Patch Queue Manager <pqm at pqm.ubuntu.com>
branch nick: +trunk
timestamp: Mon 2009-11-30 04:49:31 +0000
message:
  (jam) Improve conversion efficiency by using a better heads()
  	implementation.
modified:
  NEWS                           NEWS-20050323055033-4e00b5db738777ff
  bzrlib/fetch.py                fetch.py-20050818234941-26fea6105696365d
  bzrlib/graph.py                graph_walker.py-20070525030359-y852guab65d4wtn0-1
  bzrlib/repository.py           rev_storage.py-20051111201905-119e9401e46257e3
  bzrlib/tests/test_graph.py     test_graph_walker.py-20070525030405-enq4r60hhi9xrujc-1
=== modified file 'NEWS'

--- a/NEWS	2009-11-27 10:43:47 +0000
+++ b/NEWS	2009-11-30 04:49:31 +0000
@@ -53,6 +53,10 @@
   etc. Now only change slashes if there is something being glob expanded.
   (John Arbash Meinel, #485771)
 
+* Use our faster ``KnownGraph.heads()`` functionality when computing the
+  new rich-root heads. This can cut a conversion time in half (mysql from
+  13.5h => 6.2h) (John Arbash Meinel, #487632)
+
 Improvements
 ************
 

=== modified file 'bzrlib/fetch.py'
--- a/bzrlib/fetch.py	2009-08-07 04:29:36 +0000
+++ b/bzrlib/fetch.py	2009-11-30 03:34:09 +0000
@@ -28,6 +28,8 @@
 from bzrlib.lazy_import import lazy_import
 lazy_import(globals(), """
 from bzrlib import (
+    graph as _mod_graph,
+    static_tuple,
     tsort,
     versionedfile,
     )
@@ -36,10 +38,10 @@
 from bzrlib import (
     errors,
     symbol_versioning,
+    ui,
     )
 from bzrlib.revision import NULL_REVISION
 from bzrlib.trace import mutter
-import bzrlib.ui
 
 
 class RepoFetcher(object):
@@ -96,7 +98,7 @@
         # assert not missing
         self.count_total = 0
         self.file_ids_names = {}
-        pb = bzrlib.ui.ui_factory.nested_progress_bar()
+        pb = ui.ui_factory.nested_progress_bar()
         pb.show_pct = pb.show_count = False
         try:
             pb.update("Finding revisions", 0, 2)
@@ -123,7 +125,7 @@
             raise errors.IncompatibleRepositories(
                 self.from_repository, self.to_repository,
                 "different rich-root support")
-        pb = bzrlib.ui.ui_factory.nested_progress_bar()
+        pb = ui.ui_factory.nested_progress_bar()
         try:
             pb.update("Get stream source")
             source = self.from_repository._get_source(
@@ -251,13 +253,22 @@
         # yet, and are unlikely to in non-rich-root environments anyway.
         root_id_order.sort(key=operator.itemgetter(0))
         # Create a record stream containing the roots to create.
-        from bzrlib.graph import FrozenHeadsCache
-        graph = FrozenHeadsCache(graph)
+        if len(revs) > 100:
+            graph = _get_rich_root_heads_graph(self.source_repo, revs)
         new_roots_stream = _new_root_data_stream(
             root_id_order, rev_id_to_root_id, parent_map, self.source, graph)
         return [('texts', new_roots_stream)]
 
 
+def _get_rich_root_heads_graph(source_repo, revision_ids):
+    """Get a Graph object suitable for asking heads() for new rich roots."""
+    st = static_tuple.StaticTuple
+    revision_keys = [st(r_id).intern() for r_id in revision_ids]
+    known_graph = source_repo.revisions.get_known_graph_ancestry(
+                    revision_keys)
+    return graph.GraphThunkIdsToKeys(known_graph)
+
+
 def _new_root_data_stream(
     root_keys_to_create, rev_id_to_root_id_map, parent_map, repo, graph=None):
     """Generate a texts substream of synthesised root entries.

=== modified file 'bzrlib/graph.py'
--- a/bzrlib/graph.py	2009-09-14 01:48:28 +0000
+++ b/bzrlib/graph.py	2009-11-30 03:16:22 +0000
@@ -1679,6 +1679,19 @@
     return result
 
 
+class GraphThunkIdsToKeys(object):
+    """Forwards calls about 'ids' to be about keys internally."""
+
+    def __init__(self, graph):
+        self._graph = graph
+
+    def heads(self, ids):
+        """See Graph.heads()"""
+        as_keys = [(i,) for i in ids]
+        head_keys = self._graph.heads(as_keys)
+        return set([h[0] for h in head_keys])
+
+
 _counters = [0,0,0,0,0,0,0]
 try:
     from bzrlib._known_graph_pyx import KnownGraph

=== modified file 'bzrlib/repository.py'
--- a/bzrlib/repository.py	2009-11-19 15:06:47 +0000
+++ b/bzrlib/repository.py	2009-11-30 03:34:09 +0000
@@ -26,6 +26,7 @@
     chk_map,
     debug,
     errors,
+    fetch as _mod_fetch,
     fifo_cache,
     generate_ids,
     gpg,
@@ -2668,8 +2669,8 @@
         for ((revision_id,), parent_keys) in \
                 self.revisions.get_parent_map(query_keys).iteritems():
             if parent_keys:
-                result[revision_id] = tuple(parent_revid
-                    for (parent_revid,) in parent_keys)
+                result[revision_id] = tuple([parent_revid
+                    for (parent_revid,) in parent_keys])
             else:
                 result[revision_id] = (_mod_revision.NULL_REVISION,)
         return result
@@ -3412,8 +3413,7 @@
                    provided a default one will be created.
         :return: None.
         """
-        from bzrlib.fetch import RepoFetcher
-        f = RepoFetcher(to_repository=self.target,
+        f = _mod_fetch.RepoFetcher(to_repository=self.target,
                                from_repository=self.source,
                                last_revision=revision_id,
                                fetch_spec=fetch_spec,
@@ -3819,13 +3819,15 @@
                 basis_id, delta, current_revision_id, parents_parents)
             cache[current_revision_id] = parent_tree
 
-    def _fetch_batch(self, revision_ids, basis_id, cache):
+    def _fetch_batch(self, revision_ids, basis_id, cache, a_graph=None):
         """Fetch across a few revisions.
 
         :param revision_ids: The revisions to copy
         :param basis_id: The revision_id of a tree that must be in cache, used
             as a basis for delta when no other base is available
         :param cache: A cache of RevisionTrees that we can use.
+        :param a_graph: A Graph object to determine the heads() of the
+            rich-root data stream.
         :return: The revision_id of the last converted tree. The RevisionTree
             for it will be in cache
         """
@@ -3895,10 +3897,9 @@
         from_texts = self.source.texts
         to_texts = self.target.texts
         if root_keys_to_create:
-            from bzrlib.fetch import _new_root_data_stream
-            root_stream = _new_root_data_stream(
+            root_stream = _mod_fetch._new_root_data_stream(
                 root_keys_to_create, self._revision_id_to_root_id, parent_map,
-                self.source)
+                self.source, graph=a_graph)
             to_texts.insert_record_stream(root_stream)
         to_texts.insert_record_stream(from_texts.get_record_stream(
             text_keys, self.target._format._fetch_order,
@@ -3961,13 +3962,20 @@
         cache[basis_id] = basis_tree
         del basis_tree # We don't want to hang on to it here
         hints = []
+        if self._converting_to_rich_root and len(revision_ids) > 100:
+            a_graph = _mod_fetch._get_rich_root_heads_graph(self.source,
+                                                            revision_ids)
+        else:
+            a_graph = None
+
         for offset in range(0, len(revision_ids), batch_size):
             self.target.start_write_group()
             try:
                 pb.update('Transferring revisions', offset,
                           len(revision_ids))
                 batch = revision_ids[offset:offset+batch_size]
-                basis_id = self._fetch_batch(batch, basis_id, cache)
+                basis_id = self._fetch_batch(batch, basis_id, cache,
+                                             a_graph=a_graph)
             except:
                 self.target.abort_write_group()
                 raise
@@ -4446,8 +4454,7 @@
         fetching the inventory weave.
         """
         if self._rich_root_upgrade():
-            import bzrlib.fetch
-            return bzrlib.fetch.Inter1and2Helper(
+            return _mod_fetch.Inter1and2Helper(
                 self.from_repository).generate_root_texts(revs)
         else:
             return []

=== modified file 'bzrlib/tests/test_graph.py'
--- a/bzrlib/tests/test_graph.py	2009-08-04 04:36:34 +0000
+++ b/bzrlib/tests/test_graph.py	2009-11-30 03:16:22 +0000
@@ -1582,6 +1582,24 @@
         self.assertCollapsed(d, d)
 
 
+class TestGraphThunkIdsToKeys(tests.TestCase):
+
+    def test_heads(self):
+        # A
+        # |\
+        # B C
+        # |/
+        # D
+        d = {('D',): [('B',), ('C',)], ('C',):[('A',)],
+             ('B',): [('A',)], ('A',): []}
+        g = _mod_graph.Graph(_mod_graph.DictParentsProvider(d))
+        graph_thunk = _mod_graph.GraphThunkIdsToKeys(g)
+        self.assertEqual(['D'], sorted(graph_thunk.heads(['D', 'A'])))
+        self.assertEqual(['D'], sorted(graph_thunk.heads(['D', 'B'])))
+        self.assertEqual(['D'], sorted(graph_thunk.heads(['D', 'C'])))
+        self.assertEqual(['B', 'C'], sorted(graph_thunk.heads(['B', 'C'])))
+
+
 class TestPendingAncestryResultGetKeys(TestCaseWithMemoryTransport):
     """Tests for bzrlib.graph.PendingAncestryResult."""