Rev 6021: When the overall graph gets too big to replicate on the server, in http://bazaar.launchpad.net/~jameinel/bzr/2.4-too-much-walking-388269

Thu Jul 21 14:21:17 UTC 2011

At http://bazaar.launchpad.net/~jameinel/bzr/2.4-too-much-walking-388269

------------------------------------------------------------
revno: 6021
revision-id: john at arbash-meinel.com-20110721142102-z333hn366uvpntn3
parent: pqm at pqm.ubuntu.com-20110718144335-8v4u0gbnwzm0vzmt
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: 2.4-too-much-walking-388269
timestamp: Thu 2011-07-21 16:21:02 +0200
message:
  When the overall graph gets too big to replicate on the server,
  fall back to doing a *very* local search.
  This avoids causing the Server to iterate over most of history at each step.
  In timing gcc-linaro, it reduces the walk-all-history time from 5.5min down
  to 1m15s.
-------------- next part --------------
=== modified file 'bzrlib/remote.py'

--- a/bzrlib/remote.py	2011-06-18 13:57:17 +0000
+++ b/bzrlib/remote.py	2011-07-21 14:21:02 +0000
@@ -1744,27 +1744,61 @@
         if parents_map is None:
             # Repository is not locked, so there's no cache.
             parents_map = {}
-        # start_set is all the keys in the cache
-        start_set = set(parents_map)
-        # result set is all the references to keys in the cache
-        result_parents = set()
-        for parents in parents_map.itervalues():
-            result_parents.update(parents)
-        stop_keys = result_parents.difference(start_set)
-        # We don't need to send ghosts back to the server as a position to
-        # stop either.
-        stop_keys.difference_update(self._unstacked_provider.missing_keys)
-        key_count = len(parents_map)
-        if (NULL_REVISION in result_parents
-            and NULL_REVISION in self._unstacked_provider.missing_keys):
-            # If we pruned NULL_REVISION from the stop_keys because it's also
-            # in our cache of "missing" keys we need to increment our key count
-            # by 1, because the reconsitituted SearchResult on the server will
-            # still consider NULL_REVISION to be an included key.
-            key_count += 1
-        included_keys = start_set.intersection(result_parents)
-        start_set.difference_update(included_keys)
-        recipe = ('manual', start_set, stop_keys, key_count)
+        if len(parents_map) < 10000:
+            # start_set is all the keys in the cache
+            start_set = set(parents_map)
+            # result set is all the references to keys in the cache
+            result_parents = set()
+            for parents in parents_map.itervalues():
+                result_parents.update(parents)
+            stop_keys = result_parents.difference(start_set)
+            # We don't need to send ghosts back to the server as a position to
+            # stop either.
+            stop_keys.difference_update(self._unstacked_provider.missing_keys)
+            key_count = len(parents_map)
+            if (NULL_REVISION in result_parents
+                and NULL_REVISION in self._unstacked_provider.missing_keys):
+                # If we pruned NULL_REVISION from the stop_keys because it's also
+                # in our cache of "missing" keys we need to increment our key count
+                # by 1, because the reconsitituted SearchResult on the server will
+                # still consider NULL_REVISION to be an included key.
+                key_count += 1
+            included_keys = start_set.intersection(result_parents)
+            start_set.difference_update(included_keys)
+            mutter('Doing _get_parent_map_rpc,'
+                         ' %d cache size, %d start_set, %d result_parents,'
+                         ' %d stop_keys, %d included_keys, %d keys,'
+                         ' %d keys matching stop keys',
+                         len(parents_map), len(start_set), len(result_parents),
+                         len(stop_keys), len(included_keys), len(keys),
+                         len(keys.intersection(stop_keys)))
+            recipe = ('manual', start_set, stop_keys, key_count)
+        else:
+            import pdb; pdb.set_trace()
+            # We've searched too many revisions for it to be efficient to
+            # recreate our search on the server. So just create a 'minimal'
+            # search pattern
+            parent_to_children_map = {}
+            for child, parents in parents_map.iteritems():
+                for p in parents:
+                    # Any given parent is likely to have only a small handful
+                    # of children, many will have only one.
+                    # TODO: Use static_tuple
+                    if p not in parent_to_children_map:
+                        parent_to_children_map[p] = (child,)
+                    else:
+                        parent_to_children_map[p] = parent_to_children_map[p] + (child,)
+            stop_keys = set(keys)
+            # Just look at immediate children
+            child_keys = set()
+            for k in keys:
+                child_keys.update(parent_to_children_map[k])
+            mutter('Faking search set _get_parent_map_rpc,'
+                         ' %d cache size, %d start keys'
+                         ' %d included_keys %d stop_keys',
+                         len(parents_map), len(child_keys), len(child_keys),
+                         len(keys))
+            recipe = ('manual', child_keys, stop_keys, len(child_keys))
         body = self._serialise_search_recipe(recipe)
         path = self.bzrdir._path_for_remote_call(self._client)
         for key in keys:
@@ -1773,6 +1807,7 @@
                     "key %r not a plain string" % (key,))
         verb = 'Repository.get_parent_map'
         args = (path, 'include-missing:') + tuple(keys)
+        mutter('keys: %r body: %r' % (keys, body))
         try:
             response = self._call_with_body_bytes_expecting_body(
                 verb, args, body)

=== modified file 'bzrlib/vf_repository.py'
--- a/bzrlib/vf_repository.py	2011-06-28 17:25:26 +0000
+++ b/bzrlib/vf_repository.py	2011-07-21 14:21:02 +0000
@@ -2502,6 +2502,7 @@
         :param revision_ids: The start point for the search.
         :return: A set of revision ids.
         """
+        import pdb; pdb.set_trace()
         target_graph = self.target.get_graph()
         revision_ids = frozenset(revision_ids)
         if if_present_ids:
@@ -2514,12 +2515,21 @@
         searcher = source_graph._make_breadth_first_searcher(all_wanted_revs)
         null_set = frozenset([_mod_revision.NULL_REVISION])
         searcher_exhausted = False
+        search_step = rev_count = 0
+        gpm = searcher._parents_provider.get_parent_map
+        def get_parent_map_logging(revisions):
+            res = gpm(revisions)
+            mutter('step %d, requested %d returned %d'
+                   % (search_step, len(revisions), len(res)))
+            return res
+        searcher._parents_provider.get_parent_map = get_parent_map_logging
         while True:
             next_revs = set()
             ghosts = set()
             # Iterate the searcher until we have enough next_revs
             while len(next_revs) < self._walk_to_common_revisions_batch_size:
                 try:
+                    search_step += 1
                     next_revs_part, ghosts_part = searcher.next_with_ghosts()
                     next_revs.update(next_revs_part)
                     ghosts.update(ghosts_part)
@@ -2550,6 +2560,7 @@
                 searcher.stop_searching_any(stop_revs)
             if searcher_exhausted:
                 break
+        import pdb; pdb.set_trace()
         return searcher.get_result()
 
     @needs_read_lock