Rev 2929: (robertc) Prevent heads() calls from accessing all history unnecessarily. (Robert Collins) in file:///home/pqm/archives/thelove/bzr/%2Btrunk/

Tue Oct 23 09:21:14 BST 2007

At file:///home/pqm/archives/thelove/bzr/%2Btrunk/

------------------------------------------------------------
revno: 2929
revision-id: pqm at pqm.ubuntu.com-20071023082111-h6u34i4gvlb2nwch
parent: pqm at pqm.ubuntu.com-20071023030547-pjlo1thei6q984os
parent: robertc at robertcollins.net-20071023073514-r99iahf2lm257zrp
committer: Canonical.com Patch Queue Manager <pqm at pqm.ubuntu.com>
branch nick: +trunk
timestamp: Tue 2007-10-23 09:21:11 +0100
message:
  (robertc) Prevent heads() calls from accessing all history unnecessarily. (Robert Collins)
modified:
  NEWS                           NEWS-20050323055033-4e00b5db738777ff
  bzrlib/graph.py                graph_walker.py-20070525030359-y852guab65d4wtn0-1
  bzrlib/tests/test_graph.py     test_graph_walker.py-20070525030405-enq4r60hhi9xrujc-1
    ------------------------------------------------------------
    revno: 2921.3.5
    merged: robertc at robertcollins.net-20071023073514-r99iahf2lm257zrp
    parent: robertc at robertcollins.net-20071022194931-ohs6yd08iy7deqlr
    committer: Robert Collins <robertc at robertcollins.net>
    branch nick: graph
    timestamp: Tue 2007-10-23 17:35:14 +1000
    message:
      Used fixed rather than bugfixed at Ian's request in NEWS.
    ------------------------------------------------------------
    revno: 2921.3.4
    merged: robertc at robertcollins.net-20071022194931-ohs6yd08iy7deqlr
    parent: robertc at robertcollins.net-20071022054449-v4kdk8bue5a7ckym
    committer: Robert Collins <robertc at robertcollins.net>
    branch nick: graph
    timestamp: Tue 2007-10-23 05:49:31 +1000
    message:
      Review feedback.
    ------------------------------------------------------------
    revno: 2921.3.3
    merged: robertc at robertcollins.net-20071022054449-v4kdk8bue5a7ckym
    parent: robertc at robertcollins.net-20071022054315-b7eddcxqlvy3q58j
    committer: Robert Collins <robertc at robertcollins.net>
    branch nick: graph
    timestamp: Mon 2007-10-22 15:44:49 +1000
    message:
      Review feedback.
    ------------------------------------------------------------
    revno: 2921.3.2
    merged: robertc at robertcollins.net-20071022054315-b7eddcxqlvy3q58j
    parent: robertc at robertcollins.net-20071022004427-x1pn4ywiuubx354f
    committer: Robert Collins <robertc at robertcollins.net>
    branch nick: graph
    timestamp: Mon 2007-10-22 15:43:15 +1000
    message:
      Fix mistaken pdb import.
    ------------------------------------------------------------
    revno: 2921.3.1
    merged: robertc at robertcollins.net-20071022004427-x1pn4ywiuubx354f
    parent: pqm at pqm.ubuntu.com-20071019201226-6z006xotgfe7zmu8
    committer: Robert Collins <robertc at robertcollins.net>
    branch nick: graph
    timestamp: Mon 2007-10-22 10:44:27 +1000
    message:
      * Graph ``heads()`` queries have been bugfixed to no longer access all
        history unnecessarily. (Robert Collins)
=== modified file 'NEWS'

--- a/NEWS	2007-10-22 21:25:20 +0000
+++ b/NEWS	2007-10-23 08:21:11 +0000
@@ -69,6 +69,9 @@
    * XML inventory serialisation takes 20% less time while being stricter about
      the contents. (Robert Collins)
 
+   * Graph ``heads()`` queries have been fixed to no longer access all history
+     unnecessarily. (Robert Collins)
+
   IMPROVEMENTS:
 
    * ``bzr+https://`` smart server across https now supported. 

=== modified file 'bzrlib/graph.py'
--- a/bzrlib/graph.py	2007-10-19 03:45:02 +0000
+++ b/bzrlib/graph.py	2007-10-22 19:49:31 +0000
@@ -91,7 +91,7 @@
         specialized implementations for particular repository types.  See
         Repository.get_graph()
 
-        :param parents_func: an object providing a get_parents call
+        :param parents_provider: An object providing a get_parents call
             conforming to the behavior of StackedParentsProvider.get_parents
         """
         self.get_parents = parents_provider.get_parents
@@ -239,7 +239,21 @@
         # skip over the actual candidate for each searcher
         for searcher in active_searchers.itervalues():
             searcher.next()
+        # The common walker finds nodes that are common to two or more of the
+        # input keys, so that we don't access all history when a currently
+        # uncommon search point actually meets up with something behind a
+        # common search point. Common search points do not keep searches
+        # active; they just allow us to make searches inactive without
+        # accessing all history.
+        common_walker = self._make_breadth_first_searcher([])
         while len(active_searchers) > 0:
+            ancestors = set()
+            # advance searches
+            try:
+                common_walker.next()
+            except StopIteration:
+                # No common points being searched at this time.
+                pass
             for candidate in active_searchers.keys():
                 try:
                     searcher = active_searchers[candidate]
@@ -249,27 +263,40 @@
                     # a descendant of another candidate.
                     continue
                 try:
-                    ancestors = searcher.next()
+                    ancestors.update(searcher.next())
                 except StopIteration:
                     del active_searchers[candidate]
                     continue
-                for ancestor in ancestors:
-                    if ancestor in candidate_heads:
-                        candidate_heads.remove(ancestor)
-                        del searchers[ancestor]
-                        if ancestor in active_searchers:
-                            del active_searchers[ancestor]
+            # process found nodes
+            new_common = set()
+            for ancestor in ancestors:
+                if ancestor in candidate_heads:
+                    candidate_heads.remove(ancestor)
+                    del searchers[ancestor]
+                    if ancestor in active_searchers:
+                        del active_searchers[ancestor]
+                # it may meet up with a known common node
+                if ancestor in common_walker.seen:
+                    # some searcher has encountered our known common nodes:
+                    # just stop it
+                    ancestor_set = set([ancestor])
+                    for searcher in searchers.itervalues():
+                        searcher.stop_searching_any(ancestor_set)
+                else:
+                    # or it may have been just reached by all the searchers:
                     for searcher in searchers.itervalues():
                         if ancestor not in searcher.seen:
                             break
                     else:
-                        # if this revision was seen by all searchers, then it
-                        # is a descendant of all candidates, so we can stop
-                        # searching it, and any seen ancestors
+                        # The final active searcher has just reached this node,
+                        # making it be known as a descendant of all candidates,
+                        # so we can stop searching it, and any seen ancestors
+                        new_common.add(ancestor)
                         for searcher in searchers.itervalues():
                             seen_ancestors =\
                                 searcher.find_seen_ancestors(ancestor)
                             searcher.stop_searching_any(seen_ancestors)
+            common_walker.start_searching(new_common)
         return candidate_heads
 
     def find_unique_lca(self, left_revision, right_revision):
@@ -306,69 +333,12 @@
     def is_ancestor(self, candidate_ancestor, candidate_descendant):
         """Determine whether a revision is an ancestor of another.
 
-        There are two possible outcomes: True and False, but there are three
-        possible relationships:
-
-        a) candidate_ancestor is an ancestor of candidate_descendant
-        b) candidate_ancestor is an descendant of candidate_descendant
-        c) candidate_ancestor is an sibling of candidate_descendant
-
-        To check for a, we walk from candidate_descendant, looking for
-        candidate_ancestor.
-
-        To check for b, we walk from candidate_ancestor, looking for
-        candidate_descendant.
-
-        To make a and b more efficient, we can stop any searches that hit
-        common ancestors.
-
-        If we exhaust our searches, but neither a or b is true, then c is true.
-
-        In order to find c efficiently, we must avoid searching from
-        candidate_descendant or candidate_ancestor into common ancestors.  But
-        if we don't search common ancestors at all, we won't know if we hit
-        common ancestors.  So we have a walker for common ancestors.  Note that
-        its searches are not required to terminate in order to determine c to
-        be true.
+        We answer this using heads() as heads() has the logic to perform the
+        smallest number of parent looksup to determine the ancestral
+        relationship between N revisions.
         """
-        ancestor_walker = self._make_breadth_first_searcher(
-            [candidate_ancestor])
-        descendant_walker = self._make_breadth_first_searcher(
-            [candidate_descendant])
-        common_walker = self._make_breadth_first_searcher([])
-        active_ancestor = True
-        active_descendant = True
-        while (active_ancestor or active_descendant):
-            new_common = set()
-            if active_descendant:
-                try:
-                    nodes = descendant_walker.next()
-                except StopIteration:
-                    active_descendant = False
-                else:
-                    if candidate_ancestor in nodes:
-                        return True
-                    new_common.update(nodes.intersection(ancestor_walker.seen))
-            if active_ancestor:
-                try:
-                    nodes = ancestor_walker.next()
-                except StopIteration:
-                    active_ancestor = False
-                else:
-                    if candidate_descendant in nodes:
-                        return False
-                    new_common.update(nodes.intersection(
-                        descendant_walker.seen))
-            try:
-                new_common.update(common_walker.next())
-            except StopIteration:
-                pass
-            for walker in (ancestor_walker, descendant_walker):
-                for node in new_common:
-                    c_ancestors = walker.find_seen_ancestors(node)
-                    walker.stop_searching_any(c_ancestors)
-                common_walker.start_searching(new_common)
-        return False
+        return set([candidate_descendant]) == self.heads(
+            [candidate_ancestor, candidate_descendant])
 
 
 class HeadsCache(object):
@@ -400,7 +370,7 @@
 
 
 class _BreadthFirstSearcher(object):
-    """Parallel search the breadth-first the ancestry of revisions.
+    """Parallel search breadth-first the ancestry of revisions.
 
     This class implements the iterator protocol, but additionally
     1. provides a set of seen ancestors, and

=== modified file 'bzrlib/tests/test_graph.py'
--- a/bzrlib/tests/test_graph.py	2007-09-03 22:17:20 +0000
+++ b/bzrlib/tests/test_graph.py	2007-10-22 05:44:49 +0000
@@ -453,3 +453,57 @@
                          graph.heads(['rev2a', 'rev3b']))
         self.assertEqual(set(['rev2c', 'rev3a']),
                          graph.heads(['rev2c', 'rev3a']))
+
+    def _run_heads_break_deeper(self, graph_dict, search):
+        """Run heads on a graph-as-a-dict.
+        
+        If the search asks for the parents of 'deeper' the test will fail.
+        """
+        class stub(object):
+            pass
+        def get_parents(keys):
+            result = []
+            for key in keys:
+                if key == 'deeper':
+                    self.fail('key deeper was accessed')
+                result.append(graph_dict[key])
+            return result
+        an_obj = stub()
+        an_obj.get_parents = get_parents
+        graph = _mod_graph.Graph(an_obj)
+        return graph.heads(search)
+
+    def test_heads_limits_search(self):
+        # test that a heads query does not search all of history
+        graph_dict = {
+            'left':['common'],
+            'right':['common'],
+            'common':['deeper'],
+        }
+        self.assertEqual(set(['left', 'right']),
+            self._run_heads_break_deeper(graph_dict, ['left', 'right']))
+
+    def test_heads_limits_search_assymetric(self):
+        # test that a heads query does not search all of history
+        graph_dict = {
+            'left':['midleft'],
+            'midleft':['common'],
+            'right':['common'],
+            'common':['aftercommon'],
+            'aftercommon':['deeper'],
+        }
+        self.assertEqual(set(['left', 'right']),
+            self._run_heads_break_deeper(graph_dict, ['left', 'right']))
+
+    def test_heads_limits_search_common_search_must_continue(self):
+        # test that common nodes are still queried, preventing
+        # all-the-way-to-origin behaviour in the following graph:
+        graph_dict = {
+            'h1':['shortcut', 'common1'],
+            'h2':['common1'],
+            'shortcut':['common2'],
+            'common1':['common2'],
+            'common2':['deeper'],
+        }
+        self.assertEqual(set(['h1', 'h2']),
+            self._run_heads_break_deeper(graph_dict, ['h1', 'h2']))