Rev 52: Filter out tips we've already found interesting. in http://bzr.arbash-meinel.com/plugins/history_db

Wed Apr 7 16:08:00 BST 2010

At http://bzr.arbash-meinel.com/plugins/history_db

------------------------------------------------------------
revno: 52
revision-id: john at arbash-meinel.com-20100407150745-2t3a9o66annnsxdo
parent: john at arbash-meinel.com-20100406215530-ellqwfhrk1n2qy55
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: history_db
timestamp: Wed 2010-04-07 10:07:45 -0500
message:
  Filter out tips we've already found interesting.
-------------- next part --------------
=== modified file 'history_db.py'

--- a/history_db.py	2010-04-06 21:55:30 +0000
+++ b/history_db.py	2010-04-07 15:07:45 +0000
@@ -642,12 +642,15 @@
             "SELECT parent, gdfo FROM parent, revision"
             " WHERE parent=db_id AND child IN (%s)",
             len(self._search_tips)), list(self._search_tips)).fetchall()
-        # XXX:  Filter out search tips that we've already searched via a
-        #       different path, either entries already in the old _search_tips,
-        #       or something in _interesting_ancestor_ids, etc. Note that by
-        #       construction, everything in _search_tips should be in
-        #       _interesting_ancestor_ids...
-        self._search_tips = set([r[0] for r in res])
+        # Filter out search tips that we've already searched via a different
+        # path. By construction, if we are stepping the search tips, we know
+        # that all previous search tips are either in
+        # self._imported_dotted_revno or in self._interesting_ancestor_ids.
+        # _imported_dotted_revno will be filtered in the first
+        # _split_search_tips_by_gdfo call, so we just filter out already
+        # interesting ones.
+        interesting = self._interesting_ancestor_ids
+        self._search_tips = set([r[0] for r in res if r[0] not in interesting])
         # TODO: For search tips we will be removing, we don't need to join
         #       against revision since we should already have them. There may
         #       be other ways that we already know gdfo. It may be cheaper to

=== modified file 'test_importer.py'
--- a/test_importer.py	2010-04-06 21:55:30 +0000
+++ b/test_importer.py	2010-04-07 15:07:45 +0000
@@ -235,7 +235,6 @@
         b._tip_revision = 'D' # Something older
         importer = history_db.Importer(':memory:', b, incremental=False)
         importer.do_import()
-        D_id = importer._rev_id_to_db_id['D']
         self.assertEqual(1, importer._cursor.execute(
             "SELECT count(*) FROM dotted_revno, revision"
             " WHERE tip_revision = merged_revision"
@@ -244,7 +243,6 @@
         # Now work on just importing G
         importer._update_ancestry('G')
         self.grab_interesting_ids(importer._rev_id_to_db_id)
-        G_id = importer._rev_id_to_db_id['G']
         inc_importer = history_db._IncrementalImporter(importer, self.G_id)
         inc_importer._find_needed_mainline()
         self.assertEqual([self.G_id], inc_importer._mainline_db_ids)
@@ -295,7 +293,6 @@
         inc_importer._step_mainline()
         self.assertEqual(self.A_id, inc_importer._imported_mainline_id)
         self.assertEqual(1, inc_importer._imported_gdfo)
-        self.C_id = importer._rev_id_to_db_id['C']
         self.assertEqual({self.D_id: ((2,), 0, 0), self.C_id: ((1,1,2), 0, 1),
                           self.B_id: ((1,1,1), 1, 1),
                          }, inc_importer._imported_dotted_revno)
@@ -342,3 +339,47 @@
                          }, inc_importer._imported_dotted_revno)
         self.assertEqual({(2,): self.D_id, (1,1,2): self.C_id,
                           (1,1,1): self.B_id}, inc_importer._dotted_to_db_id)
+
+    def test__incremental_step_skips_already_seen(self):
+        # Simpler graph:
+        # A
+        # |
+        # B
+        # |\
+        # | C
+        # | |\
+        # | D E
+        # |/|/
+        # F G
+        # |/
+        # H
+        # In this case, first step should go to G & D, when stepping from
+        # there, G => D should not continue on D, since it has already been
+        # seen, but we should include E.
+        ancestry = {'A': (),
+                    'B': ('A',),
+                    'C': ('B',),
+                    'D': ('C',),
+                    'E': ('C',),
+                    'F': ('B', 'D'),
+                    'G': ('D', 'E'),
+                    'H': ('F', 'G'),
+                    }
+        b = MockBranch(ancestry, 'B')
+        importer = history_db.Importer(':memory:', b, incremental=False)
+        importer.do_import()
+        importer._update_ancestry('H')
+        self.grab_interesting_ids(importer._rev_id_to_db_id)
+        inc_importer = history_db._IncrementalImporter(importer, self.H_id)
+        inc_importer._find_needed_mainline()
+        self.assertEqual([self.H_id, self.F_id], inc_importer._mainline_db_ids)
+        self.assertEqual(self.B_id, inc_importer._imported_mainline_id)
+        inc_importer._get_initial_search_tips()
+        self.assertEqual(set([self.D_id, self.G_id]), inc_importer._search_tips)
+        # Both have higher-than-mainline gdfos
+        self.assertEqual([],
+                 inc_importer._split_search_tips_by_gdfo([self.D_id, self.G_id]))
+        inc_importer._step_search_tips()
+        # It should want to include D_id, but it should know that we've already
+        # been there
+        self.assertEqual(set([self.C_id, self.E_id]), inc_importer._search_tips)