Rev 4508: A few more updates. in http://bazaar.launchpad.net/~jameinel/bzr/1.17-chk-multilevel

Tue Jun 30 23:28:27 BST 2009

At http://bazaar.launchpad.net/~jameinel/bzr/1.17-chk-multilevel

------------------------------------------------------------
revno: 4508
revision-id: john at arbash-meinel.com-20090630222821-yq4yubbcw0r5b1lz
parent: john at arbash-meinel.com-20090630211033-7tm7nmaz2xwwhcg0
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: 1.17-chk-multilevel
timestamp: Tue 2009-06-30 17:28:21 -0500
message:
  A few more updates.
  
  Split out the interesting chks into a separate set.
  We still will filter based on what is in it, except
  now we can also filter out uninteresting items without
  running into them.
-------------- next part --------------
=== modified file 'bzrlib/chk_map.py'

--- a/bzrlib/chk_map.py	2009-06-30 21:10:33 +0000
+++ b/bzrlib/chk_map.py	2009-06-30 22:28:21 +0000
@@ -1418,6 +1418,9 @@
         #       it out, yet.
         self._all_uninteresting_chks = set(self._uninteresting_root_keys)
         self._all_uninteresting_items = set()
+        # These are interesting items which were either read, or already in the
+        # interesting queue (so don't add these refs again)
+        self._processed_interesting_refs = set()
         self._interesting_queued_refs = set()
         # TODO: use this or delete it
         self._interesting_queued_items = set()
@@ -1488,17 +1491,17 @@
         interesting_prefixes = set()
         # We are about to yield all of these, so we don't want them getting
         # added a second time
-        self._all_uninteresting_chks.update(interesting_keys)
+        self._processed_interesting_refs.update(interesting_keys)
         for record, node, prefix_refs, items in \
             self._read_nodes_from_store(interesting_keys):
             # At this level, we now know all the uninteresting references
             # So we can go ahead and filter, and queue up whatever is remaining
             for prefix, ref in prefix_refs:
                 if (ref in self._all_uninteresting_chks
-                    or ref in self._interesting_queued_refs):
+                    or ref in self._processed_interesting_refs):
                     # Either in the uninteresting set, or added by another root
                     continue
-                self._interesting_queued_refs.add(ref)
+                self._processed_interesting_refs.add(ref)
                 interesting_prefixes.update(
                     [prefix[:i+1] for i in xrange(len(prefix))])
                 heapq.heappush(self._interesting_queue,
@@ -1554,9 +1557,14 @@
         refs = refs.difference(self._all_uninteresting_chks)
 
         all_uninteresting_chks = self._all_uninteresting_chks
+        processed_interesting_refs = self._processed_interesting_refs
         all_uninteresting_items = self._all_uninteresting_items
         while refs:
-            all_uninteresting_chks.update(refs)
+            # TODO: add a test for this
+            #       The idea is that we saw an item in the queue (say during
+            #       the root load), and since then we have seen that we
+            #       shouldn't walk it after all.
+            # refs = refs.difference(all_uninteresting_chks)
             next_refs = set()
             next_refs_update = next_refs.update
             # Inlining _read_nodes_from_store improves 'bzr branch bzr.dev'
@@ -1568,6 +1576,8 @@
                 yield record, items
                 next_refs_update([i[1] for i in prefix_refs])
             next_refs = next_refs.difference(all_uninteresting_chks)
+            next_refs = next_refs.difference(processed_interesting_refs)
+            processed_interesting_refs.update(next_refs)
             refs = next_refs
 
     def _process_next_uninteresting(self):
@@ -1576,17 +1586,19 @@
         #       out uninteresting nodes that are not referenced by interesting
         #       items (we *do* currently filter out uninteresting nodes
         #       referenced from the root.)
-        prefix, ref = heapq.heappop(self._uninteresting_queue)
-        for record, node, prefix_refs, items in \
-                self._read_nodes_from_store([ref]):
+        # For now, since we don't ever abort from loading uninteresting items,
+        # just read the whole thing in at once, so that we get a single
+        # request, instead of multiple
+        refs = [pr[1] for pr in self._uninteresting_queue]
+        self._uninteresting_queue = []
+        all_uninteresting_chks = self._all_uninteresting_chks
+        for record, _, prefix_refs, items in self._read_nodes_from_store(refs):
             self._all_uninteresting_items.update(items)
-            for prefix, ref in prefix_refs:
-                # TODO: Get a test written that exercises this, and then
-                #       uncomment
-                # if ref in self._all_uninteresting_chks:
-                #     continue
-                self._all_uninteresting_chks.add(ref)
-                heapq.heappush(self._uninteresting_queue, (prefix, ref))
+            prefix_refs = [pr for pr in prefix_refs
+                           if pr[1] not in all_uninteresting_chks]
+            self._uninteresting_queue.extend(prefix_refs)
+            all_uninteresting_chks.update([pr[1] for pr in prefix_refs])
+        heapq.heapify(self._uninteresting_queue)
 
     def _process_queues(self):
         # Finish processing all of the items in the queue, for simplicity, we
@@ -1617,22 +1629,32 @@
                     if item not in self._all_uninteresting_items:
                         yield None, [item]
                 else:
-                    # Node, value == ref
+                    # Node => value == ref
                     # if value in self._all_uninteresting_chks:
                     #     continue
                     for record, node, prefix_refs, items in \
                             self._read_nodes_from_store([value]):
                         yield record, []
                         # This record has been yielded, mark it uninteresting
-                        self._all_uninteresting_chks.add(record.key)
+                        assert record.key in self._processed_interesting_refs
                         for prefix, ref in prefix_refs:
                             if (ref in self._all_uninteresting_chks
-                                 or ref in self._interesting_queued_refs):
+                                 or ref in self._processed_interesting_refs):
                                 continue
-                            self._interesting_queued_refs.add(ref)
                             heapq.heappush(self._interesting_queue,
                                            (prefix, None, ref))
+                            self._processed_interesting_refs.add(ref)
                         # TODO: do we know if these are truly interesting yet?
+                        #       Initial guess is that we do not.
+                        #       interesting may reference 'abc' from a Leaf at
+                        #       'a', while uninteresting may reference it from
+                        #       a leaf at 'ab', and unfortunately 'ab' comes
+                        #       before 'abc' but after 'a'.
+                        #       One possibility, check to see if the first
+                        #       uninteresting node has a prefix which is after
+                        #       the possible prefix for item. So if you had
+                        #       uninteresting at 'b' then you are sure you
+                        #       could just yield all items that begin with 'a'
                         for item in items:
                             if item in self._all_uninteresting_items:
                                 continue

=== modified file 'bzrlib/tests/test_chk_map.py'
--- a/bzrlib/tests/test_chk_map.py	2009-06-30 15:19:52 +0000
+++ b/bzrlib/tests/test_chk_map.py	2009-06-30 22:28:21 +0000
@@ -2442,6 +2442,38 @@
                          iterator._uninteresting_queue)
         self.assertEqual([('a', None, key3_a)], iterator._interesting_queue)
 
+    def test__process_next_uninteresting_batched_no_dupes(self):
+        c_map = self.make_two_deep_map()
+        key1 = c_map.key()
+        c_map._dump_tree() # load everything
+        key1_a = c_map._root_node._items['a'].key()
+        key1_aa = c_map._root_node._items['a']._items['aa'].key()
+        key1_ab = c_map._root_node._items['a']._items['ab'].key()
+        key1_ac = c_map._root_node._items['a']._items['ac'].key()
+        key1_ad = c_map._root_node._items['a']._items['ad'].key()
+        c_map.map(('aaa',), 'new aaa value')
+        key2 = c_map._save()
+        key2_a = c_map._root_node._items['a'].key()
+        key2_aa = c_map._root_node._items['a']._items['aa'].key()
+        c_map.map(('acc',), 'new acc content')
+        key3 = c_map._save()
+        key3_a = c_map._root_node._items['a'].key()
+        key3_ac = c_map._root_node._items['a']._items['ac'].key()
+        iterator = self.get_iterator([key3], [key1, key2],
+                                     chk_map._search_key_plain)
+        root_results = [record.key for record in iterator._read_all_roots()]
+        self.assertEqual([key3], root_results)
+        self.assertEqual(sorted([('a', key1_a), ('a', key2_a)]),
+                         sorted(iterator._uninteresting_queue))
+        self.assertEqual([('a', None, key3_a)], iterator._interesting_queue)
+        iterator._process_next_uninteresting()
+        # All of the uninteresting records should be brought in and queued up,
+        # but we should not have any duplicates
+        self.assertEqual(sorted([('aa', key1_aa), ('ab', key1_ab),
+                                 ('ac', key1_ac), ('ad', key1_ad),
+                                 ('aa', key2_aa),
+                                ]), sorted(iterator._uninteresting_queue))
+
 
 class TestIterInterestingNodes(TestCaseWithExampleMaps):