Rev 3902: Fix InternalNode._iter_nodes to split the key filter based on matches. in http://bazaar.launchpad.net/%7Ebzr/bzr/brisbane-core

Tue Mar 24 16:55:56 GMT 2009

At http://bazaar.launchpad.net/%7Ebzr/bzr/brisbane-core

------------------------------------------------------------
revno: 3902
revision-id: john at arbash-meinel.com-20090324165313-3sokh8vdzypsm7cj
parent: ian.clatworthy at internode.on.net-20090324091647-12lm127tpvhk6d5l
parent: john at arbash-meinel.com-20090324164828-l02cq5o9eyewy5g7
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: brisbane-core
timestamp: Tue 2009-03-24 11:53:13 -0500
message:
  Fix InternalNode._iter_nodes to split the key filter based on matches.
  Update LeafNode.iteritems() to use a dict lookup when the key width is correct.
  Shaves a reasonable amount of time off of stuff that wants to look at large
  portions of the inventory.
modified:
  bzrlib/chk_map.py              chk_map.py-20081001014447-ue6kkuhofvdecvxa-1
  bzrlib/tests/test_chk_map.py   test_chk_map.py-20081001014447-ue6kkuhofvdecvxa-2
    ------------------------------------------------------------
    revno: 3899.1.3
    revision-id: john at arbash-meinel.com-20090324164828-l02cq5o9eyewy5g7
    parent: john at arbash-meinel.com-20090323230249-s07xna9l5l7cqez3
    committer: John Arbash Meinel <john at arbash-meinel.com>
    branch nick: iter_changes_fixes
    timestamp: Tue 2009-03-24 11:48:28 -0500
    message:
      review comments from Ian.
    modified:
      bzrlib/tests/test_chk_map.py   test_chk_map.py-20081001014447-ue6kkuhofvdecvxa-2
    ------------------------------------------------------------
    revno: 3899.1.2
    revision-id: john at arbash-meinel.com-20090323230249-s07xna9l5l7cqez3
    parent: john at arbash-meinel.com-20090323225113-kvefl5nlj1uataj3
    committer: John Arbash Meinel <john at arbash-meinel.com>
    branch nick: iter_changes_fixes
    timestamp: Mon 2009-03-23 18:02:49 -0500
    message:
      change the LeafNode iteritems() code so that it directly returns
      non-prefix matches.
    modified:
      bzrlib/chk_map.py              chk_map.py-20081001014447-ue6kkuhofvdecvxa-1
    ------------------------------------------------------------
    revno: 3899.1.1
    revision-id: john at arbash-meinel.com-20090323225113-kvefl5nlj1uataj3
    parent: john at arbash-meinel.com-20090323215037-oyxc3d940dw0rnv7
    committer: John Arbash Meinel <john at arbash-meinel.com>
    branch nick: iter_changes_fixes
    timestamp: Mon 2009-03-23 17:51:13 -0500
    message:
      Update _iter_nodes so that it splits the key_filter into the ones that matched.
      This should be a first step for preventing the LC^2 performance we saw for ls -l.
    modified:
      bzrlib/chk_map.py              chk_map.py-20081001014447-ue6kkuhofvdecvxa-1
      bzrlib/tests/test_chk_map.py   test_chk_map.py-20081001014447-ue6kkuhofvdecvxa-2
-------------- next part --------------
=== modified file 'bzrlib/chk_map.py'

--- a/bzrlib/chk_map.py	2009-03-19 19:31:06 +0000
+++ b/bzrlib/chk_map.py	2009-03-23 23:02:49 +0000
@@ -654,19 +654,30 @@
             list/set/dict or similar repeatedly iterable container.
         """
         if key_filter is not None:
-            # Adjust the filter - short elements go to a prefix filter. Would this
-            # be cleaner explicitly? That would be no harder for InternalNode..
+            # Adjust the filter - short elements go to a prefix filter. All
+            # other items are looked up directly.
             # XXX: perhaps defaultdict? Profiling<rinse and repeat>
             filters = {}
             for key in key_filter:
-                length_filter = filters.setdefault(len(key), set())
-                length_filter.add(key)
-            filters = filters.items()
-            for item in self._items.iteritems():
-                for length, length_filter in filters:
-                    if item[0][:length] in length_filter:
-                        yield item
-                        break
+                if len(key) == self._key_width:
+                    # This filter is meant to match exactly one key, yield it
+                    # if we have it.
+                    try:
+                        yield key, self._items[key]
+                    except KeyError:
+                        # This key is not present in this map, continue
+                        pass
+                else:
+                    # Short items, we need to match based on a prefix
+                    length_filter = filters.setdefault(len(key), set())
+                    length_filter.add(key)
+            if filters:
+                filters = filters.items()
+                for item in self._items.iteritems():
+                    for length, length_filter in filters:
+                        if item[0][:length] in length_filter:
+                            yield item
+                            break
         else:
             for item in self._items.iteritems():
                 yield item
@@ -919,8 +930,8 @@
                                           search_key_func=search_key_func)
 
     def iteritems(self, store, key_filter=None):
-        for node in self._iter_nodes(store, key_filter=key_filter):
-            for item in node.iteritems(store, key_filter=key_filter):
+        for node, node_filter in self._iter_nodes(store, key_filter=key_filter):
+            for item in node.iteritems(store, key_filter=node_filter):
                 yield item
 
     def _iter_nodes(self, store, key_filter=None, batch_size=None):
@@ -935,30 +946,38 @@
         :return: An iterable of nodes. This function does not have to be fully
             consumed.  (There will be no pending I/O when items are being returned.)
         """
+        # Map from chk key ('sha1:...',) to (prefix, key_filter)
+        # prefix is the key in self._items to use, key_filter is the key_filter
+        # entries that would match this node
         keys = {}
         if key_filter is None:
             for prefix, node in self._items.iteritems():
                 if type(node) == tuple:
-                    keys[node] = prefix
+                    keys[node] = (prefix, None)
                 else:
-                    yield node
+                    yield node, None
         else:
             # XXX defaultdict ?
+            prefix_to_keys = {}
             length_filters = {}
             for key in key_filter:
                 search_key = self._search_prefix_filter(key)
                 length_filter = length_filters.setdefault(
                                     len(search_key), set())
                 length_filter.add(search_key)
+                prefix_to_keys.setdefault(search_key, []).append(key)
             length_filters = length_filters.items()
             for prefix, node in self._items.iteritems():
+                node_key_filter = []
                 for length, length_filter in length_filters:
-                    if prefix[:length] in length_filter:
-                        if type(node) == tuple:
-                            keys[node] = prefix
-                        else:
-                            yield node
-                        break
+                    sub_prefix = prefix[:length]
+                    if sub_prefix in length_filter:
+                        node_key_filter.extend(prefix_to_keys[sub_prefix])
+                if node_key_filter: # this key matched something, yield it
+                    if type(node) == tuple:
+                        keys[node] = (prefix, node_key_filter)
+                    else:
+                        yield node, node_key_filter
         if keys:
             # Look in the page cache for some more bytes
             found_keys = set()
@@ -970,9 +989,10 @@
                 else:
                     node = _deserialise(bytes, key,
                         search_key_func=self._search_key_func)
-                    self._items[keys[key]] = node
+                    prefix, node_key_filter = keys[key]
+                    self._items[prefix] = node
                     found_keys.add(key)
-                    yield node
+                    yield node, node_key_filter
             for key in found_keys:
                 del keys[key]
         if keys:
@@ -986,16 +1006,17 @@
                 # We have to fully consume the stream so there is no pending
                 # I/O, so we buffer the nodes for now.
                 stream = store.get_record_stream(batch, 'unordered', True)
-                nodes = []
+                node_and_filters = []
                 for record in stream:
                     bytes = record.get_bytes_as('fulltext')
                     node = _deserialise(bytes, record.key,
                         search_key_func=self._search_key_func)
-                    nodes.append(node)
-                    self._items[keys[record.key]] = node
+                    prefix, node_key_filter = keys[record.key]
+                    node_and_filters.append((node, node_key_filter))
+                    self._items[prefix] = node
                     _page_cache.add(record.key, bytes)
-                for node in nodes:
-                    yield node
+                for info in node_and_filters:
+                    yield info
 
     def map(self, store, key, value):
         """Map key to value."""
@@ -1018,7 +1039,8 @@
             new_parent.add_node(self._search_prefix[:len(new_prefix)+1],
                                 self)
             return new_parent.map(store, key, value)
-        children = list(self._iter_nodes(store, key_filter=[key]))
+        children = [node for node, _
+                          in self._iter_nodes(store, key_filter=[key])]
         if children:
             child = children[0]
         else:
@@ -1171,7 +1193,8 @@
         """Remove key from this node and it's children."""
         if not len(self._items):
             raise AssertionError("can't unmap in an empty InternalNode.")
-        children = list(self._iter_nodes(store, key_filter=[key]))
+        children = [node for node, _
+                          in self._iter_nodes(store, key_filter=[key])]
         if children:
             child = children[0]
         else:
@@ -1235,7 +1258,7 @@
         #   b) With 16-way fan out, we can still do a single round trip
         #   c) With 255-way fan out, we don't want to read all 255 and destroy
         #      the page cache, just to determine that we really don't need it.
-        for node in self._iter_nodes(store, batch_size=16):
+        for node, _ in self._iter_nodes(store, batch_size=16):
             if isinstance(node, InternalNode):
                 # Without looking at any leaf nodes, we are sure
                 return self

=== modified file 'bzrlib/tests/test_chk_map.py'
--- a/bzrlib/tests/test_chk_map.py	2009-03-10 01:24:49 +0000
+++ b/bzrlib/tests/test_chk_map.py	2009-03-24 16:48:28 +0000
@@ -123,8 +123,10 @@
                 # Internal nodes must have identical references
                 self.assertEqual(sorted(node_one._items.keys()),
                                  sorted(node_two._items.keys()))
-                node_one_stack.extend(node_one._iter_nodes(map_one._store))
-                node_two_stack.extend(node_two._iter_nodes(map_two._store))
+                node_one_stack.extend([n for n, _ in
+                                       node_one._iter_nodes(map_one._store)])
+                node_two_stack.extend([n for n, _ in
+                                       node_two._iter_nodes(map_two._store)])
             else:
                 # Leaf nodes must have identical contents
                 self.assertEqual(node_one._items, node_two._items)
@@ -1506,6 +1508,60 @@
 #    def test_add_node_two_oversized_third_kept_minimum_fan(self):
 #    def test_add_node_one_oversized_second_splits_errors(self):
 
+    def test__iter_nodes_no_key_filter(self):
+        node = InternalNode('')
+        child = LeafNode()
+        child.set_maximum_size(100)
+        child.map(None, ("foo",), "bar")
+        node.add_node("f", child)
+        child = LeafNode()
+        child.set_maximum_size(100)
+        child.map(None, ("bar",), "baz")
+        node.add_node("b", child)
+
+        for child, node_key_filter in node._iter_nodes(None, key_filter=None):
+            self.assertEqual(None, node_key_filter)
+
+    def test__iter_nodes_splits_key_filter(self):
+        node = InternalNode('')
+        child = LeafNode()
+        child.set_maximum_size(100)
+        child.map(None, ("foo",), "bar")
+        node.add_node("f", child)
+        child = LeafNode()
+        child.set_maximum_size(100)
+        child.map(None, ("bar",), "baz")
+        node.add_node("b", child)
+
+        # foo and bar both match exactly one leaf node, but 'cat' should not
+        # match any, and should not be placed in one.
+        key_filter = (('foo',), ('bar',), ('cat',))
+        for child, node_key_filter in node._iter_nodes(None,
+                                                       key_filter=key_filter):
+            # each child could only match one key filter, so make sure it was
+            # properly filtered
+            self.assertEqual(1, len(node_key_filter))
+
+    def test__iter_nodes_with_multiple_matches(self):
+        node = InternalNode('')
+        child = LeafNode()
+        child.set_maximum_size(100)
+        child.map(None, ("foo",), "val")
+        child.map(None, ("fob",), "val")
+        node.add_node("f", child)
+        child = LeafNode()
+        child.set_maximum_size(100)
+        child.map(None, ("bar",), "val")
+        child.map(None, ("baz",), "val")
+        node.add_node("b", child)
+
+        key_filter = (('foo',), ('fob',), ('bar',), ('baz',))
+        for child, node_key_filter in node._iter_nodes(None,
+                                                       key_filter=key_filter):
+            # each child could matches two key filters, so make sure they were
+            # both included.
+            self.assertEqual(2, len(node_key_filter))
+
     def test_iteritems_empty_new(self):
         node = InternalNode()
         self.assertEqual([], sorted(node.iteritems(None)))