Rev 3902: Fix InternalNode._iter_nodes to split the key filter based on matches. in http://bazaar.launchpad.net/%7Ebzr/bzr/brisbane-core
John Arbash Meinel
john at arbash-meinel.com
Tue Mar 24 16:55:56 GMT 2009
At http://bazaar.launchpad.net/%7Ebzr/bzr/brisbane-core
------------------------------------------------------------
revno: 3902
revision-id: john at arbash-meinel.com-20090324165313-3sokh8vdzypsm7cj
parent: ian.clatworthy at internode.on.net-20090324091647-12lm127tpvhk6d5l
parent: john at arbash-meinel.com-20090324164828-l02cq5o9eyewy5g7
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: brisbane-core
timestamp: Tue 2009-03-24 11:53:13 -0500
message:
Fix InternalNode._iter_nodes to split the key filter based on matches.
Update LeafNode.iteritems() to use a dict lookup when the key width is correct.
Shaves a reasonable amount of time off of stuff that wants to look at large
portions of the inventory.
modified:
bzrlib/chk_map.py chk_map.py-20081001014447-ue6kkuhofvdecvxa-1
bzrlib/tests/test_chk_map.py test_chk_map.py-20081001014447-ue6kkuhofvdecvxa-2
------------------------------------------------------------
revno: 3899.1.3
revision-id: john at arbash-meinel.com-20090324164828-l02cq5o9eyewy5g7
parent: john at arbash-meinel.com-20090323230249-s07xna9l5l7cqez3
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: iter_changes_fixes
timestamp: Tue 2009-03-24 11:48:28 -0500
message:
review comments from Ian.
modified:
bzrlib/tests/test_chk_map.py test_chk_map.py-20081001014447-ue6kkuhofvdecvxa-2
------------------------------------------------------------
revno: 3899.1.2
revision-id: john at arbash-meinel.com-20090323230249-s07xna9l5l7cqez3
parent: john at arbash-meinel.com-20090323225113-kvefl5nlj1uataj3
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: iter_changes_fixes
timestamp: Mon 2009-03-23 18:02:49 -0500
message:
change the LeafNode iteritems() code so that it directly returns
non-prefix matches.
modified:
bzrlib/chk_map.py chk_map.py-20081001014447-ue6kkuhofvdecvxa-1
------------------------------------------------------------
revno: 3899.1.1
revision-id: john at arbash-meinel.com-20090323225113-kvefl5nlj1uataj3
parent: john at arbash-meinel.com-20090323215037-oyxc3d940dw0rnv7
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: iter_changes_fixes
timestamp: Mon 2009-03-23 17:51:13 -0500
message:
Update _iter_nodes so that it splits the key_filter into the ones that matched.
This should be a first step for preventing the LC^2 performance we saw for ls -l.
modified:
bzrlib/chk_map.py chk_map.py-20081001014447-ue6kkuhofvdecvxa-1
bzrlib/tests/test_chk_map.py test_chk_map.py-20081001014447-ue6kkuhofvdecvxa-2
-------------- next part --------------
=== modified file 'bzrlib/chk_map.py'
--- a/bzrlib/chk_map.py 2009-03-19 19:31:06 +0000
+++ b/bzrlib/chk_map.py 2009-03-23 23:02:49 +0000
@@ -654,19 +654,30 @@
list/set/dict or similar repeatedly iterable container.
"""
if key_filter is not None:
- # Adjust the filter - short elements go to a prefix filter. Would this
- # be cleaner explicitly? That would be no harder for InternalNode..
+ # Adjust the filter - short elements go to a prefix filter. All
+ # other items are looked up directly.
# XXX: perhaps defaultdict? Profiling<rinse and repeat>
filters = {}
for key in key_filter:
- length_filter = filters.setdefault(len(key), set())
- length_filter.add(key)
- filters = filters.items()
- for item in self._items.iteritems():
- for length, length_filter in filters:
- if item[0][:length] in length_filter:
- yield item
- break
+ if len(key) == self._key_width:
+ # This filter is meant to match exactly one key, yield it
+ # if we have it.
+ try:
+ yield key, self._items[key]
+ except KeyError:
+ # This key is not present in this map, continue
+ pass
+ else:
+ # Short items, we need to match based on a prefix
+ length_filter = filters.setdefault(len(key), set())
+ length_filter.add(key)
+ if filters:
+ filters = filters.items()
+ for item in self._items.iteritems():
+ for length, length_filter in filters:
+ if item[0][:length] in length_filter:
+ yield item
+ break
else:
for item in self._items.iteritems():
yield item
@@ -919,8 +930,8 @@
search_key_func=search_key_func)
def iteritems(self, store, key_filter=None):
- for node in self._iter_nodes(store, key_filter=key_filter):
- for item in node.iteritems(store, key_filter=key_filter):
+ for node, node_filter in self._iter_nodes(store, key_filter=key_filter):
+ for item in node.iteritems(store, key_filter=node_filter):
yield item
def _iter_nodes(self, store, key_filter=None, batch_size=None):
@@ -935,30 +946,38 @@
:return: An iterable of nodes. This function does not have to be fully
consumed. (There will be no pending I/O when items are being returned.)
"""
+ # Map from chk key ('sha1:...',) to (prefix, key_filter)
+ # prefix is the key in self._items to use, key_filter is the key_filter
+ # entries that would match this node
keys = {}
if key_filter is None:
for prefix, node in self._items.iteritems():
if type(node) == tuple:
- keys[node] = prefix
+ keys[node] = (prefix, None)
else:
- yield node
+ yield node, None
else:
# XXX defaultdict ?
+ prefix_to_keys = {}
length_filters = {}
for key in key_filter:
search_key = self._search_prefix_filter(key)
length_filter = length_filters.setdefault(
len(search_key), set())
length_filter.add(search_key)
+ prefix_to_keys.setdefault(search_key, []).append(key)
length_filters = length_filters.items()
for prefix, node in self._items.iteritems():
+ node_key_filter = []
for length, length_filter in length_filters:
- if prefix[:length] in length_filter:
- if type(node) == tuple:
- keys[node] = prefix
- else:
- yield node
- break
+ sub_prefix = prefix[:length]
+ if sub_prefix in length_filter:
+ node_key_filter.extend(prefix_to_keys[sub_prefix])
+ if node_key_filter: # this key matched something, yield it
+ if type(node) == tuple:
+ keys[node] = (prefix, node_key_filter)
+ else:
+ yield node, node_key_filter
if keys:
# Look in the page cache for some more bytes
found_keys = set()
@@ -970,9 +989,10 @@
else:
node = _deserialise(bytes, key,
search_key_func=self._search_key_func)
- self._items[keys[key]] = node
+ prefix, node_key_filter = keys[key]
+ self._items[prefix] = node
found_keys.add(key)
- yield node
+ yield node, node_key_filter
for key in found_keys:
del keys[key]
if keys:
@@ -986,16 +1006,17 @@
# We have to fully consume the stream so there is no pending
# I/O, so we buffer the nodes for now.
stream = store.get_record_stream(batch, 'unordered', True)
- nodes = []
+ node_and_filters = []
for record in stream:
bytes = record.get_bytes_as('fulltext')
node = _deserialise(bytes, record.key,
search_key_func=self._search_key_func)
- nodes.append(node)
- self._items[keys[record.key]] = node
+ prefix, node_key_filter = keys[record.key]
+ node_and_filters.append((node, node_key_filter))
+ self._items[prefix] = node
_page_cache.add(record.key, bytes)
- for node in nodes:
- yield node
+ for info in node_and_filters:
+ yield info
def map(self, store, key, value):
"""Map key to value."""
@@ -1018,7 +1039,8 @@
new_parent.add_node(self._search_prefix[:len(new_prefix)+1],
self)
return new_parent.map(store, key, value)
- children = list(self._iter_nodes(store, key_filter=[key]))
+ children = [node for node, _
+ in self._iter_nodes(store, key_filter=[key])]
if children:
child = children[0]
else:
@@ -1171,7 +1193,8 @@
"""Remove key from this node and it's children."""
if not len(self._items):
raise AssertionError("can't unmap in an empty InternalNode.")
- children = list(self._iter_nodes(store, key_filter=[key]))
+ children = [node for node, _
+ in self._iter_nodes(store, key_filter=[key])]
if children:
child = children[0]
else:
@@ -1235,7 +1258,7 @@
# b) With 16-way fan out, we can still do a single round trip
# c) With 255-way fan out, we don't want to read all 255 and destroy
# the page cache, just to determine that we really don't need it.
- for node in self._iter_nodes(store, batch_size=16):
+ for node, _ in self._iter_nodes(store, batch_size=16):
if isinstance(node, InternalNode):
# Without looking at any leaf nodes, we are sure
return self
=== modified file 'bzrlib/tests/test_chk_map.py'
--- a/bzrlib/tests/test_chk_map.py 2009-03-10 01:24:49 +0000
+++ b/bzrlib/tests/test_chk_map.py 2009-03-24 16:48:28 +0000
@@ -123,8 +123,10 @@
# Internal nodes must have identical references
self.assertEqual(sorted(node_one._items.keys()),
sorted(node_two._items.keys()))
- node_one_stack.extend(node_one._iter_nodes(map_one._store))
- node_two_stack.extend(node_two._iter_nodes(map_two._store))
+ node_one_stack.extend([n for n, _ in
+ node_one._iter_nodes(map_one._store)])
+ node_two_stack.extend([n for n, _ in
+ node_two._iter_nodes(map_two._store)])
else:
# Leaf nodes must have identical contents
self.assertEqual(node_one._items, node_two._items)
@@ -1506,6 +1508,60 @@
# def test_add_node_two_oversized_third_kept_minimum_fan(self):
# def test_add_node_one_oversized_second_splits_errors(self):
+ def test__iter_nodes_no_key_filter(self):
+ node = InternalNode('')
+ child = LeafNode()
+ child.set_maximum_size(100)
+ child.map(None, ("foo",), "bar")
+ node.add_node("f", child)
+ child = LeafNode()
+ child.set_maximum_size(100)
+ child.map(None, ("bar",), "baz")
+ node.add_node("b", child)
+
+ for child, node_key_filter in node._iter_nodes(None, key_filter=None):
+ self.assertEqual(None, node_key_filter)
+
+ def test__iter_nodes_splits_key_filter(self):
+ node = InternalNode('')
+ child = LeafNode()
+ child.set_maximum_size(100)
+ child.map(None, ("foo",), "bar")
+ node.add_node("f", child)
+ child = LeafNode()
+ child.set_maximum_size(100)
+ child.map(None, ("bar",), "baz")
+ node.add_node("b", child)
+
+ # foo and bar both match exactly one leaf node, but 'cat' should not
+ # match any, and should not be placed in one.
+ key_filter = (('foo',), ('bar',), ('cat',))
+ for child, node_key_filter in node._iter_nodes(None,
+ key_filter=key_filter):
+ # each child could only match one key filter, so make sure it was
+ # properly filtered
+ self.assertEqual(1, len(node_key_filter))
+
+ def test__iter_nodes_with_multiple_matches(self):
+ node = InternalNode('')
+ child = LeafNode()
+ child.set_maximum_size(100)
+ child.map(None, ("foo",), "val")
+ child.map(None, ("fob",), "val")
+ node.add_node("f", child)
+ child = LeafNode()
+ child.set_maximum_size(100)
+ child.map(None, ("bar",), "val")
+ child.map(None, ("baz",), "val")
+ node.add_node("b", child)
+
+ key_filter = (('foo',), ('fob',), ('bar',), ('baz',))
+ for child, node_key_filter in node._iter_nodes(None,
+ key_filter=key_filter):
+ # each child could matches two key filters, so make sure they were
+ # both included.
+ self.assertEqual(2, len(node_key_filter))
+
def test_iteritems_empty_new(self):
node = InternalNode()
self.assertEqual([], sorted(node.iteritems(None)))
More information about the bazaar-commits
mailing list