Rev 3801: Change how _check_remap works so it doesn't have to load all keys. in http://bzr.arbash-meinel.com/branches/bzr/brisbane/chk_map
John Arbash Meinel
john at arbash-meinel.com
Tue Dec 2 22:05:56 GMT 2008
At http://bzr.arbash-meinel.com/branches/bzr/brisbane/chk_map
------------------------------------------------------------
revno: 3801
revision-id: john at arbash-meinel.com-20081202220545-sflppo9zeho3z3j3
parent: john at arbash-meinel.com-20081202205054-ajsix3e8w8jgnnod
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: chk_map
timestamp: Tue 2008-12-02 16:05:45 -0600
message:
Change how _check_remap works so it doesn't have to load all keys.
-------------- next part --------------
=== modified file 'bzrlib/chk_map.py'
--- a/bzrlib/chk_map.py 2008-12-02 20:50:54 +0000
+++ b/bzrlib/chk_map.py 2008-12-02 22:05:45 +0000
@@ -876,7 +876,13 @@
if len(self._items) == 1:
# this node is no longer needed:
return self._items.values()[0]
- # Logic for how unmap determines when it needs to rebuild
+ if isinstance(unmapped, InternalNode):
+ return self
+ return self._check_remap(store)
+
+ def _check_remap(self, store):
+ """Check if all keys contained by children fit in a single LeafNode."""
+ # Logic for how we determine when we need to rebuild
# 1) Implicitly unmap() is removing a key which means that the child
# nodes are going to be shrinking by some extent.
# 2) If all children are LeafNodes, it is possible that they could be
@@ -888,21 +894,7 @@
# have a chance of collapsing either.
# So a very cheap check is to just say if 'unmapped' is an
# InternalNode, we don't have to check further.
- if isinstance(unmapped, InternalNode):
- return self
- return self._check_remap(store)
-
- def _check_remap(self, store):
- # Check and see if all keys contained by children could be mapped into
- # a single LeafNode.
-
- # TODO: If any child node is an InternalNode, we know we can stop. Is
- # it cheaper to load all children and see if one is an
- # InternalNode, or is it cheaper to start adding keys into a
- # LeafNode and stop when it gets full...
- # TODO: Do a quick once-over the list of already-read children, and
- # check if any are an InternalNode. This will avoid any I/O some
- # of the time, and should be very cheap.
+
# TODO: Another alternative is to check the total size of all known
# LeafNodes. If there is some formula we can use to determine the
# final size without actually having to read in any more
@@ -915,21 +907,57 @@
new_leaf = LeafNode()
new_leaf.set_maximum_size(self._maximum_size)
new_leaf._key_width = self._key_width
- for node in self._iter_nodes(store):
- if isinstance(node, InternalNode):
- # We won't be able to collapse
- return self
- for key, value in node._items.iteritems():
- # TODO: it is expensive to have it actually do the split, when
- # all we care about is whether it *would* split. Refactor
- # the code a bit so we can just check without actually
- # having to do all the work
- next_prefix, new_nodes = new_leaf.map(store, key, value)
- if len(new_nodes) > 1:
- # These nodes cannot fit in a single leaf, so we are done
- return self
- else:
- assert new_leaf is new_nodes[0][1]
+ keys = {}
+ # There is some overlap with _iter_nodes here, but not a lot, and it
+ # allows us to do quick evaluation without paging everything in
+ for prefix, node in self._items.iteritems():
+ if type(node) == tuple:
+ keys[node] = prefix
+ else:
+ if isinstance(node, InternalNode):
+ # Without looking at any leaf nodes, we are sure
+ return self
+ for key, value in node._items.iteritems():
+ # TODO: it is expensive to have it actually do the split,
+ # when all we care about is whether it *would* split.
+ # Refactor the code a bit so we can just check
+ # without actually having to do all the work
+ next_prefix, new_nodes = new_leaf.map(store, key, value)
+ if len(new_nodes) > 1:
+ # These nodes cannot fit in a single leaf, so we are
+ # done
+ return self
+ else:
+ assert new_leaf is new_nodes[0][1]
+ # So far, everything fits. Page in the rest of the nodes, and see if it
+ # holds true.
+ if keys:
+ stream = store.get_record_stream(keys, 'unordered', True)
+ nodes = []
+ # Fully consume the stream, even if we could determine that we
+ # don't need to continue. We requested the bytes, we may as well
+ # use them
+ for record in stream:
+ node = _deserialise(record.get_bytes_as('fulltext'), record.key)
+ self._items[keys[record.key]] = node
+ nodes.append(node)
+ for node in nodes:
+ if isinstance(node, InternalNode):
+ # We know we won't fit
+ return self
+ for key, value in node._items.iteritems():
+ # TODO: it is expensive to have it actually do the split,
+ # when all we care about is whether it *would* split.
+ # Refactor the code a bit so we can just check
+ # without actually having to do all the work
+ next_prefix, new_nodes = new_leaf.map(store, key, value)
+ if len(new_nodes) > 1:
+ # These nodes cannot fit in a single leaf, so we are
+ # done
+ return self
+ else:
+ assert new_leaf is new_nodes[0][1]
+
# We have gone to every child, and everything fits in a single leaf
# node, we no longer need this internal node
return new_leaf
=== modified file 'bzrlib/tests/test_chk_map.py'
--- a/bzrlib/tests/test_chk_map.py 2008-12-02 20:50:54 +0000
+++ b/bzrlib/tests/test_chk_map.py 2008-12-02 22:05:45 +0000
@@ -368,7 +368,7 @@
chkmap._dump_tree())
self.assertCanonicalForm(chkmap)
- def test_stable_unmap_double_deep(self):
+ def test_unmap_double_deep(self):
store = self.get_chk_bytes()
chkmap = CHKMap(store, None)
# Should fit 3 keys per LeafNode
@@ -424,6 +424,111 @@
" ('abc',) 'v'\n",
chkmap._dump_tree())
+ def test_unmap_with_known_internal_node_doesnt_page(self):
+ store = self.get_chk_bytes()
+ chkmap = CHKMap(store, None)
+ # Should fit 3 keys per LeafNode
+ chkmap._root_node.set_maximum_size(30)
+ chkmap.map(('aaa',), 'v')
+ chkmap.map(('aab',), 'v')
+ chkmap.map(('aac',), 'v')
+ chkmap.map(('abc',), 'v')
+ chkmap.map(('acd',), 'v')
+ self.assertEqualDiff("'' InternalNode None\n"
+ " 'aa' InternalNode None\n"
+ " 'aaa' LeafNode None\n"
+ " ('aaa',) 'v'\n"
+ " 'aab' LeafNode None\n"
+ " ('aab',) 'v'\n"
+ " 'aac' LeafNode None\n"
+ " ('aac',) 'v'\n"
+ " 'ab' LeafNode None\n"
+ " ('abc',) 'v'\n"
+ " 'ac' LeafNode None\n"
+ " ('acd',) 'v'\n",
+ chkmap._dump_tree())
+ # Save everything to the map, and start over
+ chkmap = CHKMap(store, chkmap._save())
+ # Mapping an 'aa' key loads the internal node, but should not map the
+ # 'ab' and 'ac' nodes
+ chkmap.map(('aad',), 'v')
+ self.assertIsInstance(chkmap._root_node._items['aa'], InternalNode)
+ self.assertIsInstance(chkmap._root_node._items['ab'], tuple)
+ self.assertIsInstance(chkmap._root_node._items['ac'], tuple)
+ # Unmapping 'acd' can notice that 'aa' is an InternalNode and not have
+ # to map in 'ab'
+ chkmap.unmap(('acd',))
+ self.assertIsInstance(chkmap._root_node._items['aa'], InternalNode)
+ self.assertIsInstance(chkmap._root_node._items['ab'], tuple)
+
+ def test_unmap_without_fitting_doesnt_page_in(self):
+ store = self.get_chk_bytes()
+ chkmap = CHKMap(store, None)
+ # Should fit 2 keys per LeafNode
+ chkmap._root_node.set_maximum_size(20)
+ chkmap.map(('aaa',), 'v')
+ chkmap.map(('aab',), 'v')
+ self.assertEqualDiff("'' InternalNode None\n"
+ " 'aaa' LeafNode None\n"
+ " ('aaa',) 'v'\n"
+ " 'aab' LeafNode None\n"
+ " ('aab',) 'v'\n",
+ chkmap._dump_tree())
+ # Save everything to the map, and start over
+ chkmap = CHKMap(store, chkmap._save())
+ chkmap.map(('aac',), 'v')
+ chkmap.map(('aad',), 'v')
+ chkmap.map(('aae',), 'v')
+ chkmap.map(('aaf',), 'v')
+ # At this point, the previous nodes should not be paged in, but the
+ # newly added nodes would be
+ self.assertIsInstance(chkmap._root_node._items['aaa'], tuple)
+ self.assertIsInstance(chkmap._root_node._items['aab'], tuple)
+ self.assertIsInstance(chkmap._root_node._items['aac'], LeafNode)
+ self.assertIsInstance(chkmap._root_node._items['aad'], LeafNode)
+ self.assertIsInstance(chkmap._root_node._items['aae'], LeafNode)
+ self.assertIsInstance(chkmap._root_node._items['aaf'], LeafNode)
+ # Now unmapping one of the new nodes will use only the already-paged-in
+ # nodes to determine that we don't need to do more.
+ chkmap.unmap(('aaf',))
+ self.assertIsInstance(chkmap._root_node._items['aaa'], tuple)
+ self.assertIsInstance(chkmap._root_node._items['aab'], tuple)
+ self.assertIsInstance(chkmap._root_node._items['aac'], LeafNode)
+ self.assertIsInstance(chkmap._root_node._items['aad'], LeafNode)
+ self.assertIsInstance(chkmap._root_node._items['aae'], LeafNode)
+
+ def test_unmap_pages_in_if_necessary(self):
+ store = self.get_chk_bytes()
+ chkmap = CHKMap(store, None)
+ # Should fit 2 keys per LeafNode
+ chkmap._root_node.set_maximum_size(20)
+ chkmap.map(('aaa',), 'v')
+ chkmap.map(('aab',), 'v')
+ chkmap.map(('aac',), 'v')
+ self.assertEqualDiff("'' InternalNode None\n"
+ " 'aaa' LeafNode None\n"
+ " ('aaa',) 'v'\n"
+ " 'aab' LeafNode None\n"
+ " ('aab',) 'v'\n"
+ " 'aac' LeafNode None\n"
+ " ('aac',) 'v'\n",
+ chkmap._dump_tree())
+ # Save everything to the map, and start over
+ chkmap = CHKMap(store, chkmap._save())
+ chkmap.map(('aad',), 'v')
+ # At this point, the previous nodes should not be paged in, but the
+ # newly added node would be
+ self.assertIsInstance(chkmap._root_node._items['aaa'], tuple)
+ self.assertIsInstance(chkmap._root_node._items['aab'], tuple)
+ self.assertIsInstance(chkmap._root_node._items['aac'], tuple)
+ self.assertIsInstance(chkmap._root_node._items['aad'], LeafNode)
+ # Unmapping the new node will check the existing nodes to see if they
+ # would fit, and find out that they do not
+ chkmap.unmap(('aad',))
+ self.assertIsInstance(chkmap._root_node._items['aaa'], LeafNode)
+ self.assertIsInstance(chkmap._root_node._items['aab'], LeafNode)
+ self.assertIsInstance(chkmap._root_node._items['aac'], LeafNode)
+
def test_iter_changes_empty_ab(self):
# Asking for changes between an empty dict to a dict with keys returns
# all the keys.
@@ -678,7 +783,7 @@
" ('aab',) 'value2'",
" 'b' LeafNode sha1:67f15d1dfa451d388ed08ff17b4f9578ba010d01",
" ('bbb',) 'value3'",
- ]), chkmap._dump_tree())
+ ""]), chkmap._dump_tree())
def test__dump_tree_in_progress(self):
chkmap = self._get_map({("aaa",): "value1", ("aab",): "value2"},
@@ -695,7 +800,7 @@
" ('aab',) 'value2'",
" 'b' LeafNode None",
" ('bbb',) 'value3'",
- ]), chkmap._dump_tree())
+ ""]), chkmap._dump_tree())
class TestLeafNode(TestCaseWithStore):
@@ -1187,7 +1292,7 @@
" 'aab' LeafNode sha1:10567a3bfcc764fb8d8d9edaa28c0934ada366c5\n"
" ('aab',) 'new'\n"
" 'c' LeafNode sha1:263208de2fce0a8f9db614c1ca39e8f6de8b3802\n"
- " ('c',) 'common'",
+ " ('c',) 'common'\n",
CHKMap(self.get_chk_bytes(), target)._dump_tree())
# The key for the internal aa node
aa_key = ('sha1:2ce01860338a614b93883a5bbeb89920137ac7ef',)
More information about the bazaar-commits
mailing list