Rev 3803: Change InternalNode to always cache its serialized_prefix. in http://bzr.arbash-meinel.com/branches/bzr/brisbane/prefix
John Arbash Meinel
john at arbash-meinel.com
Thu Dec 11 21:57:53 GMT 2008
At http://bzr.arbash-meinel.com/branches/bzr/brisbane/prefix
------------------------------------------------------------
revno: 3803
revision-id: john at arbash-meinel.com-20081211215745-zri692db6y34p7re
parent: john at arbash-meinel.com-20081210053719-9dlacwww8y3cq8x1
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: prefix
timestamp: Thu 2008-12-11 15:57:45 -0600
message:
Change InternalNode to always cache its serialized_prefix.
-------------- next part --------------
=== modified file 'bzrlib/chk_map.py'
--- a/bzrlib/chk_map.py 2008-12-04 21:54:34 +0000
+++ b/bzrlib/chk_map.py 2008-12-11 21:57:45 +0000
@@ -397,6 +397,8 @@
self._size = 0
# The pointers/values this node has - meaning defined by child classes.
self._items = {}
+ # The common serialised prefix
+ self._serialised_prefix = None
def __repr__(self):
items_str = sorted(self._items)
@@ -425,6 +427,18 @@
"""
self._maximum_size = new_size
+ @staticmethod
+ def common_prefix(key1, key2):
+ """Given 2 keys, return the longest prefix common to both."""
+ # Is there a better way to do this?
+ for pos, (left, right) in enumerate(zip(key1, key2)):
+ if left != right:
+ pos -= 1
+ break
+ common = key1[:pos+1]
+ assert key2.startswith(common)
+ return common
+
class LeafNode(Node):
"""A node containing actual key:value pairs.
@@ -516,6 +530,40 @@
return True
return False
+ def _split(self, store):
+ """We have overflowed.
+
+ Split this node into multiple LeafNodes, return it up the stack so that
+ the next layer creates a new InternalNode and references the new nodes.
+
+ :return: (common_serialized_prefix, [(node_serialized_prefix, node)])
+ """
+ common_prefix = self.unique_serialised_prefix()
+ split_at = len(common_prefix) + 1
+ result = {}
+ for key, value in self._items.iteritems():
+ serialised_key = self._serialised_key(key)
+ prefix = serialised_key[:split_at]
+ # TODO: Generally only 1 key can be exactly the right length,
+ # which means we can only have 1 key in the node pointed
+ # at by the 'prefix\0' key. We might want to consider
+ # folding it into the containing InternalNode rather than
+ # having a fixed length-1 node.
+ # Note this is probably not true for hash keys, as they
+ # may get a '\00' node anywhere, but won't have keys of
+ # different lengths.
+ if len(prefix) < split_at:
+ prefix += '\x00'*(split_at - len(prefix))
+ if prefix not in result:
+ node = LeafNode()
+ node.set_maximum_size(self._maximum_size)
+ node._key_width = self._key_width
+ result[prefix] = node
+ else:
+ node = result[prefix]
+ node.map(store, key, value)
+ return common_prefix, result.items()
+
def map(self, store, key, value):
"""Map key to value."""
if key in self._items:
@@ -523,31 +571,7 @@
self._len -= 1
self._key = None
if self._map_no_split(key, value):
- common_prefix = self.unique_serialised_prefix()
- split_at = len(common_prefix) + 1
- result = {}
- for key, value in self._items.iteritems():
- serialised_key = self._serialised_key(key)
- prefix = serialised_key[:split_at]
- # TODO: Generally only 1 key can be exactly the right length,
- # which means we can only have 1 key in the node pointed
- # at by the 'prefix\0' key. We might want to consider
- # folding it into the containing InternalNode rather than
- # having a fixed length-1 node.
- # Note this is probably not true for hash keys, as they
- # may get a '\00' node anywhere, but won't have keys of
- # different lengths.
- if len(prefix) < split_at:
- prefix += '\x00'*(split_at - len(prefix))
- if prefix not in result:
- node = LeafNode()
- node.set_maximum_size(self._maximum_size)
- node._key_width = self._key_width
- result[prefix] = node
- else:
- node = result[prefix]
- node.map(store, key, value)
- return common_prefix, result.items()
+ return self._split(store)
else:
return self.unique_serialised_prefix(), [("", self)]
@@ -590,11 +614,10 @@
current_prefix = self._serialised_key(keys.pop(-1))
while current_prefix and keys:
next_key = self._serialised_key(keys.pop(-1))
- for pos, (left, right) in enumerate(zip(current_prefix, next_key)):
- if left != right:
- pos -= 1
- break
- current_prefix = current_prefix[:pos + 1]
+ if next_key.startswith(current_prefix):
+ # Don't need to do anything, the prefix already fits
+ continue
+ current_prefix = self.common_prefix(current_prefix, next_key)
return current_prefix
def unmap(self, store, key):
@@ -619,7 +642,7 @@
# self._size = 12
# How many octets key prefixes within this node are.
self._node_width = 0
- self._prefix = prefix
+ self._serialised_prefix = prefix
def __repr__(self):
items_str = sorted(self._items)
@@ -627,7 +650,7 @@
items_str = items_str[16] + '...]'
return '%s(key:%s len:%s size:%s max:%s prefix:%s items:%s)' % (
self.__class__.__name__, self._key, self._len, self._size,
- self._maximum_size, self._prefix, items_str)
+ self._maximum_size, self._serialised_prefix, items_str)
def add_node(self, prefix, node):
"""Add a child node with prefix prefix, and node node.
@@ -635,13 +658,13 @@
:param prefix: The serialised key prefix for node.
:param node: The node being added.
"""
- assert self._prefix is not None
- assert prefix.startswith(self._prefix)
- assert len(prefix) == len(self._prefix) + 1
+ assert self._serialised_prefix is not None
+ assert prefix.startswith(self._serialised_prefix)
+ assert len(prefix) == len(self._serialised_prefix) + 1
self._len += len(node)
if not len(self._items):
self._node_width = len(prefix)
- assert self._node_width == len(self._prefix) + 1
+ assert self._node_width == len(self._serialised_prefix) + 1
self._items[prefix] = node
self._key = None
@@ -676,7 +699,7 @@
result._key_width = width
result._size = len(bytes)
result._node_width = len(prefix)
- result._prefix = result.unique_serialised_prefix()
+ result._serialised_prefix = result.unique_serialised_prefix()
return result
def iteritems(self, store, key_filter=None):
@@ -748,26 +771,22 @@
if not len(self._items):
raise AssertionError("cant map in an empty InternalNode.")
serialised_key = self._serialised_key(key)
- assert self._node_width == len(self._prefix) + 1
- if not serialised_key.startswith(self._prefix):
+ assert self._node_width == len(self._serialised_prefix) + 1
+ if not serialised_key.startswith(self._serialised_prefix):
# This key doesn't fit in this index, so we need to split at the
- # point where it would fit.
- # XXX: Do we need the serialised_key in its maximum length?
- new_prefix = self.unique_serialised_prefix(serialised_key)
+ # point where it would fit, insert self into that internal node,
+ # and then map this key into that node.
+ new_prefix = self.common_prefix(self._serialised_prefix,
+ serialised_key)
new_parent = InternalNode(new_prefix)
new_parent.set_maximum_size(self._maximum_size)
new_parent._key_width = self._key_width
- new_parent.add_node(self._prefix[:len(new_prefix)+1], self)
- assert new_parent._node_width == len(new_parent._prefix) + 1
+ new_parent.add_node(self._serialised_prefix[:len(new_prefix)+1],
+ self)
return new_parent.map(store, key, value)
children = self._iter_nodes(store, key_filter=[key])
if children:
child = children[0]
- # if isinstance(child, InternalNode):
- # child_prefix = child._prefix
- # child_ser_key = child._serialised_key(key)
- # if not child_ser_key.startswith(child_prefix):
- # import pdb; pdb.set_trace()
else:
# new child needed:
child = self._new_child(serialised_key, LeafNode)
@@ -791,19 +810,21 @@
# at this level. Or the LeafNode has shrunk in size, so we need
# to check that as well.
new_node = self._check_remap(store)
- return new_node.unique_serialised_prefix(), [("", new_node)]
+ if new_node._serialised_prefix is None:
+ return new_node.unique_serialised_prefix(), [("", new_node)]
+ else:
+ return new_node._serialised_prefix, [('', new_node)]
# child has overflown - create a new intermediate node.
# XXX: This is where we might want to try and expand our depth
# to refer to more bytes of every child (which would give us
# multiple pointers to child nodes, but less intermediate nodes)
- # TODO: Is this mapped as serialised_key or as prefix?
child = self._new_child(serialised_key, InternalNode)
- child._prefix = prefix
+ child._serialised_prefix = prefix
for split, node in node_details:
child.add_node(split, node)
self._len = self._len - old_len + len(child)
self._key = None
- return self.unique_serialised_prefix(), [("", self)]
+ return self._serialised_prefix, [("", self)]
def _new_child(self, serialised_key, klass):
"""Create a new child node of type klass."""
@@ -896,11 +917,10 @@
current_prefix = keys.pop(-1)
while current_prefix and keys:
next_key = keys.pop(-1)
- for pos, (left, right) in enumerate(zip(current_prefix, next_key)):
- if left != right:
- pos -= 1
- break
- current_prefix = current_prefix[:pos + 1]
+ if next_key.startswith(current_prefix):
+ # Don't need to do anything, the prefix already fits
+ continue
+ current_prefix = self.common_prefix(current_prefix, next_key)
return current_prefix
def unmap(self, store, key):
=== modified file 'bzrlib/tests/test_chk_map.py'
--- a/bzrlib/tests/test_chk_map.py 2008-12-02 23:44:34 +0000
+++ b/bzrlib/tests/test_chk_map.py 2008-12-11 21:57:45 +0000
@@ -86,7 +86,8 @@
# Internal nodes must have identical references
self.assertEqual(sorted(node_one._items.keys()),
sorted(node_two._items.keys()))
- self.assertEqual(node_one._prefix, node_two._prefix)
+ self.assertEqual(node_one._serialised_prefix,
+ node_two._serialised_prefix)
node_one_stack.extend(node_one._iter_nodes(map_one._store))
node_two_stack.extend(node_two._iter_nodes(map_two._store))
else:
More information about the bazaar-commits
mailing list