Rev 3754: Partial multi-layer chk dictionary trees. in http://people.ubuntu.com/~robertc/baz2.0/repository

Robert Collins robertc at robertcollins.net
Wed Nov 5 06:31:11 GMT 2008


At http://people.ubuntu.com/~robertc/baz2.0/repository

------------------------------------------------------------
revno: 3754
revision-id: robertc at robertcollins.net-20081105063100-t4ghwsz33mycoisz
parent: robertc at robertcollins.net-20081022064059-wb4jp5sdx2t2whn5
committer: Robert Collins <robertc at robertcollins.net>
branch nick: repository
timestamp: Wed 2008-11-05 17:31:00 +1100
message:
  Partial multi-layer chk dictionary trees.
modified:
  bzrlib/chk_map.py              chk_map.py-20081001014447-ue6kkuhofvdecvxa-1
  bzrlib/tests/per_repository_chk/test_supported.py test_supported.py-20080925063728-k65ry0n2rhta6t34-1
  bzrlib/tests/test_chk_map.py   test_chk_map.py-20081001014447-ue6kkuhofvdecvxa-2
  bzrlib/tests/test_inv.py       testinv.py-20050722220913-1dc326138d1a5892
=== modified file 'bzrlib/chk_map.py'
--- a/bzrlib/chk_map.py	2008-10-22 06:40:59 +0000
+++ b/bzrlib/chk_map.py	2008-11-05 06:31:00 +0000
@@ -67,18 +67,23 @@
         return stream.next().get_bytes_as('fulltext')
 
     @classmethod
-    def from_dict(klass, store, initial_value):
+    def from_dict(klass, store, initial_value, maximum_size=0):
         """Create a CHKMap in store with initial_value as the content.
         
         :param store: The store to record initial_value in, a VersionedFiles
             object with 1-tuple keys supporting CHK key generation.
         :param initial_value: A dict to store in store. Its keys and values
             must be bytestrings.
+        :param maximum_size: The maximum_size rule to apply to nodes. This
+            determines the size at which no new data is added to a single node.
         :return: The root chk of te resulting CHKMap.
         """
         result = CHKMap(store, None)
+        result._root_node.set_maximum_size(maximum_size)
+        delta = []
         for key, value in initial_value.items():
-            result._map(key, value)
+            delta.append((None, key, value))
+        result.apply_delta(delta)
         return result._save()
 
     def iteritems(self, key_filter=None):
@@ -141,13 +146,295 @@
         return result
 
 
-class RootNode(object):
+class Node(object):
+    """Base class defining the protocol for CHK Map nodes."""
+
+    def __init__(self, key_width=1):
+        """Create a node.
+
+        :param key_width: The width of keys for this node.
+        """
+        self._key = None
+        # Current number of elements
+        self._len = 0
+        self._maximum_size = 0
+        self._key_width = 1
+        # current size in bytes
+        self._size = 0
+        # The pointers/values this node has - meaning defined by child classes.
+        self._items = {}
+
+    def __len__(self):
+        return self._len
+
+    @property
+    def maximum_size(self):
+        """What is the upper limit for adding references to a node."""
+        return self._maximum_size
+
+    def set_maximum_size(self, new_size):
+        """Set the size threshold for nodes.
+
+        :param new_size: The size at which no data is added to a node. 0 for
+            unlimited.
+        """
+        self._maximum_size = new_size
+
+
+class LeafNode(Node):
+    """A node containing actual key:value pairs."""
+
+    def __init__(self):
+        Node.__init__(self)
+        # The size of a leaf node with default values and no children.
+        self._size = 12
+
+    def _current_size(self):
+        """Answer the current serialised size of this node."""
+        return (self._size + len(str(self._len)) + len(str(self._key_width)) +
+            len(str(self._maximum_size)))
+
+    @classmethod
+    def deserialise(klass, bytes, key):
+        """Deserialise bytes, with key key, into a LeafNode.
+
+        :param bytes: The bytes of the node.
+        :param key: The key that the serialised node has.
+        """
+        result = LeafNode()
+        lines = bytes.splitlines()
+        items = {}
+        if lines[0] != 'chkleaf:':
+            raise ValueError("not a serialised leaf node: %r" % bytes)
+        maximum_size = int(lines[1])
+        width = int(lines[2])
+        length = int(lines[3])
+        for line in lines[4:]:
+            elements = line.split('\x00')
+            items[tuple(elements[:-1])] = elements[-1]
+        if len(items) != length:
+            raise AssertionError("item count mismatch")
+        result._items = items
+        result._len = length
+        result._maximum_size = maximum_size
+        result._key = key
+        result._key_width = width
+        result._size = len(bytes)
+        return result
+
+    def iteritems(self, key_filter=None):
+        if key_filter is not None:
+            for item in self._items.iteritems():
+                if item[0] in key_filter:
+                    yield item
+        else:
+            for item in self._items.iteritems():
+                yield item
+
+    def key(self):
+        return self._key
+
+    def map(self, key, value):
+        """Map key to value."""
+        if key in self._items:
+            self._size -= 2 + len('\x00'.join(key)) + len(self._items[key])
+            self._len -= 1
+        self._items[key] = value
+        self._size += 2 + len('\x00'.join(key)) + len(value)
+        self._len += 1
+        self._key = None
+        if (self._maximum_size and self._current_size() > self._maximum_size and
+            self._len > 1):
+            result = InternalNode()
+            result.set_maximum_size(self._maximum_size)
+            result._key_width = self._key_width
+            result._add_node(self)
+            return result
+        else:
+            return self
+
+    def serialise(self, store):
+        """Serialise the tree to store.
+
+        :param store: A VersionedFiles honouring the CHK extensions.
+        :return: An iterable of the keys inserted by this operation.
+        """
+        lines = ["chkleaf:\n"]
+        lines.append("%d\n" % self._maximum_size)
+        lines.append("%d\n" % self._key_width)
+        lines.append("%d\n" % self._len)
+        for key, value in sorted(self._items.items()):
+            lines.append("%s\x00%s\n" % ('\x00'.join(key), value))
+        sha1, _, _ = store.add_lines((None,), (), lines)
+        self._key = ("sha1:" + sha1,)
+        return [self._key]
+
+    def unmap(self, key):
+        """Unmap key from the node."""
+        self._size -= 2 + len('\x00'.join(key)) + len(self._items[key])
+        self._len -= 1
+        del self._items[key]
+        self._key = None
+        return self
+
+
+class InternalNode(Node):
+    """A node that contains references to other nodes."""
+
+    def __init__(self):
+        Node.__init__(self)
+        # The size of an internalnode with default values and no children.
+        # self._size = 12
+        # How many octets key prefixes within this node are.
+        self._node_width = 0
+
+    def _add_node(self, node):
+        """Add a node to the InternalNode.
+
+        :param node: An existing node to add. The node will be examined to see
+            if it is over or undersized and rebalanced if needed across this
+            nodes children.
+        """
+        if self._len == 0:
+            # new tree level, we're being populated by upspill from a overfull
+            # tree.
+            # Cheap-to-code-but-slow?
+            elements = {}
+            max_width = 0
+            # suck in all the values
+            for key, value in node.iteritems():
+                # We work on the serialised keys
+                serialised_key = '\x00'.join(key)
+                elements[serialised_key] = (key, value)
+                max_width = max(len(serialised_key), max_width)
+            # Determine the maximum common key width we will internally handle.
+            # Start with the full key width; if that exceeds our node size
+            # shrink it until we are within the node limit.
+            self._node_width = max_width
+            width = self._node_width
+            # Populate all the resulting keys:
+            items = self._items
+            for serialised_key, key_value in elements.iteritems():
+                actual_key = self._serialised_key(key_value[0])
+                child = items.get(actual_key, None)
+                if not child:
+                    child = LeafNode()
+                    child.set_maximum_size(self._maximum_size)
+                    child._key_width = self._key_width
+                    items[actual_key] = child
+                child.map(key_value[0], key_value[1])
+                self._len += 1
+        else:
+            raise NotImplementedError(self._add_node)
+
+    def _current_size(self):
+        """Answer the current serialised size of this node."""
+        return (self._size + len(str(self._len)) + len(str(self._key_width)) +
+            len(str(self._maximum_size)))
+
+    def iteritems(self, key_filter=None):
+        if key_filter is None:
+            for child in self._items.itervalues():
+                for item in child.iteritems():
+                    yield item
+        else:
+            serialised_filter = set([self._serialised_key(key) for key in
+                key_filter])
+            for key, child in self._items.iteritems():
+                if key in serialised_filter:
+                    for item in child.iteritems(key_filter):
+                        yield item
+
+    def map(self, key, value):
+        """Map key to value."""
+        serialised_key = self._serialised_key(key)
+        try:
+            child = self._items[serialised_key]
+        except KeyError:
+            child = LeafNode()
+            child.set_maximum_size(self._maximum_size)
+            child._key_width = self._key_width
+            self._items[serialised_key] = child
+        old_len = len(child)
+        new_child = child.map(key, value)
+        # TODO: rebalance/enforce balance
+        if new_child is not child:
+            # The child has exceeded its size; if we take more bytes off the
+            # key prefix for the child, that may fit into our node;
+            # How many more bytes can we fit?
+            remaining_size = max(0, self.maximum_size - self._current_size())
+            size_per_row = (self._node_width + 45 + 2)
+            # without increasing the key width:
+            extra_rows = remaining_size / size_per_row
+            if extra_rows:
+                # What is the minimum node width increase to split new_child:
+                offset_bytes = [1]
+                offset = self._node_width - 1
+                while len(offset_bytes) == 1 and offset < new_child._node_width:
+                    offset += 1
+                    offset_bytes = set(child_key[offset] for child_key in
+                        new_child._items.keys())
+                if len(offset_bytes) > 1:
+                    # We've found the fan out point
+                    increase = self._node_width - offset
+                    # calculate how many more pointers we need to carry
+                    new_keys = len(offset_bytes)
+                    for subnode in self._items.values():
+                        new_keys += subnode._key_count(self._node_width, offset)
+                    if (new_keys * (offset + 45 + 2) +
+                        self._prelude_size() > self._maximum_size):
+                        # can't fit it all, accept the new child
+                        self._items[serialised_key] = new_child
+                    else:
+                        # increasing the 
+                        pass
+                else:
+                    # it didn't fan out! wtf!
+                    raise AssertionError("no fan out")
+            else:
+                # leave it split
+                self._items[serialised_key] = new_child
+        self._len += 1
+        return self
+
+    def _serialised_key(self, key):
+        """Return the serialised key for key in this node."""
+        return ('\x00'.join(key) + '\x00'*self._node_width)[:self._node_width]
+
+    def _key_count(self, parent_width, cut_width):
+        """Return the number of keys in/under this node between two widths.
+
+        :param parent_width: The start offset in keys to consider.
+        :param cut_width: The width to stop considering at.
+        """
+        if cut_width > self._node_width:
+            raise NotImplementedError(self._key_count)
+        # Generate a list of unique substrings
+
+
+    def unmap(self, key):
+        """Remove key from this node and it's children."""
+        serialised_key = self._serialised_key(key)
+        child = self._items[serialised_key]
+        new_child = child.unmap(key)
+        # TODO shrink/rebalance
+        if not len(new_child):
+            del self._items[serialised_key]
+            if len(self._items) == 1:
+                return self._items.values()[0]
+        elif new_child is not child:
+            self._items[serialised_key] = new_child
+        return self
+
+
+class RootNode(Node):
     """A root node in a CHKMap."""
 
     def __init__(self):
+        Node.__init__(self)
         self._nodes = {}
-        self._key = None
-        self._len = 0
+        self._size = 12
+        self.prefix_width = 0
 
     def add_child(self, name, child):
         """Add a child to the node.
@@ -157,9 +444,20 @@
         :param name: The name of the child. A bytestring.
         :param child: The child, a key tuple for the childs value.
         """
+        if self._maximum_size and self._current_size() >= self._maximum_size:
+            return False
+        if name in self._nodes:
+            self.remove_child(name)
         self._nodes[name] = child
         self._len += 1
         self._key = None
+        self._size += len(name) + len(child[0]) + 2
+        return True
+
+    def _current_size(self):
+        """Answer the current serialised size of this node."""
+        return (self._size + len(str(self._maximum_size)) + len(str(self._len))
+            + len(str(self.prefix_width)))
 
     def deserialise(self, bytes, key):
         """Set the nodes value to that contained in bytes.
@@ -171,16 +469,17 @@
         nodes = {}
         if lines[0] != 'chkroot:':
             raise ValueError("not a serialised root node: %r" % bytes)
-        length = int(lines[1])
-        for line in lines[2:]:
+        maximum_size = int(lines[1])
+        prefix_width = int(lines[2])
+        length = int(lines[3])
+        for line in lines[4:]:
             name, value = line.split('\x00')
             nodes[name] = (value,)
         self._nodes = nodes
         self._len = length
+        self._maximum_size = maximum_size
         self._key = key
-
-    def __len__(self):
-        return self._len
+        self.prefix_width = prefix_width
 
     def refs(self):
         """Get the CHK key references this node holds."""
@@ -193,7 +492,8 @@
 
         :param name: The name to remove from the node.
         """
-        del self._nodes[name]
+        node = self._nodes.pop(name)
+        self._size -= 2 + len(name) + len(node[0])
         self._len -= 1
         self._key = None
 
@@ -203,6 +503,8 @@
         :return: A bytestring.
         """
         lines = ["chkroot:\n"]
+        lines.append("%d\n" % self._maximum_size)
+        lines.append("%d\n" % self.prefix_width)
         lines.append("%d\n" % self._len)
         for name, child in sorted(self._nodes.items()):
             lines.append("%s\x00%s\n" % (name, child[0]))

=== modified file 'bzrlib/tests/per_repository_chk/test_supported.py'
--- a/bzrlib/tests/per_repository_chk/test_supported.py	2008-10-22 06:40:59 +0000
+++ b/bzrlib/tests/per_repository_chk/test_supported.py	2008-11-05 06:31:00 +0000
@@ -65,13 +65,15 @@
             repo.unlock()
 
     def test_pack_preserves_chk_bytes_store(self):
+        expected_set = set([('sha1:062ef095245d8617d7671e50a71b529d13d93022',),
+            ('sha1:d0826cf765ff45bd602f602ecb0efbe375ea3b50',)])
         repo = self.make_repository('.')
         repo.lock_write()
         try:
             repo.start_write_group()
             try:
-                repo.chk_bytes.add_lines((None,), None, ["chkroot:\n", "0\n"],
-                    random_id=True)
+                repo.chk_bytes.add_lines((None,), None,
+                    ["chkroot:\n", "0\n", "0\n"], random_id=True)
             except:
                 repo.abort_write_group()
                 raise
@@ -87,19 +89,13 @@
             else:
                 repo.commit_write_group()
             repo.pack()
-            self.assertEqual(
-                set([('sha1:062ef095245d8617d7671e50a71b529d13d93022',),
-                     ('sha1:8e13ef930ff710425830ffd5077146b259b49534',)]),
-                repo.chk_bytes.keys())
+            self.assertEqual(expected_set, repo.chk_bytes.keys())
         finally:
             repo.unlock()
         # and reopening
         repo = repo.bzrdir.open_repository()
         repo.lock_read()
         try:
-            self.assertEqual(
-                set([('sha1:062ef095245d8617d7671e50a71b529d13d93022',),
-                     ('sha1:8e13ef930ff710425830ffd5077146b259b49534',)]),
-                repo.chk_bytes.keys())
+            self.assertEqual(expected_set, repo.chk_bytes.keys())
         finally:
             repo.unlock()

=== modified file 'bzrlib/tests/test_chk_map.py'
--- a/bzrlib/tests/test_chk_map.py	2008-10-22 06:40:59 +0000
+++ b/bzrlib/tests/test_chk_map.py	2008-11-05 06:31:00 +0000
@@ -16,11 +16,11 @@
 
 """Tests for maps built on a CHK versionedfiles facility."""
 
-from bzrlib.chk_map import CHKMap, RootNode, ValueNode
+from bzrlib.chk_map import CHKMap, RootNode, InternalNode, LeafNode, ValueNode
 from bzrlib.tests import TestCaseWithTransport
 
 
-class TestDumbMap(TestCaseWithTransport):
+class TestCaseWithStore(TestCaseWithTransport):
 
     def get_chk_bytes(self):
         # The eassiest way to get a CHK store is a development3 repository and
@@ -36,10 +36,16 @@
         stream = chk_bytes.get_record_stream([key], 'unordered', True)
         return stream.next().get_bytes_as("fulltext")
 
+    def to_dict(self, node):
+        return dict(node.iteritems())
+
+
+class TestMap(TestCaseWithStore):
+
     def assertHasABMap(self, chk_bytes):
-        root_key = ('sha1:674c986f86eacc7d9e4fbee45eda625e5689f9a4',)
+        root_key = ('sha1:29f1da33ce2323d754485fd308abc5ff17f3856e',)
         self.assertEqual(
-            "chkroot:\n1\na\x00sha1:cb29f32e561a1b7f862c38ccfd6bc7c7d892f04b\n",
+            "chkroot:\n0\n1\na\x00sha1:cb29f32e561a1b7f862c38ccfd6bc7c7d892f04b\n",
             self.read_bytes(chk_bytes, root_key))
         self.assertEqual(
             "chkvalue:\nb",
@@ -47,26 +53,26 @@
                 ("sha1:cb29f32e561a1b7f862c38ccfd6bc7c7d892f04b",)))
 
     def assertHasEmptyMap(self, chk_bytes):
-        root_key = ('sha1:8e13ef930ff710425830ffd5077146b259b49534',)
-        self.assertEqual("chkroot:\n0\n", self.read_bytes(chk_bytes, root_key))
+        root_key = ('sha1:d0826cf765ff45bd602f602ecb0efbe375ea3b50',)
+        self.assertEqual("chkroot:\n0\n0\n", self.read_bytes(chk_bytes, root_key))
 
-    def _get_map(self, a_dict):
+    def _get_map(self, a_dict, maximum_size=0):
         chk_bytes = self.get_chk_bytes()
-        root_key = CHKMap.from_dict(chk_bytes, a_dict)
+        root_key = CHKMap.from_dict(chk_bytes, a_dict, maximum_size=maximum_size)
         chkmap = CHKMap(chk_bytes, root_key)
         return chkmap
 
     def test_from_dict_empty(self):
         chk_bytes = self.get_chk_bytes()
         root_key = CHKMap.from_dict(chk_bytes, {})
-        self.assertEqual(('sha1:8e13ef930ff710425830ffd5077146b259b49534',),
+        self.assertEqual(('sha1:d0826cf765ff45bd602f602ecb0efbe375ea3b50',),
             root_key)
         self.assertHasEmptyMap(chk_bytes)
 
     def test_from_dict_ab(self):
         chk_bytes = self.get_chk_bytes()
         root_key = CHKMap.from_dict(chk_bytes, {"a":"b"})
-        self.assertEqual(('sha1:674c986f86eacc7d9e4fbee45eda625e5689f9a4',),
+        self.assertEqual(('sha1:29f1da33ce2323d754485fd308abc5ff17f3856e',),
             root_key)
         self.assertHasABMap(chk_bytes)
 
@@ -77,7 +83,7 @@
         root_key = CHKMap.from_dict(chk_bytes, {})
         chkmap = CHKMap(chk_bytes, root_key)
         new_root = chkmap.apply_delta([(None, "a", "b")])
-        self.assertEqual(('sha1:674c986f86eacc7d9e4fbee45eda625e5689f9a4',),
+        self.assertEqual(('sha1:29f1da33ce2323d754485fd308abc5ff17f3856e',),
             new_root)
         self.assertHasABMap(chk_bytes)
         # The update should have left us with an in memory root node, with an
@@ -91,7 +97,7 @@
         root_key = CHKMap.from_dict(chk_bytes, {"a":"b"})
         chkmap = CHKMap(chk_bytes, root_key)
         new_root = chkmap.apply_delta([("a", None, None)])
-        self.assertEqual(('sha1:8e13ef930ff710425830ffd5077146b259b49534',),
+        self.assertEqual(('sha1:d0826cf765ff45bd602f602ecb0efbe375ea3b50',),
             new_root)
         self.assertHasEmptyMap(chk_bytes)
         # The update should have left us with an in memory root node, with an
@@ -122,16 +128,76 @@
         self.assertEqual(0, len(chkmap))
 
     def test___len__2(self):
-        chkmap = self._get_map( {"foo":"bar", "gam":"quux"})
-        self.assertEqual(2, len(chkmap))
+        chkmap = self._get_map({"foo":"bar", "gam":"quux"})
+        self.assertEqual(2, len(chkmap))
+
+    def test_max_size_100_bytes(self):
+        # When there is a 100 byte upper node limit, a tree is formed.
+        chkmap = self._get_map({"k1"*50:"v1", "k2"*50:"v2"}, maximum_size=100)
+        # We expect three nodes:
+        # A root, with two children, and with two key prefixes - k1 to one, and
+        # k2 to the other as our node splitting is only just being developed.
+        # The maximum size should be embedded
+        chkmap._ensure_root()
+        self.assertEqual(100, chkmap._root_node.maximum_size)
+        # There should be two child nodes, and prefix of 2(bytes):
+        self.assertEqual(2, len(chkmap._root_node._nodes))
+        self.assertEqual(2, chkmap._root_node.prefix_length)
+        # The actual nodes pointed at will change as serialisers change; so
+        # here we test that the key prefix is correct; then load the nodes and
+        # check they have the right pointed at key; whether they have the
+        # pointed at value inline or not is also unrelated to this test so we
+        # don't check that.
+        nodes = sorted(chkmap._root_node._nodes.items())
+        ptr1 = nodes[0]
+        ptr2 = nodes[1]
+        self.assertEqual('k1', ptr1[0])
+        self.assertEqual('k2', ptr2[0])
+        node1 = chk_map._deserialise(chkmap._read_bytes(ptr1[1]), ptr1[1])
+        self.assertEqual(1, len(node1._nodes))
+        self.assertEqual(['k1'*50], node1._nodes.keys())
+        node2 = chk_map._deserialise(chkmap._read_bytes(ptr2[1]), ptr2[1])
+        self.assertEqual(1, len(node2._nodes))
+        self.assertEqual(['k2'*50], node2._nodes.keys())
+        # Having checked we have a good structure, check that the content is
+        # still accessible.
+        self.assertEqual(2, len(chkmap))
+        self.assertEqual([("k1"*50, "v1"), ("k2"*50, "v2")],
+            sorted(list(chkmap.iteritems())))
 
 
 class TestRootNode(TestCaseWithTransport):
 
+    def test__current_size(self):
+        node = RootNode()
+        self.assertEqual(15, node._current_size())
+        node.add_child("cd", ("sha1:12345",))
+        self.assertEqual(29, node._current_size())
+        self.assertEqual(29, len(node.serialise()))
+        node.add_child("cd", ("sha1:123456",))
+        self.assertEqual(30, node._current_size())
+        self.assertEqual(30, len(node.serialise()))
+        node.remove_child("cd")
+        self.assertEqual(15, node._current_size())
+        self.assertEqual(15, len(node.serialise()))
+        node.set_maximum_size(100)
+        self.assertEqual(17, node._current_size())
+
     def test_serialise_empty(self):
         node = RootNode()
         bytes = node.serialise()
-        self.assertEqual("chkroot:\n0\n", bytes)
+        self.assertEqual("chkroot:\n0\n0\n0\n", bytes)
+
+    def test_add_child_over_limit(self):
+        node = RootNode()
+        node.set_maximum_size(20)
+        node.add_child("abcdef", ("sha1:12345",))
+        size = node._current_size()
+        self.assertTrue(20 < size)
+        self.assertEqual(False, node.add_child("12345", ("sha1:34",)))
+        # Nothing should have changed
+        self.assertEqual(size, node._current_size())
+        self.assertEqual(1, len(node))
 
     def test_add_child_resets_key(self):
         node = RootNode()
@@ -139,6 +205,11 @@
         node.add_child("c", ("sha1:1234",))
         self.assertEqual(None, node._key)
 
+    def test_add_child_returns_True(self):
+        node = RootNode()
+        node._key = ("something",)
+        self.assertEqual(True, node.add_child("c", ("sha1:1234",)))
+
     def test_add_child_increases_len(self):
         node = RootNode()
         node._key = ("something",)
@@ -171,16 +242,28 @@
         # deserialising from a bytestring & key sets the nodes and the known
         # key.
         node = RootNode()
-        node.deserialise("chkroot:\n1\nc\x00sha1:1234\n", ("foo",))
+        node.deserialise("chkroot:\n0\n0\n1\nc\x00sha1:1234\n", ("foo",))
         self.assertEqual({"c": ("sha1:1234",)}, node._nodes)
         self.assertEqual(("foo",), node._key)
         self.assertEqual(1, len(node))
+        self.assertEqual(0, node.maximum_size)
 
     def test_serialise_with_child(self):
         node = RootNode()
         node.add_child("c", ("sha1:1234",))
         bytes = node.serialise()
-        self.assertEqual("chkroot:\n1\nc\x00sha1:1234\n", bytes)
+        # type 0-max-length 1-value key\x00CHK
+        self.assertEqual("chkroot:\n0\n0\n1\nc\x00sha1:1234\n", bytes)
+
+    def test_deserialise_max_size(self):
+        node = RootNode()
+        node.deserialise("chkroot:\n100\n0\n1\nc\x00sha1:1234\n", ("foo",))
+        self.assertEqual(100, node.maximum_size)
+
+    def test_deserialise_key_prefix(self):
+        node = RootNode()
+        node.deserialise("chkroot:\n100\n10\n1\nc\x00sha1:1234\n", ("foo",))
+        self.assertEqual(10, node.prefix_width)
 
 
 class TestValueNode(TestCaseWithTransport):
@@ -193,3 +276,297 @@
         node = ValueNode("b")
         bytes = node.serialise()
         self.assertEqual("chkvalue:\nb", bytes)
+
+
+class TestLeafNode(TestCaseWithStore):
+
+    def test_current_size_empty(self):
+        node = LeafNode()
+        self.assertEqual(15, node._current_size())
+
+    def test_current_size_size_changed(self):
+        node = LeafNode()
+        node.set_maximum_size(10)
+        self.assertEqual(16, node._current_size())
+
+    def test_current_size_width_changed(self):
+        node = LeafNode()
+        node._key_width = 10
+        self.assertEqual(16, node._current_size())
+
+    def test_current_size_items(self):
+        node = LeafNode()
+        base_size = node._current_size()
+        node = node.map(("foo bar",), "baz")
+        self.assertEqual(base_size + 12, node._current_size())
+
+    def test_deserialise_empty(self):
+        node = LeafNode.deserialise("chkleaf:\n10\n1\n0\n", ("sha1:1234",))
+        self.assertEqual(0, len(node))
+        self.assertEqual(10, node.maximum_size)
+        self.assertEqual(("sha1:1234",), node.key())
+
+    def test_deserialise_items(self):
+        node = LeafNode.deserialise(
+            "chkleaf:\n0\n1\n2\nfoo bar\x00baz\nquux\x00blarh\n", ("sha1:1234",))
+        self.assertEqual(2, len(node))
+        self.assertEqual([(("foo bar",), "baz"), (("quux",), "blarh")],
+            sorted(node.iteritems()))
+
+    def test_iteritems_selected_one_of_two_items(self):
+        node = LeafNode.deserialise(
+            "chkleaf:\n0\n1\n2\nfoo bar\x00baz\nquux\x00blarh\n", ("sha1:1234",))
+        self.assertEqual(2, len(node))
+        self.assertEqual([(("quux",), "blarh")],
+            sorted(node.iteritems([("quux",), ("qaz",)])))
+
+    def test_key_new(self):
+        node = LeafNode()
+        self.assertEqual(None, node.key())
+
+    def test_key_after_map(self):
+        node = LeafNode.deserialise("chkleaf:\n10\n1\n0\n", ("sha1:1234",))
+        node = node.map(("foo bar",), "baz quux")
+        self.assertEqual(None, node.key())
+
+    def test_key_after_unmap(self):
+        node = LeafNode.deserialise(
+            "chkleaf:\n0\n1\n2\nfoo bar\x00baz\nquux\x00blarh\n", ("sha1:1234",))
+        node = node.unmap(("foo bar",))
+        self.assertEqual(None, node.key())
+
+    def test_map_exceeding_max_size_only_entry(self):
+        node = LeafNode()
+        node.set_maximum_size(10)
+        result = node.map(("foo bar",), "baz quux")
+        self.assertEqual(result, node)
+        self.assertTrue(10 < result._current_size())
+
+    def test_map_exceeding_max_size_second_entry_early_difference(self):
+        node = LeafNode()
+        node.set_maximum_size(10)
+        node = node.map(("foo bar",), "baz quux")
+        result = node.map(("blue",), "red")
+        self.assertIsInstance(result, InternalNode)
+        # should have copied the data in:
+        self.assertEqual(2, len(result))
+        self.assertEqual({('blue',): 'red', ('foo bar',): 'baz quux'},
+            self.to_dict(result))
+        self.assertEqual(10, result.maximum_size)
+        self.assertEqual(1, result._key_width)
+
+    def test_map_exceeding_max_size_second_entry_last_octect_changed(self):
+        node = LeafNode()
+        node.set_maximum_size(10)
+        node = node.map(("foo bar",), "baz quux")
+        result = node.map(("foo baz",), "red")
+        self.assertIsInstance(result, InternalNode)
+        # should have copied the data in:
+        self.assertEqual(2, len(result))
+        self.assertEqual({('foo baz',): 'red', ('foo bar',): 'baz quux'},
+            self.to_dict(result))
+        self.assertEqual(10, result.maximum_size)
+        self.assertEqual(1, result._key_width)
+
+    def test_map_first(self):
+        node = LeafNode()
+        result = node.map(("foo bar",), "baz quux")
+        self.assertEqual(result, node)
+        self.assertEqual({("foo bar",):"baz quux"}, self.to_dict(node))
+        self.assertEqual(1, len(node))
+
+    def test_map_second(self):
+        node = LeafNode()
+        node = node.map(("foo bar",), "baz quux")
+        result = node.map(("bingo",), "bango")
+        self.assertEqual(result, node)
+        self.assertEqual({("foo bar",):"baz quux", ("bingo",):"bango"},
+            self.to_dict(node))
+        self.assertEqual(2, len(node))
+
+    def test_map_replacement(self):
+        node = LeafNode()
+        node = node.map(("foo bar",), "baz quux")
+        result = node.map(("foo bar",), "bango")
+        self.assertEqual(result, node)
+        self.assertEqual({("foo bar",): "bango"},
+            self.to_dict(node))
+        self.assertEqual(1, len(node))
+
+    def test_serialise_empty(self):
+        store = self.get_chk_bytes()
+        node = LeafNode()
+        node.set_maximum_size(10)
+        expected_key = ("sha1:62cc3565b48b0e830216e652cf99c6bd6b05b4b9",)
+        self.assertEqual([expected_key],
+            list(node.serialise(store)))
+        self.assertEqual("chkleaf:\n10\n1\n0\n", self.read_bytes(store, expected_key))
+        self.assertEqual(expected_key, node.key())
+
+    def test_serialise_items(self):
+        store = self.get_chk_bytes()
+        node = LeafNode()
+        node.set_maximum_size(10)
+        node = node.map(("foo bar",), "baz quux")
+        expected_key = ("sha1:d44cb6f0299b7e047da7f9e98f810e98f1dce1a7",)
+        self.assertEqual([expected_key],
+            list(node.serialise(store)))
+        self.assertEqual("chkleaf:\n10\n1\n1\nfoo bar\x00baz quux\n",
+            self.read_bytes(store, expected_key))
+        self.assertEqual(expected_key, node.key())
+
+    def test_unmap_missing(self):
+        node = LeafNode()
+        self.assertRaises(KeyError, node.unmap, ("foo bar",))
+
+    def test_unmap_present(self):
+        node = LeafNode()
+        node = node.map(("foo bar",), "baz quux")
+        result = node.unmap(("foo bar",))
+        self.assertEqual(result, node)
+        self.assertEqual({}, self.to_dict(node))
+        self.assertEqual(0, len(node))
+
+
+class TestInternalNode(TestCaseWithStore):
+
+    def test_add_node_empty_oversized_no_common_sets_prefix(self):
+        # adding a node with two children that is oversized will generate two
+        # new leaf nodes, and a prefix width that cuts one byte off the longest
+        # key (because that is sufficient to guarantee a split
+        overpacked = LeafNode()
+        overpacked.set_maximum_size(10)
+        overpacked.map(("foo bar",), "baz")
+        overpacked.map(("strange thing",), "it is")
+        # at this point, map returned a new internal node that is already
+        # packed, but that should have preserved the old node due to the 
+        # functional idioms.. check to be sure:
+        self.assertTrue(overpacked.maximum_size < overpacked._current_size())
+        node = InternalNode()
+        # We're not testing that the internal node rebalances yet
+        node.set_maximum_size(0)
+        node._add_node(overpacked)
+        # 13 is the length of strange_thing serialised; as there is no node size
+        # set, we pack the internal node as densely as possible.
+        self.assertEqual(13, node._node_width)
+        self.assertEqual(set(["strange thing", "foo bar\x00\x00\x00\x00\x00\x00"]),
+            set(node._items.keys()))
+        self.assertEqual(2, len(node))
+        self.assertEqual({('strange thing',): 'it is'},
+            self.to_dict(node._items["strange thing"]))
+        self.assertEqual({('foo bar',): 'baz'},
+            self.to_dict(node._items["foo bar\x00\x00\x00\x00\x00\x00"]))
+
+    def test_iteritems_empty(self):
+        node = InternalNode()
+        self.assertEqual([], sorted(node.iteritems()))
+
+    def test_iteritems_two_children(self):
+        node = InternalNode()
+        leaf1 = LeafNode()
+        leaf1.map(('foo bar',), 'quux')
+        leaf2 = LeafNode()
+        leaf2 = LeafNode()
+        leaf2.map(('strange',), 'beast')
+        node._items['foo ba'] = leaf1
+        node._items['strang'] = leaf2
+        self.assertEqual([(('foo bar',), 'quux'), (('strange',), 'beast')],
+            sorted(node.iteritems()))
+
+    def test_iteritems_two_children_partial(self):
+        node = InternalNode()
+        leaf2 = LeafNode()
+        leaf2 = LeafNode()
+        leaf2.map(('strange',), 'beast')
+        # This sets up a path that should not be followed - it will error if
+        # the code tries to.
+        node._items['foo ba'] = None
+        node._items['strang'] = leaf2
+        node._node_width = 6
+        self.assertEqual([(('strange',), 'beast')],
+            sorted(node.iteritems([('strange',), ('weird',)])))
+
+    def test_iteritems_partial_empty(self):
+        node = InternalNode()
+        self.assertEqual([], sorted(node.iteritems([('missing',)])))
+
+    def test_map_to_existing_child(self):
+        # mapping a new key which is in a child of an internal node maps
+        # recursively.
+        overpacked = LeafNode()
+        overpacked.set_maximum_size(10)
+        overpacked.map(("foo bar",), "baz")
+        node = overpacked.map(("foo baz",), "it is")
+        self.assertIsInstance(node, InternalNode)
+        # Now, increase the maximum size limit on the subnode for foo bar
+        child = node._items[node._serialised_key(("foo bar",))]
+        child.set_maximum_size(200)
+        # And map a new key into node, which will land in the same child node
+        result = node.map(("foo bar baz",), "new value")
+        self.assertTrue(result is node)
+        self.assertEqual(3, len(result))
+        self.assertEqual(2, len(child))
+        self.assertEqual({('foo bar',): 'baz',
+            ('foo bar baz',): 'new value', ('foo baz',): 'it is'},
+            self.to_dict(node))
+
+    def test_map_to_existing_child_exceed_child_size_not_internal_size(self):
+        # mapping a new key which is in a child of an internal node maps
+        # recursively, and when the child splits that is accomodated within the
+        # internal node if there is room for another child pointer.
+        overpacked = LeafNode()
+        overpacked.set_maximum_size(40)
+        overpacked.map(("foo bar",), "baz baz baz baz baz baz baz")
+        node = overpacked.map(("foo baz",), "it is it is it is it is it is")
+        self.assertIsInstance(node, InternalNode)
+        # And map a new key into node, which will land in the same child path
+        # within node, but trigger a spill event on the child, and should end
+        # up with 3 pointers in node (as the pointers can fit in the node
+        # space.
+        result = node.map(("foo bar baz",),
+            "new value new value new value new value new value")
+        self.assertTrue(result is node)
+        self.assertEqual(3, len(result))
+        # We should have one child for foo bar
+        child = node._items[node._serialised_key(("foo bar\x00",))]
+        self.assertIsInstance(child, LeafNode)
+        self.assertEqual(1, len(child))
+        # And one for 'foo bar '
+        child = node._items[node._serialised_key(("foo bar ",))]
+        self.assertIsInstance(child, LeafNode)
+        self.assertEqual(1, len(child))
+        self.assertEqual({('foo bar',): 'baz baz baz baz baz baz baz',
+            ('foo bar baz',): 'new value new value new value',
+            ('foo baz',): 'it is it is it is it is it is'},
+            self.to_dict(node))
+
+    def test_map_to_new_child(self):
+        # mapping a new key which is in a child of an internal node maps
+        # recursively.
+        overpacked = LeafNode()
+        overpacked.set_maximum_size(10)
+        overpacked.map(("foo bar",), "baz")
+        node = overpacked.map(("foo baz",), "it is")
+        self.assertIsInstance(node, InternalNode)
+        # Map a new key into node, which will land in a new child node
+        result = node.map(("quux",), "new value")
+        # Now, increase the maximum size limit on the subnode for foo bar
+        child = node._items[node._serialised_key(("quux",))]
+        self.assertTrue(result is node)
+        self.assertEqual(3, len(result))
+        self.assertEqual(1, len(child))
+        self.assertEqual({('foo bar',): 'baz',
+            ('quux',): 'new value', ('foo baz',): 'it is'},
+            self.to_dict(node))
+
+    def test_unmap_second_last_shrinks_to_other_branch(self):
+        # unmapping the second last child of an internal node downgrades it to
+        # a leaf node.
+        overpacked = LeafNode()
+        overpacked.set_maximum_size(10)
+        overpacked.map(("foo bar",), "baz")
+        node = overpacked.map(("strange thing",), "it is")
+        self.assertIsInstance(node, InternalNode)
+        result = node.unmap(("foo bar",))
+        self.assertIsInstance(result, LeafNode)
+        self.assertEqual({("strange thing",): "it is"}, self.to_dict(result))

=== modified file 'bzrlib/tests/test_inv.py'
--- a/bzrlib/tests/test_inv.py	2008-10-22 06:40:59 +0000
+++ b/bzrlib/tests/test_inv.py	2008-11-05 06:31:00 +0000
@@ -219,7 +219,7 @@
             'chkinventory:\n',
             'revision_id: foo\n',
             'root_id: TREE_ROOT\n',
-            'id_to_entry: sha1:6fc2838d88d40dc7331087fb69d3ea72c023b3ee\n'
+            'id_to_entry: sha1:3c84f56e89a6089ee0c6cc25becdcaa368e83632\n'
             ],
             chk_inv.to_lines())
 




More information about the bazaar-commits mailing list