Rev 3883: Add CHKInventory._extract_all_inventory in http://bzr.arbash-meinel.com/branches/bzr/brisbane/extract_all_inventory

Mon Mar 16 20:31:03 GMT 2009

At http://bzr.arbash-meinel.com/branches/bzr/brisbane/extract_all_inventory

------------------------------------------------------------
revno: 3883
revision-id: john at arbash-meinel.com-20090316202404-llokmiqbpiex0a38
parent: john at arbash-meinel.com-20090313073104-84v7k56qrkondv9o
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: extract_all_inventory
timestamp: Mon 2009-03-16 15:24:04 -0500
message:
  Add CHKInventory._extract_all_inventory
  
  CHKInventory is nice in that it can be lazy at extracting every object.
  However, when we *want* the whole inventory, it seems to be about 3x faster
  to do a single CHKMap.iteritems() call, and process all of the items in
  one go.
  
  The code isn't fully complete or tested, but it does make
  'bzr ls' time drop from 5s => 2s.
-------------- next part --------------
=== modified file 'bzrlib/inventory.py'

--- a/bzrlib/inventory.py	2009-03-12 23:34:38 +0000
+++ b/bzrlib/inventory.py	2009-03-16 20:24:04 +0000
@@ -1638,6 +1638,47 @@
                 (result.revision_id, expected_revision_id))
         return result
 
+    def _extract_all_inventory(self):
+        """Create the full inventory structure in a single pass.
+
+        This is used for things like iter_entries_by_dir, where we know we want
+        the entire inventory.
+        """
+        # TODO: We should have a way to know that the full inventory has been extracted
+        # These are nodes that we have read in, but we haven't found their parent yet
+        # This maps from the parent_id to a list of entries
+        parentless_nodes = {}
+        directories = {}
+        for file_id_key, bytes in self.id_to_entry.iteritems():
+            entry = self._bytes_to_entry(bytes)
+            # TODO: Do we want to populate _fileid_to_entry_cache?
+            #       Further, do we want to use the nodes that we have
+            #       already deserialised from there based on 'file_id'?
+            if entry.file_id != file_id_key[0]: # Bork?
+                import pdb; pdb.set_trace()
+            assert entry.file_id == file_id_key[0]
+            if entry.kind == 'directory':
+                assert entry.file_id not in directories
+                directories[entry.file_id] = entry
+                entry._children = {}
+                if entry.file_id in parentless_nodes:
+                    children_so_far = parentless_nodes.pop(entry.file_id)
+                    for child in children_so_far:
+                        entry._children[child.name] = child
+                        child.parent_id = entry.file_id # share strings
+            if entry.parent_id is None:
+                pass # Root entry
+            elif entry.parent_id in directories:
+                parent = directories[entry.parent_id]
+                entry.parent_id = parent.file_id # share strings
+                assert entry.name not in parent._children
+                parent._children[entry.name] = entry
+            else: # parent not found yet
+                parentless_nodes.setdefault(entry.parent_id, []).append(entry)
+            self._fileid_to_entry_cache[entry.file_id] = entry
+        # All parentless_nodes should be consumed
+        assert not parentless_nodes
+
     @classmethod
     def from_inventory(klass, chk_store, inventory, maximum_size=0, search_key_name='plain'):
         """Create a CHKInventory from an existing inventory.
@@ -1689,8 +1730,10 @@
         if result is not None:
             return result
         try:
-            return self._bytes_to_entry(
+            result = self._bytes_to_entry(
                 self.id_to_entry.iteritems([(file_id,)]).next()[1])
+            self._fileid_to_entry_cache[file_id] = result
+            return result
         except StopIteration:
             # really we're passing an inventory, not a tree...
             raise errors.NoSuchId(self, file_id)
@@ -1792,6 +1835,55 @@
             yield (file_id, (path_in_source, path_in_target), changed_content,
                 versioned, parent, name, kind, executable)
 
+    def iter_entries(self, from_dir=None):
+        """Return (path, entry) pairs, in order by name."""
+        if from_dir is None:
+            if self.root is None:
+                return
+            from_dir = self.root
+            yield '', self.root
+        elif isinstance(from_dir, basestring):
+            from_dir = self[from_dir]
+
+        if from_dir._children is None:
+            self._extract_all_inventory()
+            from_dir = self[from_dir.file_id]
+            assert from_dir._children is not None
+
+        # unrolling the recursive called changed the time from
+        # 440ms/663ms (inline/total) to 116ms/116ms
+        children = from_dir.children.items()
+        children.sort()
+        children = collections.deque(children)
+        stack = [(u'', children)]
+        while stack:
+            from_dir_relpath, children = stack[-1]
+
+            while children:
+                name, ie = children.popleft()
+
+                # we know that from_dir_relpath never ends in a slash
+                # and 'f' doesn't begin with one, we can do a string op, rather
+                # than the checks of pathjoin(), though this means that all paths
+                # start with a slash
+                path = from_dir_relpath + '/' + name
+
+                yield path[1:], ie
+
+                if ie.kind != 'directory':
+                    continue
+
+                # But do this child first
+                new_children = ie.children.items()
+                new_children.sort()
+                new_children = collections.deque(new_children)
+                stack.append((path, new_children))
+                # Break out of inner loop, so that we start outer loop with child
+                break
+            else:
+                # if we finished all children, pop it off the stack
+                stack.pop()
+
     def __len__(self):
         """Return the number of entries in the inventory."""
         return len(self.id_to_entry)