Rev 3883: Add CHKInventory._extract_all_inventory in http://bzr.arbash-meinel.com/branches/bzr/brisbane/extract_all_inventory
John Arbash Meinel
john at arbash-meinel.com
Mon Mar 16 20:31:03 GMT 2009
At http://bzr.arbash-meinel.com/branches/bzr/brisbane/extract_all_inventory
------------------------------------------------------------
revno: 3883
revision-id: john at arbash-meinel.com-20090316202404-llokmiqbpiex0a38
parent: john at arbash-meinel.com-20090313073104-84v7k56qrkondv9o
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: extract_all_inventory
timestamp: Mon 2009-03-16 15:24:04 -0500
message:
Add CHKInventory._extract_all_inventory
CHKInventory is nice in that it can be lazy at extracting every object.
However, when we *want* the whole inventory, it seems to be about 3x faster
to do a single CHKMap.iteritems() call, and process all of the items in
one go.
The code isn't fully complete or tested, but it does make
'bzr ls' time drop from 5s => 2s.
-------------- next part --------------
=== modified file 'bzrlib/inventory.py'
--- a/bzrlib/inventory.py 2009-03-12 23:34:38 +0000
+++ b/bzrlib/inventory.py 2009-03-16 20:24:04 +0000
@@ -1638,6 +1638,47 @@
(result.revision_id, expected_revision_id))
return result
+ def _extract_all_inventory(self):
+ """Create the full inventory structure in a single pass.
+
+ This is used for things like iter_entries_by_dir, where we know we want
+ the entire inventory.
+ """
+ # TODO: We should have a way to know that the full inventory has been extracted
+ # These are nodes that we have read in, but we haven't found their parent yet
+ # This maps from the parent_id to a list of entries
+ parentless_nodes = {}
+ directories = {}
+ for file_id_key, bytes in self.id_to_entry.iteritems():
+ entry = self._bytes_to_entry(bytes)
+ # TODO: Do we want to populate _fileid_to_entry_cache?
+ # Further, do we want to use the nodes that we have
+ # already deserialised from there based on 'file_id'?
+ if entry.file_id != file_id_key[0]: # Bork?
+ import pdb; pdb.set_trace()
+ assert entry.file_id == file_id_key[0]
+ if entry.kind == 'directory':
+ assert entry.file_id not in directories
+ directories[entry.file_id] = entry
+ entry._children = {}
+ if entry.file_id in parentless_nodes:
+ children_so_far = parentless_nodes.pop(entry.file_id)
+ for child in children_so_far:
+ entry._children[child.name] = child
+ child.parent_id = entry.file_id # share strings
+ if entry.parent_id is None:
+ pass # Root entry
+ elif entry.parent_id in directories:
+ parent = directories[entry.parent_id]
+ entry.parent_id = parent.file_id # share strings
+ assert entry.name not in parent._children
+ parent._children[entry.name] = entry
+ else: # parent not found yet
+ parentless_nodes.setdefault(entry.parent_id, []).append(entry)
+ self._fileid_to_entry_cache[entry.file_id] = entry
+ # All parentless_nodes should be consumed
+ assert not parentless_nodes
+
@classmethod
def from_inventory(klass, chk_store, inventory, maximum_size=0, search_key_name='plain'):
"""Create a CHKInventory from an existing inventory.
@@ -1689,8 +1730,10 @@
if result is not None:
return result
try:
- return self._bytes_to_entry(
+ result = self._bytes_to_entry(
self.id_to_entry.iteritems([(file_id,)]).next()[1])
+ self._fileid_to_entry_cache[file_id] = result
+ return result
except StopIteration:
# really we're passing an inventory, not a tree...
raise errors.NoSuchId(self, file_id)
@@ -1792,6 +1835,55 @@
yield (file_id, (path_in_source, path_in_target), changed_content,
versioned, parent, name, kind, executable)
+ def iter_entries(self, from_dir=None):
+ """Return (path, entry) pairs, in order by name."""
+ if from_dir is None:
+ if self.root is None:
+ return
+ from_dir = self.root
+ yield '', self.root
+ elif isinstance(from_dir, basestring):
+ from_dir = self[from_dir]
+
+ if from_dir._children is None:
+ self._extract_all_inventory()
+ from_dir = self[from_dir.file_id]
+ assert from_dir._children is not None
+
+ # unrolling the recursive called changed the time from
+ # 440ms/663ms (inline/total) to 116ms/116ms
+ children = from_dir.children.items()
+ children.sort()
+ children = collections.deque(children)
+ stack = [(u'', children)]
+ while stack:
+ from_dir_relpath, children = stack[-1]
+
+ while children:
+ name, ie = children.popleft()
+
+ # we know that from_dir_relpath never ends in a slash
+ # and 'f' doesn't begin with one, we can do a string op, rather
+ # than the checks of pathjoin(), though this means that all paths
+ # start with a slash
+ path = from_dir_relpath + '/' + name
+
+ yield path[1:], ie
+
+ if ie.kind != 'directory':
+ continue
+
+ # But do this child first
+ new_children = ie.children.items()
+ new_children.sort()
+ new_children = collections.deque(new_children)
+ stack.append((path, new_children))
+ # Break out of inner loop, so that we start outer loop with child
+ break
+ else:
+ # if we finished all children, pop it off the stack
+ stack.pop()
+
def __len__(self):
"""Return the number of entries in the inventory."""
return len(self.id_to_entry)
More information about the bazaar-commits
mailing list