Rev 4560: Adding a parent inventory cache, and then using add_inventory_by_delta. in http://bazaar.launchpad.net/~jameinel/bzr/1.18-2a-bundle-performance
John Arbash Meinel
john at arbash-meinel.com
Mon Jul 27 20:12:02 BST 2009
At http://bazaar.launchpad.net/~jameinel/bzr/1.18-2a-bundle-performance
------------------------------------------------------------
revno: 4560
revision-id: john at arbash-meinel.com-20090727191139-bqni7wj0x4eqacjj
parent: john at arbash-meinel.com-20090727185611-sf3mfgry3e5e9hbw
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: 1.18-2a-bundle-performance
timestamp: Mon 2009-07-27 14:11:39 -0500
message:
Adding a parent inventory cache, and then using add_inventory_by_delta.
We need both the text cache and the inventory cache, because we need the *text* cache
to be able to apply the mpdiff, and we need the *inventory* cache to be able to
compute the deltas.
With both, the time to insert 577 revisions drops to 1m23s.
This is the hit rate:
{'cached parent inv': 459,
'cached parent text': 564,
'missing parent inv': 98,
'missing parent text': 149,
'parents': 713}
so we have the parent text 564/713 times (pretty good), and the parent inventory
459/557 times, also quite good.
-------------- next part --------------
=== modified file 'bzrlib/bundle/serializer/v4.py'
--- a/bzrlib/bundle/serializer/v4.py 2009-07-27 18:56:11 +0000
+++ b/bzrlib/bundle/serializer/v4.py 2009-07-27 19:11:39 +0000
@@ -506,6 +506,11 @@
target = property(_get_target)
+_counters = {}
+def update_counter(name, adjust):
+ _counters[name] = _counters.get(name, 0) + 1
+
+
class RevisionInstaller(object):
"""Installs revisions into a repository"""
@@ -602,16 +607,22 @@
vf_records.append((key, parents, meta['sha1'], d_func(text)))
versionedfile.add_mpdiffs(vf_records)
- def _get_parent_inventory_texts(self, inventory_text_cache, parent_ids):
+ def _get_parent_inventory_texts(self, inventory_text_cache,
+ inventory_cache, parent_ids):
cached_parent_texts = {}
remaining_parent_ids = []
for parent_id in parent_ids:
+ update_counter('parents', 1)
p_text = inventory_text_cache.get(parent_id, None)
if p_text is None:
+ update_counter('missing parent text', 1)
remaining_parent_ids.append(parent_id)
else:
+ update_counter('cached parent text', 1)
cached_parent_texts[parent_id] = p_text
ghosts = ()
+ # TODO: Use inventory_cache to grab inventories we already have in
+ # memory
if remaining_parent_ids:
# first determine what keys are actually present in the local
# inventories object (don't use revisions as they haven't been
@@ -630,6 +641,7 @@
for parent_inv in self._repository.iter_inventories(
present_parent_ids):
p_text = to_string(parent_inv)
+ inventory_cache[parent_inv.revision_id] = parent_inv
cached_parent_texts[parent_inv.revision_id] = p_text
inventory_text_cache[parent_inv.revision_id] = p_text
@@ -648,6 +660,7 @@
# be >5MB). Another possibility is to cache 10-20 inventory texts
# instead
inventory_text_cache = lru_cache.LRUSizeCache(10*1024*1024)
+ inventory_cache = lru_cache.LRUCache(10)
pb = ui.ui_factory.nested_progress_bar()
try:
num_records = len(records)
@@ -659,6 +672,7 @@
# ghosts in the source, as the Bundle serialization
# format doesn't record ghosts.
p_texts = self._get_parent_inventory_texts(inventory_text_cache,
+ inventory_cache,
parent_ids)
# Why does to_lines() take strings as the source, it seems that
# it would have to cast to a list of lines, which we get back
@@ -679,13 +693,26 @@
# add_inventory_by_delta instead of always using
# add_inventory
self._handle_root(target_inv, parent_ids)
+ parent_inv = None
+ if parent_ids:
+ parent_inv = inventory_cache.get(parent_ids[0], None)
try:
- self._repository.add_inventory(revision_id, target_inv,
- parent_ids)
+ if parent_inv is None:
+ update_counter('missing parent inv', 1)
+ self._repository.add_inventory(revision_id, target_inv,
+ parent_ids)
+ else:
+ update_counter('cached parent inv', 1)
+ delta = target_inv._make_delta(parent_inv)
+ self._repository.add_inventory_by_delta(parent_ids[0],
+ delta, revision_id, parent_ids)
except errors.UnsupportedInventoryKind:
raise errors.IncompatibleRevision(repr(self._repository))
+ inventory_cache[revision_id] = target_inv
finally:
pb.finished()
+ import pprint
+ pprint.pprint(_counters)
def _handle_root(self, target_inv, parent_ids):
revision_id = target_inv.revision_id
More information about the bazaar-commits
mailing list