Rev 4560: Adding a parent inventory cache, and then using add_inventory_by_delta. in http://bazaar.launchpad.net/~jameinel/bzr/1.18-2a-bundle-performance

Mon Jul 27 20:12:02 BST 2009

At http://bazaar.launchpad.net/~jameinel/bzr/1.18-2a-bundle-performance

------------------------------------------------------------
revno: 4560
revision-id: john at arbash-meinel.com-20090727191139-bqni7wj0x4eqacjj
parent: john at arbash-meinel.com-20090727185611-sf3mfgry3e5e9hbw
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: 1.18-2a-bundle-performance
timestamp: Mon 2009-07-27 14:11:39 -0500
message:
  Adding a parent inventory cache, and then using add_inventory_by_delta.
  
  We need both the text cache and the inventory cache, because we need the *text* cache
  to be able to apply the mpdiff, and we need the *inventory* cache to be able to
  compute the deltas.
  
  With both, the time to insert 577 revisions drops to 1m23s.
  
  This is the hit rate:
  {'cached parent inv': 459,
   'cached parent text': 564,
   'missing parent inv': 98,
   'missing parent text': 149,
   'parents': 713}
  
  so we have the parent text 564/713 times (pretty good), and the parent inventory
  459/557 times, also quite good.
-------------- next part --------------
=== modified file 'bzrlib/bundle/serializer/v4.py'

--- a/bzrlib/bundle/serializer/v4.py	2009-07-27 18:56:11 +0000
+++ b/bzrlib/bundle/serializer/v4.py	2009-07-27 19:11:39 +0000
@@ -506,6 +506,11 @@
     target = property(_get_target)
 
 
+_counters = {}
+def update_counter(name, adjust):
+    _counters[name] = _counters.get(name, 0) + 1
+
+
 class RevisionInstaller(object):
     """Installs revisions into a repository"""
 
@@ -602,16 +607,22 @@
             vf_records.append((key, parents, meta['sha1'], d_func(text)))
         versionedfile.add_mpdiffs(vf_records)
 
-    def _get_parent_inventory_texts(self, inventory_text_cache, parent_ids):
+    def _get_parent_inventory_texts(self, inventory_text_cache,
+                                    inventory_cache, parent_ids):
         cached_parent_texts = {}
         remaining_parent_ids = []
         for parent_id in parent_ids:
+            update_counter('parents', 1)
             p_text = inventory_text_cache.get(parent_id, None)
             if p_text is None:
+                update_counter('missing parent text', 1)
                 remaining_parent_ids.append(parent_id)
             else:
+                update_counter('cached parent text', 1)
                 cached_parent_texts[parent_id] = p_text
         ghosts = ()
+        # TODO: Use inventory_cache to grab inventories we already have in
+        #       memory
         if remaining_parent_ids:
             # first determine what keys are actually present in the local
             # inventories object (don't use revisions as they haven't been
@@ -630,6 +641,7 @@
             for parent_inv in self._repository.iter_inventories(
                                     present_parent_ids):
                 p_text = to_string(parent_inv)
+                inventory_cache[parent_inv.revision_id] = parent_inv
                 cached_parent_texts[parent_inv.revision_id] = p_text
                 inventory_text_cache[parent_inv.revision_id] = p_text
 
@@ -648,6 +660,7 @@
         # be >5MB). Another possibility is to cache 10-20 inventory texts
         # instead
         inventory_text_cache = lru_cache.LRUSizeCache(10*1024*1024)
+        inventory_cache = lru_cache.LRUCache(10)
         pb = ui.ui_factory.nested_progress_bar()
         try:
             num_records = len(records)
@@ -659,6 +672,7 @@
                 #       ghosts in the source, as the Bundle serialization
                 #       format doesn't record ghosts.
                 p_texts = self._get_parent_inventory_texts(inventory_text_cache,
+                                                           inventory_cache,
                                                            parent_ids)
                 # Why does to_lines() take strings as the source, it seems that
                 # it would have to cast to a list of lines, which we get back
@@ -679,13 +693,26 @@
                 #       add_inventory_by_delta instead of always using
                 #       add_inventory
                 self._handle_root(target_inv, parent_ids)
+                parent_inv = None
+                if parent_ids:
+                    parent_inv = inventory_cache.get(parent_ids[0], None)
                 try:
-                    self._repository.add_inventory(revision_id, target_inv,
-                                                   parent_ids)
+                    if parent_inv is None:
+                        update_counter('missing parent inv', 1)
+                        self._repository.add_inventory(revision_id, target_inv,
+                                                       parent_ids)
+                    else:
+                        update_counter('cached parent inv', 1)
+                        delta = target_inv._make_delta(parent_inv)
+                        self._repository.add_inventory_by_delta(parent_ids[0],
+                            delta, revision_id, parent_ids)
                 except errors.UnsupportedInventoryKind:
                     raise errors.IncompatibleRevision(repr(self._repository))
+                inventory_cache[revision_id] = target_inv
         finally:
             pb.finished()
+            import pprint
+            pprint.pprint(_counters)
 
     def _handle_root(self, target_inv, parent_ids):
         revision_id = target_inv.revision_id