Rev 3903: Implement the ability to pack recent chk pages and recently referenced texts in http://bzr.arbash-meinel.com/branches/bzr/brisbane/split_pack

Tue Mar 24 20:19:27 GMT 2009

At http://bzr.arbash-meinel.com/branches/bzr/brisbane/split_pack

------------------------------------------------------------
revno: 3903
revision-id: john at arbash-meinel.com-20090324201908-cs6j7m91avmr5iyu
parent: john at arbash-meinel.com-20090324165313-3sokh8vdzypsm7cj
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: split_pack
timestamp: Tue 2009-03-24 15:19:08 -0500
message:
  Implement the ability to pack recent chk pages and recently referenced texts
  in a separate set of groups.
  The CHK pages shows a big improvement for stuff like 'ls -r-1', the text changes
  seem to cause about 10% bloat in the repo (probably from the extra fulltexts in
  the new data).
  Needs some timing tests to see if it benefits 'bzr co' times as much as the
  other work benefits 'ls' times.
-------------- next part --------------
=== modified file 'bzrlib/repofmt/groupcompress_repo.py'

--- a/bzrlib/repofmt/groupcompress_repo.py	2009-03-23 20:04:42 +0000
+++ b/bzrlib/repofmt/groupcompress_repo.py	2009-03-24 20:19:08 +0000
@@ -168,6 +168,10 @@
 class GCCHKPacker(Packer):
     """This class understand what it takes to collect a GCCHK repo."""
 
+    # This is a semi-arbitrary horizon used to pack new stuff together,
+    # separate from 'really old' stuff.
+    _RECENT_HORIZON = 1000
+
     def __init__(self, pack_collection, packs, suffix, revision_ids=None,
                  reload_func=None):
         super(GCCHKPacker, self).__init__(pack_collection, packs, suffix,
@@ -179,6 +183,7 @@
         self._gather_text_refs = False
         self._chk_id_roots = []
         self._chk_p_id_roots = []
+        self._recent_text_refs = None
         self._text_refs = None
         # set by .pack() if self.revision_ids is not None
         self.revision_keys = None
@@ -241,11 +246,10 @@
         total_keys = len(keys)
         remaining_keys = set(keys)
         counter = [0]
-        if self._gather_text_refs:
-            # Just to get _bytes_to_entry, so we don't care about the
-            # search_key_name
-            inv = inventory.CHKInventory(None)
-            self._text_refs = set()
+        # Just to get _bytes_to_entry, so we don't care about the
+        # search_key_name
+        inv = inventory.CHKInventory(None)
+        self._text_refs = set()
         def _get_referenced_stream(root_keys, parse_leaf_nodes=False):
             cur_keys = root_keys
             while cur_keys:
@@ -303,10 +307,23 @@
                 cur_keys = []
                 for prefix in sorted(keys_by_search_prefix):
                     cur_keys.extend(keys_by_search_prefix.pop(prefix))
-        for stream in _get_referenced_stream(self._chk_id_roots,
+        recent_roots = self._chk_id_roots[:self._RECENT_HORIZON]
+        old_roots = self._chk_id_roots[self._RECENT_HORIZON:]
+        del self._chk_id_roots
+        # Grab the text keys that are referenced by recent commits, so we can
+        # prioritize those as well
+        for stream in _get_referenced_stream(recent_roots, True):
+            yield stream
+        self._recent_text_refs = self._text_refs
+        if self._gather_text_refs:
+            self._text_refs = set(self._text_refs)
+        else:
+            self._text_refs = None
+        del recent_roots
+        for stream in _get_referenced_stream(old_roots,
                                              self._gather_text_refs):
             yield stream
-        del self._chk_id_roots
+        del old_roots
         # while it isn't really possible for chk_id_roots to not be in the
         # local group of packs, it is possible that the tree shape has not
         # changed recently, so we need to filter _chk_p_id_roots by the
@@ -314,8 +331,14 @@
         chk_p_id_roots = [key for key in self._chk_p_id_roots
                           if key in remaining_keys]
         del self._chk_p_id_roots
-        for stream in _get_referenced_stream(chk_p_id_roots, False):
-            yield stream
+        recent_pid_roots = chk_p_id_roots[:self._RECENT_HORIZON]
+        old_pid_roots = chk_p_id_roots[self._RECENT_HORIZON:]
+        for stream in _get_referenced_stream(recent_pid_roots, False):
+            yield stream
+        del recent_pid_roots
+        for stream in _get_referenced_stream(old_pid_roots, False):
+            yield stream
+        del old_pid_roots
         if remaining_keys:
             trace.mutter('There were %d keys in the chk index, %d of which'
                          ' were not referenced', total_keys,
@@ -420,6 +443,13 @@
         #      rev just before the ones you are copying, otherwise the filter
         #      is grabbing too many keys...
         text_keys = source_vf.keys()
+        if self._recent_text_refs is not None:
+            trace.mutter('Packing %d recent text refs',
+                         len(self._recent_text_refs))
+            self._copy_stream(source_vf, target_vf, self._recent_text_refs,
+                              'text', self._get_progress_stream, 4)
+            text_keys.difference_update(self._recent_text_refs)
+            self._recent_text_refs = None
         self._copy_stream(source_vf, target_vf, text_keys,
                           'text', self._get_progress_stream, 4)