Rev 3903: Implement the ability to pack recent chk pages and recently referenced texts in http://bzr.arbash-meinel.com/branches/bzr/brisbane/split_pack
John Arbash Meinel
john at arbash-meinel.com
Tue Mar 24 20:19:27 GMT 2009
At http://bzr.arbash-meinel.com/branches/bzr/brisbane/split_pack
------------------------------------------------------------
revno: 3903
revision-id: john at arbash-meinel.com-20090324201908-cs6j7m91avmr5iyu
parent: john at arbash-meinel.com-20090324165313-3sokh8vdzypsm7cj
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: split_pack
timestamp: Tue 2009-03-24 15:19:08 -0500
message:
Implement the ability to pack recent chk pages and recently referenced texts
in a separate set of groups.
The CHK pages shows a big improvement for stuff like 'ls -r-1', the text changes
seem to cause about 10% bloat in the repo (probably from the extra fulltexts in
the new data).
Needs some timing tests to see if it benefits 'bzr co' times as much as the
other work benefits 'ls' times.
-------------- next part --------------
=== modified file 'bzrlib/repofmt/groupcompress_repo.py'
--- a/bzrlib/repofmt/groupcompress_repo.py 2009-03-23 20:04:42 +0000
+++ b/bzrlib/repofmt/groupcompress_repo.py 2009-03-24 20:19:08 +0000
@@ -168,6 +168,10 @@
class GCCHKPacker(Packer):
"""This class understand what it takes to collect a GCCHK repo."""
+ # This is a semi-arbitrary horizon used to pack new stuff together,
+ # separate from 'really old' stuff.
+ _RECENT_HORIZON = 1000
+
def __init__(self, pack_collection, packs, suffix, revision_ids=None,
reload_func=None):
super(GCCHKPacker, self).__init__(pack_collection, packs, suffix,
@@ -179,6 +183,7 @@
self._gather_text_refs = False
self._chk_id_roots = []
self._chk_p_id_roots = []
+ self._recent_text_refs = None
self._text_refs = None
# set by .pack() if self.revision_ids is not None
self.revision_keys = None
@@ -241,11 +246,10 @@
total_keys = len(keys)
remaining_keys = set(keys)
counter = [0]
- if self._gather_text_refs:
- # Just to get _bytes_to_entry, so we don't care about the
- # search_key_name
- inv = inventory.CHKInventory(None)
- self._text_refs = set()
+ # Just to get _bytes_to_entry, so we don't care about the
+ # search_key_name
+ inv = inventory.CHKInventory(None)
+ self._text_refs = set()
def _get_referenced_stream(root_keys, parse_leaf_nodes=False):
cur_keys = root_keys
while cur_keys:
@@ -303,10 +307,23 @@
cur_keys = []
for prefix in sorted(keys_by_search_prefix):
cur_keys.extend(keys_by_search_prefix.pop(prefix))
- for stream in _get_referenced_stream(self._chk_id_roots,
+ recent_roots = self._chk_id_roots[:self._RECENT_HORIZON]
+ old_roots = self._chk_id_roots[self._RECENT_HORIZON:]
+ del self._chk_id_roots
+ # Grab the text keys that are referenced by recent commits, so we can
+ # prioritize those as well
+ for stream in _get_referenced_stream(recent_roots, True):
+ yield stream
+ self._recent_text_refs = self._text_refs
+ if self._gather_text_refs:
+ self._text_refs = set(self._text_refs)
+ else:
+ self._text_refs = None
+ del recent_roots
+ for stream in _get_referenced_stream(old_roots,
self._gather_text_refs):
yield stream
- del self._chk_id_roots
+ del old_roots
# while it isn't really possible for chk_id_roots to not be in the
# local group of packs, it is possible that the tree shape has not
# changed recently, so we need to filter _chk_p_id_roots by the
@@ -314,8 +331,14 @@
chk_p_id_roots = [key for key in self._chk_p_id_roots
if key in remaining_keys]
del self._chk_p_id_roots
- for stream in _get_referenced_stream(chk_p_id_roots, False):
- yield stream
+ recent_pid_roots = chk_p_id_roots[:self._RECENT_HORIZON]
+ old_pid_roots = chk_p_id_roots[self._RECENT_HORIZON:]
+ for stream in _get_referenced_stream(recent_pid_roots, False):
+ yield stream
+ del recent_pid_roots
+ for stream in _get_referenced_stream(old_pid_roots, False):
+ yield stream
+ del old_pid_roots
if remaining_keys:
trace.mutter('There were %d keys in the chk index, %d of which'
' were not referenced', total_keys,
@@ -420,6 +443,13 @@
# rev just before the ones you are copying, otherwise the filter
# is grabbing too many keys...
text_keys = source_vf.keys()
+ if self._recent_text_refs is not None:
+ trace.mutter('Packing %d recent text refs',
+ len(self._recent_text_refs))
+ self._copy_stream(source_vf, target_vf, self._recent_text_refs,
+ 'text', self._get_progress_stream, 4)
+ text_keys.difference_update(self._recent_text_refs)
+ self._recent_text_refs = None
self._copy_stream(source_vf, target_vf, text_keys,
'text', self._get_progress_stream, 4)
More information about the bazaar-commits
mailing list