Rev 3878: Some hacky code to check for: in http://bzr.arbash-meinel.com/branches/bzr/brisbane/hack3

Thu Mar 19 22:35:56 GMT 2009

At http://bzr.arbash-meinel.com/branches/bzr/brisbane/hack3

------------------------------------------------------------
revno: 3878
revision-id: john at arbash-meinel.com-20090319223546-odxel2nktm700d7e
parent: john at arbash-meinel.com-20090319194720-4esxj7gnrmfaykww
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: hack3
timestamp: Thu 2009-03-19 17:35:46 -0500
message:
  Some hacky code to check for:
  a) empty records, like directories and symlinks. This might decrease index pressure
  a tiny bit, as they can be recorded as (0, 0, 0, 0).
  b) duplicate entries in the same gc block.
  Rather than insert another copy of the same delta (or a delta to the identical text).
  
  We seem to get quite a few hits for both (b) and (a), so it is probably worth looking
  closer at them.
-------------- next part --------------
=== modified file 'bzrlib/groupcompress.py'

--- a/bzrlib/groupcompress.py	2009-03-19 18:38:49 +0000
+++ b/bzrlib/groupcompress.py	2009-03-19 22:35:46 +0000
@@ -339,6 +339,8 @@
         :param sha1: TODO (should we validate only when sha1 is supplied?)
         :return: The bytes for the content
         """
+        if start == end == 0:
+            return ''
         # Make sure we have enough bytes for this record
         # TODO: if we didn't want to track the end of this entry, we could
         #       _ensure_content(start+enough_bytes_for_type_and_length), and
@@ -747,12 +749,13 @@
        left side.
     """
 
-    def __init__(self):
+    def __init__(self, check_for_dupes=False):
         """Create a GroupCompressor."""
         # Consider seeding the lines with some sort of GC Start flag, or
         # putting it as part of the output stream, rather than in the
         # compressed bytes.
         self.lines = []
+        self._check_for_dupes = check_for_dupes
         self.endpoint = 0
         self.input_bytes = 0
         self.num_keys = 0
@@ -760,6 +763,8 @@
         self._last = None
         self._delta_index = _groupcompress_pyx.DeltaIndex()
         self._block = GroupCompressBlock()
+        self._entries_by_sha1 = {}
+        self._deduped_entries = 0
 
     def compress(self, key, bytes, expected_sha, nostore_sha=None, soft=False):
         """Compress lines with label key.
@@ -780,10 +785,14 @@
             the group output so far.
         :seealso VersionedFiles.add_lines:
         """
+        if not bytes: # empty, like a dir entry, etc
+            return None, 0, 'fulltext', 0
         sha1 = None
         # we assume someone knew what they were doing when they passed it in
         if expected_sha is not None:
             sha1 = expected_sha
+        if self._check_for_dupes and sha1 is None:
+            sha1 = osutils.sha_string(bytes)
         if nostore_sha is not None:
             if sha1 is None:
                 sha1 = osutils.sha_string(bytes)
@@ -807,6 +816,13 @@
             raise AssertionError('_source_offset != endpoint'
                 ' somehow the DeltaIndex got out of sync with'
                 ' the output lines')
+        if self._check_for_dupes and sha1 in self._entries_by_sha1:
+            self._deduped_entries += 1
+            (type, start, length) = self._entries_by_sha1[sha1]
+            self._block.add_entry(key, type=type,
+                                  sha1=sha1, start=start,
+                                  length=length)
+            return sha1, self.endpoint, 'sha1_dupe', 0
         max_delta_size = len(bytes) / 2
         delta = self._delta_index.make_delta(bytes, max_delta_size)
         if (delta is None):
@@ -828,6 +844,8 @@
                 self._delta_index.add_delta_source(delta, len_mini_header)
         self._block.add_entry(key, type=type, sha1=sha1,
                               start=self.endpoint, length=length)
+        if self._check_for_dupes:
+            self._entries_by_sha1[sha1] = (type, self.endpoint, length)
         delta_start = (self.endpoint, len(self.lines))
         self.num_keys += 1
         self.output_chunks(new_chunks)
@@ -957,7 +975,8 @@
 class GroupCompressVersionedFiles(VersionedFiles):
     """A group-compress based VersionedFiles implementation."""
 
-    def __init__(self, index, access, delta=True):
+    def __init__(self, index, access, delta=True,
+                 check_for_dupes=False):
         """Create a GroupCompressVersionedFiles object.
 
         :param index: The index object storing access and graph data.
@@ -968,6 +987,7 @@
         self._access = access
         self._delta = delta
         self._unadded_refs = {}
+        self._check_for_dupes = check_for_dupes
         self._group_cache = LRUSizeCache(max_size=50*1024*1024)
         self._fallback_vfs = []
 
@@ -1401,11 +1421,17 @@
                 return adapter
         # This will go up to fulltexts for gc to gc fetching, which isn't
         # ideal.
-        self._compressor = GroupCompressor()
+        self._compressor = GroupCompressor(self._check_for_dupes)
         self._unadded_refs = {}
         keys_to_add = []
         basis_end = 0
+        total_deduped = [0]
         def flush():
+            if self._compressor._deduped_entries > 0:
+                total_deduped[0] += self._compressor._deduped_entries
+                trace.note('Dedupped %d out of %d entries',
+                           self._compressor._deduped_entries,
+                           len(self._compressor._block._entries))
             bytes = self._compressor.flush().to_bytes()
             index, start, length = self._access.add_raw_records(
                 [(None, len(bytes))], bytes)[0]
@@ -1415,7 +1441,7 @@
             self._index.add_records(nodes, random_id=random_id)
             self._unadded_refs = {}
             del keys_to_add[:]
-            self._compressor = GroupCompressor()
+            self._compressor = GroupCompressor(self._check_for_dupes)
 
         last_prefix = None
         last_fulltext_len = None
@@ -1525,12 +1551,17 @@
                 key = record.key
             self._unadded_refs[key] = record.parents
             yield found_sha1
-            keys_to_add.append((key, '%d %d' % (basis_end, end_point),
-                (record.parents,)))
+            if length == 0:
+                keys_to_add.append((key, '0 0', (record.parents,)))
+            else:
+                keys_to_add.append((key, '%d %d' % (basis_end, end_point),
+                    (record.parents,)))
             basis_end = end_point
         if len(keys_to_add):
             flush()
         self._compressor = None
+        if total_deduped[0] > 0:
+            trace.note('Total deduped = %d\n', total_deduped[0])
 
     def iter_lines_added_or_present_in_keys(self, keys, pb=None):
         """Iterate over the lines in the versioned files from keys.

=== modified file 'bzrlib/repofmt/groupcompress_repo.py'
--- a/bzrlib/repofmt/groupcompress_repo.py	2009-03-19 19:47:20 +0000
+++ b/bzrlib/repofmt/groupcompress_repo.py	2009-03-19 22:35:46 +0000
@@ -358,13 +358,17 @@
                 indices.append(sub_index)
             index = _mod_index.CombinedGraphIndex(indices)
             add_callback = None
+        check_for_dupes = False
+        if index_name == 'text_index':
+            check_for_dupes = True
         vf = GroupCompressVersionedFiles(
             _GCGraphIndex(index,
                           add_callback=add_callback,
                           parents=parents,
                           is_locked=self._pack_collection.repo.is_locked),
             access=access,
-            delta=delta)
+            delta=delta,
+            check_for_dupes=check_for_dupes)
         return vf
 
     def _build_vfs(self, index_name, parents, delta):