Rev 43: Slightly different handling of large texts. in http://bazaar.launchpad.net/%7Ebzr/bzr-groupcompress/internal_index

Thu Mar 5 03:47:25 GMT 2009

At http://bazaar.launchpad.net/%7Ebzr/bzr-groupcompress/internal_index

------------------------------------------------------------
revno: 43
revision-id: john at arbash-meinel.com-20090305034657-t3qbsogy187yul4z
parent: john at arbash-meinel.com-20090305032949-ffww56phklv1vhbj
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: internal_index
timestamp: Wed 2009-03-04 21:46:57 -0600
message:
  Slightly different handling of large texts.
  
  We should only use 2*max_fulltext as a minimum size if we are still working
  on the same file. That allows us to avoid packing all texts in
  after an ISO.
-------------- next part --------------
=== modified file 'groupcompress.py'

--- a/groupcompress.py	2009-03-05 03:29:49 +0000
+++ b/groupcompress.py	2009-03-05 03:46:57 +0000
@@ -766,6 +766,7 @@
         last_prefix = None
         last_fulltext_len = None
         max_fulltext_len = 0
+        max_fulltext_prefix = None
         for record in stream:
             # Raise an error when a record is missing.
             if record.storage_kind == 'absent':
@@ -778,20 +779,30 @@
                 bytes = adapter.get_bytes(record)
             if len(record.key) > 1:
                 prefix = record.key[0]
+                soft = (prefix == last_prefix)
             else:
                 prefix = None
-            max_fulltext_len = max(max_fulltext_len, len(bytes))
+                soft = False
+            if max_fulltext_len < len(bytes):
+                max_fulltext_len = len(bytes)
+                max_fulltext_prefix = prefix
             (found_sha1, end_point, type,
              length) = self._compressor.compress(record.key,
-                bytes, record.sha1)
+                bytes, record.sha1, soft=soft)
+            # delta_ratio = float(len(bytes)) / length
             # Check if we want to continue to include that text
-            start_new_block = False
-            if end_point > 2 * max_fulltext_len:
-                if end_point > 4*1024*1024:
-                    start_new_block = True
-                elif (prefix is not None and prefix != last_prefix
-                      and end_point > 2*1024*1024):
-                    start_new_block = True
+            if (prefix == max_fulltext_prefix
+                and end_point < 2 * max_fulltext_len):
+                # As long as we are on the same file_id, we will fill at least
+                # 2 * max_fulltext_len
+                start_new_block = False
+            elif end_point > 4*1024*1024:
+                start_new_block = True
+            elif (prefix is not None and prefix != last_prefix
+                  and end_point > 2*1024*1024):
+                start_new_block = True
+            else:
+                start_new_block = False
             # if type == 'fulltext':
             #     # If this is the first text, we don't do anything
             #     if self._compressor.num_keys > 1: