Rev 3910: Have _LazyGroupContentManager pre-extract everything it holds. in http://bzr.arbash-meinel.com/branches/bzr/brisbane/lazy_gc_stream

Thu Mar 19 03:06:08 GMT 2009

At http://bzr.arbash-meinel.com/branches/bzr/brisbane/lazy_gc_stream

------------------------------------------------------------
revno: 3910
revision-id: john at arbash-meinel.com-20090319030602-stjxub1g3yhq0u32
parent: john at arbash-meinel.com-20090317203354-77ub807e883l8qx1
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: lazy_gc_stream
timestamp: Wed 2009-03-18 22:06:02 -0500
message:
  Have _LazyGroupContentManager pre-extract everything it holds.
  
  This doesn't seem to speed up decompression time, but it does prevent us from
  re-allocating the _content buffer for every bit we extract.
  Saves ~1s/30s on mysql-525 repack.
-------------- next part --------------
=== modified file 'bzrlib/groupcompress.py'

--- a/bzrlib/groupcompress.py	2009-03-17 20:33:54 +0000
+++ b/bzrlib/groupcompress.py	2009-03-19 03:06:02 +0000
@@ -227,17 +227,22 @@
             else:
                 # Start a zlib decompressor
                 assert self._compressor_name == 'zlib'
-                self._z_content_decompressor = zlib.decompressobj()
-                # Seed the decompressor with the uncompressed bytes, so that
-                # the rest of the code is simplified
-                self._content = self._z_content_decompressor.decompress(
-                    self._z_content, _ZLIB_DECOMP_WINDOW)
+                if num_bytes is None:
+                    self._content = zlib.decompress(self._z_content)
+                else:
+                    self._z_content_decompressor = zlib.decompressobj()
+                    # Seed the decompressor with the uncompressed bytes, so
+                    # that the rest of the code is simplified
+                    self._content = self._z_content_decompressor.decompress(
+                        self._z_content, num_bytes + _ZLIB_DECOMP_WINDOW)
                 # Any bytes remaining to be decompressed will be in the
                 # decompressors 'unconsumed_tail'
-            self._z_content = None
         # Do we have enough bytes already?
         if num_bytes is not None and len(self._content) >= num_bytes:
             return
+        if num_bytes is None and self._z_content_decompressor is None:
+            # We must have already decompressed everything
+            return
         # If we got this far, and don't have a decompressor, something is wrong
         assert self._z_content_decompressor is not None
         remaining_decomp = self._z_content_decompressor.unconsumed_tail
@@ -508,6 +513,7 @@
             else:
                 return ''
         if storage_kind in ('fulltext', 'chunked'):
+            self._manager._prepare_for_extract()
             block = self._manager._block
             _, bytes = block.extract(self.key, self._start, self._end)
             if storage_kind == 'fulltext':
@@ -525,6 +531,7 @@
         self._block = block
         # We need to preserve the ordering
         self._factories = []
+        self._last_byte = 0
 
     def add_factory(self, key, parents, start, end):
         if not self._factories:
@@ -534,6 +541,7 @@
         # Note that this creates a reference cycle....
         factory = _LazyGroupCompressFactory(key, parents, self,
             start, end, first=first)
+        self._last_byte = max(end, self._last_byte)
         self._factories.append(factory)
 
     def get_record_stream(self):
@@ -570,6 +578,7 @@
             factory._start = cur_endpoint
             factory._end = end_point
             cur_endpoint = end_point
+        self._last_byte = cur_endpoint
         new_block = compressor.flush()
         # TODO: Should we check that new_block really *is* smaller than the old
         #       block? It seems hard to come up with a method that it would
@@ -581,6 +590,14 @@
                      ' %d bytes => %d bytes', delta, old_length,
                      self._block._content_length)
 
+    def _prepare_for_extract(self):
+        """A _LazyGroupCompressFactory is about to extract to fulltext."""
+        # We expect that if one child is going to fulltext, all will be. This
+        # helps prevent all of them from extracting a small amount at a time.
+        # Which in itself isn't terribly expensive, but resizing 2MB 32kB at a
+        # time (self._block._content) is a little expensive.
+        self._block._ensure_content(self._last_byte)
+
     def _check_rebuild_block(self):
         """Check to see if our block should be repacked."""
         total_bytes_used = 0

=== modified file 'bzrlib/tests/test_groupcompress.py'
--- a/bzrlib/tests/test_groupcompress.py	2009-03-17 19:38:14 +0000
+++ b/bzrlib/tests/test_groupcompress.py	2009-03-19 03:06:02 +0000
@@ -214,7 +214,7 @@
         self.assertEqual('', block._z_content)
         block._ensure_content()
         self.assertEqual('', block._content)
-        self.assertIs(None, block._z_content)
+        self.assertEqual('', block._z_content)
         block._ensure_content() # Ensure content is safe to call 2x
 
     def test_from_bytes_with_labels(self):
@@ -264,7 +264,7 @@
         self.assertEqual(z_content, block._z_content)
         self.assertIs(None, block._content)
         block._ensure_content()
-        self.assertIs(None, block._z_content)
+        self.assertEqual(z_content, block._z_content)
         self.assertEqual(content, block._content)
 
     def test_from_old_bytes(self):
@@ -282,7 +282,7 @@
             z_bytes)
         self.assertIsInstance(block, groupcompress.GroupCompressBlock)
         block._ensure_content()
-        self.assertIs(None, block._z_content)
+        self.assertEqual(z_content, block._z_content)
         self.assertEqual(content, block._content)
 
     def test_add_entry(self):