Rev 32: Play around with doing deltas and then packing them after the fact. in http://bzr.arbash-meinel.com/plugins/xdelta_test

Fri Jun 22 16:49:05 BST 2007

At http://bzr.arbash-meinel.com/plugins/xdelta_test

------------------------------------------------------------
revno: 32
revision-id: john at arbash-meinel.com-20070622154902-5rvjc20mrrt373zg
parent: john at arbash-meinel.com-20070621142809-r7oxr1k8og1vln4j
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: xdelta_test
timestamp: Fri 2007-06-22 10:49:02 -0500
message:
  Play around with doing deltas and then packing them after the fact.
modified:
  bench_important_algorithms/bench_all_texts.py bench_all_texts.py-20070528175343-sl9y3m4xrylwr5n9-3
  compression_algorithms.py      compression_algorith-20070528163119-4mg41krgj6fz5xen-2
-------------- next part --------------
=== modified file 'bench_important_algorithms/bench_all_texts.py'

--- a/bench_important_algorithms/bench_all_texts.py	2007-05-29 14:31:26 +0000
+++ b/bench_important_algorithms/bench_all_texts.py	2007-06-22 15:49:02 +0000
@@ -14,6 +14,7 @@
 # along with this program; if not, write to the Free Software
 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 
+import bz2
 import sys
 
 plugin_name = __name__.rsplit('.', 2)[0]
@@ -102,6 +103,27 @@
                             self._size_to_bytes(total_comp_len-fulltext_len),
                             ))
 
+    def double_compress_and_report(self):
+        texts = self.get_all_texts()
+        ancestry_graph = self.get_ancestry_graph()
+        version_to_offsets = self.get_version_to_offsets()
+        base_versions = self.base_algorithm(ancestry_graph, version_to_offsets,
+                                            texts)
+
+        total_len = sum(len(text) for text in texts)
+        compressed = self.time(self._compress_all_texts_in_mem,
+                               texts, version_to_offsets, base_versions)
+        final = bz2.compress(''.join(compressed))
+        total_comp_len = len(final)
+        fulltexts = [v for v, b in base_versions.iteritems() if b is None]
+        sys.stdout.write('\n    %9s => %9s, %5.0f:1 (%d full, %d delta)\t'
+                         % (self._size_to_bytes(total_len),
+                            self._size_to_bytes(total_comp_len),
+                            round(float(total_len) / total_comp_len),
+                            len(fulltexts),
+                            len(compressed) - len(fulltexts),
+                            ))
+
     def decompress_and_report(self, texts, base_offsets):
         texts = self.get_all_texts()
         ancestry_graph = self.get_ancestry_graph()
@@ -139,3 +161,7 @@
         texts, base_offsets = self.get_lh_parent_base()
         self.decompress_and_report(texts, base_offsets)
         texts = self.get_all_texts()
+
+    # def test_compress_twice(self):
+    #     """Compress the texts linearly, and then run bzip2 over the lot."""
+    #     self.double_compress_and_report()

=== modified file 'compression_algorithms.py'
--- a/compression_algorithms.py	2007-05-29 17:51:11 +0000
+++ b/compression_algorithms.py	2007-06-22 15:49:02 +0000
@@ -360,6 +360,9 @@
 
 
 important_algorithms = [CompressionAlgorithm.find_algorithm(x) for x in
-                         ['xd3-default', 'xd3-djw', 'xd3-NOCOMPRESS+zlib',
+                         ['xd3-default', 'xd3-djw',
+                          #'xd3-NOCOMPRESS',
+                          'xd3-NOCOMPRESS+zlib',
+                          #'bdiff-one',
                           'bdiff-one+zlib', 'bdiff-multi+zlib',
                          ]]