Rev 11: Add extraction of just-compressed texts to support converting from knits. in http://people.ubuntu.com/~robertc/baz2.0/plugins/groupcompress/trunk
Robert Collins
robertc at robertcollins.net
Tue Jul 15 15:22:22 BST 2008
At http://people.ubuntu.com/~robertc/baz2.0/plugins/groupcompress/trunk
------------------------------------------------------------
revno: 11
revision-id: robertc at robertcollins.net-20080715142216-dghu4jkthem5vqb2
parent: robertc at robertcollins.net-20080715125820-dbzpbzpv7vy92tg0
committer: Robert Collins <robertc at robertcollins.net>
branch nick: trunk
timestamp: Wed 2008-07-16 00:22:16 +1000
message:
Add extraction of just-compressed texts to support converting from knits.
modified:
groupcompress.py groupcompress.py-20080705181503-ccbxd6xuy1bdnrpu-8
tests/test_groupcompress.py test_groupcompress.p-20080705181503-ccbxd6xuy1bdnrpu-13
=== modified file 'groupcompress.py'
--- a/groupcompress.py 2008-07-15 12:45:49 +0000
+++ b/groupcompress.py 2008-07-15 14:22:16 +0000
@@ -119,6 +119,7 @@
self.input_bytes = 0
# line: set(locations it appears at), set(N+1 for N in locations)
self.line_locations = {}
+ self.labels_deltas = {}
def compress(self, key, lines, expected_sha):
"""Compress lines with label key.
@@ -194,11 +195,25 @@
new_lines.append("i,%d\n" % new_len)
new_lines.extend(lines[new_start:new_start+new_len])
+ delta_start = (self.endpoint, len(self.lines))
self.output_lines(new_lines)
trim_encoding_newline(lines)
self.input_bytes += sum(map(len, lines))
+ delta_end = (self.endpoint, len(self.lines))
+ self.labels_deltas[key] = (delta_start, delta_end)
return sha1, self.endpoint
+ def extract(self, key):
+ """Extract a key previously added to the compressor."""
+ delta_details = self.labels_deltas[key]
+ delta_lines = self.lines[delta_details[0][1]:delta_details[1][1]]
+ label, sha1, delta = parse(delta_lines)
+ if label != key:
+ raise AssertionError("wrong key: %r, wanted %r" % (label, key))
+ lines = apply_delta(self.lines, delta)
+ sha1 = sha_strings(lines)
+ return lines, sha1
+
def output_lines(self, new_lines):
self.endpoint += sum(map(len, new_lines))
offset = len(self.lines)
@@ -213,6 +228,7 @@
"""Return the overall compression ratio."""
return float(self.input_bytes) / float(self.endpoint)
+
def make_pack_factory(graph, delta, keylength):
"""Create a factory for creating a pack based groupcompress.
@@ -262,6 +278,7 @@
self._index = index
self._access = access
self._delta = delta
+ self._unadded_refs = {}
def add_lines(self, key, parents, lines, parent_texts=None,
left_matching_blocks=None, nostore_sha=None, random_id=False,
@@ -380,6 +397,10 @@
source_results.append(new_result)
result.update(new_result)
missing.difference_update(set(new_result))
+ if self._unadded_refs:
+ for key in missing:
+ if key in self._unadded_refs:
+ result[key] = self._unadded_refs[key]
return result
def get_record_stream(self, keys, ordering, include_delta_closure):
@@ -409,29 +430,41 @@
# everything in the same group, etc.
parent_map = dict((key, details[2]) for key, details in
locations.iteritems())
+ local = frozenset(keys).intersection(set(self._unadded_refs))
+ for key in local:
+ parent_map[key] = self._unadded_refs[key]
+ locations[key] = None
present_keys = topo_sort(parent_map)
# Now group by source:
else:
present_keys = locations.keys()
+ local = frozenset(keys).intersection(set(self._unadded_refs))
+ for key in local:
+ present_keys.append(key)
+ locations[key] = None
absent_keys = keys.difference(set(locations))
for key in absent_keys:
yield AbsentContentFactory(key)
for key in present_keys:
- index_memo, _, parents, (method, _) = locations[key]
- # read
- read_memo = index_memo[0:3]
- zdata = self._access.get_raw_records([read_memo]).next()
- # decompress
- plain = zlib.decompress(zdata)
- # parse
- delta_lines = split_lines(plain[index_memo[3]:index_memo[4]])
- label, sha1, delta = parse(delta_lines)
- if label != key:
- raise AssertionError("wrong key: %r, wanted %r" % (label, key))
- basis = plain[:index_memo[3]]
- basis = StringIO(basis).readlines()
- #basis = split_lines(plain[:last_end])
- lines = apply_delta(basis, delta)
+ if key in self._unadded_refs:
+ lines, sha1 = self._compressor.extract(key)
+ parents = self._unadded_refs[key]
+ else:
+ index_memo, _, parents, (method, _) = locations[key]
+ # read
+ read_memo = index_memo[0:3]
+ zdata = self._access.get_raw_records([read_memo]).next()
+ # decompress
+ plain = zlib.decompress(zdata)
+ # parse
+ delta_lines = split_lines(plain[index_memo[3]:index_memo[4]])
+ label, sha1, delta = parse(delta_lines)
+ if label != key:
+ raise AssertionError("wrong key: %r, wanted %r" % (label, key))
+ basis = plain[:index_memo[3]]
+ basis = StringIO(basis).readlines()
+ #basis = split_lines(plain[:last_end])
+ lines = apply_delta(basis, delta)
bytes = ''.join(lines)
yield FulltextContentFactory(key, parents, sha1, bytes)
@@ -479,12 +512,13 @@
adapters = {}
# This will go up to fulltexts for gc to gc fetching, which isn't
# ideal.
- compressor = GroupCompressor(self._delta)
+ self._compressor = GroupCompressor(self._delta)
+ self._unadded_refs = {}
keys_to_add = []
basis_end = 0
groups = 1
def flush():
- compressed = zlib.compress(''.join(compressor.lines))
+ compressed = zlib.compress(''.join(self._compressor.lines))
index, start, length = self._access.add_raw_records(
[(None, len(compressed))], compressed)[0]
nodes = []
@@ -502,20 +536,24 @@
adapter = get_adapter(adapter_key)
bytes = adapter.get_bytes(record,
record.get_bytes_as(record.storage_kind))
- found_sha1, end_point = compressor.compress(record.key,
+ found_sha1, end_point = self._compressor.compress(record.key,
split_lines(bytes), record.sha1)
+ self._unadded_refs[record.key] = record.parents
yield found_sha1
keys_to_add.append((record.key, '%d %d' % (basis_end, end_point),
(record.parents,)))
basis_end = end_point
if basis_end > 1024 * 1024 * 20:
flush()
- compressor = GroupCompressor(self._delta)
+ self._compressor = GroupCompressor(self._delta)
+ self._unadded_refs = {}
keys_to_add = []
basis_end = 0
groups += 1
if len(keys_to_add):
flush()
+ self._compressor = None
+ self._unadded_refs = {}
def iter_lines_added_or_present_in_keys(self, keys, pb=None):
"""Iterate over the lines in the versioned files from keys.
=== modified file 'tests/test_groupcompress.py'
--- a/tests/test_groupcompress.py 2008-07-07 02:27:03 +0000
+++ b/tests/test_groupcompress.py 2008-07-15 14:22:16 +0000
@@ -136,3 +136,18 @@
compressor.compress(('label3',),
['new\n', 'common\n', 'different\n', 'moredifferent\n'], None)
self.assertAlmostEqual(0.3, compressor.ratio(), 1)
+
+ def test_extract_from_compressor(self):
+ # Knit fetching will try to reconstruct texts locally which results in
+ # reading something that is in the compressor stream already.
+ compressor = groupcompress.GroupCompressor(True)
+ sha_1, _ = compressor.compress(('label',),
+ ['strange\n', 'common\n'], None)
+ sha_2, _ = compressor.compress(('newlabel',),
+ ['common\n', 'different\n', 'moredifferent\n'], None)
+ # get the first out
+ self.assertEqual((['strange\n', 'common\n'], sha_1),
+ compressor.extract(('label',)))
+ # and the second
+ self.assertEqual((['common\n', 'different\n', 'moredifferent\n'],
+ sha_2), compressor.extract(('newlabel',)))
More information about the bazaar-commits
mailing list