Rev 3898: Change GroupCompressor.compress() to return the start_point. in http://bazaar.launchpad.net/%7Ebzr/bzr/brisbane-core
John Arbash Meinel
john at arbash-meinel.com
Mon Mar 23 20:37:11 GMT 2009
At http://bazaar.launchpad.net/%7Ebzr/bzr/brisbane-core
------------------------------------------------------------
revno: 3898
revision-id: john at arbash-meinel.com-20090323203538-kj155dzrd5epzaf5
parent: john at arbash-meinel.com-20090323201046-ek580vnq69i270lp
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: brisbane-core
timestamp: Mon 2009-03-23 15:35:38 -0500
message:
Change GroupCompressor.compress() to return the start_point.
Also, mark empty content with start=end=0.
This also gives us a good starting point to handle duplicate entries (if we
find that makes a difference.)
From experimentation, using 0,0 for empty entries actually makes a big difference
in the text index. Mostly because about 1/2 of all entries have no content,
(all of the directory records, for example), so it allows the compression
to shrink the index a bit.
-------------- next part --------------
=== modified file 'bzrlib/groupcompress.py'
--- a/bzrlib/groupcompress.py 2009-03-23 20:04:42 +0000
+++ b/bzrlib/groupcompress.py 2009-03-23 20:35:38 +0000
@@ -59,6 +59,10 @@
_NO_LABELS = True
_FAST = False
+# osutils.sha_string('')
+_null_sha1 = 'da39a3ee5e6b4b0d3255bfef95601890afd80709'
+
+
def encode_base128_int(val):
"""Convert an integer into a 7-bit lsb encoding."""
bytes = []
@@ -558,17 +562,16 @@
compressor = GroupCompressor()
tstart = time.time()
old_length = self._block._content_length
- cur_endpoint = 0
+ end_point = 0
for factory in self._factories:
bytes = factory.get_bytes_as('fulltext')
- (found_sha1, end_point, type,
+ (found_sha1, start_point, end_point, type,
length) = compressor.compress(factory.key, bytes, factory.sha1)
# Now update this factory with the new offsets, etc
factory.sha1 = found_sha1
- factory._start = cur_endpoint
+ factory._start = start_point
factory._end = end_point
- cur_endpoint = end_point
- self._last_byte = cur_endpoint
+ self._last_byte = end_point
new_block = compressor.flush()
# TODO: Should we check that new_block really *is* smaller than the old
# block? It seems hard to come up with a method that it would
@@ -770,12 +773,21 @@
the group output so far.
:seealso VersionedFiles.add_lines:
"""
- if not _FAST or expected_sha is None:
- sha1 = sha_string(bytes)
- else:
+ if not bytes: # empty, like a dir entry, etc
+ if nostore_sha == _null_sha1:
+ raise errors.ExistingContent()
+ self._block.add_entry(key, type='empty',
+ sha1=None, start=0,
+ length=0)
+ return _null_sha1, 0, 0, 'fulltext', 0
+ # we assume someone knew what they were doing when they passed it in
+ if expected_sha is not None:
sha1 = expected_sha
- if sha1 == nostore_sha:
- raise errors.ExistingContent()
+ else:
+ sha1 = osutils.sha_string(bytes)
+ if nostore_sha is not None:
+ if sha1 == nostore_sha:
+ raise errors.ExistingContent()
if key[-1] is None:
key = key[:-1] + ('sha1:' + sha1,)
input_len = len(bytes)
@@ -813,6 +825,7 @@
self._delta_index.add_delta_source(delta, len_mini_header)
self._block.add_entry(key, type=type, sha1=sha1,
start=self.endpoint, length=length)
+ start = self.endpoint
delta_start = (self.endpoint, len(self.lines))
self.num_keys += 1
self.output_chunks(new_chunks)
@@ -823,7 +836,7 @@
raise AssertionError('the delta index is out of sync'
'with the output lines %s != %s'
% (self._delta_index._source_offset, self.endpoint))
- return sha1, self.endpoint, type, length
+ return sha1, start, self.endpoint, type, length
def extract(self, key):
"""Extract a key previously added to the compressor.
@@ -1451,7 +1464,7 @@
if max_fulltext_len < len(bytes):
max_fulltext_len = len(bytes)
max_fulltext_prefix = prefix
- (found_sha1, end_point, type,
+ (found_sha1, start_point, end_point, type,
length) = self._compressor.compress(record.key,
bytes, record.sha1, soft=soft,
nostore_sha=nostore_sha)
@@ -1499,9 +1512,8 @@
if start_new_block:
self._compressor.pop_last()
flush()
- basis_end = 0
max_fulltext_len = len(bytes)
- (found_sha1, end_point, type,
+ (found_sha1, start_point, end_point, type,
length) = self._compressor.compress(record.key,
bytes, record.sha1)
last_fulltext_len = length
=== modified file 'bzrlib/tests/test_groupcompress.py'
--- a/bzrlib/tests/test_groupcompress.py 2009-03-20 15:43:10 +0000
+++ b/bzrlib/tests/test_groupcompress.py 2009-03-23 20:35:38 +0000
@@ -42,15 +42,37 @@
def test_one_nosha_delta(self):
# diff against NUKK
compressor = groupcompress.GroupCompressor()
- sha1, end_point, _, _ = compressor.compress(('label',),
+ sha1, start_point, end_point, _, _ = compressor.compress(('label',),
'strange\ncommon\n', None)
self.assertEqual(sha_string('strange\ncommon\n'), sha1)
expected_lines = [
'f', '\x0f', 'strange\ncommon\n',
]
self.assertEqual(expected_lines, compressor.lines)
+ self.assertEqual(0, start_point)
self.assertEqual(sum(map(len, expected_lines)), end_point)
+ def test_empty_content(self):
+ compressor = groupcompress.GroupCompressor()
+ # Adding empty bytes should return the 'null' record
+ sha1, start_point, end_point, kind, _ = compressor.compress(('empty',),
+ '', None)
+ self.assertEqual(0, start_point)
+ self.assertEqual(0, end_point)
+ self.assertEqual('fulltext', kind)
+ self.assertEqual(groupcompress._null_sha1, sha1)
+ self.assertEqual(0, compressor.endpoint)
+ self.assertEqual([], compressor.lines)
+ # Even after adding some content
+ compressor.compress(('content',), 'some\nbytes\n', None)
+ self.assertTrue(compressor.endpoint > 0)
+ sha1, start_point, end_point, kind, _ = compressor.compress(('empty2',),
+ '', None)
+ self.assertEqual(0, start_point)
+ self.assertEqual(0, end_point)
+ self.assertEqual('fulltext', kind)
+ self.assertEqual(groupcompress._null_sha1, sha1)
+
def _chunks_to_repr_lines(self, chunks):
return '\n'.join(map(repr, ''.join(chunks).split('\n')))
@@ -68,10 +90,10 @@
def test_two_nosha_delta(self):
compressor = groupcompress.GroupCompressor()
- sha1_1, _, _, _ = compressor.compress(('label',),
+ sha1_1, _, _, _, _ = compressor.compress(('label',),
'strange\ncommon long line\nthat needs a 16 byte match\n', None)
expected_lines = list(compressor.lines)
- sha1_2, end_point, _, _ = compressor.compress(('newlabel',),
+ sha1_2, start_point, end_point, _, _ = compressor.compress(('newlabel',),
'common long line\nthat needs a 16 byte match\ndifferent\n', None)
self.assertEqual(sha_string('common long line\n'
'that needs a 16 byte match\n'
@@ -93,12 +115,12 @@
# The first interesting test: make a change that should use lines from
# both parents.
compressor = groupcompress.GroupCompressor()
- sha1_1, end_point, _, _ = compressor.compress(('label',),
+ sha1_1, _, _, _, _ = compressor.compress(('label',),
'strange\ncommon very very long line\nwith some extra text\n', None)
- sha1_2, _, _, _ = compressor.compress(('newlabel',),
+ sha1_2, _, _, _, _ = compressor.compress(('newlabel',),
'different\nmoredifferent\nand then some more\n', None)
expected_lines = list(compressor.lines)
- sha1_3, end_point, _, _ = compressor.compress(('label3',),
+ sha1_3, start_point, end_point, _, _ = compressor.compress(('label3',),
'new\ncommon very very long line\nwith some extra text\n'
'different\nmoredifferent\nand then some more\n',
None)
@@ -137,10 +159,10 @@
# Knit fetching will try to reconstruct texts locally which results in
# reading something that is in the compressor stream already.
compressor = groupcompress.GroupCompressor()
- sha1_1, _, _, _ = compressor.compress(('label',),
+ sha1_1, _, _, _, _ = compressor.compress(('label',),
'strange\ncommon long line\nthat needs a 16 byte match\n', None)
expected_lines = list(compressor.lines)
- sha1_2, end_point, _, _ = compressor.compress(('newlabel',),
+ sha1_2, _, end_point, _, _ = compressor.compress(('newlabel',),
'common long line\nthat needs a 16 byte match\ndifferent\n', None)
# get the first out
self.assertEqual(('strange\ncommon long line\n'
More information about the bazaar-commits
mailing list