Rev 4785: Switch from using fixed 4-byte group inner start + length to variable width. in http://bazaar.launchpad.net/~jameinel/bzr/chk-index
John Arbash Meinel
john at arbash-meinel.com
Wed Oct 28 03:39:33 GMT 2009
At http://bazaar.launchpad.net/~jameinel/bzr/chk-index
------------------------------------------------------------
revno: 4785
revision-id: john at arbash-meinel.com-20091028033927-mc4axtqvjw2xlml1
parent: john at arbash-meinel.com-20091028022709-zzd4zvfyoip1oisi
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: chk-index
timestamp: Tue 2009-10-27 22:39:27 -0500
message:
Switch from using fixed 4-byte group inner start + length to variable width.
The main reason is an observation that chk pages cap at 64kB.
This allows us to use just 2 bytes for the average length.
I would still like to consider using a variable width field here.
length specifically could 'peak' at 64k, but on average it will
likely be a lot smaller.
-------------- next part --------------
=== modified file 'bzrlib/chk_index.py'
--- a/bzrlib/chk_index.py 2009-10-28 02:27:09 +0000
+++ b/bzrlib/chk_index.py 2009-10-28 03:39:27 +0000
@@ -250,7 +250,7 @@
@classmethod
def build_header_for(cls, num_groups, max_group_offset, max_group_length,
- num_entries):
+ num_entries, max_inner_start, max_inner_length):
header = cls()
header.num_groups = num_groups
header.group_index_offset = header._RESERVED_HEADER_BYTES
@@ -267,8 +267,8 @@
# TODO: Size these appropriately. It would be *really* nice if we could
# shrink them to 3 bytes, since this is 1 byte per entry, and
# num_entries is our largest multiplier
- header.entry_group_start_bytes = 4
- header.entry_group_length_bytes = 4
+ header.entry_group_start_bytes = find_bytes(max_inner_start)
+ header.entry_group_length_bytes = find_bytes(max_inner_length)
header._build_entry_coder()
entry_bytes = header.num_entries * header.entry_coder.size
if entry_bytes < 4096:
@@ -377,7 +377,7 @@
raise AssertionError('the empty group must be at offset 0')
return sorted_groups
- def add_node(self, key, group_start, group_end, inner_start, inner_end):
+ def add_node(self, key, group_start, group_len, inner_start, inner_len):
"""Add a node to the index.
If adding the node causes the builder to reach its spill_at threshold,
@@ -395,12 +395,12 @@
# will be shared with other parts of code. So we may not be
# saving anything here
bit_key = self._check_key(key)
- group_key = (group_start, group_end) # TODO: StaticTuple().intern()
+ group_key = (group_start, group_len) # TODO: StaticTuple().intern()
self._ensure_group(group_key)
if bit_key in self._nodes:
raise errors.BadIndexDuplicateKey(key, self)
# TODO: StaticTuple
- self._nodes[bit_key] = (group_key, inner_start, inner_end)
+ self._nodes[bit_key] = (group_key, inner_start, inner_len)
def key_count(self):
"""How many keys are present in this index?"""
@@ -409,10 +409,16 @@
def _build_header(self):
max_group_offset = max([x[0] for x in self._groups])
max_group_length = max([x[1] for x in self._groups])
+ if self._nodes:
+ max_inner_start = max([v[1] for v in self._nodes.itervalues()])
+ max_inner_length = max([v[2] for v in self._nodes.itervalues()])
+ else:
+ max_inner_start = max_inner_length = 1
# TODO: We could walk the nodes and find the max-start and max-length
# for entries. Not sure if it is really worthwhile
header = CHKIndexHeader.build_header_for(len(self._groups),
- max_group_offset, max_group_length, len(self._nodes))
+ max_group_offset, max_group_length, len(self._nodes),
+ max_inner_start, max_inner_length)
return header
def _build_group_index(self, header):
@@ -421,17 +427,20 @@
sorted_groups = self._sort_groups()
pack = header.group_coder.pack
bytes = [pack(*group_key) for group_key in sorted_groups]
+ # Note: In testing with Launchpad, the fixed-width for groups and
+ # lengths was not a big win. Namely, there was a lot of wasted
+ # space, especially in the second field
return bytes
def _entry_to_bytes(self, bit_key, entry_coder):
# TODO: this should probably be inlined, as it is the bulk of the work
# being done
- group_key, inner_start, inner_end = self._nodes[bit_key]
+ group_key, inner_start, inner_len = self._nodes[bit_key]
group_offset = self._groups[group_key]
if group_offset is None:
raise AssertionError('you must call _sort_groups before'
' you can transform _entry_to_bytes')
- return entry_coder.pack(bit_key, group_offset, inner_start, inner_end)
+ return entry_coder.pack(bit_key, group_offset, inner_start, inner_len)
@staticmethod
def _get_bit_key_to_mini_index(num_mini_index_entries):
=== modified file 'bzrlib/tests/test_chk_index.py'
--- a/bzrlib/tests/test_chk_index.py 2009-10-28 02:27:09 +0000
+++ b/bzrlib/tests/test_chk_index.py 2009-10-28 03:39:27 +0000
@@ -155,13 +155,13 @@
self.assertEqual(20, h.entry_hash_bytes)
self.assertEqual(1, h.entry_group_offset_bytes)
# For now, these widths are hard-coded
- self.assertEqual(4, h.entry_group_start_bytes)
- self.assertEqual(4, h.entry_group_length_bytes)
- self.assertEqual('<20sBII', h.entry_coder.format)
+ self.assertEqual(1, h.entry_group_start_bytes)
+ self.assertEqual(1, h.entry_group_length_bytes)
+ self.assertEqual('<20sBBB', h.entry_coder.format)
# (sha_hash, group number, group offset, group length)
- self.assertEqual(b2a_hex(self.bit_k1) + '01' '00000000' '01000000',
+ self.assertEqual(b2a_hex(self.bit_k1) + '01' '00' '01',
b2a_hex(builder._entry_to_bytes(self.bit_k1, h.entry_coder)))
- self.assertEqual(b2a_hex(self.bit_k2) + '02' '00000000' '32000000',
+ self.assertEqual(b2a_hex(self.bit_k2) + '02' '00' '32',
b2a_hex(builder._entry_to_bytes(self.bit_k2, h.entry_coder)))
def test__build_mini_index_and_entries_tiny(self):
@@ -179,8 +179,8 @@
# bit_k2 < bit_k1
self.assertEqualDiff('\n'.join([
#hash # Group # start # end/length?
- b2a_hex(self.bit_k2) + '02' '00000000' '32000000',
- b2a_hex(self.bit_k1) + '01' '00000000' '01000000',
+ b2a_hex(self.bit_k2) + '02' '00' '32',
+ b2a_hex(self.bit_k1) + '01' '00' '01',
]), '\n'.join([b2a_hex(b) for b in entry_bytes]))
def test__get_bit_key_to_mini_index_0(self):
@@ -221,8 +221,8 @@
def test__build_mini_index_and_entries_16(self):
builder = chk_index.CHKIndexBuilder()
builder.add_node(self.k1, 0, 1000, 0, 1)
- builder.add_node(self.k2, 1000, 20000, 0, 50)
- builder.add_node(self.k3, 1000, 20000, 50, 1000)
+ builder.add_node(self.k2, 1000, 20000, 0, 1000)
+ builder.add_node(self.k3, 1000, 20000, 1000, 500)
h = builder._build_header()
builder._sort_groups()
self.assertEqual(0, h.num_mini_index_entries)
@@ -250,33 +250,34 @@
'0000',
'0000',
'0000',
- 'a100',
- 'be00',
+ '9d00',
+ 'b600',
'0000',
]), '\n'.join([b2a_hex(b) for b in mini_index_bytes]))
# bit_k2 < bit_k1
+ self.assertEqual('<20sBHH', h.entry_coder.format)
self.assertEqualDiff('\n'.join([
#hash # Group # start # end/length?
- b2a_hex(self.bit_k2) + '02' '00000000' '32000000',
- b2a_hex(self.bit_k1) + '01' '00000000' '01000000',
- b2a_hex(self.bit_k3) + '02' '32000000' 'e8030000',
+ b2a_hex(self.bit_k2) + '02' '0000' 'e803',
+ b2a_hex(self.bit_k1) + '01' '0000' '0100',
+ b2a_hex(self.bit_k3) + '02' 'e803' 'f401',
]), '\n'.join([b2a_hex(b) for b in entry_bytes]))
self.assertEqual(0x0084, h.entry_offset)
- self.assertEqual(0x00a1, h.entry_offset + len(entry_bytes[0]))
- self.assertEqual(0x00be, h.entry_offset + len(entry_bytes[0])
+ self.assertEqual(0x009d, h.entry_offset + len(entry_bytes[0]))
+ self.assertEqual(0x00b6, h.entry_offset + len(entry_bytes[0])
+ len(entry_bytes[1]))
def test_build_tiny_index(self):
builder = chk_index.CHKIndexBuilder()
builder.add_node(self.k1, 0, 1000, 0, 1)
- builder.add_node(self.k2, 1000, 20000, 0, 50)
- builder.add_node(self.k3, 1000, 20000, 50, 1000)
+ builder.add_node(self.k2, 1000, 20000, 0, 1000)
+ builder.add_node(self.k3, 1000, 20000, 1000, 500)
bytes = ''.join(builder.finish())
self.assertEqualDiff("""Groupcompress CHK Index 1
hash=sha1 20
groups=120 2 2 3
mini_index=132 0 1
-entry=3 20 1 4 4
+entry=3 20 1 2 2
\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n
""", bytes[:120])
b2a_hex = binascii.b2a_hex
@@ -287,9 +288,9 @@
'e803' '204e' # group 2 @ offet 1000, length 20000
'' # No mini index
# 3 entries follow
- + b2a_hex(self.bit_k2) + '02' '00000000' '32000000'
- + b2a_hex(self.bit_k1) + '01' '00000000' '01000000'
- + b2a_hex(self.bit_k3) + '02' '32000000' 'e8030000'
+ + b2a_hex(self.bit_k2) + '02' '0000' 'e803'
+ + b2a_hex(self.bit_k1) + '01' '0000' '0100'
+ + b2a_hex(self.bit_k3) + '02' 'e803' 'f401'
, b2a_hex(bytes[120:]))
@@ -326,7 +327,9 @@
num_groups=100*1000*1000,
max_group_offset=256**7,
max_group_length=(256**4)-1,
- num_entries=100*1000*1000)
+ num_entries=100*1000*1000,
+ max_inner_start=(256**4)-1,
+ max_inner_length=(256**4)-1)
bytes = header.serialize()
self.assertEqual(header._RESERVED_HEADER_BYTES, len(bytes))
# Note: In this 'worst-case' scenario, we use 119 bytes, and
@@ -516,7 +519,9 @@
header = chk_index.CHKIndexHeader.build_header_for(
num_groups=10, max_group_offset=4*1024*1024*10,
max_group_length=4*1024*1024,
- num_entries=10*1000)
+ num_entries=10*1000,
+ max_inner_start=4*1024*1024,
+ max_inner_length=65535)
self.assertEqual(10, header.num_groups)
self.assertEqual(header._RESERVED_HEADER_BYTES,
header.group_index_offset)
@@ -536,7 +541,7 @@
header = chk_index.CHKIndexHeader.build_header_for(
num_groups=3, max_group_offset=40000,
max_group_length=20000,
- num_entries=5)
+ num_entries=5, max_inner_start=40000, max_inner_length=40000)
self.assertEqual(3, header.num_groups)
self.assertEqual(header._RESERVED_HEADER_BYTES,
header.group_index_offset)
@@ -555,7 +560,9 @@
header = chk_index.CHKIndexHeader.build_header_for(
num_groups=100000, max_group_offset=2*1024*1024*100000,
max_group_length=600*1024*1024,
- num_entries=60*1024*1024)
+ num_entries=60*1024*1024,
+ max_inner_start=600*1024*1024,
+ max_inner_length=600*1024*1024)
self.assertEqual(100000, header.num_groups)
self.assertEqual(header._RESERVED_HEADER_BYTES,
header.group_index_offset)
More information about the bazaar-commits
mailing list