Rev 4219: Merge bbc at 3908 in file:///home/vila/src/bzr/experimental/gc-py-bbc/
Vincent Ladeuil
v.ladeuil+lp at free.fr
Tue Mar 31 08:43:43 BST 2009
At file:///home/vila/src/bzr/experimental/gc-py-bbc/
------------------------------------------------------------
revno: 4219 [merge]
revision-id: v.ladeuil+lp at free.fr-20090331074343-wghocs28bnzbjlh2
parent: v.ladeuil+lp at free.fr-20090331074216-yxfi6sra7nh61v09
parent: v.ladeuil+lp at free.fr-20090331073419-89g1md60o2slg5t4
committer: Vincent Ladeuil <v.ladeuil+lp at free.fr>
branch nick: bbc
timestamp: Tue 2009-03-31 09:43:43 +0200
message:
Merge bbc at 3908
modified:
bzrlib/chk_map.py chk_map.py-20081001014447-ue6kkuhofvdecvxa-1
bzrlib/groupcompress.py groupcompress.py-20080705181503-ccbxd6xuy1bdnrpu-8
bzrlib/repofmt/pack_repo.py pack_repo.py-20070813041115-gjv5ma7ktfqwsjgn-1
bzrlib/repository.py rev_storage.py-20051111201905-119e9401e46257e3
bzrlib/tests/per_repository/test_commit_builder.py test_commit_builder.py-20060606110838-76e3ra5slucqus81-1
bzrlib/tests/test_chk_map.py test_chk_map.py-20081001014447-ue6kkuhofvdecvxa-2
-------------- next part --------------
=== modified file 'bzrlib/chk_map.py'
--- a/bzrlib/chk_map.py 2009-03-27 16:36:50 +0000
+++ b/bzrlib/chk_map.py 2009-03-30 21:13:24 +0000
@@ -1289,8 +1289,7 @@
next_interesting = set()
uninteresting_items = set()
interesting_items = set()
- interesting_records = []
- # records_read = set()
+ interesting_to_yield = []
for record in store.get_record_stream(chks_to_read, 'unordered', True):
# records_read.add(record.key())
if pb is not None:
@@ -1307,20 +1306,17 @@
# store
uninteresting_items.update(node.iteritems(None))
else:
- interesting_records.append(record)
+ interesting_to_yield.append(record.key)
if type(node) is InternalNode:
next_interesting.update(node.refs())
else:
interesting_items.update(node.iteritems(None))
- # TODO: Filter out records that have already been read, as node splitting
- # can cause us to reference the same nodes via shorter and longer
- # paths
return (next_uninteresting, uninteresting_items,
- next_interesting, interesting_records, interesting_items)
+ next_interesting, interesting_to_yield, interesting_items)
def _find_all_uninteresting(store, interesting_root_keys,
- uninteresting_root_keys, adapter, pb):
+ uninteresting_root_keys, pb):
"""Determine the full set of uninteresting keys."""
# What about duplicates between interesting_root_keys and
# uninteresting_root_keys?
@@ -1336,7 +1332,7 @@
# First step, find the direct children of both the interesting and
# uninteresting set
(uninteresting_keys, uninteresting_items,
- interesting_keys, interesting_records,
+ interesting_keys, interesting_to_yield,
interesting_items) = _find_children_info(store, interesting_root_keys,
uninteresting_root_keys,
pb=pb)
@@ -1357,10 +1353,7 @@
# TODO: Handle 'absent'
if pb is not None:
pb.tick()
- try:
- bytes = record.get_bytes_as('fulltext')
- except errors.UnavailableRepresentation:
- bytes = adapter.get_bytes(record)
+ bytes = record.get_bytes_as('fulltext')
# We don't care about search_key_func for this code, because we
# only care about external references.
node = _deserialise(bytes, record.key, search_key_func=None)
@@ -1375,7 +1368,7 @@
all_uninteresting_items.update(node._items.iteritems())
chks_to_read = next_chks
return (all_uninteresting_chks, all_uninteresting_items,
- interesting_keys, interesting_records, interesting_items)
+ interesting_keys, interesting_to_yield, interesting_items)
def iter_interesting_nodes(store, interesting_root_keys,
@@ -1390,7 +1383,7 @@
:param uninteresting_root_keys: keys which should be filtered out of the
result set.
:return: Yield
- (interesting records, interesting chk's, interesting key:values)
+ (interesting record, {interesting key:values})
"""
# TODO: consider that it may be more memory efficient to use the 20-byte
# sha1 string, rather than tuples of hexidecimal sha1 strings.
@@ -1399,33 +1392,24 @@
# able to use nodes from the _page_cache as well as actually
# requesting bytes from the store.
- # A way to adapt from the compressed texts back into fulltexts
- # In a way, this seems like a layering inversion to have CHKMap know the
- # details of versionedfile
- adapter_class = versionedfile.adapter_registry.get(
- ('knit-ft-gz', 'fulltext'))
- adapter = adapter_class(store)
-
(all_uninteresting_chks, all_uninteresting_items, interesting_keys,
- interesting_records, interesting_items) = _find_all_uninteresting(store,
- interesting_root_keys, uninteresting_root_keys, adapter, pb)
+ interesting_to_yield, interesting_items) = _find_all_uninteresting(store,
+ interesting_root_keys, uninteresting_root_keys, pb)
# Now that we know everything uninteresting, we can yield information from
# our first request
interesting_items.difference_update(all_uninteresting_items)
- records = dict((record.key, record) for record in interesting_records
- if record.key not in all_uninteresting_chks)
- if records or interesting_items:
- yield records, interesting_items
+ interesting_to_yield = set(interesting_to_yield) - all_uninteresting_chks
+ if interesting_items:
+ yield None, interesting_items
+ if interesting_to_yield:
+ # We request these records again, rather than buffering the root
+ # records, most likely they are still in the _group_cache anyway.
+ for record in store.get_record_stream(interesting_to_yield,
+ 'unordered', False):
+ yield record, []
+ all_uninteresting_chks.update(interesting_to_yield)
interesting_keys.difference_update(all_uninteresting_chks)
- # TODO: We need a test for this
- # This handles the case where after a split, one of the child trees
- # is identical to one of the interesting root keys. Like if you had a
- # leaf node, with "aa" "ab", that then overflowed at "bb". You would
- # get a new internal node, but it would have one leaf node with
- # ("aa", "ab") and another leaf node with "bb". And you don't want to
- # re-transmit that ("aa", "ab") node again
- all_uninteresting_chks.update(interesting_root_keys)
chks_to_read = interesting_keys
counter = 0
@@ -1436,10 +1420,7 @@
if pb is not None:
pb.update('find chk pages', counter)
# TODO: Handle 'absent'?
- try:
- bytes = record.get_bytes_as('fulltext')
- except errors.UnavailableRepresentation:
- bytes = adapter.get_bytes(record)
+ bytes = record.get_bytes_as('fulltext')
# We don't care about search_key_func for this code, because we
# only care about external references.
node = _deserialise(bytes, record.key, search_key_func=None)
@@ -1468,7 +1449,7 @@
# whole thing, but it does mean that callers need to
# understand they may get duplicate values.
# all_uninteresting_items.update(interesting_items)
- yield {record.key: record}, interesting_items
+ yield record, interesting_items
chks_to_read = next_chks
=== modified file 'bzrlib/groupcompress.py'
--- a/bzrlib/groupcompress.py 2009-03-27 16:36:50 +0000
+++ b/bzrlib/groupcompress.py 2009-03-30 21:30:33 +0000
@@ -506,12 +506,9 @@
self._manager._prepare_for_extract()
block = self._manager._block
self._bytes = block.extract(self.key, self._start, self._end)
- # XXX: It seems the smart fetch extracts inventories and chk
- # pages as fulltexts to find the next chk pages, but then
- # passes them down to be inserted as a
- # groupcompress-block, so this is not safe to do. Perhaps
- # we could just change the storage kind to "fulltext" at
- # that point?
+ # There are code paths that first extract as fulltext, and then
+ # extract as storage_kind (smart fetch). So we don't break the
+ # refcycle here, but instead in manager.get_record_stream()
# self._manager = None
if storage_kind == 'fulltext':
return self._bytes
@@ -551,13 +548,7 @@
yield factory
# Break the ref-cycle
factory._bytes = None
- # XXX: this is not safe, the smart fetch code requests the content
- # as both a 'fulltext', and then later on as a
- # groupcompress-block. The iter_interesting_nodes code also is
- # still buffering multiple records and returning them later.
- # So that code would need to be updated to either re-fetch the
- # original object, or buffer it somehow.
- # factory._manager = None
+ factory._manager = None
# TODO: Consider setting self._factories = None after the above loop,
# as it will break the reference cycle
@@ -667,6 +658,8 @@
record_header = '%s\n%s\n%d\n%d\n' % (
key_bytes, parent_bytes, factory._start, factory._end)
header_lines.append(record_header)
+ # TODO: Can we break the refcycle at this point and set
+ # factory._manager = None?
header_bytes = ''.join(header_lines)
del header_lines
header_bytes_len = len(header_bytes)
=== modified file 'bzrlib/repofmt/pack_repo.py'
--- a/bzrlib/repofmt/pack_repo.py 2009-03-27 16:36:50 +0000
+++ b/bzrlib/repofmt/pack_repo.py 2009-03-30 21:13:24 +0000
@@ -2471,7 +2471,7 @@
revision_ids = frozenset(revision_ids)
file_id_revisions = {}
bytes_to_info = CHKInventory._bytes_to_utf8name_key
- for records, items in chk_map.iter_interesting_nodes(self.chk_bytes,
+ for record, items in chk_map.iter_interesting_nodes(self.chk_bytes,
interesting_root_keys, uninteresting_root_keys,
pb=pb):
# This is cheating a bit to use the last grabbed 'inv', but it
=== modified file 'bzrlib/repository.py'
--- a/bzrlib/repository.py 2009-03-30 12:14:49 +0000
+++ b/bzrlib/repository.py 2009-03-31 07:43:43 +0000
@@ -4287,12 +4287,12 @@
def to_stream_adapter():
"""Adapt the iter_interesting_nodes result to a single stream.
- iter_interesting_nodes returns records as it processes them, which
- can be in batches. But we only want a single stream to be inserted.
+ iter_interesting_nodes returns records as it processes them, along
+ with keys. However, we only want to return the records themselves.
"""
for record, items in interesting:
- for value in record.itervalues():
- yield value
+ if record is not None:
+ yield record
# XXX: We could instead call get_record_stream(records.keys())
# ATM, this will always insert the records as fulltexts, and
# requires that you can hang on to records once you have gone
=== modified file 'bzrlib/tests/per_repository/test_commit_builder.py'
--- a/bzrlib/tests/per_repository/test_commit_builder.py 2009-03-30 12:14:49 +0000
+++ b/bzrlib/tests/per_repository/test_commit_builder.py 2009-03-31 07:43:43 +0000
@@ -185,7 +185,6 @@
except:
builder.abort()
raise
- builder.finish_inventory()
self.assertEqual(revision_id, builder.commit('foo bar'))
finally:
tree.unlock()
=== modified file 'bzrlib/tests/test_chk_map.py'
--- a/bzrlib/tests/test_chk_map.py 2009-03-25 07:54:11 +0000
+++ b/bzrlib/tests/test_chk_map.py 2009-03-30 21:13:24 +0000
@@ -1846,32 +1846,38 @@
store = self.get_chk_bytes()
iter_nodes = chk_map.iter_interesting_nodes(store, interesting_keys,
uninteresting_keys)
- for count, (exp, act) in enumerate(izip(expected, iter_nodes)):
- exp_record_keys, exp_items = exp
- records, items = act
- exp_tuple = (sorted(exp_record_keys), sorted(exp_items))
- act_tuple = (sorted(records.keys()), sorted(items))
- self.assertEqual(exp_tuple, act_tuple)
- self.assertEqual(len(expected), count + 1)
+ nodes = list(iter_nodes)
+ for count, (exp, act) in enumerate(izip(expected, nodes)):
+ exp_record, exp_items = exp
+ record, items = act
+ exp_tuple = (exp_record, sorted(exp_items))
+ if record is None:
+ act_tuple = (None, sorted(items))
+ else:
+ act_tuple = (record.key, sorted(items))
+ self.assertEqual(exp_tuple, act_tuple,
+ 'entry %d did not match expected' % count)
+ self.assertEqual(len(expected), len(nodes))
def test_empty_to_one_keys(self):
target = self.get_map_key({('a',): 'content'})
self.assertIterInteresting(
- [([target], [(('a',), 'content')])],
- [target], [])
+ [(target, [(('a',), 'content')]),
+ ], [target], [])
def test_none_to_one_key(self):
basis = self.get_map_key({})
target = self.get_map_key({('a',): 'content'})
self.assertIterInteresting(
- [([target], [(('a',), 'content')])],
- [target], [basis])
+ [(None, [(('a',), 'content')]),
+ (target, []),
+ ], [target], [basis])
def test_one_to_none_key(self):
basis = self.get_map_key({('a',): 'content'})
target = self.get_map_key({})
self.assertIterInteresting(
- [([target], [])],
+ [(target, [])],
[target], [basis])
def test_common_pages(self):
@@ -1896,8 +1902,8 @@
b_key = target_map._root_node._items['b'].key()
# This should return the root node, and the node for the 'b' key
self.assertIterInteresting(
- [([target], []),
- ([b_key], [(('b',), 'other content')])],
+ [(target, []),
+ (b_key, [(('b',), 'other content')])],
[target], [basis])
def test_common_sub_page(self):
@@ -1924,11 +1930,77 @@
# The key for the leaf aab node
aab_key = target_map._root_node._items['a']._items['aab'].key()
self.assertIterInteresting(
- [([target], []),
- ([a_key], []),
- ([aab_key], [(('aab',), 'new')])],
+ [(target, []),
+ (a_key, []),
+ (aab_key, [(('aab',), 'new')])],
[target], [basis])
+ def test_common_leaf(self):
+ basis = self.get_map_key({})
+ target1 = self.get_map_key({('aaa',): 'common'})
+ target2 = self.get_map_key({('aaa',): 'common',
+ ('bbb',): 'new',
+ })
+ target3 = self.get_map_key({('aaa',): 'common',
+ ('aac',): 'other',
+ ('bbb',): 'new',
+ })
+ # The LeafNode containing 'aaa': 'common' occurs at 3 different levels.
+ # Once as a root node, once as a second layer, and once as a third
+ # layer. It should only be returned one time regardless
+ target1_map = CHKMap(self.get_chk_bytes(), target1)
+ self.assertEqualDiff(
+ "'' LeafNode\n"
+ " ('aaa',) 'common'\n",
+ target1_map._dump_tree())
+ target2_map = CHKMap(self.get_chk_bytes(), target2)
+ self.assertEqualDiff(
+ "'' InternalNode\n"
+ " 'a' LeafNode\n"
+ " ('aaa',) 'common'\n"
+ " 'b' LeafNode\n"
+ " ('bbb',) 'new'\n",
+ target2_map._dump_tree())
+ target3_map = CHKMap(self.get_chk_bytes(), target3)
+ self.assertEqualDiff(
+ "'' InternalNode\n"
+ " 'a' InternalNode\n"
+ " 'aaa' LeafNode\n"
+ " ('aaa',) 'common'\n"
+ " 'aac' LeafNode\n"
+ " ('aac',) 'other'\n"
+ " 'b' LeafNode\n"
+ " ('bbb',) 'new'\n",
+ target3_map._dump_tree())
+ aaa_key = target1_map._root_node.key()
+ b_key = target2_map._root_node._items['b'].key()
+ a_key = target3_map._root_node._items['a'].key()
+ aac_key = target3_map._root_node._items['a']._items['aac'].key()
+ self.assertIterInteresting(
+ [(None, [(('aaa',), 'common')]),
+ (target1, []),
+ (target2, []),
+ (target3, []),
+ (b_key, [(('bbb',), 'new')]),
+ (a_key, []),
+ (aac_key, [(('aac',), 'other')]),
+ ], [target1, target2, target3], [basis])
+
+ self.assertIterInteresting(
+ [(target2, []),
+ (target3, []),
+ (b_key, [(('bbb',), 'new')]),
+ (a_key, []),
+ (aac_key, [(('aac',), 'other')]),
+ ], [target2, target3], [target1])
+
+ # This may be a case that we relax. A root node is a deep child of the
+ # excluded set. The cost is buffering root nodes until we have
+ # determined all possible exclusions. (Because a prefix of '', cannot
+ # be excluded.)
+ self.assertIterInteresting(
+ [], [target1], [target3])
+
def test_multiple_maps(self):
basis1 = self.get_map_key({('aaa',): 'common',
('aab',): 'basis1',
@@ -1976,9 +2048,10 @@
# The key for the leaf bba node
bba_key = target2_map._root_node._items['b']._items['bba'].key()
self.assertIterInteresting(
- [([target1, target2], []),
- ([a_key], []),
- ([b_key], []),
- ([aac_key], [(('aac',), 'target1')]),
- ([bba_key], [(('bba',), 'target2')]),
+ [(target1, []),
+ (target2, []),
+ (a_key, []),
+ (b_key, []),
+ (aac_key, [(('aac',), 'target1')]),
+ (bba_key, [(('bba',), 'target2')]),
], [target1, target2], [basis1, basis2])
More information about the bazaar-commits
mailing list