Rev 2381: Switch the bisect code to support the fact that we can have in http://bazaar.launchpad.net/%7Ebzr/bzr/dirstate
John Arbash Meinel
john at arbash-meinel.com
Fri Feb 23 21:20:47 GMT 2007
At http://bazaar.launchpad.net/%7Ebzr/bzr/dirstate
------------------------------------------------------------
revno: 2381
revision-id: john at arbash-meinel.com-20070223211937-2reect5p16takng2
parent: john at arbash-meinel.com-20070223174705-n1j4wyfh5jrlku7s
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: dirstate
timestamp: Fri 2007-02-23 15:19:37 -0600
message:
Switch the bisect code to support the fact that we can have
repeated (dir, name) pairs (dir, name, file_id) is the unique key.
Also, fix some edge cases when the page size is small so that we don't get clean
divisions of records per page.
modified:
bzrlib/dirstate.py dirstate.py-20060728012006-d6mvoihjb3je9peu-1
bzrlib/errors.py errors.py-20050309040759-20512168c4e14fbd
bzrlib/tests/test_dirstate.py test_dirstate.py-20060728012006-d6mvoihjb3je9peu-2
-------------- next part --------------
=== modified file 'bzrlib/dirstate.py'
--- a/bzrlib/dirstate.py 2007-02-23 17:47:05 +0000
+++ b/bzrlib/dirstate.py 2007-02-23 21:19:37 +0000
@@ -235,7 +235,8 @@
# TODO: jam 20070221 Make sure we handle when there are duplicated records
# (like when we remove + add the same path, or we have a rename)
# TODO: jam 20070221 Figure out what to do if we have a record that exceeds
- # the BISECT_PAGE_SIZE.
+ # the BISECT_PAGE_SIZE. For now, we just have to make it large enough
+ # that we are sure a single record will always fit.
BISECT_PAGE_SIZE = 4096
NOT_IN_MEMORY = 0
@@ -374,28 +375,31 @@
# Because it means we can sync on the '\n'
state_file = self._state_file
file_size = os.fstat(state_file.fileno()).st_size
- # Because of the extra \0 we should have one extra field
- entry_field_count = self._fields_per_row() + 1
+ # We end up with 2 extra fields, we should have a trailing '\n' to
+ # ensure that we read the whole record, and we should have a precursur
+ # '' which ensures that we start after the previous '\n'
+ entry_field_count = self._fields_per_entry() + 1
+ # print '\nfield_count:', entry_field_count
low = self._end_of_header
high = file_size - 1 # Ignore the final '\0'
- # Map from (
+ # Map from (dir, name) => entry
found = {}
# Avoid infinite seeking
max_count = 30*len(dir_name_list)
count = 0
+ # pending is a list of places to look.
+ # each entry is a tuple of low, high, dir_names
+ # low -> the first byte offset to read (inclusive)
+ # high -> the last byte offset (inclusive)
+ # dir_names -> The list of (dir, name) pairs that should be found in
+ # the [low, high] range
pending = [(low, high, dir_name_list)]
- page_size = self.BISECT_PAGE_SIZE
+ page_size = self._bisect_page_size
fields_to_entry = self._get_fields_to_entry()
- def add_one_record(num):
- info = entries[num].split('\0')
- # We have to offset by 1 because we are syncing on the earlier '\n'
- # rather than the '\0'
- record = fields_to_entry(info[1:])
- found[(record[0][0], record[0][1])] = record
while pending:
low, high, cur_files = pending.pop()
@@ -414,37 +418,64 @@
if count > max_count:
raise errors.BzrError('Too many seeks, most likely a bug.')
+ # print low, high, cur_files
+
mid = max(low, (low+high-page_size)/2)
state_file.seek(mid)
- block = state_file.read(page_size)
- entries = block.split('\n')
+ # limit the read size, so we don't end up reading data that we have
+ # already read.
+ read_size = min(page_size, (high-mid)+1)
+ block = state_file.read(read_size)
start = mid
after = mid + len(block)
+ entries = block.split('\n')
+
+ if len(entries) < 2:
+ # We didn't find a '\n', so we cannot have found any records.
+ # So put this range back and try again. But we know we have to
+ # increase the page size, because a single read did not contain
+ # a record break (so records must be larger than page_size)
+ raise errors.BisectPageSizeTooSmall(page_size)
# Check the first and last entries, in case they are partial, or if
# we don't care about the rest of this page
first_entry_num = 0
- first_info = entries[0].split('\0')
- if len(first_info) < entry_field_count:
+ first_fields = entries[0].split('\0')
+ if len(first_fields) < entry_field_count:
# We didn't get the complete first entry
# so move start, and grab the next, which
# should be a full entry
start += len(entries[0])+1
- first_info = entries[1].split('\0')
+ first_fields = entries[1].split('\0')
first_entry_num = 1
- # Find what entries we are looking for, which occur before and
- # after this first record.
- first_info_dir_and_name = (first_info[1], first_info[2])
- first_loc = bisect.bisect_left(cur_files, first_info_dir_and_name)
-
- # These exist before the current location
- pre = cur_files[:first_loc]
- # These occur after the current location, which may be in the data
- # we read, or might be after the last entry
- middle_files = cur_files[first_loc:]
+ if len(first_fields) <= 2:
+ # We didn't even get a filename here... what do we do?
+ # For now, just fall over
+ raise errors.BisectPageSizeTooSmall(page_size)
+ else:
+ # Find what entries we are looking for, which occur before and
+ # after this first record.
+ first_dir_name = (first_fields[1], first_fields[2])
+ first_loc = bisect.bisect_left(cur_files, first_dir_name)
+
+ # These exist before the current location
+ pre = cur_files[:first_loc]
+ # These occur after the current location, which may be in the
+ # data we read, or might be after the last entry
+ middle_files = cur_files[first_loc:]
+
+ if len(first_fields) < entry_field_count:
+ # We didn't actually get a full record, so just mark
+ # everything as pending and continue
+ if middle_files:
+ pending.append((start, high, middle_files))
+ if pre:
+ pending.append((low, start-1, pre))
+ continue
+
# These are only after the last entry
post = []
@@ -453,19 +484,18 @@
# Parse the last entry
last_entry_num = len(entries)-1
- last_info = entries[last_entry_num].split('\0')
- if len(last_info) < entry_field_count:
+ last_fields = entries[last_entry_num].split('\0')
+ if len(last_fields) < entry_field_count:
# The very last hunk was not complete,
# read the previous hunk
# TODO: jam 20070217 There may be an edge case if there are
# not enough entries that were read.
after -= len(entries[-1])
last_entry_num -= 1
- last_info = entries[last_entry_num].split('\0')
+ last_fields = entries[last_entry_num].split('\0')
- last_info_dir_and_name = (last_info[1], last_info[2])
- last_loc = bisect.bisect_right(middle_files,
- last_info_dir_and_name)
+ last_dir_name = (last_fields[1], last_fields[2])
+ last_loc = bisect.bisect_right(middle_files, last_dir_name)
post = middle_files[last_loc:]
middle_files = middle_files[:last_loc]
@@ -476,21 +506,30 @@
# Either we will find them here, or we can mark them as
# missing.
+ if middle_files[0] == first_dir_name:
+ # We might need to go before this location
+ pre.append(middle_files[0])
+ if middle_files[-1] == last_dir_name:
+ post.insert(0, middle_files[-1])
+
# Find out what paths we have
- # TODO: jam 20070223 There should be faster ways of doing
- # this, but in the short term this should work
- # Also, consider that we already parsed the
- # first_info and last_info, so we don't need to split
- # them again.
- paths = dict((tuple(entries[num].split('\0', 3)[1:3]), num)
- for num in xrange(first_entry_num+1, last_entry_num))
- paths[first_info_dir_and_name] = first_entry_num
- paths[last_info_dir_and_name] = last_entry_num
+ paths = {first_dir_name:[first_fields],
+ last_dir_name:[last_fields]}
+ for num in xrange(first_entry_num+1, last_entry_num):
+ # TODO: jam 20070223 We are already splitting here, so
+ # shouldn't we just split the whole thing rather
+ # than doing the split again in add_one_record?
+ fields = entries[num].split('\0')
+ dir_name = (fields[1], fields[2])
+ paths.setdefault(dir_name, []).append(fields)
- for fname in middle_files:
- num = paths.get(fname, None)
- if num is not None:
- add_one_record(num)
+ for dir_name in middle_files:
+ for fields in paths.get(dir_name, []):
+ # offset by 1 because of the opening '\0'
+ # consider changing fields_to_entry to avoid the
+ # extra list slice
+ entry = fields_to_entry(fields[1:])
+ found.setdefault(dir_name, []).append(entry)
# Now we have split up everything into pre, middle, and post, and
# we have handled everything that fell in 'middle'.
@@ -502,9 +541,7 @@
if pre:
pending.append((low, start-1, pre))
- return [found.get(path_key, None) for path_key in dir_name_list]
-
-
+ return [found.get(dir_name, None) for dir_name in dir_name_list]
def _empty_parent_info(self):
return [DirState.NULL_PARENT_DETAILS] * (len(self._parents) -
@@ -612,7 +649,7 @@
entire_entry[tree_offset + 3] = DirState._to_yesno[tree_data[3]]
return '\0'.join(entire_entry)
- def _fields_per_row(self):
+ def _fields_per_entry(self):
"""How many null separated fields should be in each entry row.
Each line now has an extra '\n' field which is not used
@@ -1047,7 +1084,7 @@
# + newline
num_present_parents = self._num_present_parents()
tree_count = 1 + num_present_parents
- entry_size = self._fields_per_row()
+ entry_size = self._fields_per_entry()
expected_field_count = entry_size * self._num_entries
if len(fields) - cur > expected_field_count:
fields = fields[:expected_field_count + cur]
=== modified file 'bzrlib/errors.py'
--- a/bzrlib/errors.py 2007-02-17 03:34:50 +0000
+++ b/bzrlib/errors.py 2007-02-23 21:19:37 +0000
@@ -163,6 +163,18 @@
_fmt = "The tree builder is already building a tree."
+class BisectPageSizeTooSmall(BzrError):
+
+ _fmt = ("The dirstate bisect page size %(size)s is too small."
+ " We were unable to read an entire record.")
+
+ internal_error = True
+
+ def __init__(self, size):
+ BzrError.__init__(self)
+ self.size = size
+
+
class BzrCheckError(BzrError):
_fmt = "Internal check failed: %(message)s"
=== modified file 'bzrlib/tests/test_dirstate.py'
--- a/bzrlib/tests/test_dirstate.py 2007-02-23 17:47:05 +0000
+++ b/bzrlib/tests/test_dirstate.py 2007-02-23 21:19:37 +0000
@@ -979,12 +979,45 @@
self.addCleanup(state.unlock)
self.assertEqual(dirstate.DirState.NOT_IN_MEMORY,
state._dirblock_state)
+ # This is code is only really tested if we actually have to make more
+ # than one read, so set the page size to something smaller.
+ # We want it to contain about 2.2 records, so that we have a couple
+ # records that we can read per attempt
+ state._bisect_page_size = 200
+ return tree, state, expected
+
+ def create_duplicated_dirstate(self):
+ """Create a dirstate with a deleted and added entries.
+
+ This grabs a basic_dirstate, and then removes and re adds every entry
+ with a new file id.
+ """
+ tree, state, expected = self.create_basic_dirstate()
+ # Now we will just remove and add every file so we get an extra entry
+ # per entry. Unversion in reverse order so we handle subdirs
+ tree.unversion(['f-id', 'e-id', 'd-id', 'c-id', 'b-id', 'a-id'])
+ tree.add(['a', 'b', 'b/c', 'b/d', 'b/d/e', 'f'],
+ ['a-id2', 'b-id2', 'c-id2', 'd-id2', 'e-id2', 'f-id2'])
+ # We will replace the 'dirstate' file underneath 'state', but that is
+ # okay as lock as we unlock 'state' first.
+ state.unlock()
+ try:
+ new_state = dirstate.DirState.from_tree(tree, 'dirstate')
+ try:
+ new_state.save()
+ finally:
+ new_state.unlock()
+ finally:
+ # But we need to leave state in a read-lock because we already have
+ # a cleanup scheduled
+ state.lock_read()
+ state._bisect_page_size = 150
return tree, state, expected
def assertBisect(self, expected, state, paths):
"""Assert that bisecting for paths returns the right result.
- :param expected: The list of entries we expect.
+ :param expected: The list of extracted entries.
:param state: The DirState object.
:param paths: A list of paths, these will automatically be split into
(dir, name) tuples, and sorted according to how _bisect
@@ -992,40 +1025,96 @@
"""
dir_names = sorted(osutils.split(p) for p in paths)
result = state._bisect(dir_names)
- self.assertEqual(expected, result)
+ # For now, results are just returned in whatever order we read them.
+ # We could sort by (dir, name, file_id) or something like that, but in
+ # the end it would still be fairly arbitrary, and we don't want the
+ # extra overhead if we can avoid it.
+ for expected_entries, actual_entries in zip(expected, result):
+ if expected_entries is not None:
+ expected_entries.sort()
+ if actual_entries is not None:
+ actual_entries.sort()
+ self.assertEqual(expected_entries, actual_entries)
+ self.assertEqual(len(expected), len(result))
def test_bisect_each(self):
"""Find a single record using bisect."""
tree, state, expected = self.create_basic_dirstate()
# Bisect should return the rows for the specified files.
- self.assertBisect([expected['']], state, [''])
- self.assertBisect([expected['a']], state, ['a'])
- self.assertBisect([expected['b']], state, ['b'])
- self.assertBisect([expected['b/c']], state, ['b/c'])
- self.assertBisect([expected['b/d']], state, ['b/d'])
- self.assertBisect([expected['b/d/e']], state, ['b/d/e'])
- self.assertBisect([expected['f']], state, ['f'])
+ self.assertBisect([[expected['']]], state, [''])
+ self.assertBisect([[expected['a']]], state, ['a'])
+ self.assertBisect([[expected['b']]], state, ['b'])
+ self.assertBisect([[expected['b/c']]], state, ['b/c'])
+ self.assertBisect([[expected['b/d']]], state, ['b/d'])
+ self.assertBisect([[expected['b/d/e']]], state, ['b/d/e'])
+ self.assertBisect([[expected['f']]], state, ['f'])
def test_bisect_multi(self):
"""Bisect can be used to find multiple records at the same time."""
tree, state, expected = self.create_basic_dirstate()
# Bisect should be capable of finding multiple entries at the same time
- self.assertBisect([expected['a'], expected['b'], expected['f']],
- state, ['a', 'b', 'f'])
- # ('', 'f') sorts before the others
- self.assertBisect([expected['f'], expected['b/d'], expected['b/d/e']],
- state, ['b/d', 'b/d/e', 'f'])
-
- def test_bisect_split_pages(self):
- """Test bisect when crossing page boundaries"""
- tree, state, expected = self.create_basic_dirstate()
- # Make sure we can't read all the records in a single page, but also
- # that we *can* read a singe entry in the page size.
- state._bisect_page_size = 100
- # Bisect should be capable of finding multiple entries at the same time
- self.assertBisect([expected['a'], expected['b'], expected['f']],
- state, ['a', 'b', 'f'])
- # ('', 'f') sorts before the others
- self.assertBisect([expected['f'], expected['b/d'], expected['b/d/e']],
- state, ['b/d', 'b/d/e', 'f'])
+ self.assertBisect([[expected['a']], [expected['b']], [expected['f']]],
+ state, ['a', 'b', 'f'])
+ # ('', 'f') sorts before the others
+ self.assertBisect([[expected['f']], [expected['b/d']],
+ [expected['b/d/e']]],
+ state, ['b/d', 'b/d/e', 'f'])
+
+ def test_bisect_one_page(self):
+ """Test bisect when there is only 1 page to read"""
+ tree, state, expected = self.create_basic_dirstate()
+ state._bisect_page_size = 5000
+ self.assertBisect([[expected['']]], state, [''])
+ self.assertBisect([[expected['a']]], state, ['a'])
+ self.assertBisect([[expected['b']]], state, ['b'])
+ self.assertBisect([[expected['b/c']]], state, ['b/c'])
+ self.assertBisect([[expected['b/d']]], state, ['b/d'])
+ self.assertBisect([[expected['b/d/e']]], state, ['b/d/e'])
+ self.assertBisect([[expected['f']]], state, ['f'])
+ self.assertBisect([[expected['a']], [expected['b']], [expected['f']]],
+ state, ['a', 'b', 'f'])
+ # ('', 'f') sorts before the others
+ self.assertBisect([[expected['f']], [expected['b/d']],
+ [expected['b/d/e']]],
+ state, ['b/d', 'b/d/e', 'f'])
+
+ def test_bisect_duplicate_paths(self):
+ """When bisecting for a path, handle multiple entries."""
+ tree, state, expected = self.create_duplicated_dirstate()
+
+ # Update the expected dictionary.
+ for path in ['a', 'b', 'b/c', 'b/d', 'b/d/e', 'f']:
+ orig = expected[path]
+ path2 = path + '2'
+ # This record was deleted in the current tree
+ expected[path] = (orig[0], [dirstate.DirState.NULL_PARENT_DETAILS,
+ orig[1][1]])
+ new_key = (orig[0][0], orig[0][1], orig[0][2]+'2')
+ # And didn't exist in the basis tree
+ expected[path2] = (new_key, [orig[1][0],
+ dirstate.DirState.NULL_PARENT_DETAILS])
+
+ # Now make sure that both records are properly returned.
+ self.assertBisect([[expected['a'], expected['a2']]], state, 'a')
+
+ def test_bisect_page_size_too_small(self):
+ """We should raise an error if we detect a field longer than page_size.
+
+ This is a safety check, since we know we are reading in pages, and we
+ expect to fit at least slightly more than 1 record per page.
+ """
+ tree, state, expected = self.create_basic_dirstate()
+ state._bisect_page_size = 50
+ self.assertRaises(errors.BisectPageSizeTooSmall,
+ state._bisect, [('b', 'e')])
+
+ def test_bisect_missing(self):
+ """Test that bisect return None if it cannot find a path."""
+ tree, state, expected = self.create_basic_dirstate()
+ self.assertBisect([None], state, ['foo'])
+ self.assertBisect([None], state, ['b/foo'])
+ self.assertBisect([None], state, ['bar/foo'])
+
+ self.assertBisect([[expected['a']], None, [expected['b/d']]],
+ state, ['a', 'foo', 'b/d'])
More information about the bazaar-commits
mailing list