Rev 3668: If we read more than 50% of the whole index, in http://bzr.arbash-meinel.com/branches/bzr/1.7-dev/graph_index_autobuffer
John Arbash Meinel
john at arbash-meinel.com
Fri Aug 29 18:38:48 BST 2008
At http://bzr.arbash-meinel.com/branches/bzr/1.7-dev/graph_index_autobuffer
------------------------------------------------------------
revno: 3668
revision-id: john at arbash-meinel.com-20080829173847-n3h7gwv6hep9glbn
parent: john at arbash-meinel.com-20080829171504-p99qggtlhhvmmzzj
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: graph_index_autobuffer
timestamp: Fri 2008-08-29 12:38:47 -0500
message:
If we read more than 50% of the whole index,
go ahead and buffer the whole thing on the next request.
This could be tuned (30%?, 75%?), but the old code could easily
get to the point where we would end up reading more than
1x the total bytes of the file.
-------------- next part --------------
=== modified file 'NEWS'
--- a/NEWS 2008-08-29 17:15:04 +0000
+++ b/NEWS 2008-08-29 17:38:47 +0000
@@ -39,7 +39,12 @@
single request treat it as a ``_buffer_all`` request. This happens
most often on small indexes over remote transports, where we default
to reading 64kB. It saves a round trip for each small index during
- fetch operations. (John Arbash Meinel)
+ fetch operations. Also, if we have read more than 50% of an index
+ file, trigger a ``_buffer_all`` on the next request. This works
+ around some inefficiencies because reads don't fall neatly on page
+ boundaries, so we would ignore those bytes, but request them again
+ later. This could trigger a total read size of more than the whole
+ file. (John Arbash Meinel)
BUG FIXES:
=== modified file 'bzrlib/index.py'
--- a/bzrlib/index.py 2008-08-29 17:13:30 +0000
+++ b/bzrlib/index.py 2008-08-29 17:38:47 +0000
@@ -272,6 +272,8 @@
self._keys_by_offset = None
self._nodes_by_key = None
self._size = size
+ # The number of bytes we've read so far in trying to process this file
+ self._bytes_read = 0
def __eq__(self, other):
"""Equal when self and other were created with the same parameters."""
@@ -477,6 +479,12 @@
return []
if self._size is None and self._nodes is None:
self._buffer_all()
+
+ if self._nodes is None and self._bytes_read * 2 >= self._size:
+ # We've already read more than 50% of the file, go ahead and buffer
+ # the whole thing
+ self._buffer_all()
+
# We fit about 20 keys per minimum-read (4K), so if we are looking for
# more than 1/20th of the index its likely (assuming homogenous key
# spread) that we'll read the entire index. If we're going to do that,
@@ -989,6 +997,7 @@
self._size)
# parse
for offset, data in readv_data:
+ self._bytes_read += len(data)
if offset == 0 and len(data) == self._size:
# We 'accidentally' read the whole range, go straight into
# '_buffer_all'. This could happen because the transport
=== modified file 'bzrlib/tests/test_index.py'
--- a/bzrlib/tests/test_index.py 2008-08-29 17:13:30 +0000
+++ b/bzrlib/tests/test_index.py 2008-08-29 17:38:47 +0000
@@ -666,6 +666,26 @@
# with buffering
self.assertIsNot(None, index._nodes)
+ def test_iter_entries_buffers_by_bytes_read(self):
+ index = self.make_index(nodes=self.make_nodes(64))
+ list(index.iter_entries([self.make_key(10)]))
+ # The first time through isn't enough to trigger a buffer all
+ self.assertIs(None, index._nodes)
+ self.assertEqual(4096, index._bytes_read)
+ # Grabbing a key in that same page won't trigger a buffer all, as we
+ # still haven't read 50% of the file
+ list(index.iter_entries([self.make_key(11)]))
+ self.assertIs(None, index._nodes)
+ self.assertEqual(4096, index._bytes_read)
+ # We haven't read more data, so reading outside the range won't trigger
+ # a buffer all right away
+ list(index.iter_entries([self.make_key(40)]))
+ self.assertIs(None, index._nodes)
+ self.assertEqual(8192, index._bytes_read)
+ # But on the next pass, we will trigger buffer all
+ list(index.iter_entries([self.make_key(32)]))
+ self.assertIsNot(None, index._nodes)
+
def test_iter_entries_references_resolved(self):
index = self.make_index(1, nodes=[
(('name', ), 'data', ([('ref', ), ('ref', )], )),
@@ -789,7 +809,20 @@
(('name', ), '', ()), (('foo', ), '', ())])
self.assertEqual(2, index.key_count())
- def test_readv_all_triggers_buffer_all(self):
+ def test_read_and_parse_tracks_real_read_value(self):
+ index = self.make_index(nodes=self.make_nodes(10))
+ del index._transport._activity[:]
+ index._read_and_parse([(0, 200)])
+ self.assertEqual([
+ ('readv', 'index', [(0, 200)], True, index._size),
+ ],
+ index._transport._activity)
+ # The readv expansion code will expand the initial request to 4096
+ # bytes, which is more than enough to read the entire index, and we
+ # will track the fact that we read that many bytes.
+ self.assertEqual(index._size, index._bytes_read)
+
+ def test_read_and_parse_triggers_buffer_all(self):
index = self.make_index(key_elements=2, nodes=[
(('name', 'fin1'), 'data', ()),
(('name', 'fin2'), 'beta', ()),
More information about the bazaar-commits
mailing list