Rev 3668: If we read more than 50% of the whole index, in http://bzr.arbash-meinel.com/branches/bzr/1.7-dev/graph_index_autobuffer

Fri Aug 29 18:38:48 BST 2008

At http://bzr.arbash-meinel.com/branches/bzr/1.7-dev/graph_index_autobuffer

------------------------------------------------------------
revno: 3668
revision-id: john at arbash-meinel.com-20080829173847-n3h7gwv6hep9glbn
parent: john at arbash-meinel.com-20080829171504-p99qggtlhhvmmzzj
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: graph_index_autobuffer
timestamp: Fri 2008-08-29 12:38:47 -0500
message:
  If we read more than 50% of the whole index,
  go ahead and buffer the whole thing on the next request.
  This could be tuned (30%?, 75%?), but the old code could easily
  get to the point where we would end up reading more than
  1x the total bytes of the file.
-------------- next part --------------
=== modified file 'NEWS'

--- a/NEWS	2008-08-29 17:15:04 +0000
+++ b/NEWS	2008-08-29 17:38:47 +0000
@@ -39,7 +39,12 @@
       single request treat it as a ``_buffer_all`` request. This happens
       most often on small indexes over remote transports, where we default
       to reading 64kB. It saves a round trip for each small index during
-      fetch operations. (John Arbash Meinel)
+      fetch operations. Also, if we have read more than 50% of an index
+      file, trigger a ``_buffer_all`` on the next request. This works
+      around some inefficiencies because reads don't fall neatly on page
+      boundaries, so we would ignore those bytes, but request them again
+      later. This could trigger a total read size of more than the whole
+      file. (John Arbash Meinel)
 
   BUG FIXES:
 

=== modified file 'bzrlib/index.py'
--- a/bzrlib/index.py	2008-08-29 17:13:30 +0000
+++ b/bzrlib/index.py	2008-08-29 17:38:47 +0000
@@ -272,6 +272,8 @@
         self._keys_by_offset = None
         self._nodes_by_key = None
         self._size = size
+        # The number of bytes we've read so far in trying to process this file
+        self._bytes_read = 0
 
     def __eq__(self, other):
         """Equal when self and other were created with the same parameters."""
@@ -477,6 +479,12 @@
             return []
         if self._size is None and self._nodes is None:
             self._buffer_all()
+
+        if self._nodes is None and self._bytes_read * 2 >= self._size:
+            # We've already read more than 50% of the file, go ahead and buffer
+            # the whole thing
+            self._buffer_all()
+
         # We fit about 20 keys per minimum-read (4K), so if we are looking for
         # more than 1/20th of the index its likely (assuming homogenous key
         # spread) that we'll read the entire index. If we're going to do that,
@@ -989,6 +997,7 @@
                 self._size)
             # parse
             for offset, data in readv_data:
+                self._bytes_read += len(data)
                 if offset == 0 and len(data) == self._size:
                     # We 'accidentally' read the whole range, go straight into
                     # '_buffer_all'. This could happen because the transport

=== modified file 'bzrlib/tests/test_index.py'
--- a/bzrlib/tests/test_index.py	2008-08-29 17:13:30 +0000
+++ b/bzrlib/tests/test_index.py	2008-08-29 17:38:47 +0000
@@ -666,6 +666,26 @@
         # with buffering
         self.assertIsNot(None, index._nodes)
 
+    def test_iter_entries_buffers_by_bytes_read(self):
+        index = self.make_index(nodes=self.make_nodes(64))
+        list(index.iter_entries([self.make_key(10)]))
+        # The first time through isn't enough to trigger a buffer all
+        self.assertIs(None, index._nodes)
+        self.assertEqual(4096, index._bytes_read)
+        # Grabbing a key in that same page won't trigger a buffer all, as we
+        # still haven't read 50% of the file
+        list(index.iter_entries([self.make_key(11)]))
+        self.assertIs(None, index._nodes)
+        self.assertEqual(4096, index._bytes_read)
+        # We haven't read more data, so reading outside the range won't trigger
+        # a buffer all right away
+        list(index.iter_entries([self.make_key(40)]))
+        self.assertIs(None, index._nodes)
+        self.assertEqual(8192, index._bytes_read)
+        # But on the next pass, we will trigger buffer all
+        list(index.iter_entries([self.make_key(32)]))
+        self.assertIsNot(None, index._nodes)
+
     def test_iter_entries_references_resolved(self):
         index = self.make_index(1, nodes=[
             (('name', ), 'data', ([('ref', ), ('ref', )], )),
@@ -789,7 +809,20 @@
             (('name', ), '', ()), (('foo', ), '', ())])
         self.assertEqual(2, index.key_count())
 
-    def test_readv_all_triggers_buffer_all(self):
+    def test_read_and_parse_tracks_real_read_value(self):
+        index = self.make_index(nodes=self.make_nodes(10))
+        del index._transport._activity[:]
+        index._read_and_parse([(0, 200)])
+        self.assertEqual([
+            ('readv', 'index', [(0, 200)], True, index._size),
+            ],
+            index._transport._activity)
+        # The readv expansion code will expand the initial request to 4096
+        # bytes, which is more than enough to read the entire index, and we
+        # will track the fact that we read that many bytes.
+        self.assertEqual(index._size, index._bytes_read)
+
+    def test_read_and_parse_triggers_buffer_all(self):
         index = self.make_index(key_elements=2, nodes=[
             (('name', 'fin1'), 'data', ()),
             (('name', 'fin2'), 'beta', ()),