Rev 3896: Change name to 'chunks_to_lines', and find an optimized form. in http://bzr.arbash-meinel.com/branches/bzr/1.11/get_record_stream_chunked

Thu Dec 11 02:02:32 GMT 2008

At http://bzr.arbash-meinel.com/branches/bzr/1.11/get_record_stream_chunked

------------------------------------------------------------
revno: 3896
revision-id: john at arbash-meinel.com-20081211020207-rrgdcyqc344zo5q1
parent: john at arbash-meinel.com-20081211011419-vqtdjgpa04woqvm4
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: get_record_stream_chunked
timestamp: Wed 2008-12-10 20:02:07 -0600
message:
  Change name to 'chunks_to_lines', and find an optimized form.
  
  It is a little bit ugly, but it is faster than join & split, and means
  we get to leave the strings untouched.
-------------- next part --------------
=== modified file 'bzrlib/osutils.py'

--- a/bzrlib/osutils.py	2008-12-11 01:10:38 +0000
+++ b/bzrlib/osutils.py	2008-12-11 02:02:07 +0000
@@ -820,7 +820,7 @@
     return pathjoin(*p)
 
 
-def chunked_to_lines(chunks):
+def chunks_to_lines(chunks):
     """Ensure that chunks is split cleanly into lines.
 
     Each entry in the result should contain a single newline at the end. Except
@@ -829,6 +829,28 @@
     :param chunks: An iterable of strings
     :return: A list of strings.
     """
+    # Optimize for a very common case when chunks are already lines
+    def fail():
+        raise IndexError
+    try:
+        # This is a bit ugly, but is the fastest way to check if all of the
+        # chunks are individual lines.
+        # You can't use function calls like .count(), .index(), or endswith()
+        # because they incur too much python overhead.
+        # It works because
+        #   if chunk is an empty string, it will raise IndexError, which will
+        #       be caught.
+        #   if chunk doesn't end with '\n' then we hit fail()
+        #   if there is more than one '\n' then we hit fail()
+        # timing shows this loop to take 2.58ms rather than 3.18ms for
+        # split_lines(''.join(chunks))
+        # Further, it means we get to preserve the original lines, rather than
+        # expanding memory
+        [(chunk[-1] == '\n' and '\n' not in chunk[:-1]) or fail()
+         for chunk in chunks]
+        return chunks
+    except IndexError:
+        pass
     return split_lines(''.join(chunks))
 
 

=== modified file 'bzrlib/tests/test_osutils.py'
--- a/bzrlib/tests/test_osutils.py	2008-12-11 01:14:19 +0000
+++ b/bzrlib/tests/test_osutils.py	2008-12-11 02:02:07 +0000
@@ -756,10 +756,10 @@
         self.assertEndsWith(osutils._mac_getcwd(), u'B\xe5gfors')
 
 
-class TestChunkedToLines(TestCase):
+class TestChunksToLines(TestCase):
 
     def assertChunksToLines(self, lines, chunks):
-        self.assertEqual(lines, osutils.chunked_to_lines(chunks))
+        self.assertEqual(lines, osutils.chunks_to_lines(chunks))
 
     def test_fulltext_chunk_to_lines(self):
         self.assertChunksToLines(['foo\n', 'bar\r\n', 'ba\rz\n'],
@@ -778,6 +778,13 @@
     def test_mixed(self):
         self.assertChunksToLines(['foo\n', 'bar\r\n', 'ba\rz'],
                                  ['foo\n', 'bar\r\nba\r', 'z'])
+        self.assertChunksToLines(['foo\n', 'bar\r\n', 'ba\rz'],
+                                 ['foo\nb', 'a', 'r\r\nba\r', 'z'])
+        self.assertChunksToLines(['foo\n', 'bar\r\n', 'ba\rz'],
+                                 ['foo\nbar\r\nba', '\r', 'z'])
+
+        self.assertChunksToLines(['foo\n', 'bar\r\n', 'ba\rz'],
+                                 ['foo\n', '', 'bar\r\nba', '\r', 'z'])
 
 
 class TestSplitLines(TestCase):