Rev 3896: Change name to 'chunks_to_lines', and find an optimized form. in http://bzr.arbash-meinel.com/branches/bzr/1.11/get_record_stream_chunked
John Arbash Meinel
john at arbash-meinel.com
Thu Dec 11 02:02:32 GMT 2008
At http://bzr.arbash-meinel.com/branches/bzr/1.11/get_record_stream_chunked
------------------------------------------------------------
revno: 3896
revision-id: john at arbash-meinel.com-20081211020207-rrgdcyqc344zo5q1
parent: john at arbash-meinel.com-20081211011419-vqtdjgpa04woqvm4
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: get_record_stream_chunked
timestamp: Wed 2008-12-10 20:02:07 -0600
message:
Change name to 'chunks_to_lines', and find an optimized form.
It is a little bit ugly, but it is faster than join & split, and means
we get to leave the strings untouched.
-------------- next part --------------
=== modified file 'bzrlib/osutils.py'
--- a/bzrlib/osutils.py 2008-12-11 01:10:38 +0000
+++ b/bzrlib/osutils.py 2008-12-11 02:02:07 +0000
@@ -820,7 +820,7 @@
return pathjoin(*p)
-def chunked_to_lines(chunks):
+def chunks_to_lines(chunks):
"""Ensure that chunks is split cleanly into lines.
Each entry in the result should contain a single newline at the end. Except
@@ -829,6 +829,28 @@
:param chunks: An iterable of strings
:return: A list of strings.
"""
+ # Optimize for a very common case when chunks are already lines
+ def fail():
+ raise IndexError
+ try:
+ # This is a bit ugly, but is the fastest way to check if all of the
+ # chunks are individual lines.
+ # You can't use function calls like .count(), .index(), or endswith()
+ # because they incur too much python overhead.
+ # It works because
+ # if chunk is an empty string, it will raise IndexError, which will
+ # be caught.
+ # if chunk doesn't end with '\n' then we hit fail()
+ # if there is more than one '\n' then we hit fail()
+ # timing shows this loop to take 2.58ms rather than 3.18ms for
+ # split_lines(''.join(chunks))
+ # Further, it means we get to preserve the original lines, rather than
+ # expanding memory
+ [(chunk[-1] == '\n' and '\n' not in chunk[:-1]) or fail()
+ for chunk in chunks]
+ return chunks
+ except IndexError:
+ pass
return split_lines(''.join(chunks))
=== modified file 'bzrlib/tests/test_osutils.py'
--- a/bzrlib/tests/test_osutils.py 2008-12-11 01:14:19 +0000
+++ b/bzrlib/tests/test_osutils.py 2008-12-11 02:02:07 +0000
@@ -756,10 +756,10 @@
self.assertEndsWith(osutils._mac_getcwd(), u'B\xe5gfors')
-class TestChunkedToLines(TestCase):
+class TestChunksToLines(TestCase):
def assertChunksToLines(self, lines, chunks):
- self.assertEqual(lines, osutils.chunked_to_lines(chunks))
+ self.assertEqual(lines, osutils.chunks_to_lines(chunks))
def test_fulltext_chunk_to_lines(self):
self.assertChunksToLines(['foo\n', 'bar\r\n', 'ba\rz\n'],
@@ -778,6 +778,13 @@
def test_mixed(self):
self.assertChunksToLines(['foo\n', 'bar\r\n', 'ba\rz'],
['foo\n', 'bar\r\nba\r', 'z'])
+ self.assertChunksToLines(['foo\n', 'bar\r\n', 'ba\rz'],
+ ['foo\nb', 'a', 'r\r\nba\r', 'z'])
+ self.assertChunksToLines(['foo\n', 'bar\r\n', 'ba\rz'],
+ ['foo\nbar\r\nba', '\r', 'z'])
+
+ self.assertChunksToLines(['foo\n', 'bar\r\n', 'ba\rz'],
+ ['foo\n', '', 'bar\r\nba', '\r', 'z'])
class TestSplitLines(TestCase):
More information about the bazaar-commits
mailing list