Rev 53: Add log acceleration. in http://people.ubuntu.com/~robertc/baz2.0/plugins/search/trunk
Robert Collins
robertc at robertcollins.net
Wed Aug 27 09:37:32 BST 2008
At http://people.ubuntu.com/~robertc/baz2.0/plugins/search/trunk
------------------------------------------------------------
revno: 53
revision-id: robertc at robertcollins.net-20080827083730-pcxlpnwaga8y67lt
parent: robertc at robertcollins.net-20080827075837-8ahm43e8ifrftqht
committer: Robert Collins <robertc at robertcollins.net>
branch nick: trunk
timestamp: Wed 2008-08-27 18:37:30 +1000
message:
Add log acceleration.
modified:
NEWS news-20080608052041-z5bahsl8kwl0uf4x-2
__init__.py __init__.py-20080608052041-z5bahsl8kwl0uf4x-4
index.py index.py-20080608055509-hnimeek7q8tctkqf-2
setup.py setup.py-20080608052041-z5bahsl8kwl0uf4x-6
tests/test_index.py test_index.py-20080608055509-hnimeek7q8tctkqf-4
=== modified file 'NEWS'
--- a/NEWS 2008-08-27 07:58:37 +0000
+++ b/NEWS 2008-08-27 08:37:30 +0000
@@ -4,6 +4,35 @@
.. contents::
+IN DEVELOPMENT
+--------------
+
+ NOTES WHEN UPGRADING:
+
+ CHANGES:
+
+ * Requires bzr.dev with the log filtering patch for full functionality.
+ users not using bzr 1.6b3 or later. (Robert Collins)
+
+ FEATURES:
+
+ * ``bzr log -m`` will now use a search index for message searches that
+ can be converted into a bzr search. While not yet optimised it is
+ about three times faster than normal log with a message regex.
+ Currently only ``\bword\b`` searches can be accelerated this way -
+ which is a word matching regex search. (Robert Collins)
+
+ IMPROVEMENTS:
+
+ BUGFIXES:
+
+ API BREAKS:
+
+ TESTING:
+
+ INTERNALS:
+
+
1.6
---
=== modified file '__init__.py'
--- a/__init__.py 2008-08-27 07:58:37 +0000
+++ b/__init__.py 2008-08-27 08:37:30 +0000
@@ -34,8 +34,10 @@
# Relative because at __init__ time the module does not exist.
from bzrlib.branch import Branch
+from bzrlib import log
import commands
import errors
+import index
for command in [
@@ -45,7 +47,7 @@
bzrlib.commands.register_command(getattr(commands, 'cmd_' + command))
-version_info = (1, 6, 0, 'final', 0)
+version_info = (1, 7, 0, 'dev', 0)
def auto_index_branch(result):
@@ -65,6 +67,17 @@
_install_hooks()
+if getattr(log, 'log_adapters', None):
+ # disable the regex search when bzr-search is active
+ index._original_make_search_filter = log.make_search_filter
+ log.log_adapters.insert(log.log_adapters.index(log.make_search_filter),
+ index.make_disable_search_filter)
+ log.log_adapters.remove(index._original_make_search_filter)
+ log.make_search_filter = index.make_disable_search_filter
+ # provide bzr-search based searches
+ log.log_adapters.insert(log.log_adapters.index(log.make_revision_objects),
+ index.make_log_search_filter)
+
def test_suite():
# Thunk across to load_tests for niceness with older bzr versions
=== modified file 'index.py'
--- a/index.py 2008-08-01 01:42:30 +0000
+++ b/index.py 2008-08-27 08:37:30 +0000
@@ -1292,3 +1292,91 @@
else:
value = self._bisect_nodes[candidate_key]
yield (self, candidate_key, value)
+
+_original_make_search_filter = None
+
+
+def make_disable_search_filter(branch, generate_delta, search, log_rev_iterator):
+ """Disable search filtering if bzr-search will be active.
+
+ This filter replaces the default search filter, using the original filter
+ if a bzr-search filter cannot be used.
+
+ :param branch: The branch being logged.
+ :param generate_delta: Whether to generate a delta for each revision.
+ :param search: A user text search string.
+ :param log_rev_iterator: An input iterator containing all revisions that
+ could be displayed, in lists.
+ :return: An iterator over ((rev_id, revno, merge_depth), rev, delta).
+ """
+ try:
+ open_index_branch(branch)
+ query = query_from_regex(search)
+ if query:
+ return log_rev_iterator
+ except errors.NoSearchIndex:
+ pass
+ return _original_make_search_filter(branch, generate_delta, search,
+ log_rev_iterator)
+
+
+def make_log_search_filter(branch, generate_delta, search, log_rev_iterator):
+ """Filter revisions by using a search index.
+
+ This filter looks up revids in the search index along with the search
+ string, if the search string regex can be converted into a bzr-search
+ query.
+
+ :param branch: The branch being logged.
+ :param generate_delta: Whether to generate a delta for each revision.
+ :param search: A user text search string.
+ :param log_rev_iterator: An input iterator containing all revisions that
+ could be displayed, in lists.
+ :return: An iterator over ((rev_id, revno, merge_depth), rev, delta).
+ """
+ # Can we possibly search on this regex?
+ query = query_from_regex(search)
+ if not query:
+ return log_rev_iterator
+ try:
+ index = open_index_branch(branch)
+ except errors.NoSearchIndex:
+ return log_rev_iterator
+ return _filter_log(index, query, log_rev_iterator)
+
+
+def _filter_log(index, query, log_rev_iterator):
+ """Filter log_rev_iterator's revision ids on query in index."""
+ rev_ids = set()
+ # TODO: we could lazy evaluate the search, for each revision we see - this
+ # would allow searches that hit everything to be less-than-completely
+ # evaluated before the first result is shown. OTOH knowing a miss will
+ # require reading the entire search anyhow. Note that we can do better -
+ # if we looked up the document id of the revision, we could search explicitly
+ # for the document id in the search up front, and do many small searches. This is
+ # likely better in terms of memory use. Needs refactoring etc.
+ for result in index.search(query):
+ if type(result) != RevisionHit:
+ continue
+ rev_ids.add(result.revision_key[0])
+ for batch in log_rev_iterator:
+ new_revs = []
+ for item in batch:
+ if item[0][0] in rev_ids:
+ new_revs.append(item)
+ yield new_revs
+
+
+def query_from_regex(regex):
+ """Convert a regex into a bzr-search query."""
+ # Most trivial implementation ever
+ if regex.count("\\b") != 2:
+ return None
+ regex = regex[2:-2]
+ if regex.count("\\b") != 0:
+ return None
+ # Any additional whitespace implies something we can't search on:
+ _ensure_regexes()
+ if _tokeniser_re.search(regex):
+ return None
+ return [(regex,)]
=== modified file 'setup.py'
--- a/setup.py 2008-08-27 07:58:37 +0000
+++ b/setup.py 2008-08-27 08:37:30 +0000
@@ -3,13 +3,13 @@
bzr_plugin_name = 'search'
-bzr_plugin_version = (1, 6, 0, 'final', 0)
+bzr_plugin_version = (1, 7, 0, 'dev', 0)
bzr_commands = ['index', 'search']
bzr_minimum_version = (1, 6, 0)
if __name__ == '__main__':
setup(name="bzr search",
- version="1.6.0final0",
+ version="1.7.0dev0",
description="bzr search plugin.",
author="Robert Collins",
author_email="bazaar at lists.canonical.com",
=== modified file 'tests/test_index.py'
--- a/tests/test_index.py 2008-08-01 01:35:31 +0000
+++ b/tests/test_index.py 2008-08-27 08:37:30 +0000
@@ -19,6 +19,7 @@
from bzrlib.errors import NotBranchError, UnknownFormatError
from bzrlib.index import GraphIndex
+from bzrlib import log
from bzrlib.plugins import search
from bzrlib.plugins.search import errors, index
from bzrlib.tests import TestCaseWithTransport
@@ -607,3 +608,71 @@
(query_index, ('pref', 'preferential'), ''),
],
sorted(query_index.iter_entries_starts_with(('pref', 'pref'))))
+
+
+class TestLogFilter(TestCaseWithTransport):
+
+ def test_registered(self):
+ self.assertTrue(index.make_disable_search_filter in log.log_adapters)
+ self.assertTrue(index.make_log_search_filter in log.log_adapters)
+ self.assertFalse(index._original_make_search_filter in log.log_adapters)
+
+ def test_get_filter_no_index(self):
+ tree = self.make_branch_and_tree('foo')
+ base_iterator = 'base'
+ # bzr-search won't kick in
+ self.assertEqual(base_iterator, index.make_log_search_filter(
+ tree.branch, False, "\\bword\\b", base_iterator))
+ # so the disabling wrapper must.
+ self.assertNotEqual(base_iterator, index.make_disable_search_filter(
+ tree.branch, False, "\\bword\\b", base_iterator))
+
+ def test_get_filter_too_complex(self):
+ """A too complex regex becomes a baseline search."""
+ # We test this by searching for something that a index search would
+ # miss but a regex search finds
+ tree = self.make_branch_and_tree('foo')
+ revid = tree.commit('first post')
+ index.index_url(self.get_url('foo'))
+ rev = tree.branch.repository.get_revision(revid)
+ input_iterator = [[((revid, 1, 0), rev, None)]]
+ rev_log_iterator = index.make_disable_search_filter(
+ tree.branch, False, "st po", input_iterator)
+ self.assertNotEqual(input_iterator, rev_log_iterator)
+ # everything matches
+ self.assertEqual(input_iterator, list(rev_log_iterator))
+ # bzr-search won't kick in
+ self.assertEqual(input_iterator, index.make_log_search_filter(
+ tree.branch, False, "st po", input_iterator))
+
+ def test_get_filter_searchable_regex(self):
+ """A parsable regex becomes a index search."""
+ # We test this by searching for something that a index search would
+ # miss hit, and crippling the baseline search reference.
+ self.saved_orig = index._original_make_search_filter
+ def restore():
+ index._original_make_search_filter = self.saved_orig
+ self.addCleanup(restore)
+ index._original_make_search_filter = None
+ tree = self.make_branch_and_tree('foo')
+ revid = tree.commit('first post')
+ revid2 = tree.commit('second post')
+ index.index_url(self.get_url('foo'))
+ input_iterator = [
+ [((revid2, 2, 0), None, None), ((revid, 1, 0), None, None)]]
+ # the disabled filter must not kick in
+ self.assertEqual(input_iterator, index.make_disable_search_filter(
+ tree.branch, False, "\\bfirst\\b", input_iterator))
+ # we must get a functional search from the log search filter.
+ rev_log_iterator = index.make_log_search_filter(
+ tree.branch, False, "\\bfirst\\b", input_iterator)
+ self.assertNotEqual(input_iterator, rev_log_iterator)
+ # rev id 2 should be filtered out.
+ expected_result = [[((revid, 1, 0), None, None)]]
+ self.assertEqual(expected_result, list(rev_log_iterator))
+
+ def test_query_from_regex(self):
+ self.assertEqual(None, index.query_from_regex("foo"))
+ self.assertEqual(None, index.query_from_regex("fo o"))
+ self.assertEqual(None, index.query_from_regex("\\bfoo \\b"))
+ self.assertEqual([("foo",)], index.query_from_regex("\\bfoo\\b"))
More information about the bazaar-commits
mailing list