Rev 38: Coming fresh to you, unoptimised and with code duplication, may I present search suggestions. (Robert Collins) in http://people.ubuntu.com/~robertc/baz2.0/plugins/search/trunk
Robert Collins
robertc at robertcollins.net
Tue Jun 17 13:25:39 BST 2008
At http://people.ubuntu.com/~robertc/baz2.0/plugins/search/trunk
------------------------------------------------------------
revno: 38
revision-id: robertc at robertcollins.net-20080617122537-eyv8tt7s1dki5dyw
parent: robertc at robertcollins.net-20080614094510-tow920w1dq381gz2
committer: Robert Collins <robertc at robertcollins.net>
branch nick: trunk
timestamp: Tue 2008-06-17 22:25:37 +1000
message:
Coming fresh to you, unoptimised and with code duplication, may I present search suggestions. (Robert Collins)
modified:
NEWS news-20080608052041-z5bahsl8kwl0uf4x-2
commands.py commands.py-20080608052041-z5bahsl8kwl0uf4x-5
index.py index.py-20080608055509-hnimeek7q8tctkqf-2
tests/test_blackbox.py test_blackbox.py-20080608052041-z5bahsl8kwl0uf4x-9
tests/test_index.py test_index.py-20080608055509-hnimeek7q8tctkqf-4
=== modified file 'NEWS'
--- a/NEWS 2008-06-14 09:45:10 +0000
+++ b/NEWS 2008-06-17 12:25:37 +0000
@@ -18,6 +18,8 @@
(Robert Collins)
* New command ``search`` to search a search index from within bzr.
+ When given -s this will perform a suggestion search, looking for
+ possible search terms starting with the current search.
(Robert Collins)
IMPROVEMENTS:
=== modified file 'commands.py'
--- a/commands.py 2008-06-14 09:45:10 +0000
+++ b/commands.py 2008-06-17 12:25:37 +0000
@@ -18,6 +18,7 @@
"""The command console user interface for bzr search."""
import bzrlib.commands
+from bzrlib.option import Option
from bzrlib.plugins.search import errors
from bzrlib.plugins.search import index as _mod_index
from bzrlib.transport import get_transport
@@ -48,21 +49,30 @@
"""
_see_also = ['index']
+ takes_options = [Option('suggest', short_name='s',
+ help="Suggest possible terms to complete the search.")
+ ]
takes_args = ['query+']
- def run(self, query_list=[]):
+ def run(self, query_list=[], suggest=False):
trans = get_transport('.')
index = _mod_index.open_index_url(trans.base)
# XXX: Have a query translator etc.
query = [(query_item,) for query_item in query_list]
- seen_count = 0
index._branch.lock_read()
try:
- for result in index.search(query):
- self.outf.write(result.document_name())
- self.outf.write(" Summary: '%s'\n" % result.summary())
- seen_count += 1
+ if suggest:
+ terms = index.suggest(query)
+ terms = list(terms)
+ terms.sort()
+ self.outf.write("Suggestions: %s\n" % terms)
+ else:
+ seen_count = 0
+ for result in index.search(query):
+ self.outf.write(result.document_name())
+ self.outf.write(" Summary: '%s'\n" % result.summary())
+ seen_count += 1
+ if seen_count == 0:
+ raise errors.NoMatch(query_list)
finally:
index._branch.unlock()
- if seen_count == 0:
- raise errors.NoMatch(query_list)
=== modified file 'index.py'
--- a/index.py 2008-06-14 09:45:10 +0000
+++ b/index.py 2008-06-17 12:25:37 +0000
@@ -421,6 +421,14 @@
self._revision_indices.remove(index.revision_index)
del self._current_names[name]
+ def _search_work(self, termlist):
+ """Core worker logic for performing searches.
+
+ :param termlist: An iterable of terms to search for.
+ :return: An iterator over search results in each component.
+ """
+ _ensure_regexes()
+
def search(self, termlist):
"""Trivial set-based search of the index.
@@ -428,7 +436,7 @@
:return: An iterator of SearchResults for documents indexed by all
terms in the termlist.
"""
- _ensure_regexes()
+ self._search_work(None)
self._refresh_indices()
found_documents = []
# Use a set to remove duplicates
@@ -454,6 +462,7 @@
# One or more terms missing - no hits are possible.
continue
# load the first document list:
+ term_info.sort()
_, term_id, posting_start, posting_length = term_info.pop(0)
posting_stop = posting_start + posting_length
post_name = "term_list." + term_id
@@ -492,6 +501,100 @@
else:
raise Exception("unknown doc type %r" % (doc_key,))
+ def suggest(self, termlist):
+ """Generate suggestions for extending a search.
+
+ :param termlist: A list of terms.
+ :return: An iterator of terms that start with the last search term in
+ termlist, and match the rest of the search.
+ """
+ _ensure_regexes()
+ self._refresh_indices()
+ found_documents = []
+ if not termlist:
+ return
+ suggest_term = termlist[-1]
+ suggestions = set()
+ # Use a set to remove duplicates
+ termlist = set(termlist[:-1])
+ term_keys = [None, set(), set()]
+ for term in termlist:
+ term_keys[len(term)].add(term)
+
+ for value, component in self._current_names.values():
+ term_index = component.term_index
+ # TODO: push into Component
+ # TODO: use a dequeue?
+ term_info = []
+ for node in chain(term_index.iter_entries(term_keys[1]),
+ component.term_2_index.iter_entries(term_keys[2])):
+ term_id, posting_count, posting_start, posting_length = \
+ node[2].split(" ")
+ term_info.append((int(posting_count), term_id,
+ int(posting_start), int(posting_length)))
+ if not termlist:
+ # no terms to search for other than the suggestion:
+ found_documents = []
+ elif len(term_info) != len(termlist):
+ # One or more terms missing - no hits are possible.
+ continue
+ else:
+ # filter down to some specific document ids
+ # load the first document list:
+ term_info.sort()
+ _, term_id, posting_start, posting_length = term_info.pop(0)
+ posting_stop = posting_start + posting_length
+ post_name = "term_list." + term_id
+ filemap = {post_name:(posting_start, posting_stop)}
+ view = FileView(self._indices_transport, component.name + '.pack',
+ filemap)
+ post_index = GraphIndex(view, post_name, posting_length)
+ common_doc_keys = set([node[1] for node in
+ post_index.iter_all_entries()])
+ # Now we whittle down the nodes we need - still going in sorted
+ # order. (possibly doing concurrent reduction would be better).
+ while common_doc_keys and term_info:
+ _, term_id, posting_start, posting_length = term_info.pop(0)
+ posting_stop = posting_start + posting_length
+ post_name = "term_list." + term_id
+ filemap = {post_name:(posting_start, posting_stop)}
+ view = FileView(self._indices_transport,
+ component.name + '.pack', filemap)
+ post_index = GraphIndex(view, post_name, posting_length)
+ common_doc_keys = set([node[1] for node in
+ post_index.iter_entries(common_doc_keys)])
+ common_doc_ids = [key[0] for key in common_doc_keys]
+ found_documents = [(component, doc_id) for doc_id in
+ common_doc_ids]
+ if len(suggest_term) == 1:
+ suggest_index = term_index
+ else:
+ suggest_index = component.term_2_index
+ for node in suggest_index.iter_entries_starts_with(suggest_term):
+ suggestion = node[1]
+ if found_documents:
+ # Friction: why don't we keep them as keys
+ common_doc_keys = [(doc[1],) for doc in found_documents]
+ term_id, posting_count, posting_start, posting_length = \
+ node[2].split(" ")
+ posting_count = int(posting_count)
+ posting_start = int(posting_start)
+ posting_length = int(posting_length)
+ posting_stop = posting_start + posting_length
+ post_name = "term_list." + term_id
+ filemap = {post_name:(posting_start, posting_stop)}
+ view = FileView(self._indices_transport,
+ component.name + '.pack', filemap)
+ post_index = GraphIndex(view, post_name, posting_length)
+ common_doc_keys = set([node[1] for node in
+ post_index.iter_entries(common_doc_keys)])
+ if len(common_doc_keys):
+ # This suggestion matches other terms in the qury:
+ suggestions.add(suggestion)
+ else:
+ suggestions.add(suggestion)
+ return suggestions
+
def _terms_for_file_terms(self, repository, file_terms, order_dict):
"""Generate terms for the path of every file_id, revision_id in terms.
@@ -712,7 +815,7 @@
}
view = FileView(transport, name + '.pack', filemap)
rev_index = GraphIndex(view, "revisions", lengths[1])
- term_index = GraphIndex(view, "terms", lengths[3])
+ term_index = SuggestableGraphIndex(view, "terms", lengths[3])
term_2_index = GraphIndex(view, "terms_2", lengths[7])
doc_index = GraphIndex(view, "documents", lengths[5])
self.revision_index = rev_index
@@ -1051,3 +1154,27 @@
"""Thunk for use by Index._add_index."""
self.transport = upload_transport
return self.combine()
+
+
+class SuggestableGraphIndex(GraphIndex):
+ """A subclass of GraphIndex which adds starts_with searches.
+
+ These searches are used for providing suggestions.
+ """
+
+ def iter_entries_starts_with(self, key):
+ """Iterate over nodes which match key.
+
+ The first length()-1 terms in key must match exactly, and the last term
+ in key is used as a starts_with test.
+
+ :param key: The key to search with.
+ """
+ # Make it work:
+ if len(key) > 1:
+ candidates = self.iter_entries_prefix([key[:-1] + (None,)])
+ else:
+ candidates = self.iter_all_entries()
+ for candidate in candidates:
+ if candidate[1][-1].startswith(key[-1]):
+ yield candidate
=== modified file 'tests/test_blackbox.py'
--- a/tests/test_blackbox.py 2008-06-13 11:48:12 +0000
+++ b/tests/test_blackbox.py 2008-06-17 12:25:37 +0000
@@ -58,6 +58,24 @@
self.assertEqual('', err)
self.assertEqual("Revision id '%s'. Summary: 'this message'\n" % rev_id1, out)
+ def test_search_suggestions_works(self):
+ # Bare bones - no ui love as yet:
+ tree = self.make_branch_and_tree('.')
+ init_index(tree.branch)
+ rev_id1 = tree.commit('this message\nhas two lines')
+ rev_id2 = tree.commit('this message does not\n')
+ index_url(self.get_url('.'))
+ index = open_index_url(self.get_url('.'))
+ out, err = self.run_bzr(['search', '-s', 'tw'])
+ self.assertEqual('', err)
+ self.assertEqual("Suggestions: [('two',)]\n", out)
+ out, err = self.run_bzr(['search', '-s', 't'])
+ self.assertEqual('', err)
+ self.assertEqual("Suggestions: [('this',), ('two',)]\n", out)
+ out, err = self.run_bzr(['search', '-s', 'too'])
+ self.assertEqual('', err)
+ self.assertEqual("Suggestions: []\n", out)
+
class TestIndex(TestCaseWithTransport):
=== modified file 'tests/test_index.py'
--- a/tests/test_index.py 2008-06-14 09:45:10 +0000
+++ b/tests/test_index.py 2008-06-17 12:25:37 +0000
@@ -226,6 +226,33 @@
self.assertIsInstance(results[0], index.RevisionHit)
self.assertEqual((revid,), results[0].revision_key)
+ def test_suggestions_trivial(self):
+ tree = self.make_branch_and_tree('tree')
+ rev_index = index.init_index(tree.branch)
+ revid = tree.commit('first')
+ rev_index.index_branch(tree.branch, revid)
+ # f matches
+ self.assertEqual([('first',)], list(rev_index.suggest([('f',)])))
+ self.assertEqual([('first',)], list(rev_index.suggest([('fi',)])))
+ self.assertEqual([('first',)], list(rev_index.suggest([('fir',)])))
+ self.assertEqual([('first',)], list(rev_index.suggest([('fir',)])))
+ self.assertEqual([('first',)], list(rev_index.suggest([('firs',)])))
+ self.assertEqual([('first',)], list(rev_index.suggest([('first',)])))
+ self.assertEqual([], list(rev_index.suggest([('firste',)])))
+
+ def test_suggestions_two_terms(self):
+ """With two terms only matching suggestions are made."""
+ tree = self.make_branch_and_tree('tree')
+ rev_index = index.init_index(tree.branch)
+ revid = tree.commit('term suggestion')
+ rev_index.index_branch(tree.branch, revid)
+ # suggesting ('term',), ('suggest',) matches suggestion,
+ # and suggestion ('missing',), ('suggest',) matches nothing.
+ self.assertEqual([('suggestion',)],
+ list(rev_index.suggest([('term',), ('suggest',)])))
+ self.assertEqual([],
+ list(rev_index.suggest([('missing',), ('suggest',)])))
+
class TestResults(TestCaseWithTransport):
@@ -418,3 +445,79 @@
revid1 = tree.commit('foo')
self.assertEqual(set([(revid1,)]),
set(search_index.indexed_revisions()))
+
+
+class TestGraphIndexSuggestions(TestCaseWithTransport):
+ """Tests for the SuggestableGraphIndex subclass."""
+
+ def test_key_length_1_no_hits(self):
+ builder = index.InMemoryGraphIndex(0, 1)
+ # We want nodes before and after the suggestions, to check boundaries.
+ builder.add_node(('pre',), '', ())
+ builder.add_node(('prep',), '', ())
+ transport = self.get_transport()
+ length = transport.put_file('index', builder.finish())
+ query_index = index.SuggestableGraphIndex(transport, 'index', length)
+ # Now, searching for suggestions for 'pref' should find nothing.
+ self.assertEqual([],
+ list(query_index.iter_entries_starts_with(('pref',))))
+
+ def test_key_length_1_iteration(self):
+ builder = index.InMemoryGraphIndex(0, 1)
+ # We want nodes before and after the suggestions, to check boundaries.
+ builder.add_node(('pre',), '', ())
+ builder.add_node(('prep',), '', ())
+ # We want some entries to find.
+ builder.add_node(('pref',), '', ())
+ builder.add_node(('preferential',), '', ())
+ transport = self.get_transport()
+ length = transport.put_file('index', builder.finish())
+ query_index = index.SuggestableGraphIndex(transport, 'index', length)
+ # Now, searching for suggestions for 'pref' should find 'pref' and
+ # 'preferential'.
+ self.assertEqual([
+ (query_index, ('pref',), ''),
+ (query_index, ('preferential',), ''),
+ ],
+ list(query_index.iter_entries_starts_with(('pref',))))
+
+ def test_key_length_2_no_hits(self):
+ builder = index.InMemoryGraphIndex(0, 2)
+ # We want nodes before and after the suggestions, to check boundaries.
+ # As there are two elements in each key, we want to check this for each
+ # element, which implies 4 boundaries:
+ builder.add_node(('pre', 'pref'), '', ())
+ builder.add_node(('pref', 'pre'), '', ())
+ builder.add_node(('pref', 'prep'), '', ())
+ builder.add_node(('prep', 'pref'), '', ())
+ transport = self.get_transport()
+ length = transport.put_file('index', builder.finish())
+ query_index = index.SuggestableGraphIndex(transport, 'index', length)
+ # Now, searching for suggestions for 'pref', 'pref' should find
+ # nothing.
+ self.assertEqual([],
+ list(query_index.iter_entries_starts_with(('pref', 'pref'))))
+
+ def test_key_length_2_iteration(self):
+ builder = index.InMemoryGraphIndex(0, 2)
+ # We want nodes before and after the suggestions, to check boundaries.
+ # - the first element of the key must be an exact match, the second is
+ # a startswith match, so provide non-match entries that match the second
+ # in case of bugs there.
+ builder.add_node(('pre', 'pref'), '', ())
+ builder.add_node(('pref', 'pre'), '', ())
+ builder.add_node(('pref', 'prep'), '', ())
+ builder.add_node(('prep', 'pref'), '', ())
+ # We want some entries to find.
+ builder.add_node(('pref', 'pref'), '', ())
+ builder.add_node(('pref', 'preferential'), '', ())
+ transport = self.get_transport()
+ length = transport.put_file('index', builder.finish())
+ query_index = index.SuggestableGraphIndex(transport, 'index', length)
+ # Now, searching for suggestions for 'pref' should find 'pref' and
+ # 'preferential'.
+ self.assertEqual([
+ (query_index, ('pref', 'pref'), ''),
+ (query_index, ('pref', 'preferential'), ''),
+ ],
+ list(query_index.iter_entries_starts_with(('pref', 'pref'))))
More information about the bazaar-commits
mailing list