Rev 38: Coming fresh to you, unoptimised and with code duplication, may I present search suggestions. (Robert Collins) in http://people.ubuntu.com/~robertc/baz2.0/plugins/search/trunk

Tue Jun 17 13:25:39 BST 2008

At http://people.ubuntu.com/~robertc/baz2.0/plugins/search/trunk

------------------------------------------------------------
revno: 38
revision-id: robertc at robertcollins.net-20080617122537-eyv8tt7s1dki5dyw
parent: robertc at robertcollins.net-20080614094510-tow920w1dq381gz2
committer: Robert Collins <robertc at robertcollins.net>
branch nick: trunk
timestamp: Tue 2008-06-17 22:25:37 +1000
message:
  Coming fresh to you, unoptimised and with code duplication, may I present search suggestions. (Robert Collins)
modified:
  NEWS                           news-20080608052041-z5bahsl8kwl0uf4x-2
  commands.py                    commands.py-20080608052041-z5bahsl8kwl0uf4x-5
  index.py                       index.py-20080608055509-hnimeek7q8tctkqf-2
  tests/test_blackbox.py         test_blackbox.py-20080608052041-z5bahsl8kwl0uf4x-9
  tests/test_index.py            test_index.py-20080608055509-hnimeek7q8tctkqf-4
=== modified file 'NEWS'

--- a/NEWS	2008-06-14 09:45:10 +0000
+++ b/NEWS	2008-06-17 12:25:37 +0000
@@ -18,6 +18,8 @@
       (Robert Collins)
 
     * New command ``search`` to search a search index from within bzr.
+      When given -s this will perform a suggestion search, looking for
+      possible search terms starting with the current search.
       (Robert Collins)
 
   IMPROVEMENTS:

=== modified file 'commands.py'
--- a/commands.py	2008-06-14 09:45:10 +0000
+++ b/commands.py	2008-06-17 12:25:37 +0000
@@ -18,6 +18,7 @@
 """The command console user interface for bzr search."""
 
 import bzrlib.commands
+from bzrlib.option import Option
 from bzrlib.plugins.search import errors
 from bzrlib.plugins.search import index as _mod_index
 from bzrlib.transport import get_transport
@@ -48,21 +49,30 @@
     """
 
     _see_also = ['index']
+    takes_options = [Option('suggest', short_name='s',
+        help="Suggest possible terms to complete the search.")
+        ]
     takes_args = ['query+']
 
-    def run(self, query_list=[]):
+    def run(self, query_list=[], suggest=False):
         trans = get_transport('.')
         index = _mod_index.open_index_url(trans.base)
         # XXX: Have a query translator etc.
         query = [(query_item,) for query_item in query_list]
-        seen_count = 0
         index._branch.lock_read()
         try:
-            for result in index.search(query):
-                self.outf.write(result.document_name())
-                self.outf.write(" Summary: '%s'\n" % result.summary())
-                seen_count += 1
+            if suggest:
+                terms = index.suggest(query)
+                terms = list(terms)
+                terms.sort()
+                self.outf.write("Suggestions: %s\n" % terms)
+            else:
+                seen_count = 0
+                for result in index.search(query):
+                    self.outf.write(result.document_name())
+                    self.outf.write(" Summary: '%s'\n" % result.summary())
+                    seen_count += 1
+                if seen_count == 0:
+                    raise errors.NoMatch(query_list)
         finally:
             index._branch.unlock()
-        if seen_count == 0:
-            raise errors.NoMatch(query_list)

=== modified file 'index.py'
--- a/index.py	2008-06-14 09:45:10 +0000
+++ b/index.py	2008-06-17 12:25:37 +0000
@@ -421,6 +421,14 @@
         self._revision_indices.remove(index.revision_index)
         del self._current_names[name]
 
+    def _search_work(self, termlist):
+        """Core worker logic for performing searches.
+        
+        :param termlist: An iterable of terms to search for.
+        :return: An iterator over search results in each component.
+        """
+        _ensure_regexes()
+
     def search(self, termlist):
         """Trivial set-based search of the index.
 
@@ -428,7 +436,7 @@
         :return: An iterator of SearchResults for documents indexed by all
             terms in the termlist.
         """
-        _ensure_regexes()
+        self._search_work(None)
         self._refresh_indices()
         found_documents = []
         # Use a set to remove duplicates
@@ -454,6 +462,7 @@
                 # One or more terms missing - no hits are possible.
                 continue
             # load the first document list: 
+            term_info.sort()
             _, term_id, posting_start, posting_length = term_info.pop(0)
             posting_stop = posting_start + posting_length
             post_name = "term_list." + term_id
@@ -492,6 +501,100 @@
                 else:
                     raise Exception("unknown doc type %r" % (doc_key,))
 
+    def suggest(self, termlist):
+        """Generate suggestions for extending a search.
+
+        :param termlist: A list of terms.
+        :return: An iterator of terms that start with the last search term in
+            termlist, and match the rest of the search.
+        """
+        _ensure_regexes()
+        self._refresh_indices()
+        found_documents = []
+        if not termlist:
+            return
+        suggest_term = termlist[-1]
+        suggestions = set()
+        # Use a set to remove duplicates
+        termlist = set(termlist[:-1])
+        term_keys = [None, set(), set()]
+        for term in termlist:
+            term_keys[len(term)].add(term)
+
+        for value, component in self._current_names.values():
+            term_index = component.term_index
+            # TODO: push into Component
+            # TODO: use a dequeue?
+            term_info = []
+            for node in chain(term_index.iter_entries(term_keys[1]),
+                component.term_2_index.iter_entries(term_keys[2])):
+                term_id, posting_count, posting_start, posting_length = \
+                    node[2].split(" ")
+                term_info.append((int(posting_count), term_id,
+                    int(posting_start), int(posting_length)))
+            if not termlist:
+                # no terms to search for other than the suggestion:
+                found_documents = []
+            elif len(term_info) != len(termlist):
+                # One or more terms missing - no hits are possible.
+                continue
+            else:
+                # filter down to some specific document ids
+                # load the first document list: 
+                term_info.sort()
+                _, term_id, posting_start, posting_length = term_info.pop(0)
+                posting_stop = posting_start + posting_length
+                post_name = "term_list." + term_id
+                filemap = {post_name:(posting_start, posting_stop)}
+                view = FileView(self._indices_transport, component.name + '.pack',
+                    filemap)
+                post_index = GraphIndex(view, post_name, posting_length)
+                common_doc_keys = set([node[1] for node in
+                    post_index.iter_all_entries()])
+                # Now we whittle down the nodes we need - still going in sorted
+                # order. (possibly doing concurrent reduction would be better).
+                while common_doc_keys and term_info:
+                    _, term_id, posting_start, posting_length = term_info.pop(0)
+                    posting_stop = posting_start + posting_length
+                    post_name = "term_list." + term_id
+                    filemap = {post_name:(posting_start, posting_stop)}
+                    view = FileView(self._indices_transport,
+                        component.name + '.pack', filemap)
+                    post_index = GraphIndex(view, post_name, posting_length)
+                    common_doc_keys = set([node[1] for node in
+                        post_index.iter_entries(common_doc_keys)])
+                common_doc_ids = [key[0] for key in common_doc_keys]
+                found_documents = [(component, doc_id) for doc_id in
+                    common_doc_ids]
+            if len(suggest_term) == 1:
+                suggest_index = term_index
+            else:
+                suggest_index = component.term_2_index
+            for node in suggest_index.iter_entries_starts_with(suggest_term):
+                suggestion = node[1]
+                if found_documents:
+                    # Friction: why don't we keep them as keys
+                    common_doc_keys = [(doc[1],) for doc in found_documents]
+                    term_id, posting_count, posting_start, posting_length = \
+                        node[2].split(" ")
+                    posting_count = int(posting_count)
+                    posting_start = int(posting_start)
+                    posting_length = int(posting_length)
+                    posting_stop = posting_start + posting_length
+                    post_name = "term_list." + term_id
+                    filemap = {post_name:(posting_start, posting_stop)}
+                    view = FileView(self._indices_transport,
+                        component.name + '.pack', filemap)
+                    post_index = GraphIndex(view, post_name, posting_length)
+                    common_doc_keys = set([node[1] for node in
+                        post_index.iter_entries(common_doc_keys)])
+                    if len(common_doc_keys):
+                        # This suggestion matches other terms in the qury:
+                        suggestions.add(suggestion)
+                else:
+                    suggestions.add(suggestion)
+        return suggestions
+
     def _terms_for_file_terms(self, repository, file_terms, order_dict):
         """Generate terms for the path of every file_id, revision_id in terms.
 
@@ -712,7 +815,7 @@
             }
         view = FileView(transport, name + '.pack', filemap)
         rev_index = GraphIndex(view, "revisions", lengths[1])
-        term_index = GraphIndex(view, "terms", lengths[3])
+        term_index = SuggestableGraphIndex(view, "terms", lengths[3])
         term_2_index = GraphIndex(view, "terms_2", lengths[7])
         doc_index = GraphIndex(view, "documents", lengths[5])
         self.revision_index = rev_index
@@ -1051,3 +1154,27 @@
         """Thunk for use by Index._add_index."""
         self.transport = upload_transport
         return self.combine()
+
+
+class SuggestableGraphIndex(GraphIndex):
+    """A subclass of GraphIndex which adds starts_with searches.
+
+    These searches are used for providing suggestions.
+    """
+
+    def iter_entries_starts_with(self, key):
+        """Iterate over nodes which match key.
+
+        The first length()-1 terms in key must match exactly, and the last term
+        in key is used as a starts_with test.
+
+        :param key: The key to search with.
+        """
+        # Make it work:
+        if len(key) > 1:
+            candidates = self.iter_entries_prefix([key[:-1] + (None,)])
+        else:
+            candidates = self.iter_all_entries()
+        for candidate in candidates:
+            if candidate[1][-1].startswith(key[-1]):
+                yield candidate

=== modified file 'tests/test_blackbox.py'
--- a/tests/test_blackbox.py	2008-06-13 11:48:12 +0000
+++ b/tests/test_blackbox.py	2008-06-17 12:25:37 +0000
@@ -58,6 +58,24 @@
         self.assertEqual('', err)
         self.assertEqual("Revision id '%s'. Summary: 'this message'\n" % rev_id1, out)
 
+    def test_search_suggestions_works(self):
+        # Bare bones - no ui love as yet:
+        tree = self.make_branch_and_tree('.')
+        init_index(tree.branch)
+        rev_id1 = tree.commit('this message\nhas two lines')
+        rev_id2 = tree.commit('this message does not\n')
+        index_url(self.get_url('.'))
+        index = open_index_url(self.get_url('.'))
+        out, err = self.run_bzr(['search', '-s', 'tw'])
+        self.assertEqual('', err)
+        self.assertEqual("Suggestions: [('two',)]\n", out)
+        out, err = self.run_bzr(['search', '-s', 't'])
+        self.assertEqual('', err)
+        self.assertEqual("Suggestions: [('this',), ('two',)]\n", out)
+        out, err = self.run_bzr(['search', '-s', 'too'])
+        self.assertEqual('', err)
+        self.assertEqual("Suggestions: []\n", out)
+
 
 class TestIndex(TestCaseWithTransport):
 

=== modified file 'tests/test_index.py'
--- a/tests/test_index.py	2008-06-14 09:45:10 +0000
+++ b/tests/test_index.py	2008-06-17 12:25:37 +0000
@@ -226,6 +226,33 @@
         self.assertIsInstance(results[0], index.RevisionHit)
         self.assertEqual((revid,), results[0].revision_key)
 
+    def test_suggestions_trivial(self):
+        tree = self.make_branch_and_tree('tree')
+        rev_index = index.init_index(tree.branch)
+        revid = tree.commit('first')
+        rev_index.index_branch(tree.branch, revid)
+        # f matches
+        self.assertEqual([('first',)], list(rev_index.suggest([('f',)])))
+        self.assertEqual([('first',)], list(rev_index.suggest([('fi',)])))
+        self.assertEqual([('first',)], list(rev_index.suggest([('fir',)])))
+        self.assertEqual([('first',)], list(rev_index.suggest([('fir',)])))
+        self.assertEqual([('first',)], list(rev_index.suggest([('firs',)])))
+        self.assertEqual([('first',)], list(rev_index.suggest([('first',)])))
+        self.assertEqual([], list(rev_index.suggest([('firste',)])))
+
+    def test_suggestions_two_terms(self):
+        """With two terms only matching suggestions are made."""
+        tree = self.make_branch_and_tree('tree')
+        rev_index = index.init_index(tree.branch)
+        revid = tree.commit('term suggestion')
+        rev_index.index_branch(tree.branch, revid)
+        # suggesting ('term',), ('suggest',) matches suggestion,
+        # and suggestion ('missing',), ('suggest',) matches nothing.
+        self.assertEqual([('suggestion',)],
+            list(rev_index.suggest([('term',), ('suggest',)])))
+        self.assertEqual([],
+            list(rev_index.suggest([('missing',), ('suggest',)])))
+
 
 class TestResults(TestCaseWithTransport):
 
@@ -418,3 +445,79 @@
         revid1 = tree.commit('foo')
         self.assertEqual(set([(revid1,)]),
             set(search_index.indexed_revisions()))
+
+
+class TestGraphIndexSuggestions(TestCaseWithTransport):
+    """Tests for the SuggestableGraphIndex subclass."""
+
+    def test_key_length_1_no_hits(self):
+        builder = index.InMemoryGraphIndex(0, 1)
+        # We want nodes before and after the suggestions, to check boundaries.
+        builder.add_node(('pre',), '', ())
+        builder.add_node(('prep',), '', ())
+        transport = self.get_transport()
+        length = transport.put_file('index', builder.finish())
+        query_index = index.SuggestableGraphIndex(transport, 'index', length)
+        # Now, searching for suggestions for 'pref' should find nothing.
+        self.assertEqual([],
+            list(query_index.iter_entries_starts_with(('pref',))))
+
+    def test_key_length_1_iteration(self):
+        builder = index.InMemoryGraphIndex(0, 1)
+        # We want nodes before and after the suggestions, to check boundaries.
+        builder.add_node(('pre',), '', ())
+        builder.add_node(('prep',), '', ())
+        # We want some entries to find.
+        builder.add_node(('pref',), '', ())
+        builder.add_node(('preferential',), '', ())
+        transport = self.get_transport()
+        length = transport.put_file('index', builder.finish())
+        query_index = index.SuggestableGraphIndex(transport, 'index', length)
+        # Now, searching for suggestions for 'pref' should find 'pref' and
+        # 'preferential'.
+        self.assertEqual([
+            (query_index, ('pref',), ''),
+            (query_index, ('preferential',), ''),
+            ],
+            list(query_index.iter_entries_starts_with(('pref',))))
+
+    def test_key_length_2_no_hits(self):
+        builder = index.InMemoryGraphIndex(0, 2)
+        # We want nodes before and after the suggestions, to check boundaries.
+        # As there are two elements in each key, we want to check this for each
+        # element, which implies 4 boundaries:
+        builder.add_node(('pre', 'pref'), '', ())
+        builder.add_node(('pref', 'pre'), '', ())
+        builder.add_node(('pref', 'prep'), '', ())
+        builder.add_node(('prep', 'pref'), '', ())
+        transport = self.get_transport()
+        length = transport.put_file('index', builder.finish())
+        query_index = index.SuggestableGraphIndex(transport, 'index', length)
+        # Now, searching for suggestions for 'pref', 'pref' should find
+        # nothing.
+        self.assertEqual([],
+            list(query_index.iter_entries_starts_with(('pref', 'pref'))))
+
+    def test_key_length_2_iteration(self):
+        builder = index.InMemoryGraphIndex(0, 2)
+        # We want nodes before and after the suggestions, to check boundaries.
+        # - the first element of the key must be an exact match, the second is
+        # a startswith match, so provide non-match entries that match the second
+        # in case of bugs there.
+        builder.add_node(('pre', 'pref'), '', ())
+        builder.add_node(('pref', 'pre'), '', ())
+        builder.add_node(('pref', 'prep'), '', ())
+        builder.add_node(('prep', 'pref'), '', ())
+        # We want some entries to find.
+        builder.add_node(('pref', 'pref'), '', ())
+        builder.add_node(('pref', 'preferential'), '', ())
+        transport = self.get_transport()
+        length = transport.put_file('index', builder.finish())
+        query_index = index.SuggestableGraphIndex(transport, 'index', length)
+        # Now, searching for suggestions for 'pref' should find 'pref' and
+        # 'preferential'.
+        self.assertEqual([
+            (query_index, ('pref', 'pref'), ''),
+            (query_index, ('pref', 'preferential'), ''),
+            ],
+            list(query_index.iter_entries_starts_with(('pref', 'pref'))))