Rev 34: Improve indexing performance of file paths five-fold by a custom extraction module for handling the inventory xml. in http://people.ubuntu.com/~robertc/baz2.0/plugins/search/trunk

Sat Jun 14 06:07:56 BST 2008

At http://people.ubuntu.com/~robertc/baz2.0/plugins/search/trunk

------------------------------------------------------------
revno: 34
revision-id: robertc at robertcollins.net-20080614050754-z5us1ebmdqbbegg7
parent: robertc at robertcollins.net-20080614014142-qrfj2rgsqeqjdxv5
committer: Robert Collins <robertc at robertcollins.net>
branch nick: trunk
timestamp: Sat 2008-06-14 15:07:54 +1000
message:
  Improve indexing performance of file paths five-fold by a custom extraction module for handling the inventory xml.
added:
  inventory.py                   inventory.py-20080614043027-u57i4xjlc5ginuft-1
  tests/test_inventory.py        test_inventory.py-20080614043027-u57i4xjlc5ginuft-2
modified:
  index.py                       index.py-20080608055509-hnimeek7q8tctkqf-2
  tests/__init__.py              __init__.py-20080608052041-z5bahsl8kwl0uf4x-8
=== modified file 'index.py'

--- a/index.py	2008-06-14 01:41:42 +0000
+++ b/index.py	2008-06-14 05:07:54 +0000
@@ -29,6 +29,7 @@
 from bzrlib.lockdir import LockDir
 from bzrlib.pack import ContainerWriter
 from bzrlib.plugins.search import errors
+from bzrlib.plugins.search.inventory import paths_from_ids
 from bzrlib.plugins.search.transport import FileView
 from bzrlib.revision import NULL_REVISION
 from bzrlib.tsort import topo_sort
@@ -497,18 +498,22 @@
         group_size = 100
         for offset in range(len(order) / group_size + 1):
             inventory_group = order[offset * group_size:(offset + 1) * group_size]
-            #texts = repository.get_inventory_weave().get_texts(inventory_group)
-            #for text in inventory_group:
-            #    pass
-            #            for text, revision_id in zip(texts, revision_ids):
+            lines_list = repository.get_inventory_weave().get_line_list(inventory_group)
+            serializer = repository._serializer
             # For VersionedFiles:
             # for xml in repository._iter_inventory_xmls(inventory_group):
             #     pass
-            for inventory in repository.iter_inventories(inventory_group):
-                revision_id = inventory.revision_id
-                for file_id in revision_ids[revision_id]:
-                    path = inventory.id2path(file_id)
+            for lines, revision_id in zip(lines_list, inventory_group):
+                path_dict = paths_from_ids(lines, serializer,
+                    revision_ids[revision_id])
+                for file_id, path in path_dict.iteritems():
                     terms[(file_id, revision_id)] = [('p', '', path)]
+            # Public api way - 5+ times slower:
+            # for inventory in repository.iter_inventories(inventory_group):
+            # #    revision_id = inventory.revision_id
+            #    for file_id in revision_ids[revision_id]:
+            #        path = inventory.id2path(file_id)
+            #        terms[(file_id, revision_id)] = [('p', '', path)]
         return terms.iteritems()
 
     def _terms_for_revs(self, repository, revision_ids):

=== added file 'inventory.py'
--- a/inventory.py	1970-01-01 00:00:00 +0000
+++ b/inventory.py	2008-06-14 05:07:54 +0000
@@ -0,0 +1,109 @@
+# search, a bzr plugin for searching within bzr branches/repositories.
+# Copyright (C) 2008 Robert Collins
+# 
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as published
+# by the Free Software Foundation.
+# 
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+# 
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+# 
+
+"""Inventory related helpers for indexing."""
+
+
+from bzrlib import lazy_regex
+
+_file_ids_name_regex = lazy_regex.lazy_compile(
+        r'file_id="(?P<file_id>[^"]+)"'
+        r'.* name="(?P<name>[^"]*)"'
+        r'(?:.* parent_id="(?P<parent_id>[^"]+)")?'
+        )
+
+def paths_from_ids(xml_inventory, serializer, file_ids):
+    """Extract the paths for some file_ids from xml_inventory."""
+    if not serializer.support_altered_by_hack:
+        raise ValueError("Cannot process with serializer %r" % serializer)
+    search = _file_ids_name_regex.search
+    unresolved_ids = set(file_ids)
+    # TODO: only examine lines we need to, break early, track unprocessed
+    found_ids = {}
+    id_paths = {}
+    result = {}
+    for line in xml_inventory:
+        match = search(line)
+        if match is None:
+            continue
+        file_id, name, parent_id = match.group('file_id', 'name', 'parent_id')
+        found_ids[file_id] = (name, parent_id)
+        if parent_id is None:
+            # no parent, stash its name now to avoid special casing
+            # later.
+            id_paths[file_id] = name
+            if file_id in unresolved_ids:
+                result[file_id] = name
+    needed_ids = set(unresolved_ids)
+    while needed_ids:
+        # ---
+        # lookup_ids_here
+        # ---
+        missing_ids = set()
+        for file_id in needed_ids:
+            name, parent_id = found_ids.get(file_id, (None, None))
+            if name is None:
+                # Unresolved id itself
+                missing_ids.add(file_id)
+            else:
+                # We have resolved it, do we have its parent
+                if parent_id is not None and parent_id not in found_ids:
+                    # No, search for it
+                    missing_ids.add(parent_id)
+        if missing_ids == needed_ids:
+            # We didn't find anything on this pass
+            raise Exception("Did not find ids %s" % missing_ids)
+        needed_ids = missing_ids
+    # We have looked up the path-to-root for all asked ids,
+    # now to resolve it
+    while unresolved_ids:
+        wanted_file_id = unresolved_ids.pop()
+        path = id_paths.get(wanted_file_id)
+        if path is not None:
+            result[wanted_file_id] = path
+            continue
+        lookup_stack = [wanted_file_id]
+        lookup_names = []
+        # May be looked up already
+        while lookup_stack:
+            file_id = lookup_stack[-1]
+            name, parent_id = found_ids[file_id]
+            parent_path = id_paths.get(parent_id, None)
+            if parent_path is None:
+                # recurse:
+                lookup_stack.append(parent_id)
+                lookup_names.append(name)
+            else:
+                # resolve:
+                if parent_path:
+                    parent_path = parent_path + '/' + name
+                else:
+                    parent_path = name
+                id_paths[file_id] = parent_path
+                if file_id == wanted_file_id:
+                    result[file_id] = parent_path
+                lookup_stack.pop(-1)
+                while lookup_stack:
+                    file_id = lookup_stack.pop(-1)
+                    if parent_path:
+                        parent_path = parent_path + '/' + lookup_names.pop(-1)
+                    else:
+                        parent_path = lookup_names.pop(-1)
+                    id_paths[file_id] = parent_path
+                    if file_id == wanted_file_id:
+                        result[file_id] = parent_path
+    return result

=== modified file 'tests/__init__.py'
--- a/tests/__init__.py	2008-06-12 01:30:02 +0000
+++ b/tests/__init__.py	2008-06-14 05:07:54 +0000
@@ -24,6 +24,7 @@
         'blackbox',
         'errors',
         'index',
+        'inventory',
         'transport',
         ]
     standard_tests.addTests(loader.loadTestsFromModuleNames(

=== added file 'tests/test_inventory.py'
--- a/tests/test_inventory.py	1970-01-01 00:00:00 +0000
+++ b/tests/test_inventory.py	2008-06-14 05:07:54 +0000
@@ -0,0 +1,89 @@
+# search, a bzr plugin for searching within bzr branches/repositories.
+# Copyright (C) 2008 Robert Collins
+# 
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as published
+# by the Free Software Foundation.
+# 
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+# 
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+# 
+
+"""Tests for the inventory specific logic."""
+
+from bzrlib.plugins.search import inventory
+from bzrlib.tests import TestCaseWithTransport
+
+
+class TestPathFromInventory(TestCaseWithTransport):
+
+    def test_paths_from_ids_basic(self):
+        tree = self.make_branch_and_tree('foo')
+        t = self.get_transport('foo')
+        t.mkdir('subdir')
+        t.mkdir('subdir/nested-dir')
+        t.put_bytes('subdir/nested-dir/the file', 'content')
+        # The ids are in reverse sort order to try to exercise corner cases in
+        # xml processing.
+        tree.add(['subdir', 'subdir/nested-dir', 'subdir/nested-dir/the file'],
+            ['3dir', '2dir', '1file'])
+        revid = tree.commit('first post')
+        tree.lock_read()
+        try:
+            inventories = tree.branch.repository.get_inventory_weave()
+            xml = inventories.get_lines(revid)
+        finally:
+            tree.unlock()
+        serializer = tree.branch.repository._serializer
+        # All individually:
+        self.assertEqual({'1file':'subdir/nested-dir/the file'},
+            inventory.paths_from_ids(xml, serializer, ['1file']))
+        self.assertEqual({'2dir':'subdir/nested-dir'},
+            inventory.paths_from_ids(xml, serializer, ['2dir']))
+        self.assertEqual({'3dir':'subdir'},
+            inventory.paths_from_ids(xml, serializer, ['3dir']))
+        # In twos:
+        self.assertEqual({'1file':'subdir/nested-dir/the file',
+            '2dir':'subdir/nested-dir'},
+            inventory.paths_from_ids(xml, serializer, ['1file', '2dir']))
+        self.assertEqual({'1file':'subdir/nested-dir/the file',
+            '3dir':'subdir'},
+            inventory.paths_from_ids(xml, serializer, ['1file', '3dir']))
+        self.assertEqual({'2dir':'subdir/nested-dir',
+            '3dir':'subdir'},
+            inventory.paths_from_ids(xml, serializer, ['3dir', '2dir']))
+        # All together
+        self.assertEqual({'1file':'subdir/nested-dir/the file',
+            '2dir':'subdir/nested-dir',
+            '3dir':'subdir'},
+            inventory.paths_from_ids(xml, serializer, ['1file', '2dir', '3dir']))
+
+    def test_paths_from_ids_rich_root(self):
+        tree = self.make_branch_and_tree('foo', format="rich-root-pack")
+        tree.set_root_id('a-root')
+        t = self.get_transport('foo')
+        t.mkdir('subdir')
+        tree.add(['subdir'], ['3dir'])
+        revid = tree.commit('first post')
+        tree.lock_read()
+        try:
+            inventories = tree.branch.repository.get_inventory_weave()
+            xml = inventories.get_lines(revid)
+        finally:
+            tree.unlock()
+        serializer = tree.branch.repository._serializer
+        # All individually:
+        self.assertEqual({'3dir':'subdir'},
+            inventory.paths_from_ids(xml, serializer, ['3dir']))
+        self.assertEqual({'a-root':''},
+            inventory.paths_from_ids(xml, serializer, ['a-root']))
+        # In twos:
+        self.assertEqual({'3dir':'subdir',
+            'a-root':''},
+            inventory.paths_from_ids(xml, serializer, ['3dir', 'a-root']))