Rev 34: Improve indexing performance of file paths five-fold by a custom extraction module for handling the inventory xml. in http://people.ubuntu.com/~robertc/baz2.0/plugins/search/trunk
Robert Collins
robertc at robertcollins.net
Sat Jun 14 06:07:56 BST 2008
At http://people.ubuntu.com/~robertc/baz2.0/plugins/search/trunk
------------------------------------------------------------
revno: 34
revision-id: robertc at robertcollins.net-20080614050754-z5us1ebmdqbbegg7
parent: robertc at robertcollins.net-20080614014142-qrfj2rgsqeqjdxv5
committer: Robert Collins <robertc at robertcollins.net>
branch nick: trunk
timestamp: Sat 2008-06-14 15:07:54 +1000
message:
Improve indexing performance of file paths five-fold by a custom extraction module for handling the inventory xml.
added:
inventory.py inventory.py-20080614043027-u57i4xjlc5ginuft-1
tests/test_inventory.py test_inventory.py-20080614043027-u57i4xjlc5ginuft-2
modified:
index.py index.py-20080608055509-hnimeek7q8tctkqf-2
tests/__init__.py __init__.py-20080608052041-z5bahsl8kwl0uf4x-8
=== modified file 'index.py'
--- a/index.py 2008-06-14 01:41:42 +0000
+++ b/index.py 2008-06-14 05:07:54 +0000
@@ -29,6 +29,7 @@
from bzrlib.lockdir import LockDir
from bzrlib.pack import ContainerWriter
from bzrlib.plugins.search import errors
+from bzrlib.plugins.search.inventory import paths_from_ids
from bzrlib.plugins.search.transport import FileView
from bzrlib.revision import NULL_REVISION
from bzrlib.tsort import topo_sort
@@ -497,18 +498,22 @@
group_size = 100
for offset in range(len(order) / group_size + 1):
inventory_group = order[offset * group_size:(offset + 1) * group_size]
- #texts = repository.get_inventory_weave().get_texts(inventory_group)
- #for text in inventory_group:
- # pass
- # for text, revision_id in zip(texts, revision_ids):
+ lines_list = repository.get_inventory_weave().get_line_list(inventory_group)
+ serializer = repository._serializer
# For VersionedFiles:
# for xml in repository._iter_inventory_xmls(inventory_group):
# pass
- for inventory in repository.iter_inventories(inventory_group):
- revision_id = inventory.revision_id
- for file_id in revision_ids[revision_id]:
- path = inventory.id2path(file_id)
+ for lines, revision_id in zip(lines_list, inventory_group):
+ path_dict = paths_from_ids(lines, serializer,
+ revision_ids[revision_id])
+ for file_id, path in path_dict.iteritems():
terms[(file_id, revision_id)] = [('p', '', path)]
+ # Public api way - 5+ times slower:
+ # for inventory in repository.iter_inventories(inventory_group):
+ # # revision_id = inventory.revision_id
+ # for file_id in revision_ids[revision_id]:
+ # path = inventory.id2path(file_id)
+ # terms[(file_id, revision_id)] = [('p', '', path)]
return terms.iteritems()
def _terms_for_revs(self, repository, revision_ids):
=== added file 'inventory.py'
--- a/inventory.py 1970-01-01 00:00:00 +0000
+++ b/inventory.py 2008-06-14 05:07:54 +0000
@@ -0,0 +1,109 @@
+# search, a bzr plugin for searching within bzr branches/repositories.
+# Copyright (C) 2008 Robert Collins
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as published
+# by the Free Software Foundation.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+#
+
+"""Inventory related helpers for indexing."""
+
+
+from bzrlib import lazy_regex
+
+_file_ids_name_regex = lazy_regex.lazy_compile(
+ r'file_id="(?P<file_id>[^"]+)"'
+ r'.* name="(?P<name>[^"]*)"'
+ r'(?:.* parent_id="(?P<parent_id>[^"]+)")?'
+ )
+
+def paths_from_ids(xml_inventory, serializer, file_ids):
+ """Extract the paths for some file_ids from xml_inventory."""
+ if not serializer.support_altered_by_hack:
+ raise ValueError("Cannot process with serializer %r" % serializer)
+ search = _file_ids_name_regex.search
+ unresolved_ids = set(file_ids)
+ # TODO: only examine lines we need to, break early, track unprocessed
+ found_ids = {}
+ id_paths = {}
+ result = {}
+ for line in xml_inventory:
+ match = search(line)
+ if match is None:
+ continue
+ file_id, name, parent_id = match.group('file_id', 'name', 'parent_id')
+ found_ids[file_id] = (name, parent_id)
+ if parent_id is None:
+ # no parent, stash its name now to avoid special casing
+ # later.
+ id_paths[file_id] = name
+ if file_id in unresolved_ids:
+ result[file_id] = name
+ needed_ids = set(unresolved_ids)
+ while needed_ids:
+ # ---
+ # lookup_ids_here
+ # ---
+ missing_ids = set()
+ for file_id in needed_ids:
+ name, parent_id = found_ids.get(file_id, (None, None))
+ if name is None:
+ # Unresolved id itself
+ missing_ids.add(file_id)
+ else:
+ # We have resolved it, do we have its parent
+ if parent_id is not None and parent_id not in found_ids:
+ # No, search for it
+ missing_ids.add(parent_id)
+ if missing_ids == needed_ids:
+ # We didn't find anything on this pass
+ raise Exception("Did not find ids %s" % missing_ids)
+ needed_ids = missing_ids
+ # We have looked up the path-to-root for all asked ids,
+ # now to resolve it
+ while unresolved_ids:
+ wanted_file_id = unresolved_ids.pop()
+ path = id_paths.get(wanted_file_id)
+ if path is not None:
+ result[wanted_file_id] = path
+ continue
+ lookup_stack = [wanted_file_id]
+ lookup_names = []
+ # May be looked up already
+ while lookup_stack:
+ file_id = lookup_stack[-1]
+ name, parent_id = found_ids[file_id]
+ parent_path = id_paths.get(parent_id, None)
+ if parent_path is None:
+ # recurse:
+ lookup_stack.append(parent_id)
+ lookup_names.append(name)
+ else:
+ # resolve:
+ if parent_path:
+ parent_path = parent_path + '/' + name
+ else:
+ parent_path = name
+ id_paths[file_id] = parent_path
+ if file_id == wanted_file_id:
+ result[file_id] = parent_path
+ lookup_stack.pop(-1)
+ while lookup_stack:
+ file_id = lookup_stack.pop(-1)
+ if parent_path:
+ parent_path = parent_path + '/' + lookup_names.pop(-1)
+ else:
+ parent_path = lookup_names.pop(-1)
+ id_paths[file_id] = parent_path
+ if file_id == wanted_file_id:
+ result[file_id] = parent_path
+ return result
=== modified file 'tests/__init__.py'
--- a/tests/__init__.py 2008-06-12 01:30:02 +0000
+++ b/tests/__init__.py 2008-06-14 05:07:54 +0000
@@ -24,6 +24,7 @@
'blackbox',
'errors',
'index',
+ 'inventory',
'transport',
]
standard_tests.addTests(loader.loadTestsFromModuleNames(
=== added file 'tests/test_inventory.py'
--- a/tests/test_inventory.py 1970-01-01 00:00:00 +0000
+++ b/tests/test_inventory.py 2008-06-14 05:07:54 +0000
@@ -0,0 +1,89 @@
+# search, a bzr plugin for searching within bzr branches/repositories.
+# Copyright (C) 2008 Robert Collins
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as published
+# by the Free Software Foundation.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+#
+
+"""Tests for the inventory specific logic."""
+
+from bzrlib.plugins.search import inventory
+from bzrlib.tests import TestCaseWithTransport
+
+
+class TestPathFromInventory(TestCaseWithTransport):
+
+ def test_paths_from_ids_basic(self):
+ tree = self.make_branch_and_tree('foo')
+ t = self.get_transport('foo')
+ t.mkdir('subdir')
+ t.mkdir('subdir/nested-dir')
+ t.put_bytes('subdir/nested-dir/the file', 'content')
+ # The ids are in reverse sort order to try to exercise corner cases in
+ # xml processing.
+ tree.add(['subdir', 'subdir/nested-dir', 'subdir/nested-dir/the file'],
+ ['3dir', '2dir', '1file'])
+ revid = tree.commit('first post')
+ tree.lock_read()
+ try:
+ inventories = tree.branch.repository.get_inventory_weave()
+ xml = inventories.get_lines(revid)
+ finally:
+ tree.unlock()
+ serializer = tree.branch.repository._serializer
+ # All individually:
+ self.assertEqual({'1file':'subdir/nested-dir/the file'},
+ inventory.paths_from_ids(xml, serializer, ['1file']))
+ self.assertEqual({'2dir':'subdir/nested-dir'},
+ inventory.paths_from_ids(xml, serializer, ['2dir']))
+ self.assertEqual({'3dir':'subdir'},
+ inventory.paths_from_ids(xml, serializer, ['3dir']))
+ # In twos:
+ self.assertEqual({'1file':'subdir/nested-dir/the file',
+ '2dir':'subdir/nested-dir'},
+ inventory.paths_from_ids(xml, serializer, ['1file', '2dir']))
+ self.assertEqual({'1file':'subdir/nested-dir/the file',
+ '3dir':'subdir'},
+ inventory.paths_from_ids(xml, serializer, ['1file', '3dir']))
+ self.assertEqual({'2dir':'subdir/nested-dir',
+ '3dir':'subdir'},
+ inventory.paths_from_ids(xml, serializer, ['3dir', '2dir']))
+ # All together
+ self.assertEqual({'1file':'subdir/nested-dir/the file',
+ '2dir':'subdir/nested-dir',
+ '3dir':'subdir'},
+ inventory.paths_from_ids(xml, serializer, ['1file', '2dir', '3dir']))
+
+ def test_paths_from_ids_rich_root(self):
+ tree = self.make_branch_and_tree('foo', format="rich-root-pack")
+ tree.set_root_id('a-root')
+ t = self.get_transport('foo')
+ t.mkdir('subdir')
+ tree.add(['subdir'], ['3dir'])
+ revid = tree.commit('first post')
+ tree.lock_read()
+ try:
+ inventories = tree.branch.repository.get_inventory_weave()
+ xml = inventories.get_lines(revid)
+ finally:
+ tree.unlock()
+ serializer = tree.branch.repository._serializer
+ # All individually:
+ self.assertEqual({'3dir':'subdir'},
+ inventory.paths_from_ids(xml, serializer, ['3dir']))
+ self.assertEqual({'a-root':''},
+ inventory.paths_from_ids(xml, serializer, ['a-root']))
+ # In twos:
+ self.assertEqual({'3dir':'subdir',
+ 'a-root':''},
+ inventory.paths_from_ids(xml, serializer, ['3dir', 'a-root']))
More information about the bazaar-commits
mailing list