Rev 24: Change the disk format to create a single pack with the posting lists for a component. in http://people.ubuntu.com/~robertc/baz2.0/plugins/search/trunk
Robert Collins
robertc at robertcollins.net
Thu Jun 12 01:48:01 BST 2008
At http://people.ubuntu.com/~robertc/baz2.0/plugins/search/trunk
------------------------------------------------------------
revno: 24
revision-id: robertc at robertcollins.net-20080612004800-q16pldrsql5120zi
parent: robertc at robertcollins.net-20080611134613-r91btjp60gizwlpt
committer: Robert Collins <robertc at robertcollins.net>
branch nick: trunk
timestamp: Thu 2008-06-12 10:48:00 +1000
message:
Change the disk format to create a single pack with the posting lists for a component.
modified:
BUGS bugs-20080609101902-m23i5z2ojdgkeyof-1
DESIGN design-20080608072426-vjoj110dtykfyb7g-1
index.py index.py-20080608055509-hnimeek7q8tctkqf-2
tests/__init__.py __init__.py-20080608052041-z5bahsl8kwl0uf4x-8
=== modified file 'BUGS'
--- a/BUGS 2008-06-11 08:29:50 +0000
+++ b/BUGS 2008-06-12 00:48:00 +0000
@@ -3,9 +3,10 @@
Some key caveats though (not bugs per se):
- - disk scaling: The current disk format creates many thousands of small files
- on disk. This may impede performance on some file systems. When these are
- put into a single pack this will be rectified.
- - memory scaling: Full text indexing currently requires very large amounts of
- memory. To index the history of 'bzr' requires nearly 600MB of memory (revno
- 3494). Larger trees are exceedingly likely to require as-much or more.
+ - disk scaling: The current disk format creates a number of files per index
+ component, but does not component components. Each component has 2500
+ revisions indexed within it. This places a low limit on the latency involved
+ in a search.
+ - memory scaling: Full text indexing currently requires very significant memory.
+ To index the history of 'bzr' requires nearly 200B of memory (revno 3494).
+ Larger trees are exceedingly likely to require as-much or more.
=== modified file 'DESIGN'
--- a/DESIGN 2008-06-11 08:29:50 +0000
+++ b/DESIGN 2008-06-12 00:48:00 +0000
@@ -76,8 +76,8 @@
A names file listing the indices.
-Second cut - TODO
-=================
+Second cut - in progress
+========================
Each name in the names file is a pack, its length, and two read vectors for the
terms and revisions lists. The pack contains a series of posting lists trailed
=== modified file 'index.py'
--- a/index.py 2008-06-11 13:46:13 +0000
+++ b/index.py 2008-06-12 00:48:00 +0000
@@ -25,7 +25,9 @@
from bzrlib.errors import NotBranchError, NoSuchFile, UnknownFormatError
from bzrlib.index import CombinedGraphIndex, GraphIndex, InMemoryGraphIndex
from bzrlib.lockdir import LockDir
+from bzrlib.pack import ContainerWriter
from bzrlib.plugins.search import errors
+from bzrlib.plugins.search.transport import FileView
from bzrlib.revision import NULL_REVISION
from bzrlib.tsort import topo_sort
@@ -172,12 +174,17 @@
for node in index.iter_all_entries():
# XXX: Duplicated logic with search().
term = node[1][0]
- term_id, posting_count, posting_length = node[2].split(" ")
+ term_id, posting_count, posting_start, posting_length = \
+ node[2].split(" ")
posting_count = int(posting_count)
+ posting_start = int(posting_start)
posting_length = int(posting_length)
+ posting_stop = posting_start + posting_length
post_name = component.name + "." + term_id
- post_index = GraphIndex(self._indices_transport, post_name,
- posting_length)
+ filemap = {post_name:(posting_start, posting_stop)}
+ view = FileView(self._indices_transport,
+ component.name + '.pack', filemap)
+ post_index = GraphIndex(view, post_name, posting_length)
doc_ids = set([node[1][0] for node in
post_index.iter_all_entries()])
doc_ids = [(index, doc_id) for doc_id in doc_ids]
@@ -342,26 +349,33 @@
# TODO: use a dequeue?
term_info = []
for node in term_index.iter_entries(term_keys):
- term_id, posting_count, posting_length = node[2].split(" ")
+ term_id, posting_count, posting_start, posting_length = \
+ node[2].split(" ")
term_info.append((int(posting_count), term_id,
- int(posting_length)))
+ int(posting_start), int(posting_length)))
if len(term_info) != len(term_keys):
# One or more terms missing - no hits are possible.
continue
# load the first document list:
- _, term_id, posting_length = term_info.pop(0)
+ _, term_id, posting_start, posting_length = term_info.pop(0)
+ posting_stop = posting_start + posting_length
post_name = component.name + "." + term_id
- post_index = GraphIndex(self._indices_transport, post_name,
- posting_length)
+ filemap = {post_name:(posting_start, posting_stop)}
+ view = FileView(self._indices_transport, component.name + '.pack',
+ filemap)
+ post_index = GraphIndex(view, post_name, posting_length)
common_doc_keys = set([node[1] for node in
post_index.iter_all_entries()])
# Now we whittle down the nodes we need - still going in sorted
# order. (possibly doing concurrent reduction would be better).
while common_doc_keys and term_info:
- _, term_id, posting_length = term_info.pop(0)
+ _, term_id, posting_start, posting_length = term_info.pop(0)
+ posting_stop = posting_start + posting_length
post_name = component.name + "." + term_id
- post_index = GraphIndex(self._indices_transport, post_name,
- posting_length)
+ filemap = {post_name:(posting_start, posting_stop)}
+ view = FileView(self._indices_transport,
+ component.name + '.pack', filemap)
+ post_index = GraphIndex(view, post_name, posting_length)
common_doc_keys = set([node[1] for node in
post_index.iter_entries(common_doc_keys)])
common_doc_ids = [key[0] for key in common_doc_keys]
@@ -601,6 +615,10 @@
# lists.
elements = []
term_index = InMemoryGraphIndex(0, 1)
+ write_stream = upload_transport.open_write_stream(index_name + ".pack")
+ elements.append(index_name + ".pack")
+ writer = ContainerWriter(write_stream.write)
+ writer.begin()
for term, term_id in self.terms.iteritems():
posting_list = self.posting_lists[term_id]
post_index = InMemoryGraphIndex(0, 1)
@@ -608,14 +626,20 @@
post_index.add_node((doc_id,), "", ())
posting_bytes = post_index.finish().read()
posting_name = index_name + "." + term_id
- upload_transport.put_bytes_non_atomic(posting_name, posting_bytes)
- elements.append(posting_name)
- # How many document ids, and how long the file is (for use when we
- # put these in packs).
- term_value = "%s %d %d" % (term_id, len(str(posting_list)),
- len(posting_bytes))
+ pos, size = writer.add_bytes_record(posting_bytes, [(posting_name,)])
+ length = len(posting_bytes)
+ offset = size - length
+ start = pos + offset
+ ### upload_transport.put_bytes_non_atomic(posting_name, posting_bytes)
+ ### elements.append(posting_name)
+ # How many document ids, and the range for the file view when we
+ # read the pack later.
+ term_value = "%s %d %d %d" % (term_id, len(str(posting_list)),
+ start, length)
+ del posting_bytes
term_index.add_node((term,), term_value, ())
- del posting_bytes
+ writer.end()
+ write_stream.close()
index_bytes = term_index.finish().read()
term_length = len(index_bytes)
upload_transport.put_bytes_non_atomic(
=== modified file 'tests/__init__.py'
--- a/tests/__init__.py 2008-06-08 05:57:24 +0000
+++ b/tests/__init__.py 2008-06-12 00:48:00 +0000
@@ -24,6 +24,7 @@
'blackbox',
'errors',
'index',
+ 'transport',
]
standard_tests.addTests(loader.loadTestsFromModuleNames(
['bzrlib.plugins.search.tests.test_' + name for
More information about the bazaar-commits
mailing list