Rev 5376: (jameinel) Decrease peak memory during 'bzr send', in file:///home/pqm/archives/thelove/bzr/%2Btrunk/
Canonical.com Patch Queue Manager
pqm at pqm.ubuntu.com
Wed Aug 11 03:49:28 BST 2010
At file:///home/pqm/archives/thelove/bzr/%2Btrunk/
------------------------------------------------------------
revno: 5376 [merge]
revision-id: pqm at pqm.ubuntu.com-20100811024926-irizklc3vvd9h45p
parent: pqm at pqm.ubuntu.com-20100810011615-0dm93d6fa4mxov6o
parent: john at arbash-meinel.com-20100811013111-wdrv1hp03kir7gf2
committer: Canonical.com Patch Queue Manager <pqm at pqm.ubuntu.com>
branch nick: +trunk
timestamp: Wed 2010-08-11 03:49:26 +0100
message:
(jameinel) Decrease peak memory during 'bzr send',
but releasing texts no longer needed. (John A Meinel)
added:
bzrlib/tests/test_versionedfile.py test_versionedfile.p-20100810180118-kszg2mcbpsoybwpt-1
modified:
NEWS NEWS-20050323055033-4e00b5db738777ff
bzrlib/bundle/serializer/v4.py v10.py-20070611062757-5ggj7k18s9dej0fr-1
bzrlib/multiparent.py __init__.py-20070410133617-n1jdhcc1n1mibarp-1
bzrlib/tests/__init__.py selftest.py-20050531073622-8d0e3c8845c97a64
bzrlib/versionedfile.py versionedfile.py-20060222045106-5039c71ee3b65490
=== modified file 'NEWS'
--- a/NEWS 2010-08-09 10:13:33 +0000
+++ b/NEWS 2010-08-11 02:49:26 +0000
@@ -47,6 +47,12 @@
* Check if both --diff-options and --using are set, and exit with error
in this case. (Matthäus G. Chajdas, #234708)
+* Decrease peak memory during ``bzr send``. The old code was caching all
+ text content and all inventory strings for all revisions before
+ computing the diffs. Now we only cache as long as there is a child that
+ will need them. Sending 2000 bzr revisions drops from 1.2GB peak to
+ 256MB peak. (John Arbash Meinel, #614576)
+
* Don't print internal object name when print an invalid revision spec
error. (Neil Martinsen-Burrell, #598701)
=== modified file 'bzrlib/bundle/serializer/v4.py'
--- a/bzrlib/bundle/serializer/v4.py 2009-08-04 14:08:32 +0000
+++ b/bzrlib/bundle/serializer/v4.py 2010-08-10 20:03:44 +0000
@@ -1,4 +1,4 @@
-# Copyright (C) 2007 Canonical Ltd
+# Copyright (C) 2007-2010 Canonical Ltd
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
@@ -30,11 +30,49 @@
serializer,
trace,
ui,
+ versionedfile as _mod_versionedfile,
)
from bzrlib.bundle import bundle_data, serializer as bundle_serializer
from bzrlib import bencode
+class _MPDiffInventoryGenerator(_mod_versionedfile._MPDiffGenerator):
+ """Generate Inventory diffs serialized inventories."""
+
+ def __init__(self, repo, inventory_keys):
+ super(_MPDiffInventoryGenerator, self).__init__(repo.inventories,
+ inventory_keys)
+ self.repo = repo
+ self.sha1s = {}
+
+ def iter_diffs(self):
+ """Compute the diffs one at a time."""
+ # This is instead of compute_diffs() since we guarantee our ordering of
+ # inventories, we don't have to do any buffering
+ self._find_needed_keys()
+ # We actually use a slightly different ordering. We grab all of the
+ # parents first, and then grab the ordered requests.
+ needed_ids = [k[-1] for k in self.present_parents]
+ needed_ids.extend([k[-1] for k in self.ordered_keys])
+ inv_to_str = self.repo._serializer.write_inventory_to_string
+ for inv in self.repo.iter_inventories(needed_ids):
+ revision_id = inv.revision_id
+ key = (revision_id,)
+ if key in self.present_parents:
+ # Not a key we will transmit, which is a shame, since because
+ # of that bundles don't work with stacked branches
+ parent_ids = None
+ else:
+ parent_ids = [k[-1] for k in self.parent_map[key]]
+ as_bytes = inv_to_str(inv)
+ self._process_one_record(key, (as_bytes,))
+ if parent_ids is None:
+ continue
+ diff = self.diffs.pop(key)
+ sha1 = osutils.sha_string(as_bytes)
+ yield revision_id, parent_ids, sha1, diff
+
+
class BundleWriter(object):
"""Writer for bundle-format files.
@@ -348,48 +386,10 @@
the other side.
"""
inventory_key_order = [(r,) for r in revision_order]
- parent_map = self.repository.inventories.get_parent_map(
- inventory_key_order)
- missing_keys = set(inventory_key_order).difference(parent_map)
- if missing_keys:
- raise errors.RevisionNotPresent(list(missing_keys)[0],
- self.repository.inventories)
- inv_to_str = self.repository._serializer.write_inventory_to_string
- # Make sure that we grab the parent texts first
- just_parents = set()
- map(just_parents.update, parent_map.itervalues())
- just_parents.difference_update(parent_map)
- # Ignore ghost parents
- present_parents = self.repository.inventories.get_parent_map(
- just_parents)
- ghost_keys = just_parents.difference(present_parents)
- needed_inventories = list(present_parents) + inventory_key_order
- needed_inventories = [k[-1] for k in needed_inventories]
- all_lines = {}
- for inv in self.repository.iter_inventories(needed_inventories):
- revision_id = inv.revision_id
- key = (revision_id,)
- as_bytes = inv_to_str(inv)
- # The sha1 is validated as the xml/textual form, not as the
- # form-in-the-repository
- sha1 = osutils.sha_string(as_bytes)
- as_lines = osutils.split_lines(as_bytes)
- del as_bytes
- all_lines[key] = as_lines
- if key in just_parents:
- # We don't transmit those entries
- continue
- # Create an mpdiff for this text, and add it to the output
- parent_keys = parent_map[key]
- # See the comment in VF.make_mpdiffs about how this effects
- # ordering when there are ghosts present. I think we have a latent
- # bug
- parent_lines = [all_lines[p_key] for p_key in parent_keys
- if p_key not in ghost_keys]
- diff = multiparent.MultiParent.from_lines(
- as_lines, parent_lines)
+ generator = _MPDiffInventoryGenerator(self.repository,
+ inventory_key_order)
+ for revision_id, parent_ids, sha1, diff in generator.iter_diffs():
text = ''.join(diff.to_patch())
- parent_ids = [k[-1] for k in parent_keys]
self.bundle.add_multiparent_record(text, sha1, parent_ids,
'inventory', revision_id, None)
=== modified file 'bzrlib/multiparent.py'
--- a/bzrlib/multiparent.py 2009-06-10 03:56:49 +0000
+++ b/bzrlib/multiparent.py 2010-08-11 01:31:11 +0000
@@ -1,4 +1,4 @@
-# Copyright (C) 2007 Canonical Ltd
+# Copyright (C) 2007-2010 Canonical Ltd
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
@@ -76,6 +76,8 @@
class MultiParent(object):
"""A multi-parent diff"""
+ __slots__ = ['hunks']
+
def __init__(self, hunks=None):
if hunks is not None:
self.hunks = hunks
@@ -258,6 +260,8 @@
class NewText(object):
"""The contents of text that is introduced by this text"""
+ __slots__ = ['lines']
+
def __init__(self, lines):
self.lines = lines
@@ -279,24 +283,30 @@
class ParentText(object):
"""A reference to text present in a parent text"""
+ __slots__ = ['parent', 'parent_pos', 'child_pos', 'num_lines']
+
def __init__(self, parent, parent_pos, child_pos, num_lines):
self.parent = parent
self.parent_pos = parent_pos
self.child_pos = child_pos
self.num_lines = num_lines
+ def _as_dict(self):
+ return dict(parent=self.parent, parent_pos=self.parent_pos,
+ child_pos=self.child_pos, num_lines=self.num_lines)
+
def __repr__(self):
- return 'ParentText(%(parent)r, %(parent_pos)r, %(child_pos)r,'\
- ' %(num_lines)r)' % self.__dict__
+ return ('ParentText(%(parent)r, %(parent_pos)r, %(child_pos)r,'
+ ' %(num_lines)r)' % self._as_dict())
def __eq__(self, other):
if self.__class__ is not other.__class__:
return False
- return (self.__dict__ == other.__dict__)
+ return self._as_dict() == other._as_dict()
def to_patch(self):
- yield 'c %(parent)d %(parent_pos)d %(child_pos)d %(num_lines)d\n'\
- % self.__dict__
+ yield ('c %(parent)d %(parent_pos)d %(child_pos)d %(num_lines)d\n'
+ % self._as_dict())
class BaseVersionedFile(object):
=== modified file 'bzrlib/tests/__init__.py'
--- a/bzrlib/tests/__init__.py 2010-08-07 16:42:09 +0000
+++ b/bzrlib/tests/__init__.py 2010-08-10 18:07:54 +0000
@@ -3828,6 +3828,7 @@
'bzrlib.tests.test_urlutils',
'bzrlib.tests.test_version',
'bzrlib.tests.test_version_info',
+ 'bzrlib.tests.test_versionedfile',
'bzrlib.tests.test_weave',
'bzrlib.tests.test_whitebox',
'bzrlib.tests.test_win32utils',
=== added file 'bzrlib/tests/test_versionedfile.py'
--- a/bzrlib/tests/test_versionedfile.py 1970-01-01 00:00:00 +0000
+++ b/bzrlib/tests/test_versionedfile.py 2010-08-10 20:03:44 +0000
@@ -0,0 +1,139 @@
+# Copyright (C) 2010 Canonical Ltd
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+
+"""Tests for VersionedFile classes"""
+
+from bzrlib import (
+ errors,
+ groupcompress,
+ multiparent,
+ tests,
+ versionedfile,
+ )
+
+
+class Test_MPDiffGenerator(tests.TestCaseWithMemoryTransport):
+ # Should this be a per vf test?
+
+ def make_vf(self):
+ t = self.get_transport('')
+ factory = groupcompress.make_pack_factory(True, True, 1)
+ return factory(t)
+
+ def make_three_vf(self):
+ vf = self.make_vf()
+ vf.add_lines(('one',), (), ['first\n'])
+ vf.add_lines(('two',), [('one',)], ['first\n', 'second\n'])
+ vf.add_lines(('three',), [('one',), ('two',)],
+ ['first\n', 'second\n', 'third\n'])
+ return vf
+
+ def test_finds_parents(self):
+ vf = self.make_three_vf()
+ gen = versionedfile._MPDiffGenerator(vf, [('three',)])
+ needed_keys, refcount = gen._find_needed_keys()
+ self.assertEqual(sorted([('one',), ('two',), ('three',)]),
+ sorted(needed_keys))
+ self.assertEqual({('one',): 1, ('two',): 1}, refcount)
+
+ def test_ignores_ghost_parents(self):
+ # If a parent is a ghost, it is just ignored
+ vf = self.make_vf()
+ vf.add_lines(('two',), [('one',)], ['first\n', 'second\n'])
+ gen = versionedfile._MPDiffGenerator(vf, [('two',)])
+ needed_keys, refcount = gen._find_needed_keys()
+ self.assertEqual(sorted([('two',)]), sorted(needed_keys))
+ # It is returned, but we don't really care as we won't extract it
+ self.assertEqual({('one',): 1}, refcount)
+ self.assertEqual([('one',)], sorted(gen.ghost_parents))
+ self.assertEqual([], sorted(gen.present_parents))
+
+ def test_raises_on_ghost_keys(self):
+ # If the requested key is a ghost, then we have a problem
+ vf = self.make_vf()
+ gen = versionedfile._MPDiffGenerator(vf, [('one',)])
+ self.assertRaises(errors.RevisionNotPresent,
+ gen._find_needed_keys)
+
+ def test_refcount_multiple_children(self):
+ vf = self.make_three_vf()
+ gen = versionedfile._MPDiffGenerator(vf, [('two',), ('three',)])
+ needed_keys, refcount = gen._find_needed_keys()
+ self.assertEqual(sorted([('one',), ('two',), ('three',)]),
+ sorted(needed_keys))
+ self.assertEqual({('one',): 2, ('two',): 1}, refcount)
+ self.assertEqual([('one',)], sorted(gen.present_parents))
+
+ def test_process_contents(self):
+ vf = self.make_three_vf()
+ gen = versionedfile._MPDiffGenerator(vf, [('two',), ('three',)])
+ gen._find_needed_keys()
+ self.assertEqual({('two',): (('one',),),
+ ('three',): (('one',), ('two',))},
+ gen.parent_map)
+ self.assertEqual({('one',): 2, ('two',): 1}, gen.refcounts)
+ self.assertEqual(sorted([('one',), ('two',), ('three',)]),
+ sorted(gen.needed_keys))
+ stream = vf.get_record_stream(gen.needed_keys, 'topological', True)
+ record = stream.next()
+ self.assertEqual(('one',), record.key)
+ # one is not needed in the output, but it is needed by children. As
+ # such, it should end up in the various caches
+ gen._process_one_record(record.key, record.get_bytes_as('chunked'))
+ # The chunks should be cached, the refcount untouched
+ self.assertEqual([('one',)], gen.chunks.keys())
+ self.assertEqual({('one',): 2, ('two',): 1}, gen.refcounts)
+ self.assertEqual([], gen.diffs.keys())
+ # Next we get 'two', which is something we output, but also needed for
+ # three
+ record = stream.next()
+ self.assertEqual(('two',), record.key)
+ gen._process_one_record(record.key, record.get_bytes_as('chunked'))
+ # Both are now cached, and the diff for two has been extracted, and
+ # one's refcount has been updated. two has been removed from the
+ # parent_map
+ self.assertEqual(sorted([('one',), ('two',)]),
+ sorted(gen.chunks.keys()))
+ self.assertEqual({('one',): 1, ('two',): 1}, gen.refcounts)
+ self.assertEqual([('two',)], gen.diffs.keys())
+ self.assertEqual({('three',): (('one',), ('two',))},
+ gen.parent_map)
+ # Finally 'three', which allows us to remove all parents from the
+ # caches
+ record = stream.next()
+ self.assertEqual(('three',), record.key)
+ gen._process_one_record(record.key, record.get_bytes_as('chunked'))
+ # Both are now cached, and the diff for two has been extracted, and
+ # one's refcount has been updated
+ self.assertEqual([], gen.chunks.keys())
+ self.assertEqual({}, gen.refcounts)
+ self.assertEqual(sorted([('two',), ('three',)]),
+ sorted(gen.diffs.keys()))
+
+ def test_compute_diffs(self):
+ vf = self.make_three_vf()
+ # The content is in the order requested, even if it isn't topological
+ gen = versionedfile._MPDiffGenerator(vf, [('two',), ('three',),
+ ('one',)])
+ diffs = gen.compute_diffs()
+ expected_diffs = [
+ multiparent.MultiParent([multiparent.ParentText(0, 0, 0, 1),
+ multiparent.NewText(['second\n'])]),
+ multiparent.MultiParent([multiparent.ParentText(1, 0, 0, 2),
+ multiparent.NewText(['third\n'])]),
+ multiparent.MultiParent([multiparent.NewText(['first\n'])]),
+ ]
+ self.assertEqual(expected_diffs, diffs)
=== modified file 'bzrlib/versionedfile.py'
--- a/bzrlib/versionedfile.py 2010-07-30 20:44:45 +0000
+++ b/bzrlib/versionedfile.py 2010-08-11 01:27:46 +0000
@@ -206,6 +206,138 @@
yield record
+class _MPDiffGenerator(object):
+ """Pull out the functionality for generating mp_diffs."""
+
+ def __init__(self, vf, keys):
+ self.vf = vf
+ # This is the order the keys were requested in
+ self.ordered_keys = tuple(keys)
+ # keys + their parents, what we need to compute the diffs
+ self.needed_keys = ()
+ # Map from key: mp_diff
+ self.diffs = {}
+ # Map from key: parents_needed (may have ghosts)
+ self.parent_map = {}
+ # Parents that aren't present
+ self.ghost_parents = ()
+ # Map from parent_key => number of children for this text
+ self.refcounts = {}
+ # Content chunks that are cached while we still need them
+ self.chunks = {}
+
+ def _find_needed_keys(self):
+ """Find the set of keys we need to request.
+
+ This includes all the original keys passed in, and the non-ghost
+ parents of those keys.
+
+ :return: (needed_keys, refcounts)
+ needed_keys is the set of all texts we need to extract
+ refcounts is a dict of {key: num_children} letting us know when we
+ no longer need to cache a given parent text
+ """
+ # All the keys and their parents
+ needed_keys = set(self.ordered_keys)
+ parent_map = self.vf.get_parent_map(needed_keys)
+ self.parent_map = parent_map
+ # TODO: Should we be using a different construct here? I think this
+ # uses difference_update internally, and we expect the result to
+ # be tiny
+ missing_keys = needed_keys.difference(parent_map)
+ if missing_keys:
+ raise errors.RevisionNotPresent(list(missing_keys)[0], self.vf)
+ # Parents that might be missing. They are allowed to be ghosts, but we
+ # should check for them
+ refcounts = {}
+ setdefault = refcounts.setdefault
+ just_parents = set()
+ for child_key, parent_keys in parent_map.iteritems():
+ if not parent_keys:
+ # parent_keys may be None if a given VersionedFile claims to
+ # not support graph operations.
+ continue
+ just_parents.update(parent_keys)
+ needed_keys.update(parent_keys)
+ for p in parent_keys:
+ refcounts[p] = setdefault(p, 0) + 1
+ just_parents.difference_update(parent_map)
+ # Remove any parents that are actually ghosts from the needed set
+ self.present_parents = set(self.vf.get_parent_map(just_parents))
+ self.ghost_parents = just_parents.difference(self.present_parents)
+ needed_keys.difference_update(self.ghost_parents)
+ self.needed_keys = needed_keys
+ self.refcounts = refcounts
+ return needed_keys, refcounts
+
+ def _compute_diff(self, key, parent_lines, lines):
+ """Compute a single mp_diff, and store it in self._diffs"""
+ if len(parent_lines) > 0:
+ # XXX: _extract_blocks is not usefully defined anywhere...
+ # It was meant to extract the left-parent diff without
+ # having to recompute it for Knit content (pack-0.92,
+ # etc). That seems to have regressed somewhere
+ left_parent_blocks = self.vf._extract_blocks(key,
+ parent_lines[0], lines)
+ else:
+ left_parent_blocks = None
+ diff = multiparent.MultiParent.from_lines(lines,
+ parent_lines, left_parent_blocks)
+ self.diffs[key] = diff
+
+ def _process_one_record(self, key, this_chunks):
+ parent_keys = None
+ if key in self.parent_map:
+ # This record should be ready to diff, since we requested
+ # content in 'topological' order
+ parent_keys = self.parent_map.pop(key)
+ # If a VersionedFile claims 'no-graph' support, then it may return
+ # None for any parent request, so we replace it with an empty tuple
+ if parent_keys is None:
+ parent_keys = ()
+ parent_lines = []
+ for p in parent_keys:
+ # Alternatively we could check p not in self.needed_keys, but
+ # ghost_parents should be tiny versus huge
+ if p in self.ghost_parents:
+ continue
+ refcount = self.refcounts[p]
+ if refcount == 1: # Last child reference
+ self.refcounts.pop(p)
+ parent_chunks = self.chunks.pop(p)
+ else:
+ self.refcounts[p] = refcount - 1
+ parent_chunks = self.chunks[p]
+ p_lines = osutils.chunks_to_lines(parent_chunks)
+ # TODO: Should we cache the line form? We did the
+ # computation to get it, but storing it this way will
+ # be less memory efficient...
+ parent_lines.append(p_lines)
+ del p_lines
+ lines = osutils.chunks_to_lines(this_chunks)
+ # Since we needed the lines, we'll go ahead and cache them this way
+ this_chunks = lines
+ self._compute_diff(key, parent_lines, lines)
+ del lines
+ # Is this content required for any more children?
+ if key in self.refcounts:
+ self.chunks[key] = this_chunks
+
+ def _extract_diffs(self):
+ needed_keys, refcounts = self._find_needed_keys()
+ for record in self.vf.get_record_stream(needed_keys,
+ 'topological', True):
+ if record.storage_kind == 'absent':
+ raise errors.RevisionNotPresent(record.key, self.vf)
+ self._process_one_record(record.key,
+ record.get_bytes_as('chunked'))
+
+ def compute_diffs(self):
+ self._extract_diffs()
+ dpop = self.diffs.pop
+ return [dpop(k) for k in self.ordered_keys]
+
+
class VersionedFile(object):
"""Versioned text file storage.
@@ -348,6 +480,10 @@
def make_mpdiffs(self, version_ids):
"""Create multiparent diffs for specified versions."""
+ # XXX: Can't use _MPDiffGenerator just yet. This is because version_ids
+ # is a list of strings, not keys. And while self.get_record_stream
+ # is supported, it takes *keys*, while self.get_parent_map() takes
+ # strings... *sigh*
knit_versions = set()
knit_versions.update(version_ids)
parent_map = self.get_parent_map(version_ids)
@@ -1047,43 +1183,8 @@
def make_mpdiffs(self, keys):
"""Create multiparent diffs for specified keys."""
- keys_order = tuple(keys)
- keys = frozenset(keys)
- knit_keys = set(keys)
- parent_map = self.get_parent_map(keys)
- for parent_keys in parent_map.itervalues():
- if parent_keys:
- knit_keys.update(parent_keys)
- missing_keys = keys - set(parent_map)
- if missing_keys:
- raise errors.RevisionNotPresent(list(missing_keys)[0], self)
- # We need to filter out ghosts, because we can't diff against them.
- maybe_ghosts = knit_keys - keys
- ghosts = maybe_ghosts - set(self.get_parent_map(maybe_ghosts))
- knit_keys.difference_update(ghosts)
- lines = {}
- chunks_to_lines = osutils.chunks_to_lines
- for record in self.get_record_stream(knit_keys, 'topological', True):
- lines[record.key] = chunks_to_lines(record.get_bytes_as('chunked'))
- # line_block_dict = {}
- # for parent, blocks in record.extract_line_blocks():
- # line_blocks[parent] = blocks
- # line_blocks[record.key] = line_block_dict
- diffs = []
- for key in keys_order:
- target = lines[key]
- parents = parent_map[key] or []
- # Note that filtering knit_keys can lead to a parent difference
- # between the creation and the application of the mpdiff.
- parent_lines = [lines[p] for p in parents if p in knit_keys]
- if len(parent_lines) > 0:
- left_parent_blocks = self._extract_blocks(key, parent_lines[0],
- target)
- else:
- left_parent_blocks = None
- diffs.append(multiparent.MultiParent.from_lines(target,
- parent_lines, left_parent_blocks))
- return diffs
+ generator = _MPDiffGenerator(self, keys)
+ return generator.compute_diffs()
def get_annotator(self):
return annotate.Annotator(self)
More information about the bazaar-commits
mailing list