Rev 5376: (jameinel) Decrease peak memory during 'bzr send', in file:///home/pqm/archives/thelove/bzr/%2Btrunk/

Wed Aug 11 03:49:28 BST 2010

At file:///home/pqm/archives/thelove/bzr/%2Btrunk/

------------------------------------------------------------
revno: 5376 [merge]
revision-id: pqm at pqm.ubuntu.com-20100811024926-irizklc3vvd9h45p
parent: pqm at pqm.ubuntu.com-20100810011615-0dm93d6fa4mxov6o
parent: john at arbash-meinel.com-20100811013111-wdrv1hp03kir7gf2
committer: Canonical.com Patch Queue Manager <pqm at pqm.ubuntu.com>
branch nick: +trunk
timestamp: Wed 2010-08-11 03:49:26 +0100
message:
  (jameinel) Decrease peak memory during 'bzr send',
   but releasing texts no longer needed. (John A Meinel)
added:
  bzrlib/tests/test_versionedfile.py test_versionedfile.p-20100810180118-kszg2mcbpsoybwpt-1
modified:
  NEWS                           NEWS-20050323055033-4e00b5db738777ff
  bzrlib/bundle/serializer/v4.py v10.py-20070611062757-5ggj7k18s9dej0fr-1
  bzrlib/multiparent.py          __init__.py-20070410133617-n1jdhcc1n1mibarp-1
  bzrlib/tests/__init__.py       selftest.py-20050531073622-8d0e3c8845c97a64
  bzrlib/versionedfile.py        versionedfile.py-20060222045106-5039c71ee3b65490
=== modified file 'NEWS'

--- a/NEWS	2010-08-09 10:13:33 +0000
+++ b/NEWS	2010-08-11 02:49:26 +0000
@@ -47,6 +47,12 @@
 * Check if both --diff-options and --using are set, and exit with error
   in this case. (MatthÃ¤us G. Chajdas, #234708)
 
+* Decrease peak memory during ``bzr send``. The old code was caching all
+  text content and all inventory strings for all revisions before
+  computing the diffs. Now we only cache as long as there is a child that
+  will need them. Sending 2000 bzr revisions drops from 1.2GB peak to
+  256MB peak. (John Arbash Meinel, #614576)
+
 * Don't print internal object name when print an invalid revision spec
   error.  (Neil Martinsen-Burrell, #598701)
 

=== modified file 'bzrlib/bundle/serializer/v4.py'
--- a/bzrlib/bundle/serializer/v4.py	2009-08-04 14:08:32 +0000
+++ b/bzrlib/bundle/serializer/v4.py	2010-08-10 20:03:44 +0000
@@ -1,4 +1,4 @@
-# Copyright (C) 2007 Canonical Ltd
+# Copyright (C) 2007-2010 Canonical Ltd
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@@ -30,11 +30,49 @@
     serializer,
     trace,
     ui,
+    versionedfile as _mod_versionedfile,
     )
 from bzrlib.bundle import bundle_data, serializer as bundle_serializer
 from bzrlib import bencode
 
 
+class _MPDiffInventoryGenerator(_mod_versionedfile._MPDiffGenerator):
+    """Generate Inventory diffs serialized inventories."""
+
+    def __init__(self, repo, inventory_keys):
+        super(_MPDiffInventoryGenerator, self).__init__(repo.inventories,
+            inventory_keys)
+        self.repo = repo
+        self.sha1s = {}
+
+    def iter_diffs(self):
+        """Compute the diffs one at a time."""
+        # This is instead of compute_diffs() since we guarantee our ordering of
+        # inventories, we don't have to do any buffering
+        self._find_needed_keys()
+        # We actually use a slightly different ordering. We grab all of the
+        # parents first, and then grab the ordered requests.
+        needed_ids = [k[-1] for k in self.present_parents]
+        needed_ids.extend([k[-1] for k in self.ordered_keys])
+        inv_to_str = self.repo._serializer.write_inventory_to_string
+        for inv in self.repo.iter_inventories(needed_ids):
+            revision_id = inv.revision_id
+            key = (revision_id,)
+            if key in self.present_parents:
+                # Not a key we will transmit, which is a shame, since because
+                # of that bundles don't work with stacked branches
+                parent_ids = None
+            else:
+                parent_ids = [k[-1] for k in self.parent_map[key]]
+            as_bytes = inv_to_str(inv)
+            self._process_one_record(key, (as_bytes,))
+            if parent_ids is None:
+                continue
+            diff = self.diffs.pop(key)
+            sha1 = osutils.sha_string(as_bytes)
+            yield revision_id, parent_ids, sha1, diff
+
+
 class BundleWriter(object):
     """Writer for bundle-format files.
 
@@ -348,48 +386,10 @@
         the other side.
         """
         inventory_key_order = [(r,) for r in revision_order]
-        parent_map = self.repository.inventories.get_parent_map(
-                            inventory_key_order)
-        missing_keys = set(inventory_key_order).difference(parent_map)
-        if missing_keys:
-            raise errors.RevisionNotPresent(list(missing_keys)[0],
-                                            self.repository.inventories)
-        inv_to_str = self.repository._serializer.write_inventory_to_string
-        # Make sure that we grab the parent texts first
-        just_parents = set()
-        map(just_parents.update, parent_map.itervalues())
-        just_parents.difference_update(parent_map)
-        # Ignore ghost parents
-        present_parents = self.repository.inventories.get_parent_map(
-                            just_parents)
-        ghost_keys = just_parents.difference(present_parents)
-        needed_inventories = list(present_parents) + inventory_key_order
-        needed_inventories = [k[-1] for k in needed_inventories]
-        all_lines = {}
-        for inv in self.repository.iter_inventories(needed_inventories):
-            revision_id = inv.revision_id
-            key = (revision_id,)
-            as_bytes = inv_to_str(inv)
-            # The sha1 is validated as the xml/textual form, not as the
-            # form-in-the-repository
-            sha1 = osutils.sha_string(as_bytes)
-            as_lines = osutils.split_lines(as_bytes)
-            del as_bytes
-            all_lines[key] = as_lines
-            if key in just_parents:
-                # We don't transmit those entries
-                continue
-            # Create an mpdiff for this text, and add it to the output
-            parent_keys = parent_map[key]
-            # See the comment in VF.make_mpdiffs about how this effects
-            # ordering when there are ghosts present. I think we have a latent
-            # bug
-            parent_lines = [all_lines[p_key] for p_key in parent_keys
-                            if p_key not in ghost_keys]
-            diff = multiparent.MultiParent.from_lines(
-                as_lines, parent_lines)
+        generator = _MPDiffInventoryGenerator(self.repository,
+                                              inventory_key_order)
+        for revision_id, parent_ids, sha1, diff in generator.iter_diffs():
             text = ''.join(diff.to_patch())
-            parent_ids = [k[-1] for k in parent_keys]
             self.bundle.add_multiparent_record(text, sha1, parent_ids,
                                                'inventory', revision_id, None)
 

=== modified file 'bzrlib/multiparent.py'
--- a/bzrlib/multiparent.py	2009-06-10 03:56:49 +0000
+++ b/bzrlib/multiparent.py	2010-08-11 01:31:11 +0000
@@ -1,4 +1,4 @@
-# Copyright (C) 2007 Canonical Ltd
+# Copyright (C) 2007-2010 Canonical Ltd
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@@ -76,6 +76,8 @@
 class MultiParent(object):
     """A multi-parent diff"""
 
+    __slots__ = ['hunks']
+
     def __init__(self, hunks=None):
         if hunks is not None:
             self.hunks = hunks
@@ -258,6 +260,8 @@
 class NewText(object):
     """The contents of text that is introduced by this text"""
 
+    __slots__ = ['lines']
+
     def __init__(self, lines):
         self.lines = lines
 
@@ -279,24 +283,30 @@
 class ParentText(object):
     """A reference to text present in a parent text"""
 
+    __slots__ = ['parent', 'parent_pos', 'child_pos', 'num_lines']
+
     def __init__(self, parent, parent_pos, child_pos, num_lines):
         self.parent = parent
         self.parent_pos = parent_pos
         self.child_pos = child_pos
         self.num_lines = num_lines
 
+    def _as_dict(self):
+        return dict(parent=self.parent, parent_pos=self.parent_pos,
+                    child_pos=self.child_pos, num_lines=self.num_lines)
+
     def __repr__(self):
-        return 'ParentText(%(parent)r, %(parent_pos)r, %(child_pos)r,'\
-            ' %(num_lines)r)' % self.__dict__
+        return ('ParentText(%(parent)r, %(parent_pos)r, %(child_pos)r,'
+                ' %(num_lines)r)' % self._as_dict())
 
     def __eq__(self, other):
         if self.__class__ is not other.__class__:
             return False
-        return (self.__dict__ == other.__dict__)
+        return self._as_dict() == other._as_dict()
 
     def to_patch(self):
-        yield 'c %(parent)d %(parent_pos)d %(child_pos)d %(num_lines)d\n'\
-            % self.__dict__
+        yield ('c %(parent)d %(parent_pos)d %(child_pos)d %(num_lines)d\n'
+               % self._as_dict())
 
 
 class BaseVersionedFile(object):

=== modified file 'bzrlib/tests/__init__.py'
--- a/bzrlib/tests/__init__.py	2010-08-07 16:42:09 +0000
+++ b/bzrlib/tests/__init__.py	2010-08-10 18:07:54 +0000
@@ -3828,6 +3828,7 @@
         'bzrlib.tests.test_urlutils',
         'bzrlib.tests.test_version',
         'bzrlib.tests.test_version_info',
+        'bzrlib.tests.test_versionedfile',
         'bzrlib.tests.test_weave',
         'bzrlib.tests.test_whitebox',
         'bzrlib.tests.test_win32utils',

=== added file 'bzrlib/tests/test_versionedfile.py'
--- a/bzrlib/tests/test_versionedfile.py	1970-01-01 00:00:00 +0000
+++ b/bzrlib/tests/test_versionedfile.py	2010-08-10 20:03:44 +0000
@@ -0,0 +1,139 @@
+# Copyright (C) 2010 Canonical Ltd
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+
+"""Tests for VersionedFile classes"""
+
+from bzrlib import (
+    errors,
+    groupcompress,
+    multiparent,
+    tests,
+    versionedfile,
+    )
+
+
+class Test_MPDiffGenerator(tests.TestCaseWithMemoryTransport):
+    # Should this be a per vf test?
+
+    def make_vf(self):
+        t = self.get_transport('')
+        factory = groupcompress.make_pack_factory(True, True, 1)
+        return factory(t)
+
+    def make_three_vf(self):
+        vf = self.make_vf()
+        vf.add_lines(('one',), (), ['first\n'])
+        vf.add_lines(('two',), [('one',)], ['first\n', 'second\n'])
+        vf.add_lines(('three',), [('one',), ('two',)],
+                    ['first\n', 'second\n', 'third\n'])
+        return vf
+
+    def test_finds_parents(self):
+        vf = self.make_three_vf()
+        gen = versionedfile._MPDiffGenerator(vf, [('three',)])
+        needed_keys, refcount = gen._find_needed_keys()
+        self.assertEqual(sorted([('one',), ('two',), ('three',)]),
+                         sorted(needed_keys))
+        self.assertEqual({('one',): 1, ('two',): 1}, refcount)
+
+    def test_ignores_ghost_parents(self):
+        # If a parent is a ghost, it is just ignored
+        vf = self.make_vf()
+        vf.add_lines(('two',), [('one',)], ['first\n', 'second\n'])
+        gen = versionedfile._MPDiffGenerator(vf, [('two',)])
+        needed_keys, refcount = gen._find_needed_keys()
+        self.assertEqual(sorted([('two',)]), sorted(needed_keys))
+        # It is returned, but we don't really care as we won't extract it
+        self.assertEqual({('one',): 1}, refcount)
+        self.assertEqual([('one',)], sorted(gen.ghost_parents))
+        self.assertEqual([], sorted(gen.present_parents))
+
+    def test_raises_on_ghost_keys(self):
+        # If the requested key is a ghost, then we have a problem
+        vf = self.make_vf()
+        gen = versionedfile._MPDiffGenerator(vf, [('one',)])
+        self.assertRaises(errors.RevisionNotPresent,
+                          gen._find_needed_keys)
+
+    def test_refcount_multiple_children(self):
+        vf = self.make_three_vf()
+        gen = versionedfile._MPDiffGenerator(vf, [('two',), ('three',)])
+        needed_keys, refcount = gen._find_needed_keys()
+        self.assertEqual(sorted([('one',), ('two',), ('three',)]),
+                         sorted(needed_keys))
+        self.assertEqual({('one',): 2, ('two',): 1}, refcount)
+        self.assertEqual([('one',)], sorted(gen.present_parents))
+
+    def test_process_contents(self):
+        vf = self.make_three_vf()
+        gen = versionedfile._MPDiffGenerator(vf, [('two',), ('three',)])
+        gen._find_needed_keys()
+        self.assertEqual({('two',): (('one',),),
+                          ('three',): (('one',), ('two',))},
+                         gen.parent_map)
+        self.assertEqual({('one',): 2, ('two',): 1}, gen.refcounts)
+        self.assertEqual(sorted([('one',), ('two',), ('three',)]),
+                         sorted(gen.needed_keys))
+        stream = vf.get_record_stream(gen.needed_keys, 'topological', True)
+        record = stream.next()
+        self.assertEqual(('one',), record.key)
+        # one is not needed in the output, but it is needed by children. As
+        # such, it should end up in the various caches
+        gen._process_one_record(record.key, record.get_bytes_as('chunked'))
+        # The chunks should be cached, the refcount untouched
+        self.assertEqual([('one',)], gen.chunks.keys())
+        self.assertEqual({('one',): 2, ('two',): 1}, gen.refcounts)
+        self.assertEqual([], gen.diffs.keys())
+        # Next we get 'two', which is something we output, but also needed for
+        # three
+        record = stream.next()
+        self.assertEqual(('two',), record.key)
+        gen._process_one_record(record.key, record.get_bytes_as('chunked'))
+        # Both are now cached, and the diff for two has been extracted, and
+        # one's refcount has been updated. two has been removed from the
+        # parent_map
+        self.assertEqual(sorted([('one',), ('two',)]),
+                         sorted(gen.chunks.keys()))
+        self.assertEqual({('one',): 1, ('two',): 1}, gen.refcounts)
+        self.assertEqual([('two',)], gen.diffs.keys())
+        self.assertEqual({('three',): (('one',), ('two',))},
+                         gen.parent_map)
+        # Finally 'three', which allows us to remove all parents from the
+        # caches
+        record = stream.next()
+        self.assertEqual(('three',), record.key)
+        gen._process_one_record(record.key, record.get_bytes_as('chunked'))
+        # Both are now cached, and the diff for two has been extracted, and
+        # one's refcount has been updated
+        self.assertEqual([], gen.chunks.keys())
+        self.assertEqual({}, gen.refcounts)
+        self.assertEqual(sorted([('two',), ('three',)]),
+                         sorted(gen.diffs.keys()))
+
+    def test_compute_diffs(self):
+        vf = self.make_three_vf()
+        # The content is in the order requested, even if it isn't topological
+        gen = versionedfile._MPDiffGenerator(vf, [('two',), ('three',),
+                                                  ('one',)])
+        diffs = gen.compute_diffs()
+        expected_diffs = [
+            multiparent.MultiParent([multiparent.ParentText(0, 0, 0, 1),
+                                     multiparent.NewText(['second\n'])]),
+            multiparent.MultiParent([multiparent.ParentText(1, 0, 0, 2),
+                                     multiparent.NewText(['third\n'])]),
+            multiparent.MultiParent([multiparent.NewText(['first\n'])]),
+            ]
+        self.assertEqual(expected_diffs, diffs)

=== modified file 'bzrlib/versionedfile.py'
--- a/bzrlib/versionedfile.py	2010-07-30 20:44:45 +0000
+++ b/bzrlib/versionedfile.py	2010-08-11 01:27:46 +0000
@@ -206,6 +206,138 @@
             yield record
 
 
+class _MPDiffGenerator(object):
+    """Pull out the functionality for generating mp_diffs."""
+
+    def __init__(self, vf, keys):
+        self.vf = vf
+        # This is the order the keys were requested in
+        self.ordered_keys = tuple(keys)
+        # keys + their parents, what we need to compute the diffs
+        self.needed_keys = ()
+        # Map from key: mp_diff
+        self.diffs = {}
+        # Map from key: parents_needed (may have ghosts)
+        self.parent_map = {}
+        # Parents that aren't present
+        self.ghost_parents = ()
+        # Map from parent_key => number of children for this text
+        self.refcounts = {}
+        # Content chunks that are cached while we still need them
+        self.chunks = {}
+
+    def _find_needed_keys(self):
+        """Find the set of keys we need to request.
+
+        This includes all the original keys passed in, and the non-ghost
+        parents of those keys.
+
+        :return: (needed_keys, refcounts)
+            needed_keys is the set of all texts we need to extract
+            refcounts is a dict of {key: num_children} letting us know when we
+                no longer need to cache a given parent text
+        """
+        # All the keys and their parents
+        needed_keys = set(self.ordered_keys)
+        parent_map = self.vf.get_parent_map(needed_keys)
+        self.parent_map = parent_map
+        # TODO: Should we be using a different construct here? I think this
+        #       uses difference_update internally, and we expect the result to
+        #       be tiny
+        missing_keys = needed_keys.difference(parent_map)
+        if missing_keys:
+            raise errors.RevisionNotPresent(list(missing_keys)[0], self.vf)
+        # Parents that might be missing. They are allowed to be ghosts, but we
+        # should check for them
+        refcounts = {}
+        setdefault = refcounts.setdefault
+        just_parents = set()
+        for child_key, parent_keys in parent_map.iteritems():
+            if not parent_keys:
+                # parent_keys may be None if a given VersionedFile claims to
+                # not support graph operations.
+                continue
+            just_parents.update(parent_keys)
+            needed_keys.update(parent_keys)
+            for p in parent_keys:
+                refcounts[p] = setdefault(p, 0) + 1
+        just_parents.difference_update(parent_map)
+        # Remove any parents that are actually ghosts from the needed set
+        self.present_parents = set(self.vf.get_parent_map(just_parents))
+        self.ghost_parents = just_parents.difference(self.present_parents)
+        needed_keys.difference_update(self.ghost_parents)
+        self.needed_keys = needed_keys
+        self.refcounts = refcounts
+        return needed_keys, refcounts
+
+    def _compute_diff(self, key, parent_lines, lines):
+        """Compute a single mp_diff, and store it in self._diffs"""
+        if len(parent_lines) > 0:
+            # XXX: _extract_blocks is not usefully defined anywhere...
+            #      It was meant to extract the left-parent diff without
+            #      having to recompute it for Knit content (pack-0.92,
+            #      etc). That seems to have regressed somewhere
+            left_parent_blocks = self.vf._extract_blocks(key,
+                parent_lines[0], lines)
+        else:
+            left_parent_blocks = None
+        diff = multiparent.MultiParent.from_lines(lines,
+                    parent_lines, left_parent_blocks)
+        self.diffs[key] = diff
+
+    def _process_one_record(self, key, this_chunks):
+        parent_keys = None
+        if key in self.parent_map:
+            # This record should be ready to diff, since we requested
+            # content in 'topological' order
+            parent_keys = self.parent_map.pop(key)
+            # If a VersionedFile claims 'no-graph' support, then it may return
+            # None for any parent request, so we replace it with an empty tuple
+            if parent_keys is None:
+                parent_keys = ()
+            parent_lines = []
+            for p in parent_keys:
+                # Alternatively we could check p not in self.needed_keys, but
+                # ghost_parents should be tiny versus huge
+                if p in self.ghost_parents:
+                    continue
+                refcount = self.refcounts[p]
+                if refcount == 1: # Last child reference
+                    self.refcounts.pop(p)
+                    parent_chunks = self.chunks.pop(p)
+                else:
+                    self.refcounts[p] = refcount - 1
+                    parent_chunks = self.chunks[p]
+                p_lines = osutils.chunks_to_lines(parent_chunks)
+                # TODO: Should we cache the line form? We did the
+                #       computation to get it, but storing it this way will
+                #       be less memory efficient...
+                parent_lines.append(p_lines)
+                del p_lines
+            lines = osutils.chunks_to_lines(this_chunks)
+            # Since we needed the lines, we'll go ahead and cache them this way
+            this_chunks = lines
+            self._compute_diff(key, parent_lines, lines)
+            del lines
+        # Is this content required for any more children?
+        if key in self.refcounts:
+            self.chunks[key] = this_chunks
+
+    def _extract_diffs(self):
+        needed_keys, refcounts = self._find_needed_keys()
+        for record in self.vf.get_record_stream(needed_keys,
+                                                'topological', True):
+            if record.storage_kind == 'absent':
+                raise errors.RevisionNotPresent(record.key, self.vf)
+            self._process_one_record(record.key,
+                                     record.get_bytes_as('chunked'))
+        
+    def compute_diffs(self):
+        self._extract_diffs()
+        dpop = self.diffs.pop
+        return [dpop(k) for k in self.ordered_keys]
+
+
 class VersionedFile(object):
     """Versioned text file storage.
 
@@ -348,6 +480,10 @@
 
     def make_mpdiffs(self, version_ids):
         """Create multiparent diffs for specified versions."""
+        # XXX: Can't use _MPDiffGenerator just yet. This is because version_ids
+        #      is a list of strings, not keys. And while self.get_record_stream
+        #      is supported, it takes *keys*, while self.get_parent_map() takes
+        #      strings... *sigh*
         knit_versions = set()
         knit_versions.update(version_ids)
         parent_map = self.get_parent_map(version_ids)
@@ -1047,43 +1183,8 @@
 
     def make_mpdiffs(self, keys):
         """Create multiparent diffs for specified keys."""
-        keys_order = tuple(keys)
-        keys = frozenset(keys)
-        knit_keys = set(keys)
-        parent_map = self.get_parent_map(keys)
-        for parent_keys in parent_map.itervalues():
-            if parent_keys:
-                knit_keys.update(parent_keys)
-        missing_keys = keys - set(parent_map)
-        if missing_keys:
-            raise errors.RevisionNotPresent(list(missing_keys)[0], self)
-        # We need to filter out ghosts, because we can't diff against them.
-        maybe_ghosts = knit_keys - keys
-        ghosts = maybe_ghosts - set(self.get_parent_map(maybe_ghosts))
-        knit_keys.difference_update(ghosts)
-        lines = {}
-        chunks_to_lines = osutils.chunks_to_lines
-        for record in self.get_record_stream(knit_keys, 'topological', True):
-            lines[record.key] = chunks_to_lines(record.get_bytes_as('chunked'))
-            # line_block_dict = {}
-            # for parent, blocks in record.extract_line_blocks():
-            #   line_blocks[parent] = blocks
-            # line_blocks[record.key] = line_block_dict
-        diffs = []
-        for key in keys_order:
-            target = lines[key]
-            parents = parent_map[key] or []
-            # Note that filtering knit_keys can lead to a parent difference
-            # between the creation and the application of the mpdiff.
-            parent_lines = [lines[p] for p in parents if p in knit_keys]
-            if len(parent_lines) > 0:
-                left_parent_blocks = self._extract_blocks(key, parent_lines[0],
-                    target)
-            else:
-                left_parent_blocks = None
-            diffs.append(multiparent.MultiParent.from_lines(target,
-                parent_lines, left_parent_blocks))
-        return diffs
+        generator = _MPDiffGenerator(self, keys)
+        return generator.compute_diffs()
 
     def get_annotator(self):
         return annotate.Annotator(self)