Rev 2930: * Commit updates the state of the working tree via a delta rather than in http://people.ubuntu.com/~robertc/baz2.0/dirstate.update_basis

Robert Collins robertc at robertcollins.net
Tue Oct 23 23:14:50 BST 2007


At http://people.ubuntu.com/~robertc/baz2.0/dirstate.update_basis

------------------------------------------------------------
revno: 2930
revision-id:robertc at robertcollins.net-20071023221432-j8zndh1oiegql3cu
parent: pqm at pqm.ubuntu.com-20071023082111-h6u34i4gvlb2nwch
committer: Robert Collins <robertc at robertcollins.net>
branch nick: dirstate.update_basis
timestamp: Wed 2007-10-24 08:14:32 +1000
message:
  * Commit updates the state of the working tree via a delta rather than
    supplying entirely new basis trees. For commit of a single specified file
    this reduces the wall clock time for commit by roughly a 30%.
    (Robert Collins, Martin Pool)
modified:
  NEWS                           NEWS-20050323055033-4e00b5db738777ff
  bzrlib/commit.py               commit.py-20050511101309-79ec1a0168e0e825
  bzrlib/dirstate.py             dirstate.py-20060728012006-d6mvoihjb3je9peu-1
  bzrlib/mutabletree.py          mutabletree.py-20060906023413-4wlkalbdpsxi2r4y-2
  bzrlib/tests/test_dirstate.py  test_dirstate.py-20060728012006-d6mvoihjb3je9peu-2
  bzrlib/tests/workingtree_implementations/test_parents.py test_set_parents.py-20060807231740-yicmnlci1mj8smu1-1
  bzrlib/workingtree_4.py        workingtree_4.py-20070208044105-5fgpc5j3ljlh5q6c-1
=== modified file 'NEWS'
--- a/NEWS	2007-10-23 08:21:11 +0000
+++ b/NEWS	2007-10-23 22:14:32 +0000
@@ -52,6 +52,11 @@
      changed to determine if the tree is unchanged rather than recalculating
      it at the end of the commit process. (Robert Collins)
 
+   * Commit updates the state of the working tree via a delta rather than
+     supplying entirely new basis trees. For commit of a single specified file
+     this reduces the wall clock time for commit by roughly a 30%.
+     (Robert Collins, Martin Pool)
+
    * Inventory serialisation no longer double-sha's the content.
      (Robert Collins)
 

=== modified file 'bzrlib/commit.py'
--- a/bzrlib/commit.py	2007-10-15 05:02:05 +0000
+++ b/bzrlib/commit.py	2007-10-23 22:14:32 +0000
@@ -383,17 +383,8 @@
 
             # Make the working tree up to date with the branch
             self._set_progress_stage("Updating the working tree")
-            rev_tree = self.builder.revision_tree()
-            # XXX: This will need to be changed if we support doing a
-            # selective commit while a merge is still pending - then we'd
-            # still have multiple parents after the commit.
-            #
-            # XXX: update_basis_by_delta is slower at present because it works
-            # on inventories, so this is not active until there's a native
-            # dirstate implementation.
-            ## self.work_tree.update_basis_by_delta(self.rev_id,
-            ##      self._basis_delta)
-            self.work_tree.set_parent_trees([(self.rev_id, rev_tree)])
+            self.work_tree.update_basis_by_delta(self.rev_id,
+                 self._basis_delta)
             self.reporter.completed(new_revno, self.rev_id)
             self._process_post_hooks(old_revno, new_revno)
         finally:

=== modified file 'bzrlib/dirstate.py'
--- a/bzrlib/dirstate.py	2007-10-22 20:05:12 +0000
+++ b/bzrlib/dirstate.py	2007-10-23 22:14:32 +0000
@@ -212,6 +212,7 @@
 import zlib
 
 from bzrlib import (
+    cache_utf8,
     debug,
     errors,
     inventory,
@@ -305,15 +306,6 @@
     def __init__(self, path):
         """Create a  DirState object.
 
-        Attributes of note:
-
-        :attr _root_entrie: The root row of the directory/file information,
-            - contains the path to / - '', ''
-            - kind of 'directory',
-            - the file id of the root in utf8
-            - size of 0
-            - a packed state
-            - and no sha information.
         :param path: The path at which the dirstate file on disk should live.
         """
         # _header_state and _dirblock_state represent the current state
@@ -897,6 +889,48 @@
             processed_dirs.update(pending_dirs)
         return found
 
+    def _discard_merge_parents(self):
+        """Discard any parents trees beyond the first.
+        
+        Note that if this fails the dirstate is corrupted.
+
+        After this function returns the dirstate contains 2 trees, neither of
+        which are ghosted.
+        """
+        self._read_header_if_needed()
+        parents = self.get_parent_ids()
+        if len(parents) < 1:
+            return
+        # only require all dirblocks if we are doing a full-pass removal.
+        self._read_dirblocks_if_needed()
+        dead_patterns = set([('a', 'r'), ('a', 'a'), ('r', 'r'), ('r', 'a')])
+        def iter_entries_removable():
+            for block in self._dirblocks:
+                deleted_positions = []
+                for pos, entry in enumerate(block[1]):
+                    yield entry
+                    if (entry[1][0][0], entry[1][1][0]) in dead_patterns:
+                        deleted_positions.append(pos)
+                if deleted_positions:
+                    if len(deleted_positions) == len(block):
+                        del block[1][:]
+                    else:
+                        for pos in reversed(deleted_positions):
+                            del block[1][pos]
+        # if the first parent is a ghost:
+        if parents[0] in self.get_ghosts():
+            empty_parent = [DirState.NULL_PARENT_DETAILS]
+            for entry in iter_entries_removable():
+                entry[1][1:] = empty_parent
+        else:
+            for entry in iter_entries_removable():
+                del entry[1][2:]
+
+        self._ghosts = []
+        self._parents = [parents[0]]
+        self._dirblock_state = DirState.IN_MEMORY_MODIFIED
+        self._header_state = DirState.IN_MEMORY_MODIFIED
+
     def _empty_parent_info(self):
         return [DirState.NULL_PARENT_DETAILS] * (len(self._parents) -
                                                     len(self._ghosts))
@@ -1127,6 +1161,184 @@
             raise
         return result
 
+    def update_basis_by_delta(self, delta, new_revid):
+        """Update the parents of this tree after a commit.
+
+        This gives the tree one parent, with revision id new_revid. The
+        inventory delta is applied to the current basis tree to generate the
+        inventory for the parent new_revid, and all other parent trees are
+        discarded.
+
+        Note that an exception during the operation of this method will leave
+        the dirstate in a corrupt state where it should not be saved.
+
+        Finally, we expect all changes to be synchronising the basis tree with
+        the working tree.
+
+        :param new_revid: The new revision id for the trees parent.
+        :param delta: An inventory delta (see apply_inventory_delta) describing
+            the changes from the current left most parent revision to new_revid.
+        """
+        self._read_dirblocks_if_needed()
+        self._discard_merge_parents()
+        if self._ghosts != []:
+            raise NotImplementedError(self.update_basis_by_delta)
+        if len(self._parents) == 0:
+            # setup a blank tree, the most simple way.
+            empty_parent = DirState.NULL_PARENT_DETAILS
+            for entry in self._iter_entries():
+                entry[1].append(empty_parent)
+            self._parents.append(new_revid)
+
+        self._parents[0] = new_revid
+
+        delta = sorted(delta, reverse=True)
+        adds = []
+        changes = []
+        deletes = []
+        # The paths this function accepts are unicode and must be encoded as we
+        # go.
+        encode = cache_utf8.encode
+        inv_to_entry = self._inv_entry_to_details
+        # delta is now (deletes, changes), (adds) in reverse lexographical
+        # order.
+        # deletes in reverse lexographic order are safe to process in situ.
+        # renames are not, as a rename from any path could go to a path
+        # lexographically lower, so we transform renames into delete, add pairs,
+        # expanding them recursively as needed.
+        # At the same time, to reduce interface friction we convert the input
+        # inventory entries to dirstate.
+        
+        for old_path, new_path, file_id, inv_entry in delta:
+            if old_path is None:
+                adds.append((None, encode(new_path), file_id,
+                    inv_to_entry(inv_entry)))
+            elif new_path is None:
+                deletes.append((encode(old_path), None, file_id, None))
+            elif old_path != new_path:
+                # Renames:
+                # Because renames must preserve their children we must have
+                # processed all reloations and removes before hand. The sort
+                # order ensures we've examined the child paths, but we also
+                # have to execute the removals, or the split to an add/delete
+                # pair will result in the deleted item being reinserted, or
+                # renamed items being reinserted twice - and possibly at the
+                # wrong place. Splitting into a delete/add pair also simplifies
+                # the handling of entries with ('f', ...), ('r' ...) because
+                # the target of the 'r' is old_path here, and we add that to
+                # deletes, meaning that the add handler does not need to check
+                # for 'r' items on every pass.
+                self._update_basis_apply_deletes(deletes)
+                deletes = []
+                new_path_utf8 = encode(new_path)
+                # Split into an add/delete pair recursively.
+                adds.append((None, new_path_utf8, file_id,
+                    inv_to_entry(inv_entry)))
+                # Remove the current contents of the tree at orig_path, and
+                # reinsert at the correct new path.
+                new_deletes = list(reversed(list(self._iter_child_entries(1,
+                    encode(old_path)))))
+                for entry in new_deletes:
+                    source_path = '/'.join(entry[0][0:2])
+                    target_path = new_path_utf8 + source_path[len(old_path):]
+                    adds.append((None, target_path, entry[0][2], entry[1][1]))
+                    deletes.append((source_path,  None, entry[0][2], None))
+                deletes.append((encode(old_path), None, file_id, None))
+            else:
+                changes.append((encode(old_path), encode(new_path), file_id,
+                    inv_to_entry(inv_entry)))
+
+        self._update_basis_apply_deletes(deletes)
+        self._update_basis_apply_changes(changes)
+        self._update_basis_apply_adds(adds)
+
+        # remove all deletes
+        self._dirblock_state = DirState.IN_MEMORY_MODIFIED
+        self._header_state = DirState.IN_MEMORY_MODIFIED
+        return
+
+    def _update_basis_apply_adds(self, adds):
+        """Apply a sequence of adds to tree 1 during update_basis_by_delta.
+
+        They may be adds, or renames that have been split into add/delete
+        pairs.
+
+        :param adds: A sequence of adds. Each add is a tuple:
+            (None, new_path_utf8, file_id, (entry_details))
+        """
+        # Adds are accumulated partly from renames, so can be in any input
+        # order - sort it.
+        adds.sort()
+        # adds is now in lexographic order, which places all parents before
+        # their children, so we can process it linearly.
+        absent = ('r', 'a')
+        for old_path, new_path, file_id, new_details in adds:
+            assert old_path is None
+            # the entry for this file_id must be in tree 0.
+            entry = self._get_entry(0, file_id, new_path)
+            if entry[0][2] != file_id:
+                raise errors.BzrError('dirstate: cannot apply delta, working'
+                    ' tree does not contain new entry %r %r' %
+                    (new_path, file_id))
+            if entry[1][1][0] not in absent:
+                raise errors.BzrError('dirstate: inconsistent delta, with '
+                    'tree 0. %r %r' % (new_path, file_id))
+            # We don't need to update the target of an 'r' because the handling
+            # of renames turns all 'r' situations into a delete at the original
+            # location.
+            entry[1][1] = new_details
+
+    def _update_basis_apply_changes(self, changes):
+        """Apply a sequence of changes to tree 1 during update_basis_by_delta.
+
+        :param adds: A sequence of changes. Each change is a tuple:
+            (path_utf8, path_utf8, file_id, (entry_details))
+        """
+        absent = ('a', 'r')
+        for old_path, new_path, file_id, new_details in changes:
+            assert old_path == new_path
+            # the entry for this file_id must be in tree 0.
+            entry = self._get_entry(0, file_id, new_path)
+            if entry[0][2] != file_id:
+                raise errors.BzrError('dirstate: cannot apply delta, working'
+                    ' tree does not contain new entry %r %r' %
+                    (new_path, file_id))
+            if (entry[1][0][0] in absent or
+                entry[1][1][0] in absent):
+                raise errors.BzrError('dirstate: inconsistent delta, with '
+                    'tree 0. %r %r' % (new_path, file_id))
+            entry[1][1] = new_details
+
+    def _update_basis_apply_deletes(self, deletes):
+        """Apply a sequence of deletes to tree 1 during update_basis_by_delta.
+
+        They may be deletes, or renames that have been split into add/delete
+        pairs.
+
+        :param adds: A sequence of deletes. Each delete is a tuple:
+            (old_path_utf8, None, file_id, None)
+        """
+        absent = ('r', 'a')
+        # Deletes are accumulated in lexographical order.
+        for old_path, new_path, file_id, _ in deletes:
+            assert new_path is None
+            # the entry for this file_id must be in tree 1.
+            dirname, basename = osutils.split(old_path)
+            block_index, entry_index, dir_present, file_present = \
+                self._get_block_entry_index(dirname, basename, 1)
+            if not file_present:
+                raise errors.BzrError('dirstate: cannot apply delta, basis'
+                    ' tree does not contain new entry %r %r' %
+                    (old_path, file_id))
+            entry = self._dirblocks[block_index][1][entry_index]
+            if entry[0][2] != file_id:
+                raise errors.BzrError('mismatched file_id in tree 1 %r %r' %
+                    (old_path, file_id))
+            if entry[1][0][0] not in absent:
+                raise errors.BzrError('dirstate: inconsistent delta, with '
+                    'tree 0. %r %r' % (old_path, file_id))
+            del self._dirblocks[block_index][1][entry_index]
+
     def update_entry(self, entry, abspath, stat_value,
                      _stat_to_minikind=_stat_to_minikind,
                      _pack_stat=pack_stat):
@@ -1526,6 +1738,37 @@
             raise Exception("can't pack %s" % inv_entry)
         return (minikind, fingerprint, size, executable, tree_data)
 
+    def _iter_child_entries(self, tree_index, path_utf8):
+        """Iterate over all the entries that are children of path_utf.
+
+        This only returns entries that are present (not in 'a', 'r') in 
+        tree_index. tree_index data is not refreshed, so if tree 0 is used,
+        results may differ from that obtained if paths were statted to
+        determine what ones were directories.
+
+        Asking for the children of a non-directory will return an empty
+        iterator.
+        """
+        pending_dirs = []
+        next_pending_dirs = [path_utf8]
+        absent = ('r', 'a')
+        while next_pending_dirs:
+            pending_dirs = next_pending_dirs
+            next_pending_dirs = []
+            for path in pending_dirs:
+                block_index, present = self._find_block_index_from_key(
+                    (path, '', ''))
+                if not present:
+                    # children of a non-directory asked for.
+                    continue
+                block = self._dirblocks[block_index]
+                for entry in block[1]:
+                    kind = entry[1][tree_index][0]
+                    if kind not in absent:
+                        yield entry
+                    if kind == 'd':
+                        next_pending_dirs.append('/'.join(entry[0][0:2]))
+    
     def _iter_entries(self):
         """Iterate over all the entries in the dirstate.
 

=== modified file 'bzrlib/mutabletree.py'
--- a/bzrlib/mutabletree.py	2007-10-12 08:00:07 +0000
+++ b/bzrlib/mutabletree.py	2007-10-23 22:14:32 +0000
@@ -437,6 +437,13 @@
         inventory for the parent new_revid, and all other parent trees are
         discarded.
 
+        All the changes in the delta should be changes synchronising the basis
+        tree with some or all of the working tree, with a change to a directory
+        requiring that its contents have been recursively included. That is,
+        this is not a general purpose tree modification routine, but a helper
+        for commit which is not required to handle situations that do not arise
+        outside of commit.
+
         :param new_revid: The new revision id for the trees parent.
         :param delta: An inventory delta (see apply_inventory_delta) describing
             the changes from the current left most parent revision to new_revid.

=== modified file 'bzrlib/tests/test_dirstate.py'
--- a/bzrlib/tests/test_dirstate.py	2007-10-10 00:07:04 +0000
+++ b/bzrlib/tests/test_dirstate.py	2007-10-23 22:14:32 +0000
@@ -1394,6 +1394,107 @@
             state.unlock()
 
 
+class TestIterChildEntries(TestCaseWithDirState):
+
+    def create_dirstate_with_two_trees(self):
+        """This dirstate contains multiple files and directories.
+
+         /        a-root-value
+         a/       a-dir
+         b/       b-dir
+         c        c-file
+         d        d-file
+         a/e/     e-dir
+         a/f      f-file
+         b/g      g-file
+         b/h\xc3\xa5  h-\xc3\xa5-file  #This is u'\xe5' encoded into utf-8
+
+        Notice that a/e is an empty directory.
+
+        There is one parent tree, which has the same shape with the following variations:
+        b/g in the parent is gone.
+        b/h in the parent has a different id
+        b/i is new in the parent 
+        c is renamed to b/j in the parent
+
+        :return: The dirstate, still write-locked.
+        """
+        packed_stat = 'AAAAREUHaIpFB2iKAAADAQAtkqUAAIGk'
+        null_sha = 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
+        NULL_PARENT_DETAILS = dirstate.DirState.NULL_PARENT_DETAILS
+        root_entry = ('', '', 'a-root-value'), [
+            ('d', '', 0, False, packed_stat),
+            ('d', '', 0, False, 'parent-revid'),
+            ]
+        a_entry = ('', 'a', 'a-dir'), [
+            ('d', '', 0, False, packed_stat),
+            ('d', '', 0, False, 'parent-revid'),
+            ]
+        b_entry = ('', 'b', 'b-dir'), [
+            ('d', '', 0, False, packed_stat),
+            ('d', '', 0, False, 'parent-revid'),
+            ]
+        c_entry = ('', 'c', 'c-file'), [
+            ('f', null_sha, 10, False, packed_stat),
+            ('r', 'b/j', 0, False, ''),
+            ]
+        d_entry = ('', 'd', 'd-file'), [
+            ('f', null_sha, 20, False, packed_stat),
+            ('f', 'd', 20, False, 'parent-revid'),
+            ]
+        e_entry = ('a', 'e', 'e-dir'), [
+            ('d', '', 0, False, packed_stat),
+            ('d', '', 0, False, 'parent-revid'),
+            ]
+        f_entry = ('a', 'f', 'f-file'), [
+            ('f', null_sha, 30, False, packed_stat),
+            ('f', 'f', 20, False, 'parent-revid'),
+            ]
+        g_entry = ('b', 'g', 'g-file'), [
+            ('f', null_sha, 30, False, packed_stat),
+            NULL_PARENT_DETAILS,
+            ]
+        h_entry1 = ('b', 'h\xc3\xa5', 'h-\xc3\xa5-file1'), [
+            ('f', null_sha, 40, False, packed_stat),
+            NULL_PARENT_DETAILS,
+            ]
+        h_entry2 = ('b', 'h\xc3\xa5', 'h-\xc3\xa5-file2'), [
+            NULL_PARENT_DETAILS,
+            ('f', 'h', 20, False, 'parent-revid'),
+            ]
+        i_entry = ('b', 'i', 'i-file'), [
+            NULL_PARENT_DETAILS,
+            ('f', 'h', 20, False, 'parent-revid'),
+            ]
+        j_entry = ('b', 'j', 'c-file'), [
+            ('r', 'c', 0, False, ''),
+            ('f', 'j', 20, False, 'parent-revid'),
+            ]
+        dirblocks = []
+        dirblocks.append(('', [root_entry]))
+        dirblocks.append(('', [a_entry, b_entry, c_entry, d_entry]))
+        dirblocks.append(('a', [e_entry, f_entry]))
+        dirblocks.append(('b', [g_entry, h_entry1, h_entry2, i_entry, j_entry]))
+        state = dirstate.DirState.initialize('dirstate')
+        state._validate()
+        try:
+            state._set_data(['parent'], dirblocks)
+        except:
+            state.unlock()
+            raise
+        return state, dirblocks
+
+    def test_iter_children_b(self):
+        state, dirblocks = self.create_dirstate_with_two_trees()
+        self.addCleanup(state.unlock)
+        expected_result = []
+        expected_result.append(dirblocks[3][1][2]) # h2
+        expected_result.append(dirblocks[3][1][3]) # i
+        expected_result.append(dirblocks[3][1][4]) # j
+        self.assertEqual(expected_result,
+            list(state._iter_child_entries(1, 'b')))
+
+
 class TestDirstateSortOrder(TestCaseWithTransport):
     """Test that DirState adds entries in the right order."""
 

=== modified file 'bzrlib/tests/workingtree_implementations/test_parents.py'
--- a/bzrlib/tests/workingtree_implementations/test_parents.py	2007-10-12 08:00:07 +0000
+++ b/bzrlib/tests/workingtree_implementations/test_parents.py	2007-10-23 22:14:32 +0000
@@ -260,7 +260,11 @@
 
     def assertDeltaApplicationResultsInExpectedBasis(self, tree, revid, delta,
         expected_inventory):
-        tree.update_basis_by_delta(revid, delta)
+        tree.lock_write()
+        try:
+            tree.update_basis_by_delta(revid, delta)
+        finally:
+            tree.unlock()
         # check the last revision was adjusted to rev_id
         self.assertEqual(revid, tree.last_revision())
         # check the parents are what we expect
@@ -354,6 +358,8 @@
                 parents.append(extra_parent)
             tree.set_parent_ids(parents)
         self.fake_up_revision(tree, new_revid, new_shape)
+        # give tree an inventory of new_shape
+        tree._write_inventory(new_shape)
         self.assertDeltaApplicationResultsInExpectedBasis(tree, new_revid,
             delta, new_shape)
         osutils.rmtree('tree')
@@ -575,3 +581,21 @@
         self.add_link(new_shape, new_revid, 'link-id-B', 'root-id', 'B', 'C')
         self.assertTransitionFromBasisToShape(basis_shape, old_revid,
             new_shape, new_revid)
+
+    def test_move_moves_children_recursively(self):
+        old_revid = 'old-parent'
+        basis_shape = Inventory(root_id=None)
+        self.add_dir(basis_shape, old_revid, 'root-id', None, '')
+        self.add_dir(basis_shape, old_revid, 'dir-id-A', 'root-id', 'A')
+        self.add_dir(basis_shape, old_revid, 'dir-id-B', 'dir-id-A', 'B')
+        self.add_link(basis_shape, old_revid, 'link-id-C', 'dir-id-B', 'C', 'D')
+        new_revid = 'new-parent'
+        new_shape = Inventory(root_id=None)
+        self.add_new_root(new_shape, old_revid, new_revid)
+        # the moved path:
+        self.add_dir(new_shape, new_revid, 'dir-id-A', 'root-id', 'B')
+        # unmoved children.
+        self.add_dir(new_shape, old_revid, 'dir-id-B', 'dir-id-A', 'B')
+        self.add_link(new_shape, old_revid, 'link-id-C', 'dir-id-B', 'C', 'D')
+        self.assertTransitionFromBasisToShape(basis_shape, old_revid,
+            new_shape, new_revid)

=== modified file 'bzrlib/workingtree_4.py'
--- a/bzrlib/workingtree_4.py	2007-10-17 17:03:06 +0000
+++ b/bzrlib/workingtree_4.py	2007-10-23 22:14:32 +0000
@@ -1220,6 +1220,11 @@
             for file_id in file_ids:
                 self._inventory.remove_recursive_id(file_id)
 
+    def update_basis_by_delta(self, new_revid, delta):
+        """See MutableTree.update_basis_by_delta."""
+        assert self.last_revision() != new_revid
+        self.current_dirstate().update_basis_by_delta(delta, new_revid)
+
     @needs_read_lock
     def _validate(self):
         self._dirstate._validate()
@@ -1227,7 +1232,8 @@
     @needs_tree_write_lock
     def _write_inventory(self, inv):
         """Write inventory as the current inventory."""
-        assert not self._dirty, "attempting to write an inventory when the dirstate is dirty will cause data loss"
+        assert not self._dirty, ("attempting to write an inventory when the "
+            "dirstate is dirty will cause data loss")
         self.current_dirstate().set_state_from_inventory(inv)
         self._make_dirty(reset_inventory=False)
         if self._inventory is not None:



More information about the bazaar-commits mailing list