Rev 5848: (jameinel) Make set_parent_trees() ~5% faster by improving the sort key in file:///home/pqm/archives/thelove/bzr/%2Btrunk/

Canonical.com Patch Queue Manager pqm at pqm.ubuntu.com
Wed May 11 02:48:34 UTC 2011


At file:///home/pqm/archives/thelove/bzr/%2Btrunk/

------------------------------------------------------------
revno: 5848 [merge]
revision-id: pqm at pqm.ubuntu.com-20110511024831-tm38ubce8znnq351
parent: pqm at pqm.ubuntu.com-20110510102823-vf4qlngmjhgg6538
parent: john at arbash-meinel.com-20110511015847-qi4i70gertbq6lj2
committer: Canonical.com Patch Queue Manager <pqm at pqm.ubuntu.com>
branch nick: +trunk
timestamp: Wed 2011-05-11 02:48:31 +0000
message:
  (jameinel) Make set_parent_trees() ~5% faster by improving the sort key
   function. (John A Meinel)
modified:
  bzrlib/dirstate.py             dirstate.py-20060728012006-d6mvoihjb3je9peu-1
=== modified file 'bzrlib/dirstate.py'
--- a/bzrlib/dirstate.py	2011-05-06 15:15:44 +0000
+++ b/bzrlib/dirstate.py	2011-05-11 01:58:47 +0000
@@ -2141,7 +2141,8 @@
             executable = False
         else:
             raise Exception("can't pack %s" % inv_entry)
-        return (minikind, fingerprint, size, executable, tree_data)
+        return static_tuple.StaticTuple(minikind, fingerprint, size,
+                                        executable, tree_data)
 
     def _iter_child_entries(self, tree_index, path_utf8):
         """Iterate over all the entries that are children of path_utf.
@@ -2525,6 +2526,7 @@
         parent_trees = [tree for rev_id, tree in trees if rev_id not in ghosts]
         # how many trees do we end up with
         parent_count = len(parent_trees)
+        st = static_tuple.StaticTuple
 
         # one: the current tree
         for entry in self._iter_entries():
@@ -2547,6 +2549,7 @@
             # the suffix is from tree_index+1:parent_count+1.
             new_location_suffix = [DirState.NULL_PARENT_DETAILS] * (parent_count - tree_index)
             # now stitch in all the entries from this tree
+            last_dirname = None
             for path, entry in tree.iter_entries_by_dir():
                 # here we process each trees details for each item in the tree.
                 # we first update any existing entries for the id at other paths,
@@ -2560,10 +2563,16 @@
                 file_id = entry.file_id
                 path_utf8 = path.encode('utf8')
                 dirname, basename = osutils.split(path_utf8)
-                new_entry_key = (dirname, basename, file_id)
+                if dirname == last_dirname:
+                    # Try to re-use objects as much as possible
+                    dirname = last_dirname
+                else:
+                    last_dirname = dirname
+                new_entry_key = st(dirname, basename, file_id)
                 # tree index consistency: All other paths for this id in this tree
                 # index must point to the correct path.
-                for entry_key in id_index.get(file_id, ()):
+                entry_keys = id_index.get(file_id, ())
+                for entry_key in entry_keys:
                     # TODO:PROFILING: It might be faster to just update
                     # rather than checking if we need to, and then overwrite
                     # the one we are located at.
@@ -2572,12 +2581,14 @@
                         # other trees, so put absent pointers there
                         # This is the vertical axis in the matrix, all pointing
                         # to the real path.
-                        by_path[entry_key][tree_index] = ('r', path_utf8, 0, False, '')
-                # by path consistency: Insert into an existing path record (trivial), or
-                # add a new one with relocation pointers for the other tree indexes.
-                entry_keys = id_index.get(file_id, ())
+                        by_path[entry_key][tree_index] = st('r', path_utf8, 0,
+                                                            False, '')
+                # by path consistency: Insert into an existing path record
+                # (trivial), or add a new one with relocation pointers for the
+                # other tree indexes.
                 if new_entry_key in entry_keys:
-                    # there is already an entry where this data belongs, just insert it.
+                    # there is already an entry where this data belongs, just
+                    # insert it.
                     by_path[new_entry_key][tree_index] = \
                         self._inv_entry_to_details(entry)
                 else:
@@ -2593,17 +2604,16 @@
                             new_details.append(DirState.NULL_PARENT_DETAILS)
                         else:
                             # grab any one entry, use it to find the right path.
-                            # TODO: optimise this to reduce memory use in highly
-                            # fragmented situations by reusing the relocation
-                            # records.
                             a_key = iter(entry_keys).next()
                             if by_path[a_key][lookup_index][0] in ('r', 'a'):
-                                # its a pointer or missing statement, use it as is.
+                                # its a pointer or missing statement, use it as
+                                # is.
                                 new_details.append(by_path[a_key][lookup_index])
                             else:
                                 # we have the right key, make a pointer to it.
                                 real_path = ('/'.join(a_key[0:2])).strip('/')
-                                new_details.append(('r', real_path, 0, False, ''))
+                                new_details.append(st('r', real_path, 0, False,
+                                                      ''))
                     new_details.append(self._inv_entry_to_details(entry))
                     new_details.extend(new_location_suffix)
                     by_path[new_entry_key] = new_details
@@ -2625,9 +2635,20 @@
         try to keep everything in sorted blocks all the time, but sometimes
         it's easier to sort after the fact.
         """
-        def _key(entry):
+        # When sorting, we usually have 10x more entries than directories. (69k
+        # total entries, 4k directories). So cache the results of splitting.
+        # Saving time and objects. Also, use StaticTuple to avoid putting all
+        # of these object into python's garbage collector.
+        split_dirs = {}
+        def _key(entry, _split_dirs=split_dirs, _st=static_tuple.StaticTuple):
             # sort by: directory parts, file name, file id
-            return entry[0][0].split('/'), entry[0][1], entry[0][2]
+            dirpath, fname, file_id = entry[0]
+            try:
+                split = _split_dirs[dirpath]
+            except KeyError:
+                split = _st.from_sequence(dirpath.split('/'))
+                _split_dirs[dirpath] = split
+            return _st(split, fname, file_id)
         return sorted(entry_list, key=_key)
 
     def set_state_from_inventory(self, new_inv):




More information about the bazaar-commits mailing list