Rev 2490: Update WorkingTree4 so that it doesn't use a HashCache, in http://bzr.arbash-meinel.com/branches/bzr/experimental/dirstate-nohc

John Arbash Meinel john at arbash-meinel.com
Thu Mar 1 19:58:48 GMT 2007


At http://bzr.arbash-meinel.com/branches/bzr/experimental/dirstate-nohc

------------------------------------------------------------
revno: 2490
revision-id: john at arbash-meinel.com-20070301195838-7p053os20qwr6qf7
parent: john at arbash-meinel.com-20070301164850-80ih12xza1edee6i
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: dirstate-nohc
timestamp: Thu 2007-03-01 13:58:38 -0600
message:
  Update WorkingTree4 so that it doesn't use a HashCache,
  instead caching the sha values and stat fingerprint in the 'current'
  section.
modified:
  bzrlib/dirstate.py             dirstate.py-20060728012006-d6mvoihjb3je9peu-1
  bzrlib/tests/test_dirstate.py  test_dirstate.py-20060728012006-d6mvoihjb3je9peu-2
  bzrlib/tests/test_workingtree_4.py test_workingtree_4.p-20070223025758-531n3tznl3zacv2o-1
  bzrlib/tests/workingtree_implementations/test_readonly.py test_readonly.py-20061219164256-7imbl63m4j15n0es-1
  bzrlib/workingtree_4.py        workingtree_4.py-20070208044105-5fgpc5j3ljlh5q6c-1
-------------- next part --------------
=== modified file 'bzrlib/dirstate.py'
--- a/bzrlib/dirstate.py	2007-03-01 12:04:50 +0000
+++ b/bzrlib/dirstate.py	2007-03-01 19:58:38 +0000
@@ -191,12 +191,13 @@
 """
 
 
-import base64
 import bisect
+import codecs
 import cStringIO
 import os
 import sha
 import struct
+import time
 import zlib
 
 from bzrlib import (
@@ -297,7 +298,7 @@
             within the root.
         :param file_id: The file id of the path being added.
         :param kind: The kind of the path.
-        :param stat: The output of os.lstate for the path.
+        :param stat: The output of os.lstat for the path.
         :param link_or_sha1: The sha value of the file, or the target of a
             symlink. '' for directories.
         """
@@ -900,7 +901,7 @@
 
     def _entry_to_line(self, entry):
         """Serialize entry to a NULL delimited line ready for _get_output_lines.
-        
+
         :param entry: An entry_tuple as defined in the module docstring.
         """
         entire_entry = list(entry[0])
@@ -1014,6 +1015,68 @@
             raise
         return result
 
+    def get_sha1_for_entry(self, entry, abspath, stat_value=None):
+        """Return the sha1 sum for a given file.
+
+        :param entry: This is the dirblock entry for the file in question.
+        :param abspath: The path on disk for this file.
+        :param stat_value: (optional) if we already have done a stat on the
+            file, re-use it.
+        :return: The sha1 hexdigest of the file (40 bytes)
+        """
+        # This code assumes that the entry passed in is directly held in one of
+        # the internal _dirblocks. So the dirblock state must have already been
+        # read.
+        assert self._dirblock_state != DirState.NOT_IN_MEMORY
+        assert entry[1][0][0] == 'f', \
+            'can only get sha1 for a file not: %s %s' % (
+            DirState._minikind_to_kind[entry[1][0][0]], entry[0])
+        if stat_value is None:
+            stat_value = os.lstat(abspath)
+        packed_stat = pack_stat(stat_value)
+        saved_sha1_digest = entry[1][0][1]
+        saved_file_size = entry[1][0][2]
+        saved_packed_stat = entry[1][0][4]
+        if (packed_stat == saved_packed_stat
+            and saved_sha1_digest != ''
+            # size should also be in packed_stat
+            and saved_file_size == stat_value.st_size):
+            # The stat hasn't changed since we saved, so we can potentially
+            # re-use the saved sha hash.
+            cutoff = self._sha_cutoff_time()
+            if (stat_value.st_mtime < cutoff
+                and stat_value.st_ctime < cutoff):
+                # Return the existing fingerprint
+                return saved_sha1_digest
+        # If we have gotten this far, that means that we need to actually
+        # process the file.
+        new_sha1_digest = self._sha1_file(abspath)
+        # TODO: jam 20070301 Is it worth checking to see if the new sha is
+        #       actually different? I'm guessing not, because we wouldn't have
+        #       gotten this far otherwise.
+        entry[1][0] = ('f', new_sha1_digest, stat_value.st_size,
+                       entry[1][0][3], # Executable?
+                       packed_stat
+                      )
+        self._dirblock_state = DirState.IN_MEMORY_MODIFIED
+        return new_sha1_digest
+
+    def _sha_cutoff_time(self):
+        """Return cutoff time.
+
+        Files modified more recently than this time are at risk of being
+        undetectably modified and so can't be cached.
+        """
+        return int(time.time()) - 3
+
+    def _sha1_file(self, abspath):
+        """Calculate the SHA1 of a file by reading the full text"""
+        f = file(abspath, 'rb', buffering=65000)
+        try:
+            return osutils.sha_file(f)
+        finally:
+            f.close()
+
     def get_ghosts(self):
         """Return a list of the parent tree revision ids that are ghosts."""
         self._read_header_if_needed()
@@ -1226,8 +1289,8 @@
                         path_utf8=real_path)
             return None, None
 
-    @staticmethod
-    def initialize(path):
+    @classmethod
+    def initialize(cls, path):
         """Create a new dirstate on path.
 
         The new dirstate will be an empty tree - that is it has no parents,
@@ -1245,7 +1308,7 @@
         # stock empty dirstate information - a root with ROOT_ID, no children,
         # and no parents. Finally it calls save() to ensure that this data will
         # persist.
-        result = DirState(path)
+        result = cls(path)
         # root dir and root dir contents with no children.
         empty_tree_dirblocks = [('', []), ('', [])]
         # a new root directory, with a NULLSTAT.
@@ -2048,7 +2111,10 @@
     return lo
 
 
-def pack_stat(st, _encode=base64.encodestring, _pack=struct.pack):
+_base64_encoder = codecs.getencoder('base64')
+
+
+def pack_stat(st, _encode=_base64_encoder, _pack=struct.pack):
     """Convert stat values into a packed representation."""
     # jam 20060614 it isn't really worth removing more entries if we
     # are going to leave it in packed form.
@@ -2058,5 +2124,5 @@
 
     # base64.encode always adds a final newline, so strip it off
     return _encode(_pack('>llllll'
-        , st.st_size, st.st_mtime, st.st_ctime
-        , st.st_dev, st.st_ino, st.st_mode))[:-1]
+        , st.st_size, int(st.st_mtime), int(st.st_ctime)
+        , st.st_dev, st.st_ino, st.st_mode))[0][:-1]

=== modified file 'bzrlib/tests/test_dirstate.py'
--- a/bzrlib/tests/test_dirstate.py	2007-03-01 12:04:50 +0000
+++ b/bzrlib/tests/test_dirstate.py	2007-03-01 19:58:38 +0000
@@ -1005,6 +1005,163 @@
         self.assertEqual(expected, dirblock_names)
 
 
+class Sha1InstrumentedDirState(dirstate.DirState):
+    """An DirState with instrumented sha1 functionality."""
+
+    def __init__(self, path):
+        super(Sha1InstrumentedDirState, self).__init__(path)
+        self._time_offset = 0
+        self._sha1_log = []
+
+    def _sha_cutoff_time(self):
+        timestamp = super(Sha1InstrumentedDirState, self)._sha_cutoff_time()
+        return timestamp + self._time_offset
+
+    def _sha1_file(self, abspath):
+        self._sha1_log.append(abspath)
+        return super(Sha1InstrumentedDirState, self)._sha1_file(abspath)
+
+    def adjust_time(self, secs):
+        """Move the clock forward or back.
+
+        :param secs: The amount to adjust the clock by. Positive values make it
+        seem as if we are in the future, negative values make it seem like we
+        are in the past.
+        """
+        self._time_offset += secs
+
+
+class _FakeStat(object):
+    """A class with the same attributes as a real stat result."""
+
+    def __init__(self, size, mtime, ctime, dev, ino, mode):
+        self.st_size = size
+        self.st_mtime = mtime
+        self.st_ctime = ctime
+        self.st_dev = dev
+        self.st_ino = ino
+        self.st_mode = mode
+
+
+class TestDirStateGetSha1(TestCaseWithDirState):
+    """Test the DirState.get_sha1 functions"""
+
+    def test_get_sha1_from_entry(self):
+        state = Sha1InstrumentedDirState.initialize('dirstate')
+        self.addCleanup(state.unlock)
+        self.build_tree(['a'])
+        # Add one where we don't provide the stat or sha already
+        state.add('a', 'a-id', 'file', None, '')
+        entry = state._get_entry(0, path_utf8='a')
+        self.assertEqual(('', 'a', 'a-id'), entry[0])
+        self.assertEqual([('f', '', 0, False, dirstate.DirState.NULLSTAT)],
+                         entry[1])
+        # Flush the buffers to disk
+        state.save()
+        self.assertEqual(dirstate.DirState.IN_MEMORY_UNMODIFIED,
+                         state._dirblock_state)
+
+        stat_value = os.lstat('a')
+        digest = state.get_sha1_for_entry(entry, abspath='a',
+                                          stat_value=stat_value)
+        self.assertEqual('b50e5406bb5e153ebbeb20268fcf37c87e1ecfb6', digest)
+        packed_stat = dirstate.pack_stat(stat_value)
+
+        # The dirblock entry should be updated with the new info
+        self.assertEqual([('f', digest, 14, False, packed_stat)], entry[1])
+        self.assertEqual(dirstate.DirState.IN_MEMORY_MODIFIED,
+                         state._dirblock_state)
+        self.assertEqual(['a'], state._sha1_log)
+
+        state.save()
+        self.assertEqual(dirstate.DirState.IN_MEMORY_UNMODIFIED,
+                         state._dirblock_state)
+
+        # If we do it again right away, we don't know if the file has changed
+        # so we will re-read the file. Roll the clock back so the file is
+        # guaranteed to look too new.
+        state.adjust_time(-10)
+
+        digest = state.get_sha1_for_entry(entry, abspath='a',
+                                          stat_value=stat_value)
+        self.assertEqual(['a', 'a'], state._sha1_log)
+        self.assertEqual('b50e5406bb5e153ebbeb20268fcf37c87e1ecfb6', digest)
+        self.assertEqual(dirstate.DirState.IN_MEMORY_MODIFIED,
+                         state._dirblock_state)
+        state.save()
+
+        # However, if we move the clock forward so the file is considered
+        # "stable", it should just returned the cached value.
+        state.adjust_time(20)
+        digest = state.get_sha1_for_entry(entry, abspath='a',
+                                          stat_value=stat_value)
+        self.assertEqual('b50e5406bb5e153ebbeb20268fcf37c87e1ecfb6', digest)
+        self.assertEqual(['a', 'a'], state._sha1_log)
+
+    def test_get_sha1_from_entry_no_stat_value(self):
+        """Passing the stat_value is optional."""
+        state = Sha1InstrumentedDirState.initialize('dirstate')
+        self.addCleanup(state.unlock)
+        self.build_tree(['a'])
+        # Add one where we don't provide the stat or sha already
+        state.add('a', 'a-id', 'file', None, '')
+        entry = state._get_entry(0, path_utf8='a')
+        digest = state.get_sha1_for_entry(entry, abspath='a')
+        self.assertEqual('b50e5406bb5e153ebbeb20268fcf37c87e1ecfb6', digest)
+
+
+class TestPackStat(TestCaseWithTransport):
+
+    def assertPackStat(self, expected, stat_value):
+        """Check the packed and serialized form of a stat value."""
+        self.assertEqual(expected, dirstate.pack_stat(stat_value))
+
+    def test_pack_stat_int(self):
+        st = _FakeStat(6859L, 1172758614, 1172758617, 777L, 6499538L, 0100644)
+        # Make sure that all parameters have an impact on the packed stat.
+        self.assertPackStat('AAAay0Xm4FZF5uBZAAADCQBjLNIAAIGk', st)
+        st.st_size = 7000L
+        #                ay0 => bWE
+        self.assertPackStat('AAAbWEXm4FZF5uBZAAADCQBjLNIAAIGk', st)
+        st.st_mtime = 1172758620
+        #                     4FZ => 4Fx
+        self.assertPackStat('AAAbWEXm4FxF5uBZAAADCQBjLNIAAIGk', st)
+        st.st_ctime = 1172758630
+        #                          uBZ => uBm
+        self.assertPackStat('AAAbWEXm4FxF5uBmAAADCQBjLNIAAIGk', st)
+        st.st_dev = 888L
+        #                                DCQ => DeA
+        self.assertPackStat('AAAbWEXm4FxF5uBmAAADeABjLNIAAIGk', st)
+        st.st_ino = 6499540L
+        #                                     LNI => LNQ
+        self.assertPackStat('AAAbWEXm4FxF5uBmAAADeABjLNQAAIGk', st)
+        st.st_mode = 0100744
+        #                                          IGk => IHk
+        self.assertPackStat('AAAbWEXm4FxF5uBmAAADeABjLNQAAIHk', st)
+
+    def test_pack_stat_float(self):
+        """On some platforms mtime and ctime are floats.
+
+        Make sure we don't get warnings or errors, and that we ignore changes <
+        1s
+        """
+        st = _FakeStat(7000L, 1172758614.0, 1172758617.0,
+                       777L, 6499538L, 0100644)
+        # These should all be the same as the integer counterparts
+        self.assertPackStat('AAAbWEXm4FZF5uBZAAADCQBjLNIAAIGk', st)
+        st.st_mtime = 1172758620.0
+        #                     FZF5 => FxF5
+        self.assertPackStat('AAAbWEXm4FxF5uBZAAADCQBjLNIAAIGk', st)
+        st.st_ctime = 1172758630.0
+        #                          uBZ => uBm
+        self.assertPackStat('AAAbWEXm4FxF5uBmAAADCQBjLNIAAIGk', st)
+        # fractional seconds are discarded, so no change from above
+        st.st_mtime = 1172758620.453
+        self.assertPackStat('AAAbWEXm4FxF5uBmAAADCQBjLNIAAIGk', st)
+        st.st_ctime = 1172758630.228
+        self.assertPackStat('AAAbWEXm4FxF5uBmAAADCQBjLNIAAIGk', st)
+
+
 class TestBisect(TestCaseWithTransport):
     """Test the ability to bisect into the disk format."""
 

=== modified file 'bzrlib/tests/test_workingtree_4.py'
--- a/bzrlib/tests/test_workingtree_4.py	2007-02-26 00:45:31 +0000
+++ b/bzrlib/tests/test_workingtree_4.py	2007-03-01 19:58:38 +0000
@@ -42,8 +42,6 @@
         t = control.get_workingtree_transport(None)
         self.assertEqualDiff('Bazaar Working Tree format 4\n',
                              t.get('format').read())
-        self.assertEqualDiff('### bzr hashcache v5\n',
-                             t.get('stat-cache').read())
         self.assertFalse(t.has('inventory.basis'))
         # no last-revision file means 'None' or 'NULLREVISION'
         self.assertFalse(t.has('last-revision'))
@@ -59,7 +57,7 @@
 
     def test_uses_lockdir(self):
         """WorkingTreeFormat4 uses its own LockDir:
-            
+
             - lock is a directory
             - when the WorkingTree is locked, LockDir can see that
         """

=== modified file 'bzrlib/tests/workingtree_implementations/test_readonly.py'
--- a/bzrlib/tests/workingtree_implementations/test_readonly.py	2006-12-19 23:33:17 +0000
+++ b/bzrlib/tests/workingtree_implementations/test_readonly.py	2007-03-01 19:58:38 +0000
@@ -89,9 +89,15 @@
         # test so that they pass. For now, we just assert that we have the
         # right type of objects available.
         the_hashcache = getattr(tree, '_hashcache', None)
-        self.assertNotEqual(None, the_hashcache)
-        self.assertIsInstance(the_hashcache, hashcache.HashCache)
-        the_hashcache._cutoff_time = self._custom_cutoff_time
+        if the_hashcache is not None:
+            self.assertIsInstance(the_hashcache, hashcache.HashCache)
+            the_hashcache._cutoff_time = self._custom_cutoff_time
+            hack_dirstate = False
+        else:
+            # DirState trees don't have a HashCache, but they do have the same
+            # function as part of the DirState. However, until the tree is
+            # locked, we don't have a DirState to modify
+            hack_dirstate = True
 
         # Make it a little dirty
         self.build_tree_contents([('tree/a', 'new contents of a\n')])
@@ -101,6 +107,8 @@
 
         tree.lock_read()
         try:
+            if hack_dirstate:
+                tree._dirstate._sha_cutoff_time = self._custom_cutoff_time
             # Make sure we check all the files
             for file_id in tree:
                 size = tree.get_file_size(file_id)

=== modified file 'bzrlib/workingtree_4.py'
--- a/bzrlib/workingtree_4.py	2007-03-01 16:48:50 +0000
+++ b/bzrlib/workingtree_4.py	2007-03-01 19:58:38 +0000
@@ -125,7 +125,6 @@
         """
         self._format = _format
         self.bzrdir = _bzrdir
-        from bzrlib.hashcache import HashCache
         from bzrlib.trace import note, mutter
         assert isinstance(basedir, basestring), \
             "base directory %r is not a string" % basedir
@@ -140,22 +139,6 @@
         assert isinstance(_control_files, LockableFiles), \
             "_control_files must be a LockableFiles, not %r" % _control_files
         self._control_files = _control_files
-        # update the whole cache up front and write to disk if anything changed;
-        # in the future we might want to do this more selectively
-        # two possible ways offer themselves : in self._unlock, write the cache
-        # if needed, or, when the cache sees a change, append it to the hash
-        # cache file, and have the parser take the most recent entry for a
-        # given path only.
-        cache_filename = self.bzrdir.get_workingtree_transport(None).local_abspath('stat-cache')
-        hc = self._hashcache = HashCache(basedir, cache_filename, self._control_files._file_mode)
-        hc.read()
-        # is this scan needed ? it makes things kinda slow.
-        #hc.scan()
-
-        if hc.needs_write:
-            mutter("write hc")
-            hc.write()
-
         self._dirty = None
         #-------------
         # during a read or write lock these objects are set, and are
@@ -369,13 +352,17 @@
 
     def get_file_sha1(self, file_id, path=None, stat_value=None):
         # check file id is valid unconditionally.
-        key, details = self._get_entry(file_id=file_id, path=path)
-        assert key is not None, 'what error should this raise'
+        entry = self._get_entry(file_id=file_id, path=path)
+        assert entry[0] is not None, 'what error should this raise'
         # TODO:
         # if row stat is valid, use cached sha1, else, get a new sha1.
         if path is None:
-            path = pathjoin(key[0], key[1]).decode('utf8')
-        return self._hashcache.get_sha1(path, stat_value)
+            path = pathjoin(entry[0][0], entry[0][1]).decode('utf8')
+
+        file_abspath = self.abspath(path)
+        state = self.current_dirstate()
+        return state.get_sha1_for_entry(entry, file_abspath,
+                                        stat_value=stat_value)
 
     def _get_inventory(self):
         """Get the inventory for the tree. This is only valid within a lock."""
@@ -941,7 +928,6 @@
     def unlock(self):
         """Unlock in format 4 trees needs to write the entire dirstate."""
         if self._control_files._lock_count == 1:
-            self._write_hashcache_if_dirty()
             # eventually we should do signature checking during read locks for
             # dirstate updates.
             if self._control_files._lock_mode == 'w':
@@ -1262,11 +1248,11 @@
         return self._repository.get_revision(last_changed_revision).timestamp
 
     def get_file_sha1(self, file_id, path=None, stat_value=None):
-        # TODO: if path is present, fast-path on that, as inventory
-        # might not be present
-        ie = self.inventory[file_id]
-        if ie.kind == "file":
-            return ie.text_sha1
+        entry = self._get_entry(file_id=file_id, path=path)
+        parent_index = self._get_parent_index()
+        parent_details = entry[1][parent_index]
+        if parent_details[0] == 'f':
+            return parent_details[1]
         return None
 
     def get_file(self, file_id):
@@ -1639,8 +1625,9 @@
                                 content_change = True
                             else:
                                 # maybe the same. Get the hash
-                                new_hash = self.target._hashcache.get_sha1(
-                                                            path, path_info[3])
+                                new_hash = state.get_sha1_for_entry(entry,
+                                                abspath=path_info[4],
+                                                stat_value=path_info[3])
                                 content_change = (new_hash != source_details[1])
                         target_exec = bool(
                             stat.S_ISREG(path_info[3].st_mode)



More information about the bazaar-commits mailing list