Rev 2490: Update WorkingTree4 so that it doesn't use a HashCache, in http://bzr.arbash-meinel.com/branches/bzr/experimental/dirstate-nohc
John Arbash Meinel
john at arbash-meinel.com
Thu Mar 1 19:58:48 GMT 2007
At http://bzr.arbash-meinel.com/branches/bzr/experimental/dirstate-nohc
------------------------------------------------------------
revno: 2490
revision-id: john at arbash-meinel.com-20070301195838-7p053os20qwr6qf7
parent: john at arbash-meinel.com-20070301164850-80ih12xza1edee6i
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: dirstate-nohc
timestamp: Thu 2007-03-01 13:58:38 -0600
message:
Update WorkingTree4 so that it doesn't use a HashCache,
instead caching the sha values and stat fingerprint in the 'current'
section.
modified:
bzrlib/dirstate.py dirstate.py-20060728012006-d6mvoihjb3je9peu-1
bzrlib/tests/test_dirstate.py test_dirstate.py-20060728012006-d6mvoihjb3je9peu-2
bzrlib/tests/test_workingtree_4.py test_workingtree_4.p-20070223025758-531n3tznl3zacv2o-1
bzrlib/tests/workingtree_implementations/test_readonly.py test_readonly.py-20061219164256-7imbl63m4j15n0es-1
bzrlib/workingtree_4.py workingtree_4.py-20070208044105-5fgpc5j3ljlh5q6c-1
-------------- next part --------------
=== modified file 'bzrlib/dirstate.py'
--- a/bzrlib/dirstate.py 2007-03-01 12:04:50 +0000
+++ b/bzrlib/dirstate.py 2007-03-01 19:58:38 +0000
@@ -191,12 +191,13 @@
"""
-import base64
import bisect
+import codecs
import cStringIO
import os
import sha
import struct
+import time
import zlib
from bzrlib import (
@@ -297,7 +298,7 @@
within the root.
:param file_id: The file id of the path being added.
:param kind: The kind of the path.
- :param stat: The output of os.lstate for the path.
+ :param stat: The output of os.lstat for the path.
:param link_or_sha1: The sha value of the file, or the target of a
symlink. '' for directories.
"""
@@ -900,7 +901,7 @@
def _entry_to_line(self, entry):
"""Serialize entry to a NULL delimited line ready for _get_output_lines.
-
+
:param entry: An entry_tuple as defined in the module docstring.
"""
entire_entry = list(entry[0])
@@ -1014,6 +1015,68 @@
raise
return result
+ def get_sha1_for_entry(self, entry, abspath, stat_value=None):
+ """Return the sha1 sum for a given file.
+
+ :param entry: This is the dirblock entry for the file in question.
+ :param abspath: The path on disk for this file.
+ :param stat_value: (optional) if we already have done a stat on the
+ file, re-use it.
+ :return: The sha1 hexdigest of the file (40 bytes)
+ """
+ # This code assumes that the entry passed in is directly held in one of
+ # the internal _dirblocks. So the dirblock state must have already been
+ # read.
+ assert self._dirblock_state != DirState.NOT_IN_MEMORY
+ assert entry[1][0][0] == 'f', \
+ 'can only get sha1 for a file not: %s %s' % (
+ DirState._minikind_to_kind[entry[1][0][0]], entry[0])
+ if stat_value is None:
+ stat_value = os.lstat(abspath)
+ packed_stat = pack_stat(stat_value)
+ saved_sha1_digest = entry[1][0][1]
+ saved_file_size = entry[1][0][2]
+ saved_packed_stat = entry[1][0][4]
+ if (packed_stat == saved_packed_stat
+ and saved_sha1_digest != ''
+ # size should also be in packed_stat
+ and saved_file_size == stat_value.st_size):
+ # The stat hasn't changed since we saved, so we can potentially
+ # re-use the saved sha hash.
+ cutoff = self._sha_cutoff_time()
+ if (stat_value.st_mtime < cutoff
+ and stat_value.st_ctime < cutoff):
+ # Return the existing fingerprint
+ return saved_sha1_digest
+ # If we have gotten this far, that means that we need to actually
+ # process the file.
+ new_sha1_digest = self._sha1_file(abspath)
+ # TODO: jam 20070301 Is it worth checking to see if the new sha is
+ # actually different? I'm guessing not, because we wouldn't have
+ # gotten this far otherwise.
+ entry[1][0] = ('f', new_sha1_digest, stat_value.st_size,
+ entry[1][0][3], # Executable?
+ packed_stat
+ )
+ self._dirblock_state = DirState.IN_MEMORY_MODIFIED
+ return new_sha1_digest
+
+ def _sha_cutoff_time(self):
+ """Return cutoff time.
+
+ Files modified more recently than this time are at risk of being
+ undetectably modified and so can't be cached.
+ """
+ return int(time.time()) - 3
+
+ def _sha1_file(self, abspath):
+ """Calculate the SHA1 of a file by reading the full text"""
+ f = file(abspath, 'rb', buffering=65000)
+ try:
+ return osutils.sha_file(f)
+ finally:
+ f.close()
+
def get_ghosts(self):
"""Return a list of the parent tree revision ids that are ghosts."""
self._read_header_if_needed()
@@ -1226,8 +1289,8 @@
path_utf8=real_path)
return None, None
- @staticmethod
- def initialize(path):
+ @classmethod
+ def initialize(cls, path):
"""Create a new dirstate on path.
The new dirstate will be an empty tree - that is it has no parents,
@@ -1245,7 +1308,7 @@
# stock empty dirstate information - a root with ROOT_ID, no children,
# and no parents. Finally it calls save() to ensure that this data will
# persist.
- result = DirState(path)
+ result = cls(path)
# root dir and root dir contents with no children.
empty_tree_dirblocks = [('', []), ('', [])]
# a new root directory, with a NULLSTAT.
@@ -2048,7 +2111,10 @@
return lo
-def pack_stat(st, _encode=base64.encodestring, _pack=struct.pack):
+_base64_encoder = codecs.getencoder('base64')
+
+
+def pack_stat(st, _encode=_base64_encoder, _pack=struct.pack):
"""Convert stat values into a packed representation."""
# jam 20060614 it isn't really worth removing more entries if we
# are going to leave it in packed form.
@@ -2058,5 +2124,5 @@
# base64.encode always adds a final newline, so strip it off
return _encode(_pack('>llllll'
- , st.st_size, st.st_mtime, st.st_ctime
- , st.st_dev, st.st_ino, st.st_mode))[:-1]
+ , st.st_size, int(st.st_mtime), int(st.st_ctime)
+ , st.st_dev, st.st_ino, st.st_mode))[0][:-1]
=== modified file 'bzrlib/tests/test_dirstate.py'
--- a/bzrlib/tests/test_dirstate.py 2007-03-01 12:04:50 +0000
+++ b/bzrlib/tests/test_dirstate.py 2007-03-01 19:58:38 +0000
@@ -1005,6 +1005,163 @@
self.assertEqual(expected, dirblock_names)
+class Sha1InstrumentedDirState(dirstate.DirState):
+ """An DirState with instrumented sha1 functionality."""
+
+ def __init__(self, path):
+ super(Sha1InstrumentedDirState, self).__init__(path)
+ self._time_offset = 0
+ self._sha1_log = []
+
+ def _sha_cutoff_time(self):
+ timestamp = super(Sha1InstrumentedDirState, self)._sha_cutoff_time()
+ return timestamp + self._time_offset
+
+ def _sha1_file(self, abspath):
+ self._sha1_log.append(abspath)
+ return super(Sha1InstrumentedDirState, self)._sha1_file(abspath)
+
+ def adjust_time(self, secs):
+ """Move the clock forward or back.
+
+ :param secs: The amount to adjust the clock by. Positive values make it
+ seem as if we are in the future, negative values make it seem like we
+ are in the past.
+ """
+ self._time_offset += secs
+
+
+class _FakeStat(object):
+ """A class with the same attributes as a real stat result."""
+
+ def __init__(self, size, mtime, ctime, dev, ino, mode):
+ self.st_size = size
+ self.st_mtime = mtime
+ self.st_ctime = ctime
+ self.st_dev = dev
+ self.st_ino = ino
+ self.st_mode = mode
+
+
+class TestDirStateGetSha1(TestCaseWithDirState):
+ """Test the DirState.get_sha1 functions"""
+
+ def test_get_sha1_from_entry(self):
+ state = Sha1InstrumentedDirState.initialize('dirstate')
+ self.addCleanup(state.unlock)
+ self.build_tree(['a'])
+ # Add one where we don't provide the stat or sha already
+ state.add('a', 'a-id', 'file', None, '')
+ entry = state._get_entry(0, path_utf8='a')
+ self.assertEqual(('', 'a', 'a-id'), entry[0])
+ self.assertEqual([('f', '', 0, False, dirstate.DirState.NULLSTAT)],
+ entry[1])
+ # Flush the buffers to disk
+ state.save()
+ self.assertEqual(dirstate.DirState.IN_MEMORY_UNMODIFIED,
+ state._dirblock_state)
+
+ stat_value = os.lstat('a')
+ digest = state.get_sha1_for_entry(entry, abspath='a',
+ stat_value=stat_value)
+ self.assertEqual('b50e5406bb5e153ebbeb20268fcf37c87e1ecfb6', digest)
+ packed_stat = dirstate.pack_stat(stat_value)
+
+ # The dirblock entry should be updated with the new info
+ self.assertEqual([('f', digest, 14, False, packed_stat)], entry[1])
+ self.assertEqual(dirstate.DirState.IN_MEMORY_MODIFIED,
+ state._dirblock_state)
+ self.assertEqual(['a'], state._sha1_log)
+
+ state.save()
+ self.assertEqual(dirstate.DirState.IN_MEMORY_UNMODIFIED,
+ state._dirblock_state)
+
+ # If we do it again right away, we don't know if the file has changed
+ # so we will re-read the file. Roll the clock back so the file is
+ # guaranteed to look too new.
+ state.adjust_time(-10)
+
+ digest = state.get_sha1_for_entry(entry, abspath='a',
+ stat_value=stat_value)
+ self.assertEqual(['a', 'a'], state._sha1_log)
+ self.assertEqual('b50e5406bb5e153ebbeb20268fcf37c87e1ecfb6', digest)
+ self.assertEqual(dirstate.DirState.IN_MEMORY_MODIFIED,
+ state._dirblock_state)
+ state.save()
+
+ # However, if we move the clock forward so the file is considered
+ # "stable", it should just returned the cached value.
+ state.adjust_time(20)
+ digest = state.get_sha1_for_entry(entry, abspath='a',
+ stat_value=stat_value)
+ self.assertEqual('b50e5406bb5e153ebbeb20268fcf37c87e1ecfb6', digest)
+ self.assertEqual(['a', 'a'], state._sha1_log)
+
+ def test_get_sha1_from_entry_no_stat_value(self):
+ """Passing the stat_value is optional."""
+ state = Sha1InstrumentedDirState.initialize('dirstate')
+ self.addCleanup(state.unlock)
+ self.build_tree(['a'])
+ # Add one where we don't provide the stat or sha already
+ state.add('a', 'a-id', 'file', None, '')
+ entry = state._get_entry(0, path_utf8='a')
+ digest = state.get_sha1_for_entry(entry, abspath='a')
+ self.assertEqual('b50e5406bb5e153ebbeb20268fcf37c87e1ecfb6', digest)
+
+
+class TestPackStat(TestCaseWithTransport):
+
+ def assertPackStat(self, expected, stat_value):
+ """Check the packed and serialized form of a stat value."""
+ self.assertEqual(expected, dirstate.pack_stat(stat_value))
+
+ def test_pack_stat_int(self):
+ st = _FakeStat(6859L, 1172758614, 1172758617, 777L, 6499538L, 0100644)
+ # Make sure that all parameters have an impact on the packed stat.
+ self.assertPackStat('AAAay0Xm4FZF5uBZAAADCQBjLNIAAIGk', st)
+ st.st_size = 7000L
+ # ay0 => bWE
+ self.assertPackStat('AAAbWEXm4FZF5uBZAAADCQBjLNIAAIGk', st)
+ st.st_mtime = 1172758620
+ # 4FZ => 4Fx
+ self.assertPackStat('AAAbWEXm4FxF5uBZAAADCQBjLNIAAIGk', st)
+ st.st_ctime = 1172758630
+ # uBZ => uBm
+ self.assertPackStat('AAAbWEXm4FxF5uBmAAADCQBjLNIAAIGk', st)
+ st.st_dev = 888L
+ # DCQ => DeA
+ self.assertPackStat('AAAbWEXm4FxF5uBmAAADeABjLNIAAIGk', st)
+ st.st_ino = 6499540L
+ # LNI => LNQ
+ self.assertPackStat('AAAbWEXm4FxF5uBmAAADeABjLNQAAIGk', st)
+ st.st_mode = 0100744
+ # IGk => IHk
+ self.assertPackStat('AAAbWEXm4FxF5uBmAAADeABjLNQAAIHk', st)
+
+ def test_pack_stat_float(self):
+ """On some platforms mtime and ctime are floats.
+
+ Make sure we don't get warnings or errors, and that we ignore changes <
+ 1s
+ """
+ st = _FakeStat(7000L, 1172758614.0, 1172758617.0,
+ 777L, 6499538L, 0100644)
+ # These should all be the same as the integer counterparts
+ self.assertPackStat('AAAbWEXm4FZF5uBZAAADCQBjLNIAAIGk', st)
+ st.st_mtime = 1172758620.0
+ # FZF5 => FxF5
+ self.assertPackStat('AAAbWEXm4FxF5uBZAAADCQBjLNIAAIGk', st)
+ st.st_ctime = 1172758630.0
+ # uBZ => uBm
+ self.assertPackStat('AAAbWEXm4FxF5uBmAAADCQBjLNIAAIGk', st)
+ # fractional seconds are discarded, so no change from above
+ st.st_mtime = 1172758620.453
+ self.assertPackStat('AAAbWEXm4FxF5uBmAAADCQBjLNIAAIGk', st)
+ st.st_ctime = 1172758630.228
+ self.assertPackStat('AAAbWEXm4FxF5uBmAAADCQBjLNIAAIGk', st)
+
+
class TestBisect(TestCaseWithTransport):
"""Test the ability to bisect into the disk format."""
=== modified file 'bzrlib/tests/test_workingtree_4.py'
--- a/bzrlib/tests/test_workingtree_4.py 2007-02-26 00:45:31 +0000
+++ b/bzrlib/tests/test_workingtree_4.py 2007-03-01 19:58:38 +0000
@@ -42,8 +42,6 @@
t = control.get_workingtree_transport(None)
self.assertEqualDiff('Bazaar Working Tree format 4\n',
t.get('format').read())
- self.assertEqualDiff('### bzr hashcache v5\n',
- t.get('stat-cache').read())
self.assertFalse(t.has('inventory.basis'))
# no last-revision file means 'None' or 'NULLREVISION'
self.assertFalse(t.has('last-revision'))
@@ -59,7 +57,7 @@
def test_uses_lockdir(self):
"""WorkingTreeFormat4 uses its own LockDir:
-
+
- lock is a directory
- when the WorkingTree is locked, LockDir can see that
"""
=== modified file 'bzrlib/tests/workingtree_implementations/test_readonly.py'
--- a/bzrlib/tests/workingtree_implementations/test_readonly.py 2006-12-19 23:33:17 +0000
+++ b/bzrlib/tests/workingtree_implementations/test_readonly.py 2007-03-01 19:58:38 +0000
@@ -89,9 +89,15 @@
# test so that they pass. For now, we just assert that we have the
# right type of objects available.
the_hashcache = getattr(tree, '_hashcache', None)
- self.assertNotEqual(None, the_hashcache)
- self.assertIsInstance(the_hashcache, hashcache.HashCache)
- the_hashcache._cutoff_time = self._custom_cutoff_time
+ if the_hashcache is not None:
+ self.assertIsInstance(the_hashcache, hashcache.HashCache)
+ the_hashcache._cutoff_time = self._custom_cutoff_time
+ hack_dirstate = False
+ else:
+ # DirState trees don't have a HashCache, but they do have the same
+ # function as part of the DirState. However, until the tree is
+ # locked, we don't have a DirState to modify
+ hack_dirstate = True
# Make it a little dirty
self.build_tree_contents([('tree/a', 'new contents of a\n')])
@@ -101,6 +107,8 @@
tree.lock_read()
try:
+ if hack_dirstate:
+ tree._dirstate._sha_cutoff_time = self._custom_cutoff_time
# Make sure we check all the files
for file_id in tree:
size = tree.get_file_size(file_id)
=== modified file 'bzrlib/workingtree_4.py'
--- a/bzrlib/workingtree_4.py 2007-03-01 16:48:50 +0000
+++ b/bzrlib/workingtree_4.py 2007-03-01 19:58:38 +0000
@@ -125,7 +125,6 @@
"""
self._format = _format
self.bzrdir = _bzrdir
- from bzrlib.hashcache import HashCache
from bzrlib.trace import note, mutter
assert isinstance(basedir, basestring), \
"base directory %r is not a string" % basedir
@@ -140,22 +139,6 @@
assert isinstance(_control_files, LockableFiles), \
"_control_files must be a LockableFiles, not %r" % _control_files
self._control_files = _control_files
- # update the whole cache up front and write to disk if anything changed;
- # in the future we might want to do this more selectively
- # two possible ways offer themselves : in self._unlock, write the cache
- # if needed, or, when the cache sees a change, append it to the hash
- # cache file, and have the parser take the most recent entry for a
- # given path only.
- cache_filename = self.bzrdir.get_workingtree_transport(None).local_abspath('stat-cache')
- hc = self._hashcache = HashCache(basedir, cache_filename, self._control_files._file_mode)
- hc.read()
- # is this scan needed ? it makes things kinda slow.
- #hc.scan()
-
- if hc.needs_write:
- mutter("write hc")
- hc.write()
-
self._dirty = None
#-------------
# during a read or write lock these objects are set, and are
@@ -369,13 +352,17 @@
def get_file_sha1(self, file_id, path=None, stat_value=None):
# check file id is valid unconditionally.
- key, details = self._get_entry(file_id=file_id, path=path)
- assert key is not None, 'what error should this raise'
+ entry = self._get_entry(file_id=file_id, path=path)
+ assert entry[0] is not None, 'what error should this raise'
# TODO:
# if row stat is valid, use cached sha1, else, get a new sha1.
if path is None:
- path = pathjoin(key[0], key[1]).decode('utf8')
- return self._hashcache.get_sha1(path, stat_value)
+ path = pathjoin(entry[0][0], entry[0][1]).decode('utf8')
+
+ file_abspath = self.abspath(path)
+ state = self.current_dirstate()
+ return state.get_sha1_for_entry(entry, file_abspath,
+ stat_value=stat_value)
def _get_inventory(self):
"""Get the inventory for the tree. This is only valid within a lock."""
@@ -941,7 +928,6 @@
def unlock(self):
"""Unlock in format 4 trees needs to write the entire dirstate."""
if self._control_files._lock_count == 1:
- self._write_hashcache_if_dirty()
# eventually we should do signature checking during read locks for
# dirstate updates.
if self._control_files._lock_mode == 'w':
@@ -1262,11 +1248,11 @@
return self._repository.get_revision(last_changed_revision).timestamp
def get_file_sha1(self, file_id, path=None, stat_value=None):
- # TODO: if path is present, fast-path on that, as inventory
- # might not be present
- ie = self.inventory[file_id]
- if ie.kind == "file":
- return ie.text_sha1
+ entry = self._get_entry(file_id=file_id, path=path)
+ parent_index = self._get_parent_index()
+ parent_details = entry[1][parent_index]
+ if parent_details[0] == 'f':
+ return parent_details[1]
return None
def get_file(self, file_id):
@@ -1639,8 +1625,9 @@
content_change = True
else:
# maybe the same. Get the hash
- new_hash = self.target._hashcache.get_sha1(
- path, path_info[3])
+ new_hash = state.get_sha1_for_entry(entry,
+ abspath=path_info[4],
+ stat_value=path_info[3])
content_change = (new_hash != source_details[1])
target_exec = bool(
stat.S_ISREG(path_info[3].st_mode)
More information about the bazaar-commits
mailing list