Rev 4181: LRUCache is now implemented with a dict to a linked list, in lp:///~jameinel/bzr/lru_cache_linked_lst
John Arbash Meinel
john at arbash-meinel.com
Sun Mar 22 19:05:11 GMT 2009
At lp:///~jameinel/bzr/lru_cache_linked_lst
------------------------------------------------------------
revno: 4181
revision-id: john at arbash-meinel.com-20090322190503-wabx7fjrgarxx9c5
parent: john at arbash-meinel.com-20090322170709-r389wogofbe8n6d9
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: lru_cache_linked_lst
timestamp: Sun 2009-03-22 14:05:03 -0500
message:
LRUCache is now implemented with a dict to a linked list,
rather than multiple dicts.
This turns out to be rather good for memory savings, and even a little
bit faster. Proper data structures FTW.
Still seeing ~500ms overhead versus a FIFO or dict, but some of that
is just the overhead you expect from having to maintain the LRU info.
-------------- next part --------------
=== modified file 'bzrlib/lru_cache.py'
--- a/bzrlib/lru_cache.py 2009-03-22 17:00:12 +0000
+++ b/bzrlib/lru_cache.py 2009-03-22 19:05:03 +0000
@@ -21,6 +21,42 @@
from bzrlib import symbol_versioning
+class _LRUNode(object):
+ """This maintains the linked-list which is the lru internals."""
+
+ __slots__ = ('prev', 'next', 'key', 'value', 'cleanup', 'size')
+
+ def __init__(self, key, value, cleanup=None):
+ self.prev = None
+ self.next = None
+ self.key = key
+ self.value = value
+ self.cleanup = cleanup
+ # TODO: We could compute this 'on-the-fly' like we used to, and remove
+ # one pointer from this object, we just need to decide if it
+ # actually costs us much of anything in normal usage
+ self.size = None
+
+ def __repr__(self):
+ if self.next is None:
+ next = None
+ else:
+ next = self.next.key
+ if self.prev is None:
+ prev = None
+ else:
+ prev = self.prev.key
+ return '%s(%r n:%r p:%r)' % (self.__class__.__name__, self.key,
+ next, prev)
+
+ def run_cleanup(self):
+ if self.cleanup is not None:
+ self.cleanup(self.key, self.value)
+ self.cleanup = None
+ # Just make sure to break any refcycles, etc
+ self.value = None
+
+
class LRUCache(object):
"""A class which manages a cache of entries, removing unused ones."""
@@ -33,22 +69,42 @@
DeprecationWarning)
after_cleanup_count = after_cleanup_size
self._cache = {}
- self._cleanup = {}
- self._queue = deque() # Track when things are accessed
- self._refcount = {} # number of entries in self._queue for each key
+ # The "HEAD" of the lru linked list
+ self._most_recently_used = None
+ # The "TAIL" of the lru linked list
+ self._last_recently_used = None
self._update_max_cache(max_cache, after_cleanup_count)
def __contains__(self, key):
return key in self._cache
def __getitem__(self, key):
- val = self._cache[key]
- self._record_access(key)
- return val
+ node = self._cache[key]
+ # TODO: We may want to inline _record_access, to decrease the cost of
+ # __getitem__ calls
+ self._record_access(node)
+ return node.value
def __len__(self):
return len(self._cache)
+ def _walk_lru(self):
+ """Walk the LRU list."""
+ node = self._most_recently_used
+ if node is not None:
+ assert node.prev is None
+ while node is not None:
+ if node.next is None:
+ assert node is self._last_recently_used
+ else:
+ assert node.next.prev is node
+ if node.prev is None:
+ assert node is self._most_recently_used
+ else:
+ assert node.prev.next is node
+ yield node
+ node = node.next
+
def add(self, key, value, cleanup=None):
"""Add a new value to the cache.
@@ -61,11 +117,14 @@
'value' sohuld be cleaned up.
"""
if key in self._cache:
- self._remove(key)
- self._cache[key] = value
- if cleanup is not None:
- self._cleanup[key] = cleanup
- self._record_access(key)
+ node = self._cache[key]
+ node.run_cleanup()
+ node.value = value
+ node.cleanup = cleanup
+ else:
+ node = _LRUNode(key, value, cleanup=cleanup)
+ self._cache[key] = node
+ self._record_access(node)
if len(self._cache) > self._max_cache:
# Trigger the cleanup
@@ -76,9 +135,10 @@
return self._max_cache
def get(self, key, default=None):
- if key in self._cache:
- return self[key]
- return default
+ node = self._cache.get(key, None)
+ if node is None:
+ return default
+ return node.value
def keys(self):
"""Get the list of keys currently cached.
@@ -91,6 +151,10 @@
"""
return self._cache.keys()
+ def items(self):
+ """Get the key:value pairs as a dict."""
+ return dict((k, n.value) for k, n in self._cache.iteritems())
+
def cleanup(self):
"""Clear the cache until it shrinks to the requested size.
@@ -107,38 +171,54 @@
"""Add a value to the cache, there will be no cleanup function."""
self.add(key, value, cleanup=None)
- def _record_access(self, key):
+ def _record_access(self, node):
"""Record that key was accessed."""
- self._queue.append(key)
- # Can't use setdefault because you can't += 1 the result
- self._refcount[key] = self._refcount.get(key, 0) + 1
-
- # If our access queue is too large, clean it up too
- if len(self._queue) > self._compact_queue_length:
- self._compact_queue()
-
- def _compact_queue(self):
- """Compact the queue, leaving things in sorted last appended order."""
- new_queue = deque()
- for item in self._queue:
- if self._refcount[item] == 1:
- new_queue.append(item)
- else:
- self._refcount[item] -= 1
- self._queue = new_queue
- # All entries should be of the same size. There should be one entry in
- # queue for each entry in cache, and all refcounts should == 1
- if not (len(self._queue) == len(self._cache) ==
- len(self._refcount) == sum(self._refcount.itervalues())):
- raise AssertionError()
-
- def _remove(self, key):
- """Remove an entry, making sure to maintain the invariants."""
- cleanup = self._cleanup.pop(key, None)
- val = self._cache.pop(key)
- if cleanup is not None:
- cleanup(key, val)
- return val
+ # Move 'node' to the front of the queue
+ if self._most_recently_used is None:
+ assert len(self._cache) == 1
+ self._most_recently_used = node
+ self._last_recently_used = node
+ assert node.next == None
+ assert node.prev == None
+ return
+ elif node is self._most_recently_used:
+ # Nothing to do, this node is already at the head of the queue
+ return
+ elif node is self._last_recently_used:
+ assert self._last_recently_used is not None
+ self._last_recently_used = node.prev
+ assert self._last_recently_used is not None
+ # We've taken care of the tail pointer, remove the node, and insert it
+ # at the front
+ # REMOVE
+ if node.prev is not None:
+ node.prev.next = node.next
+ if node.next is not None:
+ node.next.prev = node.prev
+ # INSERT
+ node.next = self._most_recently_used
+ self._most_recently_used = node
+ node.next.prev = node
+ node.prev = None
+
+ def _remove_node(self, node):
+ assert node is not None
+ if node is self._last_recently_used:
+ self._last_recently_used = node.prev
+ node2 = self._cache.pop(node.key)
+ assert node2 is node
+ del node2
+ # If we have removed all entries, remove the head pointer as well
+ if self._last_recently_used is None:
+ if len(self._cache) != 0:
+ import pdb; pdb.set_trace()
+ assert len(self._cache) == 0
+ assert self._most_recently_used is node
+ self._most_recently_used = None
+ node.run_cleanup()
+ # Shouldn't be necessary
+ # node.next = None
+ # node.prev = None
def _remove_lru(self):
"""Remove one entry from the lru, and handle consequences.
@@ -146,11 +226,7 @@
If there are no more references to the lru, then this entry should be
removed from the cache.
"""
- key = self._queue.popleft()
- self._refcount[key] -= 1
- if not self._refcount[key]:
- del self._refcount[key]
- self._remove(key)
+ self._remove_node(self._last_recently_used)
def clear(self):
"""Clear out all of the cache."""
@@ -168,11 +244,8 @@
if after_cleanup_count is None:
self._after_cleanup_count = self._max_cache * 8 / 10
else:
- self._after_cleanup_count = min(after_cleanup_count, self._max_cache)
-
- self._compact_queue_length = 4*self._max_cache
- if len(self._queue) > self._compact_queue_length:
- self._compact_queue()
+ self._after_cleanup_count = min(after_cleanup_count,
+ self._max_cache)
self.cleanup()
@@ -204,9 +277,6 @@
self._compute_size = compute_size
if compute_size is None:
self._compute_size = len
- # This approximates that texts are > 0.5k in size. It only really
- # effects when we clean up the queue, so we don't want it to be too
- # large.
self._update_max_size(max_size, after_cleanup_size=after_cleanup_size)
LRUCache.__init__(self, max_cache=max(int(max_size/512), 1))
@@ -221,16 +291,22 @@
:param cleanup: None or a function taking (key, value) to indicate
'value' sohuld be cleaned up.
"""
- if key in self._cache:
- self._remove(key)
+ node = self._cache.get(key, None)
value_len = self._compute_size(value)
if value_len >= self._after_cleanup_size:
+ if node is not None:
+ # The new value is too large, we won't be replacing the value,
+ # just removing the node completely
+ self._remove_node(node)
return
+ if node is None:
+ node = _LRUNode(key, value, cleanup=cleanup)
+ self._cache[key] = node
+ else:
+ self._value_size -= node.size
+ node.size = value_len
self._value_size += value_len
- self._cache[key] = value
- if cleanup is not None:
- self._cleanup[key] = cleanup
- self._record_access(key)
+ self._record_access(node)
if self._value_size > self._max_size:
# Time to cleanup
@@ -246,10 +322,9 @@
while self._value_size > self._after_cleanup_size:
self._remove_lru()
- def _remove(self, key):
- """Remove an entry, making sure to maintain the invariants."""
- val = LRUCache._remove(self, key)
- self._value_size -= self._compute_size(val)
+ def _remove_node(self, node):
+ self._value_size -= node.size
+ LRUCache._remove_node(self, node)
def resize(self, max_size, after_cleanup_size=None):
"""Change the number of bytes that will be cached."""
=== modified file 'bzrlib/tests/test_lru_cache.py'
--- a/bzrlib/tests/test_lru_cache.py 2009-03-22 17:07:09 +0000
+++ b/bzrlib/tests/test_lru_cache.py 2009-03-22 19:05:03 +0000
@@ -73,18 +73,6 @@
self.failIf('foo' in cache)
- def test_queue_stays_bounded(self):
- """Lots of accesses does not cause the queue to grow without bound."""
- cache = lru_cache.LRUCache(max_cache=10)
-
- cache['baz'] = 'biz'
- cache['foo'] = 'bar'
-
- for i in xrange(1000):
- cache['baz']
-
- self.failUnless(len(cache._queue) < 40)
-
def test_cleanup(self):
"""Test that we can use a cleanup function."""
cleanup_called = []
@@ -199,20 +187,14 @@
cache.add(4, 30)
cache.add(5, 35)
- self.assertEqual([1, 2, 3, 4, 5], list(cache._queue))
+ self.assertEqual([5, 4, 3, 2, 1], [n.key for n in cache._walk_lru()])
# Now access some randomly
cache[2]
cache[5]
cache[3]
cache[2]
- self.assertEqual([1, 2, 3, 4, 5, 2, 5, 3, 2], list(cache._queue))
- self.assertEqual({1:1, 2:3, 3:2, 4:1, 5:2}, cache._refcount)
-
- # Compacting should save the last position
- cache._compact_queue()
- self.assertEqual([1, 4, 5, 3, 2], list(cache._queue))
- self.assertEqual({1:1, 2:1, 3:1, 4:1, 5:1}, cache._refcount)
+ self.assertEqual([2, 3, 5, 4, 1], [n.key for n in cache._walk_lru()])
def test_get(self):
cache = lru_cache.LRUCache(max_cache=5)
@@ -288,7 +270,6 @@
def test_basic_init(self):
cache = lru_cache.LRUSizeCache()
self.assertEqual(2048, cache._max_cache)
- self.assertEqual(4*2048, cache._compact_queue_length)
self.assertEqual(int(cache._max_size*0.8), cache._after_cleanup_size)
self.assertEqual(0, cache._value_size)
@@ -303,29 +284,30 @@
self.assertEqual(0, cache._value_size)
cache.add('my key', 'my value text')
self.assertEqual(13, cache._value_size)
- cache._remove('my key')
+ node = cache._cache['my key']
+ cache._remove_node(node)
self.assertEqual(0, cache._value_size)
def test_no_add_over_size(self):
"""Adding a large value may not be cached at all."""
cache = lru_cache.LRUSizeCache(max_size=10, after_cleanup_size=5)
self.assertEqual(0, cache._value_size)
- self.assertEqual({}, cache._cache)
+ self.assertEqual({}, cache.items())
cache.add('test', 'key')
self.assertEqual(3, cache._value_size)
- self.assertEqual({'test':'key'}, cache._cache)
+ self.assertEqual({'test': 'key'}, cache.items())
cache.add('test2', 'key that is too big')
self.assertEqual(3, cache._value_size)
- self.assertEqual({'test':'key'}, cache._cache)
+ self.assertEqual({'test':'key'}, cache.items())
# If we would add a key, only to cleanup and remove all cached entries,
# then obviously that value should not be stored
cache.add('test3', 'bigkey')
self.assertEqual(3, cache._value_size)
- self.assertEqual({'test':'key'}, cache._cache)
+ self.assertEqual({'test':'key'}, cache.items())
cache.add('test4', 'bikey')
self.assertEqual(3, cache._value_size)
- self.assertEqual({'test':'key'}, cache._cache)
+ self.assertEqual({'test':'key'}, cache.items())
def test_adding_clears_cache_based_on_size(self):
"""The cache is cleared in LRU order until small enough"""
@@ -339,7 +321,7 @@
# We have to remove 2 keys to get back under limit
self.assertEqual(6+8, cache._value_size)
self.assertEqual({'key2':'value2', 'key4':'value234'},
- cache._cache)
+ cache.items())
def test_adding_clears_to_after_cleanup_size(self):
cache = lru_cache.LRUSizeCache(max_size=20, after_cleanup_size=10)
@@ -351,7 +333,7 @@
cache.add('key4', 'value234') # 8 chars, over limit
# We have to remove 3 keys to get back under limit
self.assertEqual(8, cache._value_size)
- self.assertEqual({'key4':'value234'}, cache._cache)
+ self.assertEqual({'key4':'value234'}, cache.items())
def test_custom_sizes(self):
def size_of_list(lst):
@@ -367,7 +349,7 @@
cache.add('key4', ['value', '234']) # 8 chars, over limit
# We have to remove 3 keys to get back under limit
self.assertEqual(8, cache._value_size)
- self.assertEqual({'key4':['value', '234']}, cache._cache)
+ self.assertEqual({'key4':['value', '234']}, cache.items())
def test_cleanup(self):
cache = lru_cache.LRUSizeCache(max_size=20, after_cleanup_size=10)
More information about the bazaar-commits
mailing list