Rev 4181: LRUCache is now implemented with a dict to a linked list, in lp:///~jameinel/bzr/lru_cache_linked_lst

Sun Mar 22 19:05:11 GMT 2009

At lp:///~jameinel/bzr/lru_cache_linked_lst

------------------------------------------------------------
revno: 4181
revision-id: john at arbash-meinel.com-20090322190503-wabx7fjrgarxx9c5
parent: john at arbash-meinel.com-20090322170709-r389wogofbe8n6d9
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: lru_cache_linked_lst
timestamp: Sun 2009-03-22 14:05:03 -0500
message:
  LRUCache is now implemented with a dict to a linked list,
  rather than multiple dicts.
  
  This turns out to be rather good for memory savings, and even a little
  bit faster. Proper data structures FTW.
  
  Still seeing ~500ms overhead versus a FIFO or dict, but some of that
  is just the overhead you expect from having to maintain the LRU info.
-------------- next part --------------
=== modified file 'bzrlib/lru_cache.py'

--- a/bzrlib/lru_cache.py	2009-03-22 17:00:12 +0000
+++ b/bzrlib/lru_cache.py	2009-03-22 19:05:03 +0000
@@ -21,6 +21,42 @@
 from bzrlib import symbol_versioning
 
 
+class _LRUNode(object):
+    """This maintains the linked-list which is the lru internals."""
+
+    __slots__ = ('prev', 'next', 'key', 'value', 'cleanup', 'size')
+
+    def __init__(self, key, value, cleanup=None):
+        self.prev = None
+        self.next = None
+        self.key = key
+        self.value = value
+        self.cleanup = cleanup
+        # TODO: We could compute this 'on-the-fly' like we used to, and remove
+        #       one pointer from this object, we just need to decide if it
+        #       actually costs us much of anything in normal usage
+        self.size = None
+
+    def __repr__(self):
+        if self.next is None:
+            next = None
+        else:
+            next = self.next.key
+        if self.prev is None:
+            prev = None
+        else:
+            prev = self.prev.key
+        return '%s(%r n:%r p:%r)' % (self.__class__.__name__, self.key,
+                                     next, prev)
+
+    def run_cleanup(self):
+        if self.cleanup is not None:
+            self.cleanup(self.key, self.value)
+        self.cleanup = None
+        # Just make sure to break any refcycles, etc
+        self.value = None
+
+
 class LRUCache(object):
     """A class which manages a cache of entries, removing unused ones."""
 
@@ -33,22 +69,42 @@
                                    DeprecationWarning)
             after_cleanup_count = after_cleanup_size
         self._cache = {}
-        self._cleanup = {}
-        self._queue = deque() # Track when things are accessed
-        self._refcount = {} # number of entries in self._queue for each key
+        # The "HEAD" of the lru linked list
+        self._most_recently_used = None
+        # The "TAIL" of the lru linked list
+        self._last_recently_used = None
         self._update_max_cache(max_cache, after_cleanup_count)
 
     def __contains__(self, key):
         return key in self._cache
 
     def __getitem__(self, key):
-        val = self._cache[key]
-        self._record_access(key)
-        return val
+        node = self._cache[key]
+        # TODO: We may want to inline _record_access, to decrease the cost of
+        #       __getitem__ calls
+        self._record_access(node)
+        return node.value
 
     def __len__(self):
         return len(self._cache)
 
+    def _walk_lru(self):
+        """Walk the LRU list."""
+        node = self._most_recently_used
+        if node is not None:
+            assert node.prev is None
+        while node is not None:
+            if node.next is None:
+                assert node is self._last_recently_used
+            else:
+                assert node.next.prev is node
+            if node.prev is None:
+                assert node is self._most_recently_used
+            else:
+                assert node.prev.next is node
+            yield node
+            node = node.next
+
     def add(self, key, value, cleanup=None):
         """Add a new value to the cache.
 
@@ -61,11 +117,14 @@
                         'value' sohuld be cleaned up.
         """
         if key in self._cache:
-            self._remove(key)
-        self._cache[key] = value
-        if cleanup is not None:
-            self._cleanup[key] = cleanup
-        self._record_access(key)
+            node = self._cache[key]
+            node.run_cleanup()
+            node.value = value
+            node.cleanup = cleanup
+        else:
+            node = _LRUNode(key, value, cleanup=cleanup)
+            self._cache[key] = node
+        self._record_access(node)
 
         if len(self._cache) > self._max_cache:
             # Trigger the cleanup
@@ -76,9 +135,10 @@
         return self._max_cache
 
     def get(self, key, default=None):
-        if key in self._cache:
-            return self[key]
-        return default
+        node = self._cache.get(key, None)
+        if node is None:
+            return default
+        return node.value
 
     def keys(self):
         """Get the list of keys currently cached.
@@ -91,6 +151,10 @@
         """
         return self._cache.keys()
 
+    def items(self):
+        """Get the key:value pairs as a dict."""
+        return dict((k, n.value) for k, n in self._cache.iteritems())
+
     def cleanup(self):
         """Clear the cache until it shrinks to the requested size.
 
@@ -107,38 +171,54 @@
         """Add a value to the cache, there will be no cleanup function."""
         self.add(key, value, cleanup=None)
 
-    def _record_access(self, key):
+    def _record_access(self, node):
         """Record that key was accessed."""
-        self._queue.append(key)
-        # Can't use setdefault because you can't += 1 the result
-        self._refcount[key] = self._refcount.get(key, 0) + 1
-
-        # If our access queue is too large, clean it up too
-        if len(self._queue) > self._compact_queue_length:
-            self._compact_queue()
-
-    def _compact_queue(self):
-        """Compact the queue, leaving things in sorted last appended order."""
-        new_queue = deque()
-        for item in self._queue:
-            if self._refcount[item] == 1:
-                new_queue.append(item)
-            else:
-                self._refcount[item] -= 1
-        self._queue = new_queue
-        # All entries should be of the same size. There should be one entry in
-        # queue for each entry in cache, and all refcounts should == 1
-        if not (len(self._queue) == len(self._cache) ==
-                len(self._refcount) == sum(self._refcount.itervalues())):
-            raise AssertionError()
-
-    def _remove(self, key):
-        """Remove an entry, making sure to maintain the invariants."""
-        cleanup = self._cleanup.pop(key, None)
-        val = self._cache.pop(key)
-        if cleanup is not None:
-            cleanup(key, val)
-        return val
+        # Move 'node' to the front of the queue
+        if self._most_recently_used is None:
+            assert len(self._cache) == 1
+            self._most_recently_used = node
+            self._last_recently_used = node
+            assert node.next == None
+            assert node.prev == None
+            return
+        elif node is self._most_recently_used:
+            # Nothing to do, this node is already at the head of the queue
+            return
+        elif node is self._last_recently_used:
+            assert self._last_recently_used is not None
+            self._last_recently_used = node.prev
+            assert self._last_recently_used is not None
+        # We've taken care of the tail pointer, remove the node, and insert it
+        # at the front
+        # REMOVE
+        if node.prev is not None:
+            node.prev.next = node.next
+        if node.next is not None:
+            node.next.prev = node.prev
+        # INSERT
+        node.next = self._most_recently_used
+        self._most_recently_used = node
+        node.next.prev = node
+        node.prev = None
+
+    def _remove_node(self, node):
+        assert node is not None
+        if node is self._last_recently_used:
+            self._last_recently_used = node.prev
+        node2 = self._cache.pop(node.key)
+        assert node2 is node
+        del node2
+        # If we have removed all entries, remove the head pointer as well
+        if self._last_recently_used is None:
+            if len(self._cache) != 0:
+                import pdb; pdb.set_trace()
+            assert len(self._cache) == 0
+            assert self._most_recently_used is node
+            self._most_recently_used = None
+        node.run_cleanup()
+        # Shouldn't be necessary
+        # node.next = None
+        # node.prev = None
 
     def _remove_lru(self):
         """Remove one entry from the lru, and handle consequences.
@@ -146,11 +226,7 @@
         If there are no more references to the lru, then this entry should be
         removed from the cache.
         """
-        key = self._queue.popleft()
-        self._refcount[key] -= 1
-        if not self._refcount[key]:
-            del self._refcount[key]
-            self._remove(key)
+        self._remove_node(self._last_recently_used)
 
     def clear(self):
         """Clear out all of the cache."""
@@ -168,11 +244,8 @@
         if after_cleanup_count is None:
             self._after_cleanup_count = self._max_cache * 8 / 10
         else:
-            self._after_cleanup_count = min(after_cleanup_count, self._max_cache)
-
-        self._compact_queue_length = 4*self._max_cache
-        if len(self._queue) > self._compact_queue_length:
-            self._compact_queue()
+            self._after_cleanup_count = min(after_cleanup_count,
+                                            self._max_cache)
         self.cleanup()
 
 
@@ -204,9 +277,6 @@
         self._compute_size = compute_size
         if compute_size is None:
             self._compute_size = len
-        # This approximates that texts are > 0.5k in size. It only really
-        # effects when we clean up the queue, so we don't want it to be too
-        # large.
         self._update_max_size(max_size, after_cleanup_size=after_cleanup_size)
         LRUCache.__init__(self, max_cache=max(int(max_size/512), 1))
 
@@ -221,16 +291,22 @@
         :param cleanup: None or a function taking (key, value) to indicate
                         'value' sohuld be cleaned up.
         """
-        if key in self._cache:
-            self._remove(key)
+        node = self._cache.get(key, None)
         value_len = self._compute_size(value)
         if value_len >= self._after_cleanup_size:
+            if node is not None:
+                # The new value is too large, we won't be replacing the value,
+                # just removing the node completely
+                self._remove_node(node)
             return
+        if node is None:
+            node = _LRUNode(key, value, cleanup=cleanup)
+            self._cache[key] = node
+        else:
+            self._value_size -= node.size
+        node.size = value_len
         self._value_size += value_len
-        self._cache[key] = value
-        if cleanup is not None:
-            self._cleanup[key] = cleanup
-        self._record_access(key)
+        self._record_access(node)
 
         if self._value_size > self._max_size:
             # Time to cleanup
@@ -246,10 +322,9 @@
         while self._value_size > self._after_cleanup_size:
             self._remove_lru()
 
-    def _remove(self, key):
-        """Remove an entry, making sure to maintain the invariants."""
-        val = LRUCache._remove(self, key)
-        self._value_size -= self._compute_size(val)
+    def _remove_node(self, node):
+        self._value_size -= node.size
+        LRUCache._remove_node(self, node)
 
     def resize(self, max_size, after_cleanup_size=None):
         """Change the number of bytes that will be cached."""

=== modified file 'bzrlib/tests/test_lru_cache.py'
--- a/bzrlib/tests/test_lru_cache.py	2009-03-22 17:07:09 +0000
+++ b/bzrlib/tests/test_lru_cache.py	2009-03-22 19:05:03 +0000
@@ -73,18 +73,6 @@
 
         self.failIf('foo' in cache)
 
-    def test_queue_stays_bounded(self):
-        """Lots of accesses does not cause the queue to grow without bound."""
-        cache = lru_cache.LRUCache(max_cache=10)
-
-        cache['baz'] = 'biz'
-        cache['foo'] = 'bar'
-
-        for i in xrange(1000):
-            cache['baz']
-
-        self.failUnless(len(cache._queue) < 40)
-
     def test_cleanup(self):
         """Test that we can use a cleanup function."""
         cleanup_called = []
@@ -199,20 +187,14 @@
         cache.add(4, 30)
         cache.add(5, 35)
 
-        self.assertEqual([1, 2, 3, 4, 5], list(cache._queue))
+        self.assertEqual([5, 4, 3, 2, 1], [n.key for n in cache._walk_lru()])
 
         # Now access some randomly
         cache[2]
         cache[5]
         cache[3]
         cache[2]
-        self.assertEqual([1, 2, 3, 4, 5, 2, 5, 3, 2], list(cache._queue))
-        self.assertEqual({1:1, 2:3, 3:2, 4:1, 5:2}, cache._refcount)
-
-        # Compacting should save the last position
-        cache._compact_queue()
-        self.assertEqual([1, 4, 5, 3, 2], list(cache._queue))
-        self.assertEqual({1:1, 2:1, 3:1, 4:1, 5:1}, cache._refcount)
+        self.assertEqual([2, 3, 5, 4, 1], [n.key for n in cache._walk_lru()])
 
     def test_get(self):
         cache = lru_cache.LRUCache(max_cache=5)
@@ -288,7 +270,6 @@
     def test_basic_init(self):
         cache = lru_cache.LRUSizeCache()
         self.assertEqual(2048, cache._max_cache)
-        self.assertEqual(4*2048, cache._compact_queue_length)
         self.assertEqual(int(cache._max_size*0.8), cache._after_cleanup_size)
         self.assertEqual(0, cache._value_size)
 
@@ -303,29 +284,30 @@
         self.assertEqual(0, cache._value_size)
         cache.add('my key', 'my value text')
         self.assertEqual(13, cache._value_size)
-        cache._remove('my key')
+        node = cache._cache['my key']
+        cache._remove_node(node)
         self.assertEqual(0, cache._value_size)
 
     def test_no_add_over_size(self):
         """Adding a large value may not be cached at all."""
         cache = lru_cache.LRUSizeCache(max_size=10, after_cleanup_size=5)
         self.assertEqual(0, cache._value_size)
-        self.assertEqual({}, cache._cache)
+        self.assertEqual({}, cache.items())
         cache.add('test', 'key')
         self.assertEqual(3, cache._value_size)
-        self.assertEqual({'test':'key'}, cache._cache)
+        self.assertEqual({'test': 'key'}, cache.items())
         cache.add('test2', 'key that is too big')
         self.assertEqual(3, cache._value_size)
-        self.assertEqual({'test':'key'}, cache._cache)
+        self.assertEqual({'test':'key'}, cache.items())
         # If we would add a key, only to cleanup and remove all cached entries,
         # then obviously that value should not be stored
         cache.add('test3', 'bigkey')
         self.assertEqual(3, cache._value_size)
-        self.assertEqual({'test':'key'}, cache._cache)
+        self.assertEqual({'test':'key'}, cache.items())
 
         cache.add('test4', 'bikey')
         self.assertEqual(3, cache._value_size)
-        self.assertEqual({'test':'key'}, cache._cache)
+        self.assertEqual({'test':'key'}, cache.items())
 
     def test_adding_clears_cache_based_on_size(self):
         """The cache is cleared in LRU order until small enough"""
@@ -339,7 +321,7 @@
         # We have to remove 2 keys to get back under limit
         self.assertEqual(6+8, cache._value_size)
         self.assertEqual({'key2':'value2', 'key4':'value234'},
-                         cache._cache)
+                         cache.items())
 
     def test_adding_clears_to_after_cleanup_size(self):
         cache = lru_cache.LRUSizeCache(max_size=20, after_cleanup_size=10)
@@ -351,7 +333,7 @@
         cache.add('key4', 'value234') # 8 chars, over limit
         # We have to remove 3 keys to get back under limit
         self.assertEqual(8, cache._value_size)
-        self.assertEqual({'key4':'value234'}, cache._cache)
+        self.assertEqual({'key4':'value234'}, cache.items())
 
     def test_custom_sizes(self):
         def size_of_list(lst):
@@ -367,7 +349,7 @@
         cache.add('key4', ['value', '234']) # 8 chars, over limit
         # We have to remove 3 keys to get back under limit
         self.assertEqual(8, cache._value_size)
-        self.assertEqual({'key4':['value', '234']}, cache._cache)
+        self.assertEqual({'key4':['value', '234']}, cache.items())
 
     def test_cleanup(self):
         cache = lru_cache.LRUSizeCache(max_size=20, after_cleanup_size=10)