Rev 3886: Merge the refcycle changes. in http://bzr.arbash-meinel.com/branches/bzr/brisbane/hack3

Mon Mar 23 03:43:57 GMT 2009

At http://bzr.arbash-meinel.com/branches/bzr/brisbane/hack3

------------------------------------------------------------
revno: 3886
revision-id: john at arbash-meinel.com-20090323034347-fubmlsnyfvx5vvup
parent: john at arbash-meinel.com-20090323033446-wc9ispgsj118rgl6
parent: john at arbash-meinel.com-20090323032950-lmbrocu79l90dqn5
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: hack3
timestamp: Sun 2009-03-22 22:43:47 -0500
message:
  Merge the refcycle changes.
modified:
  bzrlib/groupcompress.py        groupcompress.py-20080705181503-ccbxd6xuy1bdnrpu-8
  bzrlib/lru_cache.py            lru_cache.py-20070119165515-tlw203kuwh0id5gv-1
  bzrlib/repofmt/groupcompress_repo.py repofmt.py-20080715094215-wp1qfvoo7093c8qr-1
    ------------------------------------------------------------
    revno: 3869.9.2
    revision-id: john at arbash-meinel.com-20090323032950-lmbrocu79l90dqn5
    parent: john at arbash-meinel.com-20090320150205-kcmh70biyo76p0kn
    parent: john at arbash-meinel.com-20090321032222-n2wbqe0ozhhizwxm
    committer: John Arbash Meinel <john at arbash-meinel.com>
    branch nick: refcycles
    timestamp: Sun 2009-03-22 22:29:50 -0500
    message:
      Merge brisbane-core tip, resolve differences.
      Finish making various get_record_stream() calls clean up refcycles
      and memory consumption after yielding the record.
    modified:
      bzrlib/groupcompress.py        groupcompress.py-20080705181503-ccbxd6xuy1bdnrpu-8
      bzrlib/repofmt/groupcompress_repo.py repofmt.py-20080715094215-wp1qfvoo7093c8qr-1
      bzrlib/tests/test_groupcompress.py test_groupcompress.p-20080705181503-ccbxd6xuy1bdnrpu-13
    ------------------------------------------------------------
    revno: 3869.9.1
    revision-id: john at arbash-meinel.com-20090320150205-kcmh70biyo76p0kn
    parent: john at arbash-meinel.com-20090320032107-bm9wg421rtcacy5i
    committer: John Arbash Meinel <john at arbash-meinel.com>
    branch nick: refcycles
    timestamp: Fri 2009-03-20 10:02:05 -0500
    message:
      Some testing to see if we can decrease the peak memory consumption a bit.
      It looks like we can, just need some more perf, etc.
    modified:
      bzrlib/groupcompress.py        groupcompress.py-20080705181503-ccbxd6xuy1bdnrpu-8
      bzrlib/lru_cache.py            lru_cache.py-20070119165515-tlw203kuwh0id5gv-1
      bzrlib/repofmt/groupcompress_repo.py repofmt.py-20080715094215-wp1qfvoo7093c8qr-1
-------------- next part --------------
=== modified file 'bzrlib/groupcompress.py'

--- a/bzrlib/groupcompress.py	2009-03-23 03:34:46 +0000
+++ b/bzrlib/groupcompress.py	2009-03-23 03:43:47 +0000
@@ -339,8 +339,6 @@
         :param sha1: TODO (should we validate only when sha1 is supplied?)
         :return: The bytes for the content
         """
-        # Handle the 'Empty Content' record, even if we don't always write it
-        # yet.
         if start == end == 0:
             return ''
         self._ensure_content(end)
@@ -477,6 +475,7 @@
         #       get_bytes_as call? After Manager.get_record_stream() returns
         #       the object?
         self._manager = manager
+        self._bytes = None
         self.storage_kind = 'groupcompress-block'
         if not first:
             self.storage_kind = 'groupcompress-block-ref'
@@ -495,14 +494,18 @@
                 return self._manager._wire_bytes()
             else:
                 return ''
+            self._manager = None # safe?
         if storage_kind in ('fulltext', 'chunked'):
-            self._manager._prepare_for_extract()
-            block = self._manager._block
-            bytes = block.extract(self.key, self._start, self._end)
+            if self._bytes is None:
+                # Grab the raw bytes for this entry, and break the ref-cycle
+                self._manager._prepare_for_extract()
+                block = self._manager._block
+                self._bytes = block.extract(self.key, self._start, self._end)
+                self._manager = None
             if storage_kind == 'fulltext':
-                return bytes
+                return self._bytes
             else:
-                return [bytes]
+                return [self._bytes]
         raise errors.UnavailableRepresentation(self.key, storage_kind,
             self.storage_kind)
 
@@ -531,6 +534,8 @@
         """Get a record for all keys added so far."""
         for factory in self._factories:
             yield factory
+            factory._bytes = None
+            factory._manager = None
         # TODO: Consider setting self._factories = None after the above loop,
         #       as it will break the reference cycle
 
@@ -1310,6 +1315,8 @@
         for key in missing:
             yield AbsentContentFactory(key)
         manager = None
+        last_block = None
+        last_memo = None
         # TODO: This works fairly well at batching up existing groups into a
         #       streamable format, and possibly allowing for taking one big
         #       group and splitting it when it isn't fully utilized.
@@ -1327,13 +1334,24 @@
                             # Yield everything buffered so far
                             for factory in manager.get_record_stream():
                                 yield factory
+                                # Disable this record, breaks the refcycle, and
+                                # saves memory. But this means clients really
+                                # *cannot* hang on to objects.
+                                factory._bytes = None
+                                factory._manager = None
                             manager = None
                         bytes, sha1 = self._compressor.extract(key)
                         parents = self._unadded_refs[key]
                         yield FulltextContentFactory(key, parents, sha1, bytes)
                     else:
                         index_memo, _, parents, (method, _) = locations[key]
-                        block = self._get_block(index_memo)
+                        read_memo = index_memo[0:3]
+                        if last_memo == read_memo:
+                            block = last_block
+                        else:
+                            block = self._get_block(index_memo)
+                            last_block = block
+                            last_memo = read_memo
                         start, end = index_memo[3:5]
                         if manager is None:
                             manager = _LazyGroupContentManager(block)

=== modified file 'bzrlib/lru_cache.py'
--- a/bzrlib/lru_cache.py	2008-12-09 22:31:56 +0000
+++ b/bzrlib/lru_cache.py	2009-03-20 15:02:05 +0000
@@ -151,8 +151,12 @@
     def clear(self):
         """Clear out all of the cache."""
         # Clean up in LRU order
-        while self._cache:
-            self._remove_lru()
+        for key in self._cache.keys():
+            self._remove(key)
+        assert not self._cache
+        assert not self._cleanup
+        self._queue = deque()
+        self._refcount = {}
 
     def resize(self, max_cache, after_cleanup_count=None):
         """Change the number of entries that will be cached."""
@@ -247,6 +251,10 @@
         val = LRUCache._remove(self, key)
         self._value_size -= self._compute_size(val)
 
+    def clear(self):
+        LRUCache.clear(self)
+        self._value_size = 0
+
     def resize(self, max_size, after_cleanup_size=None):
         """Change the number of bytes that will be cached."""
         self._update_max_size(max_size, after_cleanup_size=after_cleanup_size)

=== modified file 'bzrlib/repofmt/groupcompress_repo.py'
--- a/bzrlib/repofmt/groupcompress_repo.py	2009-03-21 02:51:34 +0000
+++ b/bzrlib/repofmt/groupcompress_repo.py	2009-03-23 03:43:47 +0000
@@ -190,6 +190,8 @@
                 if pb is not None:
                     pb.update(message, idx + 1, len(keys))
                 yield record
+                record._manager = None
+                record._bytes = None
         return pb_stream()
 
     def _get_filtered_inv_stream(self, source_vf, keys, message, pb=None):
@@ -216,6 +218,8 @@
                     p_id_roots_set.add(key)
                     self._chk_p_id_roots.append(key)
                 yield record
+                record._manager = None
+                record._bytes = None
             # We have finished processing all of the inventory records, we
             # don't need these sets anymore
             id_roots_set.clear()
@@ -291,6 +295,8 @@
                         if pb is not None:
                             pb.update('chk node', counter[0], total_keys)
                         yield record
+                        record._manager = None
+                        record._bytes = None
                 yield next_stream()
                 # Double check that we won't be emitting any keys twice
                 # If we get rid of the pre-calculation of all keys, we could
@@ -302,7 +308,7 @@
                 # next_keys = next_keys.intersection(remaining_keys)
                 cur_keys = []
                 for prefix in sorted(keys_by_search_prefix):
-                    cur_keys.extend(keys_by_search_prefix[prefix])
+                    cur_keys.extend(keys_by_search_prefix.pop(prefix))
         for stream in _get_referenced_stream(self._chk_id_roots,
                                              self._gather_text_refs):
             yield stream
@@ -390,11 +396,17 @@
             self.revision_keys = source_vf.keys()
         self._copy_stream(source_vf, target_vf, self.revision_keys,
                           'revisions', self._get_progress_stream, 1)
+        for index in source_vf._index._graph_index._indices:
+            index._leaf_node_cache.clear()
+        # target_vf._index._graph_index._spill_mem_keys_to_disk()
 
     def _copy_inventory_texts(self):
         source_vf, target_vf = self._build_vfs('inventory', True, True)
         self._copy_stream(source_vf, target_vf, self.revision_keys,
                           'inventories', self._get_filtered_inv_stream, 2)
+        for index in source_vf._index._graph_index._indices:
+            index._leaf_node_cache.clear()
+        # target_vf._index._graph_index._spill_mem_keys_to_disk()
 
     def _copy_chk_texts(self):
         source_vf, target_vf = self._build_vfs('chk', False, False)
@@ -416,6 +428,9 @@
                     pass
         finally:
             child_pb.finished()
+        for index in source_vf._index._graph_index._indices:
+            index._leaf_node_cache.clear()
+        # target_vf._index._graph_index._spill_mem_keys_to_disk()
 
     def _copy_text_texts(self):
         source_vf, target_vf = self._build_vfs('text', True, True)
@@ -427,6 +442,9 @@
         text_keys = source_vf.keys()
         self._copy_stream(source_vf, target_vf, text_keys,
                           'text', self._get_progress_stream, 4)
+        for index in source_vf._index._graph_index._indices:
+            index._leaf_node_cache.clear()
+        # target_vf._index._graph_index._spill_mem_keys_to_disk()
 
     def _copy_signature_texts(self):
         source_vf, target_vf = self._build_vfs('signature', False, False)
@@ -434,6 +452,9 @@
         signature_keys.intersection(self.revision_keys)
         self._copy_stream(source_vf, target_vf, signature_keys,
                           'signatures', self._get_progress_stream, 5)
+        for index in source_vf._index._graph_index._indices:
+            index._leaf_node_cache.clear()
+        # target_vf._index._graph_index._spill_mem_keys_to_disk()
 
     def _create_pack_from_packs(self):
         self.pb.update('repacking', 0, 7)
@@ -446,6 +467,7 @@
         self._copy_text_texts()
         self._copy_signature_texts()
         self.new_pack._check_references()
+        trace.debug_memory('after fetch')
         if not self._use_pack(self.new_pack):
             self.new_pack.abort()
             return None