Rev 9: Merge in my latest code. in http://bazaar.launchpad.net/%7Ebzr/bzr-repodetails/trunk

John Arbash Meinel john at arbash-meinel.com
Tue Dec 2 01:41:46 GMT 2008


At http://bazaar.launchpad.net/%7Ebzr/bzr-repodetails/trunk

------------------------------------------------------------
revno: 9
revision-id: john at arbash-meinel.com-20081202014124-qgvuoydx8e2gzpch
parent: john at arbash-meinel.com-20081118201839-eg0um6e8moig2e6e
parent: john at arbash-meinel.com-20081119205529-eeysovzgyx4qk4bu
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: trunk
timestamp: Mon 2008-12-01 19:41:24 -0600
message:
  Merge in my latest code.
modified:
  __init__.py                    __init__.py-20081118194634-dp5enu9mdxvbmyxy-1
  gather_stats.py                __init__.py-20081017014933-iriuw53viune2txe-2
  tests/test_repositorydetails.py test_repositorydetai-20081017014933-iriuw53viune2txe-6
    ------------------------------------------------------------
    revno: 7.1.8
    revision-id: john at arbash-meinel.com-20081119205529-eeysovzgyx4qk4bu
    parent: john at arbash-meinel.com-20081118211304-4iqzbn28u7k17pzx
    committer: John Arbash Meinel <john at arbash-meinel.com>
    branch nick: jam
    timestamp: Wed 2008-11-19 14:55:29 -0600
    message:
      Update the gather code to understand the parent_id_basename map.
    modified:
      __init__.py                    __init__.py-20081118194634-dp5enu9mdxvbmyxy-1
      gather_stats.py                __init__.py-20081017014933-iriuw53viune2txe-2
      tests/test_repositorydetails.py test_repositorydetai-20081017014933-iriuw53viune2txe-6
    ------------------------------------------------------------
    revno: 7.1.7
    revision-id: john at arbash-meinel.com-20081118211304-4iqzbn28u7k17pzx
    parent: john at arbash-meinel.com-20081118203119-mgx4he32nfotqwmb
    committer: John Arbash Meinel <john at arbash-meinel.com>
    branch nick: jam
    timestamp: Tue 2008-11-18 15:13:04 -0600
    message:
      Add the ability to report specifics about chk internals.
    modified:
      __init__.py                    __init__.py-20081118194634-dp5enu9mdxvbmyxy-1
      gather_stats.py                __init__.py-20081017014933-iriuw53viune2txe-2
      tests/test_repositorydetails.py test_repositorydetai-20081017014933-iriuw53viune2txe-6
    ------------------------------------------------------------
    revno: 7.1.6
    revision-id: john at arbash-meinel.com-20081118203119-mgx4he32nfotqwmb
    parent: john at arbash-meinel.com-20081118201711-hpxn5cmaoca7mljr
    committer: John Arbash Meinel <john at arbash-meinel.com>
    branch nick: jam
    timestamp: Tue 2008-11-18 14:31:19 -0600
    message:
      Actually avoid counting the same nodes more than once.
    modified:
      gather_stats.py                __init__.py-20081017014933-iriuw53viune2txe-2
-------------- next part --------------
=== modified file '__init__.py'
--- a/__init__.py	2008-11-18 19:47:10 +0000
+++ b/__init__.py	2008-11-19 20:55:29 +0000
@@ -60,6 +60,16 @@
         self.outf.write("Total:       %s\n" %
             self._format_object(stats.total, stats.total))
 
+        if stats.inventories.extra_counters:
+            self.outf.write('\nExtra Info:           ' # Padding
+                            'count    total  avg stddev  min  max\n')
+        for counter_name in sorted(stats.inventories.extra_counters):
+            (count, total, avg, stddev, min_val,
+             max_val) = stats.inventories.compute_extra_stats(counter_name)
+            self.outf.write("%-20s %6d %8d %4.0f %6.1f %4d %4d\n"
+                            % (counter_name, count, total, avg, stddev,
+                               min_val, max_val))
+
 
 commands.register_command(cmd_repository_details)
 

=== modified file 'gather_stats.py'
--- a/gather_stats.py	2008-11-18 20:17:11 +0000
+++ b/gather_stats.py	2008-11-19 20:55:29 +0000
@@ -20,7 +20,7 @@
 The repository-details command is the only tool so far.
 """
 
-version_info = (1, 9, 0, 'dev', 0)
+import math
 
 from bzrlib import chk_map, repository
 from bzrlib.inventory import CHKInventory
@@ -105,7 +105,50 @@
     def __init__(self):
         self.objects = 0
         self.raw_size = 0
+        self.raw_size_squared = 0
         self.compressed_size = 0
+        self.compressed_size_squared = 0
+        self.extra_counters = {}
+
+    def compute_stats(self, total, sum_of_squared, count):
+        """Compute average and standard deviation."""
+        if count == 0:
+            return 0.0, 0.0
+        avg = total / count
+        # See http://en.wikipedia.org/wiki/Standard_deviation
+        # Arguably, we should be using "count-1" rather than count, depending
+        # on whether you consider this a full population, or just a sample
+        exp_x2 = sum_of_squared / count
+        stddev = math.sqrt(exp_x2 - avg*avg)
+        return avg, stddev
+
+    def compute_extra_stats(self, counter_name):
+        (count, total, sum_of_squared, min_val,
+         max_val) = self.extra_counters[counter_name]
+        avg, stddev = self.compute_stats(total, sum_of_squared, count)
+        return count, total, avg, stddev, min_val, max_val
+
+    def add_compressed_size(self, size):
+        self.compressed_size += size
+        self.compressed_size_squared += (size*size)
+
+    def add_raw_size(self, size):
+        self.raw_size += size
+        self.raw_size_squared += (size*size)
+
+    def add_extra_counter(self, counter_name, value):
+        ptr = self.extra_counters.setdefault(counter_name, [0, 0, 0, None, None])
+        ptr[0] += 1
+        ptr[1] += value
+        ptr[2] += value*value
+        if ptr[3] is None:
+            ptr[3] = value
+        else:
+            ptr[3] = min(ptr[3], value)
+        if ptr[4] is None:
+            ptr[4] = value
+        else:
+            ptr[4] = max(ptr[4], value)
 
 
 class RepoStats(object):
@@ -137,32 +180,53 @@
     # XXX: Doesn't consider duplicate-in-separate-packs overhead.
     details = vf._index.get_build_details(keys)
     for detail in details.itervalues():
-        objectstats.compressed_size += detail[0][2]
+        objectstats.add_compressed_size(detail[0][2])
     keys = sorted(keys)
     batch_size = 200
     for offset in xrange(0, len(keys), batch_size):
         batch = keys[offset:offset + batch_size]
         for entry in vf.get_record_stream(batch, 'unordered', True):
             bytes = entry.get_bytes_as('fulltext')
-            objectstats.raw_size += len(bytes)
+            objectstats.add_raw_size(len(bytes))
             yield bytes, entry.key
 
 
-def _gather_chk_inv(objectstats, repo):
-    # first pass: the inventory objects yield chk dicts:
-    pending = set()
+def _gather_chk_map(objectstats, chk_bytes, pending, internal_counter,
+                    leaf_counter):
     done = set()
-    for bytes, key in _gather_and_iter_object_vf_texts(objectstats, repo.inventories):
-        inv = CHKInventory.deserialise(repo.chk_bytes, bytes, key)
-        pending.add(inv.id_to_entry._root_node)
     while pending:
         # Don't visit nodes twice
         done.update(pending)
         next = pending
         pending = set()
-        for bytes, key in _gather_and_iter_object_vf_texts(objectstats, repo.chk_bytes, next):
+        for bytes, key in _gather_and_iter_object_vf_texts(objectstats, chk_bytes, next):
             node = chk_map._deserialise(bytes, key)
-            pending.update(node.refs())
+            refs = node.refs()
+            pending.update(refs)
+            if isinstance(node, chk_map.InternalNode):
+                objectstats.add_extra_counter(internal_counter, len(refs))
+            else:
+                items = list(node.iteritems(chk_bytes))
+                objectstats.add_extra_counter(leaf_counter, len(items))
+        # Different routes can take a different number of steps to get to the
+        # same nodes.
+        pending.difference_update(done)
+
+
+def _gather_chk_inv(objectstats, repo):
+    # first pass: the inventory objects yield chk dicts:
+    pending = set()
+    pending_parent_id = set()
+    for bytes, key in _gather_and_iter_object_vf_texts(objectstats, repo.inventories):
+        inv = CHKInventory.deserialise(repo.chk_bytes, bytes, key)
+        pending.add(inv.id_to_entry._root_node)
+        if inv.parent_id_basename_to_file_id is not None:
+            pending_parent_id.add(inv.parent_id_basename_to_file_id._root_node)
+    _gather_chk_map(objectstats, repo.chk_bytes, pending, 'internal node refs',
+                    'leaf node items')
+    _gather_chk_map(objectstats, repo.chk_bytes, pending_parent_id,
+                    'internal parent_id refs',
+                    'leaf parent_id items')
 
 
 def _gather_stats_locked(repo):

=== modified file 'tests/test_repositorydetails.py'
--- a/tests/test_repositorydetails.py	2008-11-18 20:02:08 +0000
+++ b/tests/test_repositorydetails.py	2008-11-19 20:55:29 +0000
@@ -42,6 +42,28 @@
             , out)
         self.assertEqual("", err)
 
+    def test_smoke_chk(self):
+        tree = self.make_branch_and_tree('.', format="development4")
+        self.build_tree(["foo"])
+        tree.add(["foo"], ["foo-id"])
+        tree.commit("first post", rev_id="foo", committer="foo at bar",
+                    timestamp=100000, timezone=0)
+        out, err = self.run_bzr(["repository-details"])
+        self.assertEqualDiff(
+            "Commits: 1\n"
+            "                      Raw    %    Compressed    %  Objects\n"
+            "Revisions:          0 KiB  40%         0 KiB  29%        1\n"
+            "Inventories:        0 KiB  56%         0 KiB  60%        3\n"
+            "Texts:              0 KiB   2%         0 KiB  11%        1\n"
+            "Signatures:         0 KiB   0%         0 KiB   0%        0\n"
+            "Total:              0 KiB 100%         0 KiB 100%        5\n"
+            "\n"
+            "Extra Info:           count    total  avg stddev  min  max\n"
+            "leaf node items           1        2    2    0.0    2    2\n"
+            "leaf parent_id items      1        2    2    0.0    2    2\n"
+            , out)
+        self.assertEqual("", err)
+
     def test_gather_stats_pack92(self):
         tree = self.make_branch_and_tree('.', format="pack-0.92")
         self.build_tree(["foo"])
@@ -67,7 +89,7 @@
         self.assertEqual(537, stats.total.compressed_size)
 
     def test_gather_stats_chk(self):
-        tree = self.make_branch_and_tree('.', format="development3")
+        tree = self.make_branch_and_tree('.', format="development4")
         self.build_tree(["foo"])
         tree.add(["foo"], ["foo-id"])
         tree.commit("first post", rev_id="foo", committer="foo at bar",
@@ -76,20 +98,23 @@
         self.assertEqual(1, stats.revision_count)
         self.assertEqual(1, stats.revisions.objects)
         self.assertEqual(267, stats.revisions.raw_size)
-        self.assertEqual(250, stats.revisions.compressed_size)
+        self.assertEqual(249, stats.revisions.compressed_size)
         # inv, root, foo-id-lead_node.
-        self.assertEqual(2, stats.inventories.objects)
-        self.assertEqual(242, stats.inventories.raw_size)
-        self.assertEqual(336, stats.inventories.compressed_size)
+        self.assertEqual(3, stats.inventories.objects)
+        self.assertEqual(370, stats.inventories.raw_size)
+        self.assertEqual(518, stats.inventories.compressed_size)
+        self.assertEqual({'leaf node items': [1, 2, 4, 2, 2],
+                          'leaf parent_id items': [1, 2, 4, 2, 2]},
+                         stats.inventories.extra_counters)
         self.assertEqual(1, stats.texts.objects)
         self.assertEqual(16, stats.texts.raw_size)
         self.assertEqual(94, stats.texts.compressed_size)
         self.assertEqual(0, stats.signatures.objects)
         self.assertEqual(0, stats.signatures.raw_size)
         self.assertEqual(0, stats.signatures.compressed_size)
-        self.assertEqual(4, stats.total.objects)
-        self.assertEqual(525, stats.total.raw_size)
-        self.assertEqual(680, stats.total.compressed_size)
+        self.assertEqual(5, stats.total.objects)
+        self.assertEqual(653, stats.total.raw_size)
+        self.assertEqual(861, stats.total.compressed_size)
 
     def test_gather_stats_empty(self):
         tree = self.make_branch_and_tree('.')



More information about the bazaar-commits mailing list