Rev 9: Merge in my latest code. in http://bazaar.launchpad.net/%7Ebzr/bzr-repodetails/trunk
John Arbash Meinel
john at arbash-meinel.com
Tue Dec 2 01:41:46 GMT 2008
At http://bazaar.launchpad.net/%7Ebzr/bzr-repodetails/trunk
------------------------------------------------------------
revno: 9
revision-id: john at arbash-meinel.com-20081202014124-qgvuoydx8e2gzpch
parent: john at arbash-meinel.com-20081118201839-eg0um6e8moig2e6e
parent: john at arbash-meinel.com-20081119205529-eeysovzgyx4qk4bu
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: trunk
timestamp: Mon 2008-12-01 19:41:24 -0600
message:
Merge in my latest code.
modified:
__init__.py __init__.py-20081118194634-dp5enu9mdxvbmyxy-1
gather_stats.py __init__.py-20081017014933-iriuw53viune2txe-2
tests/test_repositorydetails.py test_repositorydetai-20081017014933-iriuw53viune2txe-6
------------------------------------------------------------
revno: 7.1.8
revision-id: john at arbash-meinel.com-20081119205529-eeysovzgyx4qk4bu
parent: john at arbash-meinel.com-20081118211304-4iqzbn28u7k17pzx
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: jam
timestamp: Wed 2008-11-19 14:55:29 -0600
message:
Update the gather code to understand the parent_id_basename map.
modified:
__init__.py __init__.py-20081118194634-dp5enu9mdxvbmyxy-1
gather_stats.py __init__.py-20081017014933-iriuw53viune2txe-2
tests/test_repositorydetails.py test_repositorydetai-20081017014933-iriuw53viune2txe-6
------------------------------------------------------------
revno: 7.1.7
revision-id: john at arbash-meinel.com-20081118211304-4iqzbn28u7k17pzx
parent: john at arbash-meinel.com-20081118203119-mgx4he32nfotqwmb
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: jam
timestamp: Tue 2008-11-18 15:13:04 -0600
message:
Add the ability to report specifics about chk internals.
modified:
__init__.py __init__.py-20081118194634-dp5enu9mdxvbmyxy-1
gather_stats.py __init__.py-20081017014933-iriuw53viune2txe-2
tests/test_repositorydetails.py test_repositorydetai-20081017014933-iriuw53viune2txe-6
------------------------------------------------------------
revno: 7.1.6
revision-id: john at arbash-meinel.com-20081118203119-mgx4he32nfotqwmb
parent: john at arbash-meinel.com-20081118201711-hpxn5cmaoca7mljr
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: jam
timestamp: Tue 2008-11-18 14:31:19 -0600
message:
Actually avoid counting the same nodes more than once.
modified:
gather_stats.py __init__.py-20081017014933-iriuw53viune2txe-2
-------------- next part --------------
=== modified file '__init__.py'
--- a/__init__.py 2008-11-18 19:47:10 +0000
+++ b/__init__.py 2008-11-19 20:55:29 +0000
@@ -60,6 +60,16 @@
self.outf.write("Total: %s\n" %
self._format_object(stats.total, stats.total))
+ if stats.inventories.extra_counters:
+ self.outf.write('\nExtra Info: ' # Padding
+ 'count total avg stddev min max\n')
+ for counter_name in sorted(stats.inventories.extra_counters):
+ (count, total, avg, stddev, min_val,
+ max_val) = stats.inventories.compute_extra_stats(counter_name)
+ self.outf.write("%-20s %6d %8d %4.0f %6.1f %4d %4d\n"
+ % (counter_name, count, total, avg, stddev,
+ min_val, max_val))
+
commands.register_command(cmd_repository_details)
=== modified file 'gather_stats.py'
--- a/gather_stats.py 2008-11-18 20:17:11 +0000
+++ b/gather_stats.py 2008-11-19 20:55:29 +0000
@@ -20,7 +20,7 @@
The repository-details command is the only tool so far.
"""
-version_info = (1, 9, 0, 'dev', 0)
+import math
from bzrlib import chk_map, repository
from bzrlib.inventory import CHKInventory
@@ -105,7 +105,50 @@
def __init__(self):
self.objects = 0
self.raw_size = 0
+ self.raw_size_squared = 0
self.compressed_size = 0
+ self.compressed_size_squared = 0
+ self.extra_counters = {}
+
+ def compute_stats(self, total, sum_of_squared, count):
+ """Compute average and standard deviation."""
+ if count == 0:
+ return 0.0, 0.0
+ avg = total / count
+ # See http://en.wikipedia.org/wiki/Standard_deviation
+ # Arguably, we should be using "count-1" rather than count, depending
+ # on whether you consider this a full population, or just a sample
+ exp_x2 = sum_of_squared / count
+ stddev = math.sqrt(exp_x2 - avg*avg)
+ return avg, stddev
+
+ def compute_extra_stats(self, counter_name):
+ (count, total, sum_of_squared, min_val,
+ max_val) = self.extra_counters[counter_name]
+ avg, stddev = self.compute_stats(total, sum_of_squared, count)
+ return count, total, avg, stddev, min_val, max_val
+
+ def add_compressed_size(self, size):
+ self.compressed_size += size
+ self.compressed_size_squared += (size*size)
+
+ def add_raw_size(self, size):
+ self.raw_size += size
+ self.raw_size_squared += (size*size)
+
+ def add_extra_counter(self, counter_name, value):
+ ptr = self.extra_counters.setdefault(counter_name, [0, 0, 0, None, None])
+ ptr[0] += 1
+ ptr[1] += value
+ ptr[2] += value*value
+ if ptr[3] is None:
+ ptr[3] = value
+ else:
+ ptr[3] = min(ptr[3], value)
+ if ptr[4] is None:
+ ptr[4] = value
+ else:
+ ptr[4] = max(ptr[4], value)
class RepoStats(object):
@@ -137,32 +180,53 @@
# XXX: Doesn't consider duplicate-in-separate-packs overhead.
details = vf._index.get_build_details(keys)
for detail in details.itervalues():
- objectstats.compressed_size += detail[0][2]
+ objectstats.add_compressed_size(detail[0][2])
keys = sorted(keys)
batch_size = 200
for offset in xrange(0, len(keys), batch_size):
batch = keys[offset:offset + batch_size]
for entry in vf.get_record_stream(batch, 'unordered', True):
bytes = entry.get_bytes_as('fulltext')
- objectstats.raw_size += len(bytes)
+ objectstats.add_raw_size(len(bytes))
yield bytes, entry.key
-def _gather_chk_inv(objectstats, repo):
- # first pass: the inventory objects yield chk dicts:
- pending = set()
+def _gather_chk_map(objectstats, chk_bytes, pending, internal_counter,
+ leaf_counter):
done = set()
- for bytes, key in _gather_and_iter_object_vf_texts(objectstats, repo.inventories):
- inv = CHKInventory.deserialise(repo.chk_bytes, bytes, key)
- pending.add(inv.id_to_entry._root_node)
while pending:
# Don't visit nodes twice
done.update(pending)
next = pending
pending = set()
- for bytes, key in _gather_and_iter_object_vf_texts(objectstats, repo.chk_bytes, next):
+ for bytes, key in _gather_and_iter_object_vf_texts(objectstats, chk_bytes, next):
node = chk_map._deserialise(bytes, key)
- pending.update(node.refs())
+ refs = node.refs()
+ pending.update(refs)
+ if isinstance(node, chk_map.InternalNode):
+ objectstats.add_extra_counter(internal_counter, len(refs))
+ else:
+ items = list(node.iteritems(chk_bytes))
+ objectstats.add_extra_counter(leaf_counter, len(items))
+ # Different routes can take a different number of steps to get to the
+ # same nodes.
+ pending.difference_update(done)
+
+
+def _gather_chk_inv(objectstats, repo):
+ # first pass: the inventory objects yield chk dicts:
+ pending = set()
+ pending_parent_id = set()
+ for bytes, key in _gather_and_iter_object_vf_texts(objectstats, repo.inventories):
+ inv = CHKInventory.deserialise(repo.chk_bytes, bytes, key)
+ pending.add(inv.id_to_entry._root_node)
+ if inv.parent_id_basename_to_file_id is not None:
+ pending_parent_id.add(inv.parent_id_basename_to_file_id._root_node)
+ _gather_chk_map(objectstats, repo.chk_bytes, pending, 'internal node refs',
+ 'leaf node items')
+ _gather_chk_map(objectstats, repo.chk_bytes, pending_parent_id,
+ 'internal parent_id refs',
+ 'leaf parent_id items')
def _gather_stats_locked(repo):
=== modified file 'tests/test_repositorydetails.py'
--- a/tests/test_repositorydetails.py 2008-11-18 20:02:08 +0000
+++ b/tests/test_repositorydetails.py 2008-11-19 20:55:29 +0000
@@ -42,6 +42,28 @@
, out)
self.assertEqual("", err)
+ def test_smoke_chk(self):
+ tree = self.make_branch_and_tree('.', format="development4")
+ self.build_tree(["foo"])
+ tree.add(["foo"], ["foo-id"])
+ tree.commit("first post", rev_id="foo", committer="foo at bar",
+ timestamp=100000, timezone=0)
+ out, err = self.run_bzr(["repository-details"])
+ self.assertEqualDiff(
+ "Commits: 1\n"
+ " Raw % Compressed % Objects\n"
+ "Revisions: 0 KiB 40% 0 KiB 29% 1\n"
+ "Inventories: 0 KiB 56% 0 KiB 60% 3\n"
+ "Texts: 0 KiB 2% 0 KiB 11% 1\n"
+ "Signatures: 0 KiB 0% 0 KiB 0% 0\n"
+ "Total: 0 KiB 100% 0 KiB 100% 5\n"
+ "\n"
+ "Extra Info: count total avg stddev min max\n"
+ "leaf node items 1 2 2 0.0 2 2\n"
+ "leaf parent_id items 1 2 2 0.0 2 2\n"
+ , out)
+ self.assertEqual("", err)
+
def test_gather_stats_pack92(self):
tree = self.make_branch_and_tree('.', format="pack-0.92")
self.build_tree(["foo"])
@@ -67,7 +89,7 @@
self.assertEqual(537, stats.total.compressed_size)
def test_gather_stats_chk(self):
- tree = self.make_branch_and_tree('.', format="development3")
+ tree = self.make_branch_and_tree('.', format="development4")
self.build_tree(["foo"])
tree.add(["foo"], ["foo-id"])
tree.commit("first post", rev_id="foo", committer="foo at bar",
@@ -76,20 +98,23 @@
self.assertEqual(1, stats.revision_count)
self.assertEqual(1, stats.revisions.objects)
self.assertEqual(267, stats.revisions.raw_size)
- self.assertEqual(250, stats.revisions.compressed_size)
+ self.assertEqual(249, stats.revisions.compressed_size)
# inv, root, foo-id-lead_node.
- self.assertEqual(2, stats.inventories.objects)
- self.assertEqual(242, stats.inventories.raw_size)
- self.assertEqual(336, stats.inventories.compressed_size)
+ self.assertEqual(3, stats.inventories.objects)
+ self.assertEqual(370, stats.inventories.raw_size)
+ self.assertEqual(518, stats.inventories.compressed_size)
+ self.assertEqual({'leaf node items': [1, 2, 4, 2, 2],
+ 'leaf parent_id items': [1, 2, 4, 2, 2]},
+ stats.inventories.extra_counters)
self.assertEqual(1, stats.texts.objects)
self.assertEqual(16, stats.texts.raw_size)
self.assertEqual(94, stats.texts.compressed_size)
self.assertEqual(0, stats.signatures.objects)
self.assertEqual(0, stats.signatures.raw_size)
self.assertEqual(0, stats.signatures.compressed_size)
- self.assertEqual(4, stats.total.objects)
- self.assertEqual(525, stats.total.raw_size)
- self.assertEqual(680, stats.total.compressed_size)
+ self.assertEqual(5, stats.total.objects)
+ self.assertEqual(653, stats.total.raw_size)
+ self.assertEqual(861, stats.total.compressed_size)
def test_gather_stats_empty(self):
tree = self.make_branch_and_tree('.')
More information about the bazaar-commits
mailing list