[RFC] __BIG__ speed improvment in clone/branch on http transport
Goffredo Baroncelli
kreijack at alice.it
Thu Dec 1 21:48:54 GMT 2005
Hi all,
the patch below introduces a new function to the branch class. The function
is named file_involved(rev1,rev2 ) and returns the file_id
involved in changes between the revision1 and the revisions2. Moreover
this function can be called with a set of revisions as argument: in this case
file_involved() returns the file_id(s) involved in the revisions set.
This function is __very__ useful during the clone/branch function because
with this is very easy know which weave we need to update/download.
This function, due to structure of the inventory.weave, is very simple: when
we change/rename/chmod+x a file, the weave gains a new revisions; then the inventory
is updated with a new entry that track the file_id / revision-id.
Walking the inventory.weave, when a new entry ( a line from a weave point of view,
a tag <file/directory....> from an inventory point of view ) is encountered we know
that the related file_id was updated.
In conclusion if we filter the inventory.weave for certain revision_id(s), the
entries which result added are all related to the file_id involved in the
changes.
As example of application, the patch ( see below ) to the file bzrlib/clone.py permits to
clone the bazaar repository in less than 3 minutes via http: more or less the same time
of the other protocols which permit the listing. In fact without the listing
function ( as in the http:// transport ) the processing of getting the file_id(s) list
are performed expanding the inventories of every revision involved and downloading
every file_id....
A side note, I would suggest to change the format of inventory replacing the (ugly) xml
format to a more simple one-entry/one-line format.
Goffredo
------
ghigo at therra:~/bazaar$ time bzr.dev/bzr branch ftp://ghigo:27b72@127.0.0.1//home/ghigo/bazaar/bzr.dev test/bzr.dev-ftp-orig
Branched 1447 revision(s).
real 2m23.087s
user 0m40.883s
sys 0m13.382
time bzr-weave/bzr branch ftp://ghigo:27b72@127.0.0.1//home/ghigo/bazaar/bzr.dev test/bzr.dev-ftp-mod
Branched 1447 revision(s).
real 1m38.606s
user 0m39.010s
sys 0m12.247s
ghigo at therra:~/bazaar$ time bzr-weave/bzr branch http://127.0.0.1:8077/bazaar/bazaar-ng_stable_branch test/bzr.dev-http-mod
Branched 1447 revision(s).
real 2m2.435s
user 0m58.163s
sys 0m14.752
----
=== modified file 'bzrlib/branch.py'
--- bzrlib/branch.py
+++ bzrlib/branch.py
@@ -1065,6 +1065,47 @@
self.revision_store.add(StringIO(gpg_strategy.sign(plaintext)),
revision_id, "sig")
+ def file_involved(self, arg1=None, arg2=None):
+ """ This function returns the file_id(s) involved in the
+ changese between two revisions, or in the changes
+
+ The revisions are expressed as revision_id
+
+ if two args are passed,the changes are searched between
+ 'rev-arg1'..'rev-arg2'
+ if one arg is passed, the changes are searched up to rev-arg1 or
+ if it is a set, inside this set
+ if no args is passed, all files_id are returned
+ """
+
+ w = self._get_inventory_weave( )
+ file_id = set( )
+
+ if arg2:
+ from_set = set(w.inclusions([w.lookup(arg1)]))
+ to_set = set(w.inclusions([w.lookup(arg2)]))
+ changed = to_set.difference(from_set)
+ elif arg1:
+ if isinstance(arg1, set):
+ changed = map(w.lookup, arg1 )
+ else:
+ changed = w.inclusions([w.lookup(arg1)])
+ else:
+ changed = set(w.inclusions([w.numversions( )-1]))
+
+ for lineno, insert, deleteset, line in w._walk():
+ if insert in changed:
+ start = line.find('file_id="')+9
+ if start < 9: continue
+ end = line.find('"',start)
+ assert end>= 0
+ fid = line[start:end]
+
+
+ file_id.add(fid)
+
+ return file_id
+
class ScratchBranch(BzrBranch):
"""Special test class: a branch that cleans up after itself.
=== modified file 'bzrlib/clone.py'
--- bzrlib/clone.py
+++ bzrlib/clone.py
@@ -78,10 +78,6 @@
note("basis_branch is not supported for fast weave copy yet.")
branch_from.lock_read()
try:
- if not (branch_from.weave_store.listable()
- and branch_from.revision_store.listable()):
- return copy_branch_slower(branch_from, to_location, revision,
- basis_branch)
history = _get_truncated_history(branch_from, revision)
if not bzrlib.osutils.lexists(to_location):
os.mkdir(to_location)
@@ -111,11 +107,17 @@
return history[:idx+1]
def _copy_text_weaves(branch_from, branch_to):
- copy_all(branch_from.weave_store, branch_to.weave_store)
+
+ from_set = set(branch_from._get_inventory_weave( )._names)
+ file_ids = branch_from.file_involved( from_set )
+ branch_to.weave_store.copy_multi(branch_from.weave_store, file_ids )
def _copy_revision_store(branch_from, branch_to):
- copy_all(branch_from.revision_store, branch_to.revision_store)
+
+ # copy all revision
+ from_set = set(branch_from._get_inventory_weave( )._names)
+ branch_to.revision_store.copy_multi(branch_from.revision_store, from_set )
def _copy_control_weaves(branch_from, branch_to):
@@ -123,39 +125,3 @@
from_control = branch_from.control_weaves
to_control.copy_multi(from_control, ['inventory'])
-
-def copy_branch_slower(branch_from, to_location, revision=None, basis_branch=None):
- """Copy branch_from into the existing directory to_location.
-
- revision
- If not None, only revisions up to this point will be copied.
- The head of the new branch will be that revision. Must be a
- revid or None.
-
- to_location -- The destination directory; must either exist and be
- empty, or not exist, in which case it is created.
-
- revno
- The revision to copy up to
-
- basis_branch
- A local branch to copy revisions from, related to branch_from.
- This is used when branching from a remote (slow) branch, and we have
- a local branch that might contain some relevant revisions.
- """
- assert isinstance(branch_from, Branch)
- assert isinstance(to_location, basestring)
- if not bzrlib.osutils.lexists(to_location):
- os.mkdir(to_location)
- br_to = Branch.initialize(to_location)
- mutter("copy branch from %s to %s", branch_from, br_to)
- if basis_branch is not None:
- basis_branch.push_stores(br_to)
- br_to.working_tree().set_root_id(branch_from.get_root_id())
- if revision is None:
- revision = branch_from.last_revision()
- br_to.update_revisions(branch_from, stop_revision=revision)
- build_working_dir(to_location)
- br_to.set_parent(branch_from.base)
- mutter("copied")
- return br_to
--
gpg key@ keyserver.linux.it: Goffredo Baroncelli (ghigo) <kreijack AT inwind.it>
Key fingerprint = CE3C 7E01 6782 30A3 5B87 87C0 BB86 505C 6B2A CFF9
More information about the bazaar
mailing list