Rev 6361: (jelmer) Avoid loading XML modules when importing CHKSerializer. (Jelmer in file:///srv/pqm.bazaar-vcs.org/archives/thelove/bzr/%2Btrunk/
Patch Queue Manager
pqm at pqm.ubuntu.com
Mon Dec 12 14:47:04 UTC 2011
At file:///srv/pqm.bazaar-vcs.org/archives/thelove/bzr/%2Btrunk/
------------------------------------------------------------
revno: 6361 [merge]
revision-id: pqm at pqm.ubuntu.com-20111212144703-suptg74yxhcpon4p
parent: pqm at pqm.ubuntu.com-20111212142156-5zjw49zf7l0wxg6h
parent: jelmer at samba.org-20111212134731-cfgz8aiuze0byuq4
committer: Patch Queue Manager <pqm at pqm.ubuntu.com>
branch nick: +trunk
timestamp: Mon 2011-12-12 14:47:03 +0000
message:
(jelmer) Avoid loading XML modules when importing CHKSerializer. (Jelmer
Vernooij)
modified:
bzrlib/chk_serializer.py chk_serializer.py-20081002064345-2tofdfj2eqq01h4b-1
bzrlib/tests/test_import_tariff.py test_import_tariff.p-20100207155145-ff9infp7goncs7zh-1
bzrlib/tests/test_xml.py test_xml.py-20050905091053-80b45588931a9b35
bzrlib/xml5.py xml5.py-20080328030717-t9guwinq8hom0ar3-1
bzrlib/xml8.py xml5.py-20050907032657-aac8f960815b66b1
bzrlib/xml_serializer.py xml.py-20050309040759-57d51586fdec365d
=== modified file 'bzrlib/chk_serializer.py'
--- a/bzrlib/chk_serializer.py 2011-02-19 22:39:03 +0000
+++ b/bzrlib/chk_serializer.py 2011-12-12 13:47:31 +0000
@@ -16,11 +16,21 @@
"""Serializer object for CHK based inventory storage."""
+from cStringIO import StringIO
+
+from bzrlib import lazy_import
+lazy_import.lazy_import(globals(),
+"""
+from bzrlib import (
+ xml_serializer,
+ )
+""")
from bzrlib import (
bencode,
cache_utf8,
+ errors,
revision as _mod_revision,
- xml8,
+ serializer,
)
@@ -129,17 +139,106 @@
return self.read_revision_from_string(f.read())
-class CHKSerializer(xml8.Serializer_v8):
+class CHKSerializer(serializer.Serializer):
"""A CHKInventory based serializer with 'plain' behaviour."""
format_num = '9'
revision_format_num = None
support_altered_by_hack = False
+ supported_kinds = set(['file', 'directory', 'symlink'])
def __init__(self, node_size, search_key_name):
self.maximum_size = node_size
self.search_key_name = search_key_name
+ def _unpack_inventory(self, elt, revision_id=None, entry_cache=None,
+ return_from_cache=False):
+ """Construct from XML Element"""
+ inv = xml_serializer.unpack_inventory_flat(elt, self.format_num,
+ xml_serializer.unpack_inventory_entry, entry_cache,
+ return_from_cache)
+ return inv
+
+ def read_inventory_from_string(self, xml_string, revision_id=None,
+ entry_cache=None, return_from_cache=False):
+ """Read xml_string into an inventory object.
+
+ :param xml_string: The xml to read.
+ :param revision_id: If not-None, the expected revision id of the
+ inventory.
+ :param entry_cache: An optional cache of InventoryEntry objects. If
+ supplied we will look up entries via (file_id, revision_id) which
+ should map to a valid InventoryEntry (File/Directory/etc) object.
+ :param return_from_cache: Return entries directly from the cache,
+ rather than copying them first. This is only safe if the caller
+ promises not to mutate the returned inventory entries, but it can
+ make some operations significantly faster.
+ """
+ try:
+ return self._unpack_inventory(
+ xml_serializer.fromstring(xml_string), revision_id,
+ entry_cache=entry_cache,
+ return_from_cache=return_from_cache)
+ except xml_serializer.ParseError, e:
+ raise errors.UnexpectedInventoryFormat(e)
+
+ def read_inventory(self, f, revision_id=None):
+ """Read an inventory from a file-like object."""
+ try:
+ try:
+ return self._unpack_inventory(self._read_element(f),
+ revision_id=None)
+ finally:
+ f.close()
+ except xml_serializer.ParseError, e:
+ raise errors.UnexpectedInventoryFormat(e)
+
+ def write_inventory_to_lines(self, inv):
+ """Return a list of lines with the encoded inventory."""
+ return self.write_inventory(inv, None)
+
+ def write_inventory_to_string(self, inv, working=False):
+ """Just call write_inventory with a StringIO and return the value.
+
+ :param working: If True skip history data - text_sha1, text_size,
+ reference_revision, symlink_target.
+ """
+ sio = StringIO()
+ self.write_inventory(inv, sio, working)
+ return sio.getvalue()
+
+ def write_inventory(self, inv, f, working=False):
+ """Write inventory to a file.
+
+ :param inv: the inventory to write.
+ :param f: the file to write. (May be None if the lines are the desired
+ output).
+ :param working: If True skip history data - text_sha1, text_size,
+ reference_revision, symlink_target.
+ :return: The inventory as a list of lines.
+ """
+ output = []
+ append = output.append
+ if inv.revision_id is not None:
+ revid1 = ' revision_id="'
+ revid2 = xml_serializer.encode_and_escape(inv.revision_id)
+ else:
+ revid1 = ""
+ revid2 = ""
+ append('<inventory format="%s"%s%s>\n' % (
+ self.format_num, revid1, revid2))
+ append('<directory file_id="%s name="%s revision="%s />\n' % (
+ xml_serializer.encode_and_escape(inv.root.file_id),
+ xml_serializer.encode_and_escape(inv.root.name),
+ xml_serializer.encode_and_escape(inv.root.revision)))
+ xml_serializer.serialize_inventory_flat(inv,
+ append,
+ root_id=None, supported_kinds=self.supported_kinds,
+ working=working)
+ if f is not None:
+ f.writelines(output)
+ return output
+
chk_serializer_255_bigpage = CHKSerializer(65536, 'hash-255-way')
=== modified file 'bzrlib/tests/test_import_tariff.py'
--- a/bzrlib/tests/test_import_tariff.py 2011-08-30 09:30:27 +0000
+++ b/bzrlib/tests/test_import_tariff.py 2011-12-12 11:23:28 +0000
@@ -186,6 +186,8 @@
'bzrlib.smart.server',
'bzrlib.transform',
'bzrlib.version_info_formats.format_rio',
+ 'bzrlib.xml_serializer',
+ 'bzrlib.xml8',
'getpass',
'kerberos',
'smtplib',
@@ -253,6 +255,8 @@
'bzrlib.transform',
'bzrlib.version_info_formats.format_rio',
'bzrlib.workingtree_4',
+ 'bzrlib.xml_serializer',
+ 'bzrlib.xml8',
'getpass',
'kerberos',
'smtplib',
=== modified file 'bzrlib/tests/test_xml.py'
--- a/bzrlib/tests/test_xml.py 2011-02-21 23:43:10 +0000
+++ b/bzrlib/tests/test_xml.py 2011-12-12 12:11:51 +0000
@@ -508,34 +508,33 @@
def setUp(self):
TestCase.setUp(self)
# Keep the cache clear before and after the test
- bzrlib.xml8._ensure_utf8_re()
- bzrlib.xml8._clear_cache()
- self.addCleanup(bzrlib.xml8._clear_cache)
+ bzrlib.xml_serializer._clear_cache()
+ self.addCleanup(bzrlib.xml_serializer._clear_cache)
def test_simple_ascii(self):
# _encode_and_escape always appends a final ", because these parameters
# are being used in xml attributes, and by returning it now, we have to
# do fewer string operations later.
- val = bzrlib.xml8._encode_and_escape('foo bar')
+ val = bzrlib.xml_serializer.encode_and_escape('foo bar')
self.assertEqual('foo bar"', val)
# The second time should be cached
- val2 = bzrlib.xml8._encode_and_escape('foo bar')
+ val2 = bzrlib.xml_serializer.encode_and_escape('foo bar')
self.assertIs(val2, val)
def test_ascii_with_xml(self):
self.assertEqual('&'"<>"',
- bzrlib.xml8._encode_and_escape('&\'"<>'))
+ bzrlib.xml_serializer.encode_and_escape('&\'"<>'))
def test_utf8_with_xml(self):
# u'\xb5\xe5&\u062c'
utf8_str = '\xc2\xb5\xc3\xa5&\xd8\xac'
self.assertEqual('µå&ج"',
- bzrlib.xml8._encode_and_escape(utf8_str))
+ bzrlib.xml_serializer.encode_and_escape(utf8_str))
def test_unicode(self):
uni_str = u'\xb5\xe5&\u062c'
self.assertEqual('µå&ج"',
- bzrlib.xml8._encode_and_escape(uni_str))
+ bzrlib.xml_serializer.encode_and_escape(uni_str))
class TestMisc(TestCase):
=== modified file 'bzrlib/xml5.py'
--- a/bzrlib/xml5.py 2010-02-17 17:11:16 +0000
+++ b/bzrlib/xml5.py 2011-12-12 12:11:51 +0000
@@ -19,8 +19,13 @@
errors,
inventory,
xml6,
- xml8,
- )
+ )
+from bzrlib.xml_serializer import (
+ encode_and_escape,
+ get_utf8_or_ascii,
+ unpack_inventory_entry,
+ )
+
class Serializer_v5(xml6.Serializer_v6):
"""Version 5 serializer
@@ -35,7 +40,7 @@
"""Construct from XML Element
"""
root_id = elt.get('file_id') or inventory.ROOT_ID
- root_id = xml8._get_utf8_or_ascii(root_id)
+ root_id = get_utf8_or_ascii(root_id)
format = elt.get('format')
if format is not None:
@@ -52,10 +57,9 @@
# avoiding attributes 2.46s
# adding assertions 2.50s
# last_parent cache 2.52s (worse, removed)
- unpack_entry = self._unpack_entry
byid = inv._byid
for e in elt:
- ie = unpack_entry(e, entry_cache=entry_cache,
+ ie = unpack_inventory_entry(e, entry_cache=entry_cache,
return_from_cache=return_from_cache)
parent_id = ie.parent_id
if parent_id is None:
@@ -92,13 +96,13 @@
"""Append the inventory root to output."""
if inv.root.file_id not in (None, inventory.ROOT_ID):
fileid1 = ' file_id="'
- fileid2 = xml8._encode_and_escape(inv.root.file_id)
+ fileid2 = encode_and_escape(inv.root.file_id)
else:
fileid1 = ""
fileid2 = ""
if inv.revision_id is not None:
revid1 = ' revision_id="'
- revid2 = xml8._encode_and_escape(inv.revision_id)
+ revid2 = encode_and_escape(inv.revision_id)
else:
revid1 = ""
revid2 = ""
=== modified file 'bzrlib/xml8.py'
--- a/bzrlib/xml8.py 2011-02-21 15:09:19 +0000
+++ b/bzrlib/xml8.py 2011-12-12 13:47:31 +0000
@@ -15,12 +15,9 @@
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
import cStringIO
-import re
from bzrlib import (
cache_utf8,
- errors,
- inventory,
lazy_regex,
revision as _mod_revision,
trace,
@@ -29,23 +26,17 @@
Element,
SubElement,
XMLSerializer,
+ encode_and_escape,
escape_invalid_chars,
+ get_utf8_or_ascii,
+ serialize_inventory_flat,
+ unpack_inventory_entry,
+ unpack_inventory_flat,
)
-from bzrlib.inventory import InventoryEntry
from bzrlib.revision import Revision
from bzrlib.errors import BzrError
-_utf8_re = None
-_unicode_re = None
-_xml_escape_map = {
- "&":'&',
- "'":"'", # FIXME: overkill
- "\"":""",
- "<":"<",
- ">":">",
- }
-
_xml_unescape_map = {
'apos':"'",
'quot':'"',
@@ -65,115 +56,13 @@
return unichr(int(code[1:])).encode('utf8')
-_unescape_re = None
-
+_unescape_re = lazy_regex.lazy_compile('\&([^;]*);')
def _unescape_xml(data):
"""Unescape predefined XML entities in a string of data."""
- global _unescape_re
- if _unescape_re is None:
- _unescape_re = re.compile('\&([^;]*);')
return _unescape_re.sub(_unescaper, data)
-def _ensure_utf8_re():
- """Make sure the _utf8_re and _unicode_re regexes have been compiled."""
- global _utf8_re, _unicode_re
- if _utf8_re is None:
- _utf8_re = re.compile('[&<>\'\"]|[\x80-\xff]+')
- if _unicode_re is None:
- _unicode_re = re.compile(u'[&<>\'\"\u0080-\uffff]')
-
-
-def _unicode_escape_replace(match, _map=_xml_escape_map):
- """Replace a string of non-ascii, non XML safe characters with their escape
-
- This will escape both Standard XML escapes, like <>"', etc.
- As well as escaping non ascii characters, because ElementTree did.
- This helps us remain compatible to older versions of bzr. We may change
- our policy in the future, though.
- """
- # jam 20060816 Benchmarks show that try/KeyError is faster if you
- # expect the entity to rarely miss. There is about a 10% difference
- # in overall time. But if you miss frequently, then if None is much
- # faster. For our use case, we *rarely* have a revision id, file id
- # or path name that is unicode. So use try/KeyError.
- try:
- return _map[match.group()]
- except KeyError:
- return "&#%d;" % ord(match.group())
-
-
-def _utf8_escape_replace(match, _map=_xml_escape_map):
- """Escape utf8 characters into XML safe ones.
-
- This uses 2 tricks. It is either escaping "standard" characters, like "&<>,
- or it is handling characters with the high-bit set. For ascii characters,
- we just lookup the replacement in the dictionary. For everything else, we
- decode back into Unicode, and then use the XML escape code.
- """
- try:
- return _map[match.group()]
- except KeyError:
- return ''.join('&#%d;' % ord(uni_chr)
- for uni_chr in match.group().decode('utf8'))
-
-
-_to_escaped_map = {}
-
-def _encode_and_escape(unicode_or_utf8_str, _map=_to_escaped_map):
- """Encode the string into utf8, and escape invalid XML characters"""
- # We frequently get entities we have not seen before, so it is better
- # to check if None, rather than try/KeyError
- text = _map.get(unicode_or_utf8_str)
- if text is None:
- if unicode_or_utf8_str.__class__ is unicode:
- # The alternative policy is to do a regular UTF8 encoding
- # and then escape only XML meta characters.
- # Performance is equivalent once you use cache_utf8. *However*
- # this makes the serialized texts incompatible with old versions
- # of bzr. So no net gain. (Perhaps the read code would handle utf8
- # better than entity escapes, but cElementTree seems to do just fine
- # either way)
- text = str(_unicode_re.sub(_unicode_escape_replace,
- unicode_or_utf8_str)) + '"'
- else:
- # Plain strings are considered to already be in utf-8 so we do a
- # slightly different method for escaping.
- text = _utf8_re.sub(_utf8_escape_replace,
- unicode_or_utf8_str) + '"'
- _map[unicode_or_utf8_str] = text
- return text
-
-
-def _get_utf8_or_ascii(a_str,
- _encode_utf8=cache_utf8.encode,
- _get_cached_ascii=cache_utf8.get_cached_ascii):
- """Return a cached version of the string.
-
- cElementTree will return a plain string if the XML is plain ascii. It only
- returns Unicode when it needs to. We want to work in utf-8 strings. So if
- cElementTree returns a plain string, we can just return the cached version.
- If it is Unicode, then we need to encode it.
-
- :param a_str: An 8-bit string or Unicode as returned by
- cElementTree.Element.get()
- :return: A utf-8 encoded 8-bit string.
- """
- # This is fairly optimized because we know what cElementTree does, this is
- # not meant as a generic function for all cases. Because it is possible for
- # an 8-bit string to not be ascii or valid utf8.
- if a_str.__class__ is unicode:
- return _encode_utf8(a_str)
- else:
- return intern(a_str)
-
-
-def _clear_cache():
- """Clean out the unicode => escaped map"""
- _to_escaped_map.clear()
-
-
class Serializer_v8(XMLSerializer):
"""This serialiser adds rich roots.
@@ -261,83 +150,11 @@
reference_revision, symlink_target.
:return: The inventory as a list of lines.
"""
- _ensure_utf8_re()
- self._check_revisions(inv)
output = []
append = output.append
self._append_inventory_root(append, inv)
- entries = inv.iter_entries()
- # Skip the root
- root_path, root_ie = entries.next()
- for path, ie in entries:
- if ie.parent_id != self.root_id:
- parent_str = ' parent_id="'
- parent_id = _encode_and_escape(ie.parent_id)
- else:
- parent_str = ''
- parent_id = ''
- if ie.kind == 'file':
- if ie.executable:
- executable = ' executable="yes"'
- else:
- executable = ''
- if not working:
- append('<file%s file_id="%s name="%s%s%s revision="%s '
- 'text_sha1="%s" text_size="%d" />\n' % (
- executable, _encode_and_escape(ie.file_id),
- _encode_and_escape(ie.name), parent_str, parent_id,
- _encode_and_escape(ie.revision), ie.text_sha1,
- ie.text_size))
- else:
- append('<file%s file_id="%s name="%s%s%s />\n' % (
- executable, _encode_and_escape(ie.file_id),
- _encode_and_escape(ie.name), parent_str, parent_id))
- elif ie.kind == 'directory':
- if not working:
- append('<directory file_id="%s name="%s%s%s revision="%s '
- '/>\n' % (
- _encode_and_escape(ie.file_id),
- _encode_and_escape(ie.name),
- parent_str, parent_id,
- _encode_and_escape(ie.revision)))
- else:
- append('<directory file_id="%s name="%s%s%s />\n' % (
- _encode_and_escape(ie.file_id),
- _encode_and_escape(ie.name),
- parent_str, parent_id))
- elif ie.kind == 'symlink':
- if not working:
- append('<symlink file_id="%s name="%s%s%s revision="%s '
- 'symlink_target="%s />\n' % (
- _encode_and_escape(ie.file_id),
- _encode_and_escape(ie.name),
- parent_str, parent_id,
- _encode_and_escape(ie.revision),
- _encode_and_escape(ie.symlink_target)))
- else:
- append('<symlink file_id="%s name="%s%s%s />\n' % (
- _encode_and_escape(ie.file_id),
- _encode_and_escape(ie.name),
- parent_str, parent_id))
- elif ie.kind == 'tree-reference':
- if ie.kind not in self.supported_kinds:
- raise errors.UnsupportedInventoryKind(ie.kind)
- if not working:
- append('<tree-reference file_id="%s name="%s%s%s '
- 'revision="%s reference_revision="%s />\n' % (
- _encode_and_escape(ie.file_id),
- _encode_and_escape(ie.name),
- parent_str, parent_id,
- _encode_and_escape(ie.revision),
- _encode_and_escape(ie.reference_revision)))
- else:
- append('<tree-reference file_id="%s name="%s%s%s />\n' % (
- _encode_and_escape(ie.file_id),
- _encode_and_escape(ie.name),
- parent_str, parent_id))
- else:
- raise errors.UnsupportedInventoryKind(ie.kind)
- append('</inventory>\n')
+ serialize_inventory_flat(inv, append,
+ self.root_id, self.supported_kinds, working)
if f is not None:
f.writelines(output)
# Just to keep the cache from growing without bounds
@@ -349,16 +166,16 @@
"""Append the inventory root to output."""
if inv.revision_id is not None:
revid1 = ' revision_id="'
- revid2 = _encode_and_escape(inv.revision_id)
+ revid2 = encode_and_escape(inv.revision_id)
else:
revid1 = ""
revid2 = ""
append('<inventory format="%s"%s%s>\n' % (
self.format_num, revid1, revid2))
append('<directory file_id="%s name="%s revision="%s />\n' % (
- _encode_and_escape(inv.root.file_id),
- _encode_and_escape(inv.root.name),
- _encode_and_escape(inv.root.revision)))
+ encode_and_escape(inv.root.file_id),
+ encode_and_escape(inv.root.name),
+ encode_and_escape(inv.root.revision)))
def _pack_revision(self, rev):
"""Revision object -> xml tree"""
@@ -408,119 +225,19 @@
prop_elt.tail = '\n'
top_elt.tail = '\n'
+ def _unpack_entry(self, elt, entry_cache=None, return_from_cache=False):
+ # This is here because it's overridden by xml7
+ return unpack_inventory_entry(elt, entry_cache,
+ return_from_cache)
+
def _unpack_inventory(self, elt, revision_id=None, entry_cache=None,
return_from_cache=False):
"""Construct from XML Element"""
- if elt.tag != 'inventory':
- raise errors.UnexpectedInventoryFormat('Root tag is %r' % elt.tag)
- format = elt.get('format')
- if format != self.format_num:
- raise errors.UnexpectedInventoryFormat('Invalid format version %r'
- % format)
- revision_id = elt.get('revision_id')
- if revision_id is not None:
- revision_id = cache_utf8.encode(revision_id)
- inv = inventory.Inventory(root_id=None, revision_id=revision_id)
- for e in elt:
- ie = self._unpack_entry(e, entry_cache=entry_cache,
- return_from_cache=return_from_cache)
- inv.add(ie)
+ inv = unpack_inventory_flat(elt, self.format_num, self._unpack_entry,
+ entry_cache, return_from_cache)
self._check_cache_size(len(inv), entry_cache)
return inv
- def _unpack_entry(self, elt, entry_cache=None, return_from_cache=False):
- elt_get = elt.get
- file_id = elt_get('file_id')
- revision = elt_get('revision')
- # Check and see if we have already unpacked this exact entry
- # Some timings for "repo.revision_trees(last_100_revs)"
- # bzr mysql
- # unmodified 4.1s 40.8s
- # using lru 3.5s
- # using fifo 2.83s 29.1s
- # lru._cache 2.8s
- # dict 2.75s 26.8s
- # inv.add 2.5s 26.0s
- # no_copy 2.00s 20.5s
- # no_c,dict 1.95s 18.0s
- # Note that a cache of 10k nodes is more than sufficient to hold all of
- # the inventory for the last 100 revs for bzr, but not for mysql (20k
- # is enough for mysql, which saves the same 2s as using a dict)
-
- # Breakdown of mysql using time.clock()
- # 4.1s 2 calls to element.get for file_id, revision_id
- # 4.5s cache_hit lookup
- # 7.1s InventoryFile.copy()
- # 2.4s InventoryDirectory.copy()
- # 0.4s decoding unique entries
- # 1.6s decoding entries after FIFO fills up
- # 0.8s Adding nodes to FIFO (including flushes)
- # 0.1s cache miss lookups
- # Using an LRU cache
- # 4.1s 2 calls to element.get for file_id, revision_id
- # 9.9s cache_hit lookup
- # 10.8s InventoryEntry.copy()
- # 0.3s cache miss lookus
- # 1.2s decoding entries
- # 1.0s adding nodes to LRU
- if entry_cache is not None and revision is not None:
- key = (file_id, revision)
- try:
- # We copy it, because some operations may mutate it
- cached_ie = entry_cache[key]
- except KeyError:
- pass
- else:
- # Only copying directory entries drops us 2.85s => 2.35s
- if return_from_cache:
- if cached_ie.kind == 'directory':
- return cached_ie.copy()
- return cached_ie
- return cached_ie.copy()
-
- kind = elt.tag
- if not InventoryEntry.versionable_kind(kind):
- raise AssertionError('unsupported entry kind %s' % kind)
-
- get_cached = _get_utf8_or_ascii
-
- file_id = get_cached(file_id)
- if revision is not None:
- revision = get_cached(revision)
- parent_id = elt_get('parent_id')
- if parent_id is not None:
- parent_id = get_cached(parent_id)
-
- if kind == 'directory':
- ie = inventory.InventoryDirectory(file_id,
- elt_get('name'),
- parent_id)
- elif kind == 'file':
- ie = inventory.InventoryFile(file_id,
- elt_get('name'),
- parent_id)
- ie.text_sha1 = elt_get('text_sha1')
- if elt_get('executable') == 'yes':
- ie.executable = True
- v = elt_get('text_size')
- ie.text_size = v and int(v)
- elif kind == 'symlink':
- ie = inventory.InventoryLink(file_id,
- elt_get('name'),
- parent_id)
- ie.symlink_target = elt_get('symlink_target')
- else:
- raise errors.UnsupportedInventoryKind(kind)
- ie.revision = revision
- if revision is not None and entry_cache is not None:
- # We cache a copy() because callers like to mutate objects, and
- # that would cause the item in cache to mutate as well.
- # This has a small effect on many-inventory performance, because
- # the majority fraction is spent in cache hits, not misses.
- entry_cache[key] = ie.copy()
-
- return ie
-
def _unpack_revision(self, elt):
"""XML Element -> Revision object"""
format = elt.get('format')
@@ -531,7 +248,7 @@
if format != format_num:
raise BzrError("invalid format version %r on revision"
% format)
- get_cached = _get_utf8_or_ascii
+ get_cached = get_utf8_or_ascii
rev = Revision(committer = elt.get('committer'),
timestamp = float(elt.get('timestamp')),
revision_id = get_cached(elt.get('revision_id')),
=== modified file 'bzrlib/xml_serializer.py'
--- a/bzrlib/xml_serializer.py 2010-09-17 04:35:23 +0000
+++ b/bzrlib/xml_serializer.py 2011-12-12 13:47:31 +0000
@@ -50,7 +50,12 @@
import util.elementtree as elementtree
from xml.parsers.expat import ExpatError as ParseError
-from bzrlib import errors
+from bzrlib import (
+ cache_utf8,
+ inventory,
+ lazy_regex,
+ errors,
+ )
class XMLSerializer(Serializer):
@@ -130,3 +135,304 @@
return re.subn(u'[^\x09\x0A\x0D\u0020-\uD7FF\uE000-\uFFFD]+',
lambda match: match.group(0).encode('unicode_escape'),
message)
+
+
+def get_utf8_or_ascii(a_str, _encode_utf8=cache_utf8.encode):
+ """Return a cached version of the string.
+
+ cElementTree will return a plain string if the XML is plain ascii. It only
+ returns Unicode when it needs to. We want to work in utf-8 strings. So if
+ cElementTree returns a plain string, we can just return the cached version.
+ If it is Unicode, then we need to encode it.
+
+ :param a_str: An 8-bit string or Unicode as returned by
+ cElementTree.Element.get()
+ :return: A utf-8 encoded 8-bit string.
+ """
+ # This is fairly optimized because we know what cElementTree does, this is
+ # not meant as a generic function for all cases. Because it is possible for
+ # an 8-bit string to not be ascii or valid utf8.
+ if a_str.__class__ is unicode:
+ return _encode_utf8(a_str)
+ else:
+ return intern(a_str)
+
+
+_utf8_re = lazy_regex.lazy_compile('[&<>\'\"]|[\x80-\xff]+')
+_unicode_re = lazy_regex.lazy_compile(u'[&<>\'\"\u0080-\uffff]')
+
+
+_xml_escape_map = {
+ "&":'&',
+ "'":"'", # FIXME: overkill
+ "\"":""",
+ "<":"<",
+ ">":">",
+ }
+
+
+def _unicode_escape_replace(match, _map=_xml_escape_map):
+ """Replace a string of non-ascii, non XML safe characters with their escape
+
+ This will escape both Standard XML escapes, like <>"', etc.
+ As well as escaping non ascii characters, because ElementTree did.
+ This helps us remain compatible to older versions of bzr. We may change
+ our policy in the future, though.
+ """
+ # jam 20060816 Benchmarks show that try/KeyError is faster if you
+ # expect the entity to rarely miss. There is about a 10% difference
+ # in overall time. But if you miss frequently, then if None is much
+ # faster. For our use case, we *rarely* have a revision id, file id
+ # or path name that is unicode. So use try/KeyError.
+ try:
+ return _map[match.group()]
+ except KeyError:
+ return "&#%d;" % ord(match.group())
+
+
+def _utf8_escape_replace(match, _map=_xml_escape_map):
+ """Escape utf8 characters into XML safe ones.
+
+ This uses 2 tricks. It is either escaping "standard" characters, like "&<>,
+ or it is handling characters with the high-bit set. For ascii characters,
+ we just lookup the replacement in the dictionary. For everything else, we
+ decode back into Unicode, and then use the XML escape code.
+ """
+ try:
+ return _map[match.group()]
+ except KeyError:
+ return ''.join('&#%d;' % ord(uni_chr)
+ for uni_chr in match.group().decode('utf8'))
+
+
+_to_escaped_map = {}
+
+def encode_and_escape(unicode_or_utf8_str, _map=_to_escaped_map):
+ """Encode the string into utf8, and escape invalid XML characters"""
+ # We frequently get entities we have not seen before, so it is better
+ # to check if None, rather than try/KeyError
+ text = _map.get(unicode_or_utf8_str)
+ if text is None:
+ if unicode_or_utf8_str.__class__ is unicode:
+ # The alternative policy is to do a regular UTF8 encoding
+ # and then escape only XML meta characters.
+ # Performance is equivalent once you use cache_utf8. *However*
+ # this makes the serialized texts incompatible with old versions
+ # of bzr. So no net gain. (Perhaps the read code would handle utf8
+ # better than entity escapes, but cElementTree seems to do just fine
+ # either way)
+ text = str(_unicode_re.sub(_unicode_escape_replace,
+ unicode_or_utf8_str)) + '"'
+ else:
+ # Plain strings are considered to already be in utf-8 so we do a
+ # slightly different method for escaping.
+ text = _utf8_re.sub(_utf8_escape_replace,
+ unicode_or_utf8_str) + '"'
+ _map[unicode_or_utf8_str] = text
+ return text
+
+
+def _clear_cache():
+ """Clean out the unicode => escaped map"""
+ _to_escaped_map.clear()
+
+
+def unpack_inventory_entry(elt, entry_cache=None, return_from_cache=False):
+ elt_get = elt.get
+ file_id = elt_get('file_id')
+ revision = elt_get('revision')
+ # Check and see if we have already unpacked this exact entry
+ # Some timings for "repo.revision_trees(last_100_revs)"
+ # bzr mysql
+ # unmodified 4.1s 40.8s
+ # using lru 3.5s
+ # using fifo 2.83s 29.1s
+ # lru._cache 2.8s
+ # dict 2.75s 26.8s
+ # inv.add 2.5s 26.0s
+ # no_copy 2.00s 20.5s
+ # no_c,dict 1.95s 18.0s
+ # Note that a cache of 10k nodes is more than sufficient to hold all of
+ # the inventory for the last 100 revs for bzr, but not for mysql (20k
+ # is enough for mysql, which saves the same 2s as using a dict)
+
+ # Breakdown of mysql using time.clock()
+ # 4.1s 2 calls to element.get for file_id, revision_id
+ # 4.5s cache_hit lookup
+ # 7.1s InventoryFile.copy()
+ # 2.4s InventoryDirectory.copy()
+ # 0.4s decoding unique entries
+ # 1.6s decoding entries after FIFO fills up
+ # 0.8s Adding nodes to FIFO (including flushes)
+ # 0.1s cache miss lookups
+ # Using an LRU cache
+ # 4.1s 2 calls to element.get for file_id, revision_id
+ # 9.9s cache_hit lookup
+ # 10.8s InventoryEntry.copy()
+ # 0.3s cache miss lookus
+ # 1.2s decoding entries
+ # 1.0s adding nodes to LRU
+ if entry_cache is not None and revision is not None:
+ key = (file_id, revision)
+ try:
+ # We copy it, because some operations may mutate it
+ cached_ie = entry_cache[key]
+ except KeyError:
+ pass
+ else:
+ # Only copying directory entries drops us 2.85s => 2.35s
+ if return_from_cache:
+ if cached_ie.kind == 'directory':
+ return cached_ie.copy()
+ return cached_ie
+ return cached_ie.copy()
+
+ kind = elt.tag
+ if not inventory.InventoryEntry.versionable_kind(kind):
+ raise AssertionError('unsupported entry kind %s' % kind)
+
+ file_id = get_utf8_or_ascii(file_id)
+ if revision is not None:
+ revision = get_utf8_or_ascii(revision)
+ parent_id = elt_get('parent_id')
+ if parent_id is not None:
+ parent_id = get_utf8_or_ascii(parent_id)
+
+ if kind == 'directory':
+ ie = inventory.InventoryDirectory(file_id,
+ elt_get('name'),
+ parent_id)
+ elif kind == 'file':
+ ie = inventory.InventoryFile(file_id,
+ elt_get('name'),
+ parent_id)
+ ie.text_sha1 = elt_get('text_sha1')
+ if elt_get('executable') == 'yes':
+ ie.executable = True
+ v = elt_get('text_size')
+ ie.text_size = v and int(v)
+ elif kind == 'symlink':
+ ie = inventory.InventoryLink(file_id,
+ elt_get('name'),
+ parent_id)
+ ie.symlink_target = elt_get('symlink_target')
+ else:
+ raise errors.UnsupportedInventoryKind(kind)
+ ie.revision = revision
+ if revision is not None and entry_cache is not None:
+ # We cache a copy() because callers like to mutate objects, and
+ # that would cause the item in cache to mutate as well.
+ # This has a small effect on many-inventory performance, because
+ # the majority fraction is spent in cache hits, not misses.
+ entry_cache[key] = ie.copy()
+
+ return ie
+
+
+def unpack_inventory_flat(elt, format_num, unpack_entry,
+ entry_cache=None, return_from_cache=False):
+ """Unpack a flat XML inventory.
+
+ :param elt: XML element for the inventory
+ :param format_num: Expected format number
+ :param unpack_entry: Function for unpacking inventory entries
+ :return: An inventory
+ :raise UnexpectedInventoryFormat: When unexpected elements or data is
+ encountered
+ """
+ if elt.tag != 'inventory':
+ raise errors.UnexpectedInventoryFormat('Root tag is %r' % elt.tag)
+ format = elt.get('format')
+ if format != format_num:
+ raise errors.UnexpectedInventoryFormat('Invalid format version %r'
+ % format)
+ revision_id = elt.get('revision_id')
+ if revision_id is not None:
+ revision_id = cache_utf8.encode(revision_id)
+ inv = inventory.Inventory(root_id=None, revision_id=revision_id)
+ for e in elt:
+ ie = unpack_entry(e, entry_cache, return_from_cache)
+ inv.add(ie)
+ return inv
+
+
+def serialize_inventory_flat(inv, append, root_id, supported_kinds, working):
+ """Serialize an inventory to a flat XML file.
+
+ :param inv: Inventory to serialize
+ :param append: Function for writing a line of output
+ :param working: If True skip history data - text_sha1, text_size,
+ reference_revision, symlink_target. self._check_revisions(inv)
+ """
+ entries = inv.iter_entries()
+ # Skip the root
+ root_path, root_ie = entries.next()
+ for path, ie in entries:
+ if ie.parent_id != root_id:
+ parent_str = ' parent_id="'
+ parent_id = encode_and_escape(ie.parent_id)
+ else:
+ parent_str = ''
+ parent_id = ''
+ if ie.kind == 'file':
+ if ie.executable:
+ executable = ' executable="yes"'
+ else:
+ executable = ''
+ if not working:
+ append('<file%s file_id="%s name="%s%s%s revision="%s '
+ 'text_sha1="%s" text_size="%d" />\n' % (
+ executable, encode_and_escape(ie.file_id),
+ encode_and_escape(ie.name), parent_str, parent_id,
+ encode_and_escape(ie.revision), ie.text_sha1,
+ ie.text_size))
+ else:
+ append('<file%s file_id="%s name="%s%s%s />\n' % (
+ executable, encode_and_escape(ie.file_id),
+ encode_and_escape(ie.name), parent_str, parent_id))
+ elif ie.kind == 'directory':
+ if not working:
+ append('<directory file_id="%s name="%s%s%s revision="%s '
+ '/>\n' % (
+ encode_and_escape(ie.file_id),
+ encode_and_escape(ie.name),
+ parent_str, parent_id,
+ encode_and_escape(ie.revision)))
+ else:
+ append('<directory file_id="%s name="%s%s%s />\n' % (
+ encode_and_escape(ie.file_id),
+ encode_and_escape(ie.name),
+ parent_str, parent_id))
+ elif ie.kind == 'symlink':
+ if not working:
+ append('<symlink file_id="%s name="%s%s%s revision="%s '
+ 'symlink_target="%s />\n' % (
+ encode_and_escape(ie.file_id),
+ encode_and_escape(ie.name),
+ parent_str, parent_id,
+ encode_and_escape(ie.revision),
+ encode_and_escape(ie.symlink_target)))
+ else:
+ append('<symlink file_id="%s name="%s%s%s />\n' % (
+ encode_and_escape(ie.file_id),
+ encode_and_escape(ie.name),
+ parent_str, parent_id))
+ elif ie.kind == 'tree-reference':
+ if ie.kind not in supported_kinds:
+ raise errors.UnsupportedInventoryKind(ie.kind)
+ if not working:
+ append('<tree-reference file_id="%s name="%s%s%s '
+ 'revision="%s reference_revision="%s />\n' % (
+ encode_and_escape(ie.file_id),
+ encode_and_escape(ie.name),
+ parent_str, parent_id,
+ encode_and_escape(ie.revision),
+ encode_and_escape(ie.reference_revision)))
+ else:
+ append('<tree-reference file_id="%s name="%s%s%s />\n' % (
+ encode_and_escape(ie.file_id),
+ encode_and_escape(ie.name),
+ parent_str, parent_id))
+ else:
+ raise errors.UnsupportedInventoryKind(ie.kind)
+ append('</inventory>\n')
More information about the bazaar-commits
mailing list