Rev 6361: (jelmer) Avoid loading XML modules when importing CHKSerializer. (Jelmer in file:///srv/pqm.bazaar-vcs.org/archives/thelove/bzr/%2Btrunk/

Mon Dec 12 14:47:04 UTC 2011

At file:///srv/pqm.bazaar-vcs.org/archives/thelove/bzr/%2Btrunk/

------------------------------------------------------------
revno: 6361 [merge]
revision-id: pqm at pqm.ubuntu.com-20111212144703-suptg74yxhcpon4p
parent: pqm at pqm.ubuntu.com-20111212142156-5zjw49zf7l0wxg6h
parent: jelmer at samba.org-20111212134731-cfgz8aiuze0byuq4
committer: Patch Queue Manager <pqm at pqm.ubuntu.com>
branch nick: +trunk
timestamp: Mon 2011-12-12 14:47:03 +0000
message:
  (jelmer) Avoid loading XML modules when importing CHKSerializer. (Jelmer
   Vernooij)
modified:
  bzrlib/chk_serializer.py       chk_serializer.py-20081002064345-2tofdfj2eqq01h4b-1
  bzrlib/tests/test_import_tariff.py test_import_tariff.p-20100207155145-ff9infp7goncs7zh-1
  bzrlib/tests/test_xml.py       test_xml.py-20050905091053-80b45588931a9b35
  bzrlib/xml5.py                 xml5.py-20080328030717-t9guwinq8hom0ar3-1
  bzrlib/xml8.py                 xml5.py-20050907032657-aac8f960815b66b1
  bzrlib/xml_serializer.py       xml.py-20050309040759-57d51586fdec365d
=== modified file 'bzrlib/chk_serializer.py'

--- a/bzrlib/chk_serializer.py	2011-02-19 22:39:03 +0000
+++ b/bzrlib/chk_serializer.py	2011-12-12 13:47:31 +0000
@@ -16,11 +16,21 @@
 
 """Serializer object for CHK based inventory storage."""
 
+from cStringIO import StringIO
+
+from bzrlib import lazy_import
+lazy_import.lazy_import(globals(),
+"""
+from bzrlib import (
+    xml_serializer,
+    )
+""")
 from bzrlib import (
     bencode,
     cache_utf8,
+    errors,
     revision as _mod_revision,
-    xml8,
+    serializer,
     )
 
 
@@ -129,17 +139,106 @@
         return self.read_revision_from_string(f.read())
 
 
-class CHKSerializer(xml8.Serializer_v8):
+class CHKSerializer(serializer.Serializer):
     """A CHKInventory based serializer with 'plain' behaviour."""
 
     format_num = '9'
     revision_format_num = None
     support_altered_by_hack = False
+    supported_kinds = set(['file', 'directory', 'symlink'])
 
     def __init__(self, node_size, search_key_name):
         self.maximum_size = node_size
         self.search_key_name = search_key_name
 
+    def _unpack_inventory(self, elt, revision_id=None, entry_cache=None,
+                          return_from_cache=False):
+        """Construct from XML Element"""
+        inv = xml_serializer.unpack_inventory_flat(elt, self.format_num,
+            xml_serializer.unpack_inventory_entry, entry_cache,
+            return_from_cache)
+        return inv
+
+    def read_inventory_from_string(self, xml_string, revision_id=None,
+                                   entry_cache=None, return_from_cache=False):
+        """Read xml_string into an inventory object.
+
+        :param xml_string: The xml to read.
+        :param revision_id: If not-None, the expected revision id of the
+            inventory.
+        :param entry_cache: An optional cache of InventoryEntry objects. If
+            supplied we will look up entries via (file_id, revision_id) which
+            should map to a valid InventoryEntry (File/Directory/etc) object.
+        :param return_from_cache: Return entries directly from the cache,
+            rather than copying them first. This is only safe if the caller
+            promises not to mutate the returned inventory entries, but it can
+            make some operations significantly faster.
+        """
+        try:
+            return self._unpack_inventory(
+                xml_serializer.fromstring(xml_string), revision_id,
+                entry_cache=entry_cache,
+                return_from_cache=return_from_cache)
+        except xml_serializer.ParseError, e:
+            raise errors.UnexpectedInventoryFormat(e)
+
+    def read_inventory(self, f, revision_id=None):
+        """Read an inventory from a file-like object."""
+        try:
+            try:
+                return self._unpack_inventory(self._read_element(f),
+                    revision_id=None)
+            finally:
+                f.close()
+        except xml_serializer.ParseError, e:
+            raise errors.UnexpectedInventoryFormat(e)
+
+    def write_inventory_to_lines(self, inv):
+        """Return a list of lines with the encoded inventory."""
+        return self.write_inventory(inv, None)
+
+    def write_inventory_to_string(self, inv, working=False):
+        """Just call write_inventory with a StringIO and return the value.
+
+        :param working: If True skip history data - text_sha1, text_size,
+            reference_revision, symlink_target.
+        """
+        sio = StringIO()
+        self.write_inventory(inv, sio, working)
+        return sio.getvalue()
+
+    def write_inventory(self, inv, f, working=False):
+        """Write inventory to a file.
+
+        :param inv: the inventory to write.
+        :param f: the file to write. (May be None if the lines are the desired
+            output).
+        :param working: If True skip history data - text_sha1, text_size,
+            reference_revision, symlink_target.
+        :return: The inventory as a list of lines.
+        """
+        output = []
+        append = output.append
+        if inv.revision_id is not None:
+            revid1 = ' revision_id="'
+            revid2 = xml_serializer.encode_and_escape(inv.revision_id)
+        else:
+            revid1 = ""
+            revid2 = ""
+        append('<inventory format="%s"%s%s>\n' % (
+            self.format_num, revid1, revid2))
+        append('<directory file_id="%s name="%s revision="%s />\n' % (
+            xml_serializer.encode_and_escape(inv.root.file_id),
+            xml_serializer.encode_and_escape(inv.root.name),
+            xml_serializer.encode_and_escape(inv.root.revision)))
+        xml_serializer.serialize_inventory_flat(inv,
+            append,
+            root_id=None, supported_kinds=self.supported_kinds, 
+            working=working)
+        if f is not None:
+            f.writelines(output)
+        return output
+
 
 chk_serializer_255_bigpage = CHKSerializer(65536, 'hash-255-way')
 

=== modified file 'bzrlib/tests/test_import_tariff.py'
--- a/bzrlib/tests/test_import_tariff.py	2011-08-30 09:30:27 +0000
+++ b/bzrlib/tests/test_import_tariff.py	2011-12-12 11:23:28 +0000
@@ -186,6 +186,8 @@
             'bzrlib.smart.server',
             'bzrlib.transform',
             'bzrlib.version_info_formats.format_rio',
+            'bzrlib.xml_serializer',
+            'bzrlib.xml8',
             'getpass',
             'kerberos',
             'smtplib',
@@ -253,6 +255,8 @@
             'bzrlib.transform',
             'bzrlib.version_info_formats.format_rio',
             'bzrlib.workingtree_4',
+            'bzrlib.xml_serializer',
+            'bzrlib.xml8',
             'getpass',
             'kerberos',
             'smtplib',

=== modified file 'bzrlib/tests/test_xml.py'
--- a/bzrlib/tests/test_xml.py	2011-02-21 23:43:10 +0000
+++ b/bzrlib/tests/test_xml.py	2011-12-12 12:11:51 +0000
@@ -508,34 +508,33 @@
     def setUp(self):
         TestCase.setUp(self)
         # Keep the cache clear before and after the test
-        bzrlib.xml8._ensure_utf8_re()
-        bzrlib.xml8._clear_cache()
-        self.addCleanup(bzrlib.xml8._clear_cache)
+        bzrlib.xml_serializer._clear_cache()
+        self.addCleanup(bzrlib.xml_serializer._clear_cache)
 
     def test_simple_ascii(self):
         # _encode_and_escape always appends a final ", because these parameters
         # are being used in xml attributes, and by returning it now, we have to
         # do fewer string operations later.
-        val = bzrlib.xml8._encode_and_escape('foo bar')
+        val = bzrlib.xml_serializer.encode_and_escape('foo bar')
         self.assertEqual('foo bar"', val)
         # The second time should be cached
-        val2 = bzrlib.xml8._encode_and_escape('foo bar')
+        val2 = bzrlib.xml_serializer.encode_and_escape('foo bar')
         self.assertIs(val2, val)
 
     def test_ascii_with_xml(self):
         self.assertEqual('&'"<>"',
-                         bzrlib.xml8._encode_and_escape('&\'"<>'))
+                         bzrlib.xml_serializer.encode_and_escape('&\'"<>'))
 
     def test_utf8_with_xml(self):
         # u'\xb5\xe5&\u062c'
         utf8_str = '\xc2\xb5\xc3\xa5&\xd8\xac'
         self.assertEqual('µå&ج"',
-                         bzrlib.xml8._encode_and_escape(utf8_str))
+                         bzrlib.xml_serializer.encode_and_escape(utf8_str))
 
     def test_unicode(self):
         uni_str = u'\xb5\xe5&\u062c'
         self.assertEqual('µå&ج"',
-                         bzrlib.xml8._encode_and_escape(uni_str))
+                         bzrlib.xml_serializer.encode_and_escape(uni_str))
 
 
 class TestMisc(TestCase):

=== modified file 'bzrlib/xml5.py'
--- a/bzrlib/xml5.py	2010-02-17 17:11:16 +0000
+++ b/bzrlib/xml5.py	2011-12-12 12:11:51 +0000
@@ -19,8 +19,13 @@
     errors,
     inventory,
     xml6,
-    xml8,
-    )
+    )
+from bzrlib.xml_serializer import (
+    encode_and_escape,
+    get_utf8_or_ascii,
+    unpack_inventory_entry,
+    )
+
 
 class Serializer_v5(xml6.Serializer_v6):
     """Version 5 serializer
@@ -35,7 +40,7 @@
         """Construct from XML Element
         """
         root_id = elt.get('file_id') or inventory.ROOT_ID
-        root_id = xml8._get_utf8_or_ascii(root_id)
+        root_id = get_utf8_or_ascii(root_id)
 
         format = elt.get('format')
         if format is not None:
@@ -52,10 +57,9 @@
         #   avoiding attributes     2.46s
         #   adding assertions       2.50s
         #   last_parent cache       2.52s (worse, removed)
-        unpack_entry = self._unpack_entry
         byid = inv._byid
         for e in elt:
-            ie = unpack_entry(e, entry_cache=entry_cache,
+            ie = unpack_inventory_entry(e, entry_cache=entry_cache,
                               return_from_cache=return_from_cache)
             parent_id = ie.parent_id
             if parent_id is None:
@@ -92,13 +96,13 @@
         """Append the inventory root to output."""
         if inv.root.file_id not in (None, inventory.ROOT_ID):
             fileid1 = ' file_id="'
-            fileid2 = xml8._encode_and_escape(inv.root.file_id)
+            fileid2 = encode_and_escape(inv.root.file_id)
         else:
             fileid1 = ""
             fileid2 = ""
         if inv.revision_id is not None:
             revid1 = ' revision_id="'
-            revid2 = xml8._encode_and_escape(inv.revision_id)
+            revid2 = encode_and_escape(inv.revision_id)
         else:
             revid1 = ""
             revid2 = ""

=== modified file 'bzrlib/xml8.py'
--- a/bzrlib/xml8.py	2011-02-21 15:09:19 +0000
+++ b/bzrlib/xml8.py	2011-12-12 13:47:31 +0000
@@ -15,12 +15,9 @@
 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 
 import cStringIO
-import re
 
 from bzrlib import (
     cache_utf8,
-    errors,
-    inventory,
     lazy_regex,
     revision as _mod_revision,
     trace,
@@ -29,23 +26,17 @@
     Element,
     SubElement,
     XMLSerializer,
+    encode_and_escape,
     escape_invalid_chars,
+    get_utf8_or_ascii,
+    serialize_inventory_flat,
+    unpack_inventory_entry,
+    unpack_inventory_flat,
     )
-from bzrlib.inventory import InventoryEntry
 from bzrlib.revision import Revision
 from bzrlib.errors import BzrError
 
 
-_utf8_re = None
-_unicode_re = None
-_xml_escape_map = {
-    "&":'&',
-    "'":"'", # FIXME: overkill
-    "\"":""",
-    "<":"<",
-    ">":">",
-    }
-
 _xml_unescape_map = {
     'apos':"'",
     'quot':'"',
@@ -65,115 +56,13 @@
         return unichr(int(code[1:])).encode('utf8')
 
 
-_unescape_re = None
-
+_unescape_re = lazy_regex.lazy_compile('\&([^;]*);')
 
 def _unescape_xml(data):
     """Unescape predefined XML entities in a string of data."""
-    global _unescape_re
-    if _unescape_re is None:
-        _unescape_re = re.compile('\&([^;]*);')
     return _unescape_re.sub(_unescaper, data)
 
 
-def _ensure_utf8_re():
-    """Make sure the _utf8_re and _unicode_re regexes have been compiled."""
-    global _utf8_re, _unicode_re
-    if _utf8_re is None:
-        _utf8_re = re.compile('[&<>\'\"]|[\x80-\xff]+')
-    if _unicode_re is None:
-        _unicode_re = re.compile(u'[&<>\'\"\u0080-\uffff]')
-
-
-def _unicode_escape_replace(match, _map=_xml_escape_map):
-    """Replace a string of non-ascii, non XML safe characters with their escape
-
-    This will escape both Standard XML escapes, like <>"', etc.
-    As well as escaping non ascii characters, because ElementTree did.
-    This helps us remain compatible to older versions of bzr. We may change
-    our policy in the future, though.
-    """
-    # jam 20060816 Benchmarks show that try/KeyError is faster if you
-    # expect the entity to rarely miss. There is about a 10% difference
-    # in overall time. But if you miss frequently, then if None is much
-    # faster. For our use case, we *rarely* have a revision id, file id
-    # or path name that is unicode. So use try/KeyError.
-    try:
-        return _map[match.group()]
-    except KeyError:
-        return "&#%d;" % ord(match.group())
-
-
-def _utf8_escape_replace(match, _map=_xml_escape_map):
-    """Escape utf8 characters into XML safe ones.
-
-    This uses 2 tricks. It is either escaping "standard" characters, like "&<>,
-    or it is handling characters with the high-bit set. For ascii characters,
-    we just lookup the replacement in the dictionary. For everything else, we
-    decode back into Unicode, and then use the XML escape code.
-    """
-    try:
-        return _map[match.group()]
-    except KeyError:
-        return ''.join('&#%d;' % ord(uni_chr)
-                       for uni_chr in match.group().decode('utf8'))
-
-
-_to_escaped_map = {}
-
-def _encode_and_escape(unicode_or_utf8_str, _map=_to_escaped_map):
-    """Encode the string into utf8, and escape invalid XML characters"""
-    # We frequently get entities we have not seen before, so it is better
-    # to check if None, rather than try/KeyError
-    text = _map.get(unicode_or_utf8_str)
-    if text is None:
-        if unicode_or_utf8_str.__class__ is unicode:
-            # The alternative policy is to do a regular UTF8 encoding
-            # and then escape only XML meta characters.
-            # Performance is equivalent once you use cache_utf8. *However*
-            # this makes the serialized texts incompatible with old versions
-            # of bzr. So no net gain. (Perhaps the read code would handle utf8
-            # better than entity escapes, but cElementTree seems to do just fine
-            # either way)
-            text = str(_unicode_re.sub(_unicode_escape_replace,
-                                       unicode_or_utf8_str)) + '"'
-        else:
-            # Plain strings are considered to already be in utf-8 so we do a
-            # slightly different method for escaping.
-            text = _utf8_re.sub(_utf8_escape_replace,
-                                unicode_or_utf8_str) + '"'
-        _map[unicode_or_utf8_str] = text
-    return text
-
-
-def _get_utf8_or_ascii(a_str,
-                       _encode_utf8=cache_utf8.encode,
-                       _get_cached_ascii=cache_utf8.get_cached_ascii):
-    """Return a cached version of the string.
-
-    cElementTree will return a plain string if the XML is plain ascii. It only
-    returns Unicode when it needs to. We want to work in utf-8 strings. So if
-    cElementTree returns a plain string, we can just return the cached version.
-    If it is Unicode, then we need to encode it.
-
-    :param a_str: An 8-bit string or Unicode as returned by
-                  cElementTree.Element.get()
-    :return: A utf-8 encoded 8-bit string.
-    """
-    # This is fairly optimized because we know what cElementTree does, this is
-    # not meant as a generic function for all cases. Because it is possible for
-    # an 8-bit string to not be ascii or valid utf8.
-    if a_str.__class__ is unicode:
-        return _encode_utf8(a_str)
-    else:
-        return intern(a_str)
-
-
-def _clear_cache():
-    """Clean out the unicode => escaped map"""
-    _to_escaped_map.clear()
-
-
 class Serializer_v8(XMLSerializer):
     """This serialiser adds rich roots.
 
@@ -261,83 +150,11 @@
             reference_revision, symlink_target.
         :return: The inventory as a list of lines.
         """
-        _ensure_utf8_re()
-        self._check_revisions(inv)
         output = []
         append = output.append
         self._append_inventory_root(append, inv)
-        entries = inv.iter_entries()
-        # Skip the root
-        root_path, root_ie = entries.next()
-        for path, ie in entries:
-            if ie.parent_id != self.root_id:
-                parent_str = ' parent_id="'
-                parent_id  = _encode_and_escape(ie.parent_id)
-            else:
-                parent_str = ''
-                parent_id  = ''
-            if ie.kind == 'file':
-                if ie.executable:
-                    executable = ' executable="yes"'
-                else:
-                    executable = ''
-                if not working:
-                    append('<file%s file_id="%s name="%s%s%s revision="%s '
-                        'text_sha1="%s" text_size="%d" />\n' % (
-                        executable, _encode_and_escape(ie.file_id),
-                        _encode_and_escape(ie.name), parent_str, parent_id,
-                        _encode_and_escape(ie.revision), ie.text_sha1,
-                        ie.text_size))
-                else:
-                    append('<file%s file_id="%s name="%s%s%s />\n' % (
-                        executable, _encode_and_escape(ie.file_id),
-                        _encode_and_escape(ie.name), parent_str, parent_id))
-            elif ie.kind == 'directory':
-                if not working:
-                    append('<directory file_id="%s name="%s%s%s revision="%s '
-                        '/>\n' % (
-                        _encode_and_escape(ie.file_id),
-                        _encode_and_escape(ie.name),
-                        parent_str, parent_id,
-                        _encode_and_escape(ie.revision)))
-                else:
-                    append('<directory file_id="%s name="%s%s%s />\n' % (
-                        _encode_and_escape(ie.file_id),
-                        _encode_and_escape(ie.name),
-                        parent_str, parent_id))
-            elif ie.kind == 'symlink':
-                if not working:
-                    append('<symlink file_id="%s name="%s%s%s revision="%s '
-                        'symlink_target="%s />\n' % (
-                        _encode_and_escape(ie.file_id),
-                        _encode_and_escape(ie.name),
-                        parent_str, parent_id,
-                        _encode_and_escape(ie.revision),
-                        _encode_and_escape(ie.symlink_target)))
-                else:
-                    append('<symlink file_id="%s name="%s%s%s />\n' % (
-                        _encode_and_escape(ie.file_id),
-                        _encode_and_escape(ie.name),
-                        parent_str, parent_id))
-            elif ie.kind == 'tree-reference':
-                if ie.kind not in self.supported_kinds:
-                    raise errors.UnsupportedInventoryKind(ie.kind)
-                if not working:
-                    append('<tree-reference file_id="%s name="%s%s%s '
-                        'revision="%s reference_revision="%s />\n' % (
-                        _encode_and_escape(ie.file_id),
-                        _encode_and_escape(ie.name),
-                        parent_str, parent_id,
-                        _encode_and_escape(ie.revision),
-                        _encode_and_escape(ie.reference_revision)))
-                else:
-                    append('<tree-reference file_id="%s name="%s%s%s />\n' % (
-                        _encode_and_escape(ie.file_id),
-                        _encode_and_escape(ie.name),
-                        parent_str, parent_id))
-            else:
-                raise errors.UnsupportedInventoryKind(ie.kind)
-        append('</inventory>\n')
+        serialize_inventory_flat(inv, append,
+            self.root_id, self.supported_kinds, working)
         if f is not None:
             f.writelines(output)
         # Just to keep the cache from growing without bounds
@@ -349,16 +166,16 @@
         """Append the inventory root to output."""
         if inv.revision_id is not None:
             revid1 = ' revision_id="'
-            revid2 = _encode_and_escape(inv.revision_id)
+            revid2 = encode_and_escape(inv.revision_id)
         else:
             revid1 = ""
             revid2 = ""
         append('<inventory format="%s"%s%s>\n' % (
             self.format_num, revid1, revid2))
         append('<directory file_id="%s name="%s revision="%s />\n' % (
-            _encode_and_escape(inv.root.file_id),
-            _encode_and_escape(inv.root.name),
-            _encode_and_escape(inv.root.revision)))
+            encode_and_escape(inv.root.file_id),
+            encode_and_escape(inv.root.name),
+            encode_and_escape(inv.root.revision)))
 
     def _pack_revision(self, rev):
         """Revision object -> xml tree"""
@@ -408,119 +225,19 @@
             prop_elt.tail = '\n'
         top_elt.tail = '\n'
 
+    def _unpack_entry(self, elt, entry_cache=None, return_from_cache=False):
+        # This is here because it's overridden by xml7
+        return unpack_inventory_entry(elt, entry_cache,
+                return_from_cache)
+
     def _unpack_inventory(self, elt, revision_id=None, entry_cache=None,
                           return_from_cache=False):
         """Construct from XML Element"""
-        if elt.tag != 'inventory':
-            raise errors.UnexpectedInventoryFormat('Root tag is %r' % elt.tag)
-        format = elt.get('format')
-        if format != self.format_num:
-            raise errors.UnexpectedInventoryFormat('Invalid format version %r'
-                                                   % format)
-        revision_id = elt.get('revision_id')
-        if revision_id is not None:
-            revision_id = cache_utf8.encode(revision_id)
-        inv = inventory.Inventory(root_id=None, revision_id=revision_id)
-        for e in elt:
-            ie = self._unpack_entry(e, entry_cache=entry_cache,
-                                    return_from_cache=return_from_cache)
-            inv.add(ie)
+        inv = unpack_inventory_flat(elt, self.format_num, self._unpack_entry,
+            entry_cache, return_from_cache)
         self._check_cache_size(len(inv), entry_cache)
         return inv
 
-    def _unpack_entry(self, elt, entry_cache=None, return_from_cache=False):
-        elt_get = elt.get
-        file_id = elt_get('file_id')
-        revision = elt_get('revision')
-        # Check and see if we have already unpacked this exact entry
-        # Some timings for "repo.revision_trees(last_100_revs)"
-        #               bzr     mysql
-        #   unmodified  4.1s    40.8s
-        #   using lru   3.5s
-        #   using fifo  2.83s   29.1s
-        #   lru._cache  2.8s
-        #   dict        2.75s   26.8s
-        #   inv.add     2.5s    26.0s
-        #   no_copy     2.00s   20.5s
-        #   no_c,dict   1.95s   18.0s
-        # Note that a cache of 10k nodes is more than sufficient to hold all of
-        # the inventory for the last 100 revs for bzr, but not for mysql (20k
-        # is enough for mysql, which saves the same 2s as using a dict)
-
-        # Breakdown of mysql using time.clock()
-        #   4.1s    2 calls to element.get for file_id, revision_id
-        #   4.5s    cache_hit lookup
-        #   7.1s    InventoryFile.copy()
-        #   2.4s    InventoryDirectory.copy()
-        #   0.4s    decoding unique entries
-        #   1.6s    decoding entries after FIFO fills up
-        #   0.8s    Adding nodes to FIFO (including flushes)
-        #   0.1s    cache miss lookups
-        # Using an LRU cache
-        #   4.1s    2 calls to element.get for file_id, revision_id
-        #   9.9s    cache_hit lookup
-        #   10.8s   InventoryEntry.copy()
-        #   0.3s    cache miss lookus
-        #   1.2s    decoding entries
-        #   1.0s    adding nodes to LRU
-        if entry_cache is not None and revision is not None:
-            key = (file_id, revision)
-            try:
-                # We copy it, because some operations may mutate it
-                cached_ie = entry_cache[key]
-            except KeyError:
-                pass
-            else:
-                # Only copying directory entries drops us 2.85s => 2.35s
-                if return_from_cache:
-                    if cached_ie.kind == 'directory':
-                        return cached_ie.copy()
-                    return cached_ie
-                return cached_ie.copy()
-
-        kind = elt.tag
-        if not InventoryEntry.versionable_kind(kind):
-            raise AssertionError('unsupported entry kind %s' % kind)
-
-        get_cached = _get_utf8_or_ascii
-
-        file_id = get_cached(file_id)
-        if revision is not None:
-            revision = get_cached(revision)
-        parent_id = elt_get('parent_id')
-        if parent_id is not None:
-            parent_id = get_cached(parent_id)
-
-        if kind == 'directory':
-            ie = inventory.InventoryDirectory(file_id,
-                                              elt_get('name'),
-                                              parent_id)
-        elif kind == 'file':
-            ie = inventory.InventoryFile(file_id,
-                                         elt_get('name'),
-                                         parent_id)
-            ie.text_sha1 = elt_get('text_sha1')
-            if elt_get('executable') == 'yes':
-                ie.executable = True
-            v = elt_get('text_size')
-            ie.text_size = v and int(v)
-        elif kind == 'symlink':
-            ie = inventory.InventoryLink(file_id,
-                                         elt_get('name'),
-                                         parent_id)
-            ie.symlink_target = elt_get('symlink_target')
-        else:
-            raise errors.UnsupportedInventoryKind(kind)
-        ie.revision = revision
-        if revision is not None and entry_cache is not None:
-            # We cache a copy() because callers like to mutate objects, and
-            # that would cause the item in cache to mutate as well.
-            # This has a small effect on many-inventory performance, because
-            # the majority fraction is spent in cache hits, not misses.
-            entry_cache[key] = ie.copy()
-
-        return ie
-
     def _unpack_revision(self, elt):
         """XML Element -> Revision object"""
         format = elt.get('format')
@@ -531,7 +248,7 @@
             if format != format_num:
                 raise BzrError("invalid format version %r on revision"
                                 % format)
-        get_cached = _get_utf8_or_ascii
+        get_cached = get_utf8_or_ascii
         rev = Revision(committer = elt.get('committer'),
                        timestamp = float(elt.get('timestamp')),
                        revision_id = get_cached(elt.get('revision_id')),

=== modified file 'bzrlib/xml_serializer.py'
--- a/bzrlib/xml_serializer.py	2010-09-17 04:35:23 +0000
+++ b/bzrlib/xml_serializer.py	2011-12-12 13:47:31 +0000
@@ -50,7 +50,12 @@
     import util.elementtree as elementtree
     from xml.parsers.expat import ExpatError as ParseError
 
-from bzrlib import errors
+from bzrlib import (
+    cache_utf8,
+    inventory,
+    lazy_regex,
+    errors,
+    )
 
 
 class XMLSerializer(Serializer):
@@ -130,3 +135,304 @@
     return re.subn(u'[^\x09\x0A\x0D\u0020-\uD7FF\uE000-\uFFFD]+',
             lambda match: match.group(0).encode('unicode_escape'),
             message)
+
+
+def get_utf8_or_ascii(a_str, _encode_utf8=cache_utf8.encode):
+    """Return a cached version of the string.
+
+    cElementTree will return a plain string if the XML is plain ascii. It only
+    returns Unicode when it needs to. We want to work in utf-8 strings. So if
+    cElementTree returns a plain string, we can just return the cached version.
+    If it is Unicode, then we need to encode it.
+
+    :param a_str: An 8-bit string or Unicode as returned by
+                  cElementTree.Element.get()
+    :return: A utf-8 encoded 8-bit string.
+    """
+    # This is fairly optimized because we know what cElementTree does, this is
+    # not meant as a generic function for all cases. Because it is possible for
+    # an 8-bit string to not be ascii or valid utf8.
+    if a_str.__class__ is unicode:
+        return _encode_utf8(a_str)
+    else:
+        return intern(a_str)
+
+
+_utf8_re = lazy_regex.lazy_compile('[&<>\'\"]|[\x80-\xff]+')
+_unicode_re = lazy_regex.lazy_compile(u'[&<>\'\"\u0080-\uffff]')
+
+
+_xml_escape_map = {
+    "&":'&',
+    "'":"'", # FIXME: overkill
+    "\"":""",
+    "<":"<",
+    ">":">",
+    }
+
+
+def _unicode_escape_replace(match, _map=_xml_escape_map):
+    """Replace a string of non-ascii, non XML safe characters with their escape
+
+    This will escape both Standard XML escapes, like <>"', etc.
+    As well as escaping non ascii characters, because ElementTree did.
+    This helps us remain compatible to older versions of bzr. We may change
+    our policy in the future, though.
+    """
+    # jam 20060816 Benchmarks show that try/KeyError is faster if you
+    # expect the entity to rarely miss. There is about a 10% difference
+    # in overall time. But if you miss frequently, then if None is much
+    # faster. For our use case, we *rarely* have a revision id, file id
+    # or path name that is unicode. So use try/KeyError.
+    try:
+        return _map[match.group()]
+    except KeyError:
+        return "&#%d;" % ord(match.group())
+
+
+def _utf8_escape_replace(match, _map=_xml_escape_map):
+    """Escape utf8 characters into XML safe ones.
+
+    This uses 2 tricks. It is either escaping "standard" characters, like "&<>,
+    or it is handling characters with the high-bit set. For ascii characters,
+    we just lookup the replacement in the dictionary. For everything else, we
+    decode back into Unicode, and then use the XML escape code.
+    """
+    try:
+        return _map[match.group()]
+    except KeyError:
+        return ''.join('&#%d;' % ord(uni_chr)
+                       for uni_chr in match.group().decode('utf8'))
+
+
+_to_escaped_map = {}
+
+def encode_and_escape(unicode_or_utf8_str, _map=_to_escaped_map):
+    """Encode the string into utf8, and escape invalid XML characters"""
+    # We frequently get entities we have not seen before, so it is better
+    # to check if None, rather than try/KeyError
+    text = _map.get(unicode_or_utf8_str)
+    if text is None:
+        if unicode_or_utf8_str.__class__ is unicode:
+            # The alternative policy is to do a regular UTF8 encoding
+            # and then escape only XML meta characters.
+            # Performance is equivalent once you use cache_utf8. *However*
+            # this makes the serialized texts incompatible with old versions
+            # of bzr. So no net gain. (Perhaps the read code would handle utf8
+            # better than entity escapes, but cElementTree seems to do just fine
+            # either way)
+            text = str(_unicode_re.sub(_unicode_escape_replace,
+                                       unicode_or_utf8_str)) + '"'
+        else:
+            # Plain strings are considered to already be in utf-8 so we do a
+            # slightly different method for escaping.
+            text = _utf8_re.sub(_utf8_escape_replace,
+                                unicode_or_utf8_str) + '"'
+        _map[unicode_or_utf8_str] = text
+    return text
+
+
+def _clear_cache():
+    """Clean out the unicode => escaped map"""
+    _to_escaped_map.clear()
+
+
+def unpack_inventory_entry(elt, entry_cache=None, return_from_cache=False):
+    elt_get = elt.get
+    file_id = elt_get('file_id')
+    revision = elt_get('revision')
+    # Check and see if we have already unpacked this exact entry
+    # Some timings for "repo.revision_trees(last_100_revs)"
+    #               bzr     mysql
+    #   unmodified  4.1s    40.8s
+    #   using lru   3.5s
+    #   using fifo  2.83s   29.1s
+    #   lru._cache  2.8s
+    #   dict        2.75s   26.8s
+    #   inv.add     2.5s    26.0s
+    #   no_copy     2.00s   20.5s
+    #   no_c,dict   1.95s   18.0s
+    # Note that a cache of 10k nodes is more than sufficient to hold all of
+    # the inventory for the last 100 revs for bzr, but not for mysql (20k
+    # is enough for mysql, which saves the same 2s as using a dict)
+
+    # Breakdown of mysql using time.clock()
+    #   4.1s    2 calls to element.get for file_id, revision_id
+    #   4.5s    cache_hit lookup
+    #   7.1s    InventoryFile.copy()
+    #   2.4s    InventoryDirectory.copy()
+    #   0.4s    decoding unique entries
+    #   1.6s    decoding entries after FIFO fills up
+    #   0.8s    Adding nodes to FIFO (including flushes)
+    #   0.1s    cache miss lookups
+    # Using an LRU cache
+    #   4.1s    2 calls to element.get for file_id, revision_id
+    #   9.9s    cache_hit lookup
+    #   10.8s   InventoryEntry.copy()
+    #   0.3s    cache miss lookus
+    #   1.2s    decoding entries
+    #   1.0s    adding nodes to LRU
+    if entry_cache is not None and revision is not None:
+        key = (file_id, revision)
+        try:
+            # We copy it, because some operations may mutate it
+            cached_ie = entry_cache[key]
+        except KeyError:
+            pass
+        else:
+            # Only copying directory entries drops us 2.85s => 2.35s
+            if return_from_cache:
+                if cached_ie.kind == 'directory':
+                    return cached_ie.copy()
+                return cached_ie
+            return cached_ie.copy()
+
+    kind = elt.tag
+    if not inventory.InventoryEntry.versionable_kind(kind):
+        raise AssertionError('unsupported entry kind %s' % kind)
+
+    file_id = get_utf8_or_ascii(file_id)
+    if revision is not None:
+        revision = get_utf8_or_ascii(revision)
+    parent_id = elt_get('parent_id')
+    if parent_id is not None:
+        parent_id = get_utf8_or_ascii(parent_id)
+
+    if kind == 'directory':
+        ie = inventory.InventoryDirectory(file_id,
+                                          elt_get('name'),
+                                          parent_id)
+    elif kind == 'file':
+        ie = inventory.InventoryFile(file_id,
+                                     elt_get('name'),
+                                     parent_id)
+        ie.text_sha1 = elt_get('text_sha1')
+        if elt_get('executable') == 'yes':
+            ie.executable = True
+        v = elt_get('text_size')
+        ie.text_size = v and int(v)
+    elif kind == 'symlink':
+        ie = inventory.InventoryLink(file_id,
+                                     elt_get('name'),
+                                     parent_id)
+        ie.symlink_target = elt_get('symlink_target')
+    else:
+        raise errors.UnsupportedInventoryKind(kind)
+    ie.revision = revision
+    if revision is not None and entry_cache is not None:
+        # We cache a copy() because callers like to mutate objects, and
+        # that would cause the item in cache to mutate as well.
+        # This has a small effect on many-inventory performance, because
+        # the majority fraction is spent in cache hits, not misses.
+        entry_cache[key] = ie.copy()
+
+    return ie
+
+
+def unpack_inventory_flat(elt, format_num, unpack_entry,
+            entry_cache=None, return_from_cache=False):
+    """Unpack a flat XML inventory.
+
+    :param elt: XML element for the inventory
+    :param format_num: Expected format number
+    :param unpack_entry: Function for unpacking inventory entries
+    :return: An inventory
+    :raise UnexpectedInventoryFormat: When unexpected elements or data is
+        encountered
+    """
+    if elt.tag != 'inventory':
+        raise errors.UnexpectedInventoryFormat('Root tag is %r' % elt.tag)
+    format = elt.get('format')
+    if format != format_num:
+        raise errors.UnexpectedInventoryFormat('Invalid format version %r'
+                                               % format)
+    revision_id = elt.get('revision_id')
+    if revision_id is not None:
+        revision_id = cache_utf8.encode(revision_id)
+    inv = inventory.Inventory(root_id=None, revision_id=revision_id)
+    for e in elt:
+        ie = unpack_entry(e, entry_cache, return_from_cache)
+        inv.add(ie)
+    return inv
+
+
+def serialize_inventory_flat(inv, append, root_id, supported_kinds, working):
+    """Serialize an inventory to a flat XML file.
+
+    :param inv: Inventory to serialize
+    :param append: Function for writing a line of output
+    :param working: If True skip history data - text_sha1, text_size,
+        reference_revision, symlink_target.    self._check_revisions(inv)
+    """
+    entries = inv.iter_entries()
+    # Skip the root
+    root_path, root_ie = entries.next()
+    for path, ie in entries:
+        if ie.parent_id != root_id:
+            parent_str = ' parent_id="'
+            parent_id  = encode_and_escape(ie.parent_id)
+        else:
+            parent_str = ''
+            parent_id  = ''
+        if ie.kind == 'file':
+            if ie.executable:
+                executable = ' executable="yes"'
+            else:
+                executable = ''
+            if not working:
+                append('<file%s file_id="%s name="%s%s%s revision="%s '
+                    'text_sha1="%s" text_size="%d" />\n' % (
+                    executable, encode_and_escape(ie.file_id),
+                    encode_and_escape(ie.name), parent_str, parent_id,
+                    encode_and_escape(ie.revision), ie.text_sha1,
+                    ie.text_size))
+            else:
+                append('<file%s file_id="%s name="%s%s%s />\n' % (
+                    executable, encode_and_escape(ie.file_id),
+                    encode_and_escape(ie.name), parent_str, parent_id))
+        elif ie.kind == 'directory':
+            if not working:
+                append('<directory file_id="%s name="%s%s%s revision="%s '
+                    '/>\n' % (
+                    encode_and_escape(ie.file_id),
+                    encode_and_escape(ie.name),
+                    parent_str, parent_id,
+                    encode_and_escape(ie.revision)))
+            else:
+                append('<directory file_id="%s name="%s%s%s />\n' % (
+                    encode_and_escape(ie.file_id),
+                    encode_and_escape(ie.name),
+                    parent_str, parent_id))
+        elif ie.kind == 'symlink':
+            if not working:
+                append('<symlink file_id="%s name="%s%s%s revision="%s '
+                    'symlink_target="%s />\n' % (
+                    encode_and_escape(ie.file_id),
+                    encode_and_escape(ie.name),
+                    parent_str, parent_id,
+                    encode_and_escape(ie.revision),
+                    encode_and_escape(ie.symlink_target)))
+            else:
+                append('<symlink file_id="%s name="%s%s%s />\n' % (
+                    encode_and_escape(ie.file_id),
+                    encode_and_escape(ie.name),
+                    parent_str, parent_id))
+        elif ie.kind == 'tree-reference':
+            if ie.kind not in supported_kinds:
+                raise errors.UnsupportedInventoryKind(ie.kind)
+            if not working:
+                append('<tree-reference file_id="%s name="%s%s%s '
+                    'revision="%s reference_revision="%s />\n' % (
+                    encode_and_escape(ie.file_id),
+                    encode_and_escape(ie.name),
+                    parent_str, parent_id,
+                    encode_and_escape(ie.revision),
+                    encode_and_escape(ie.reference_revision)))
+            else:
+                append('<tree-reference file_id="%s name="%s%s%s />\n' % (
+                    encode_and_escape(ie.file_id),
+                    encode_and_escape(ie.name),
+                    parent_str, parent_id))
+        else:
+            raise errors.UnsupportedInventoryKind(ie.kind)
+    append('</inventory>\n')