[PATCH][MERGE] Improvements to is_ignored, take 2

Thu Jan 12 22:47:41 GMT 2006

Hello,

I have improved on the is_ignored enhancements. The is_ignored_by method
should now work even when there are more than 100 patterns. No test cases
added to check that yet though.

It is at: <http://www.ucw.cz/~bulb/bzr/bzr.ignore/>. Please review and
comment. The diff again follows.

=== modified file 'bzrlib/add.py'

--- bzrlib/add.py	
+++ bzrlib/add.py	
@@ -155,7 +155,7 @@
                 if subf == bzrlib.BZRDIR:
                     mutter("skip control directory %r", subp)
                 else:
-                    ignore_glob = tree.is_ignored(subp)
+                    ignore_glob = tree.is_ignored_by(subp)
                     if ignore_glob is not None:
                         mutter("skip ignored sub-file %r", subp)
                         if ignore_glob not in ignored:

=== modified file 'bzrlib/builtins.py'
--- bzrlib/builtins.py	
+++ bzrlib/builtins.py	
@@ -1135,7 +1135,7 @@
             if file_class != 'I':
                 continue
             ## XXX: Slightly inefficient since this was already calculated
-            pat = tree.is_ignored(path)
+            pat = tree.is_ignored_by(path)
             print '%-50s %s' % (path, pat)
 
 

=== modified file 'bzrlib/workingtree.py'
--- bzrlib/workingtree.py	
+++ bzrlib/workingtree.py	
@@ -113,6 +113,45 @@
     return gen_file_id('TREE_ROOT')
 
 
+def _glob_to_regex(pat):
+    r"""Convert a unix glob to regular expression.
+    
+    Patterns containing '/' or '\' need to match whole path; others match
+    against only the last component - as per requirement of
+    WorkingTree.is_ignored().
+    
+    Pattern is returned as string.
+    """
+    # TODO: For now we use fnmatch.translate, which is Broken(tm). New,
+    # correct, translator that would handle '**' for matching paths and other
+    # extended globbing stuff from cvs/rsync should be implemented.
+
+    # XXX: Shouldn't the globs be actually UNICODE?
+
+    # XXX: fnmatch is actually not quite what we want: it's only
+    # approximately the same as real Unix fnmatch, and doesn't
+    # treat dotfiles correctly and allows * to match /.
+    # Eventually it should be replaced with something more
+    # accurate.
+        
+    if '/' in pat or '\\' in pat:
+        if (pat[:2] == './') or (pat[:2] == '.\\'):
+            pat = pat[2:]
+        return fnmatch.translate(pat)
+    else:
+        # XXX: Is the path normalized? Should we match [/\\] ?
+        return '(?:.*/)?' + fnmatch.translate(pat)
+
+
+def _glob_list_to_regex(pats, wrap='(?:%s)'):
+    """Convert a list of unix globs to a regular expression.
+
+    The pattern is returned as string. The wrap is % format applied to each
+    individual glob pattern. It has to apply group.
+    """
+    return '|'.join([wrap % _glob_to_regex(x) for x in pats])
+
+
 class TreeEntry(object):
     """An entry that implements the minium interface used by commands.
 
@@ -712,11 +751,10 @@
 
 
     def ignored_files(self):
-        """Yield list of PATH, IGNORE_PATTERN"""
+        """Yield list of paths"""
         for subp in self.extras():
-            pat = self.is_ignored(subp)
-            if pat != None:
-                yield subp, pat
+            if self.is_ignored(subp):
+                yield subp
 
 
     def get_ignore_list(self):
@@ -734,6 +772,36 @@
         self._ignorelist = l
         return l
 
+    def _get_ignore_regex(self):
+        """Return a regular expression composed of ignore patterns.
+
+        Cached in the Tree object after the first call.
+        """
+        import re
+        if not hasattr(self, '_ignoreregex'):
+            self._ignoreregex = re.compile(
+                    _glob_list_to_regex(self.get_ignore_list()))
+        return self._ignoreregex
+
+    def _get_ignore_by_regex_list(self):
+        """Return regex list for is_ignored_by method.
+
+        Cached in the Tree object after the first call.
+
+        The return is a list of lists, each having pattern as the first
+        element, followed by list of globs it is composed from.
+        """
+        import re
+        if not hasattr(self, '_ignore_by_regex_list'):
+            pats = list(self.get_ignore_list()) # So we can shift...
+            self._ignore_by_regex_list = []
+            while pats:
+                self._ignore_by_regex_list.append(
+                        [re.compile(_glob_list_to_regex(pats[0:50],
+                                    wrap='(%s)'))]
+                        + pats[0:50])
+                pats[0:50] = ()
+        return self._ignore_by_regex_list
 
     def is_ignored(self, filename):
         r"""Check whether the filename matches an ignore pattern.
@@ -741,37 +809,27 @@
         Patterns containing '/' or '\' need to match the whole path;
         others match against only the last component.
 
-        If the file is ignored, returns the pattern which caused it to
-        be ignored, otherwise None.  So this can simply be used as a
-        boolean if desired."""
-
-        # TODO: Use '**' to match directories, and other extended
-        # globbing stuff from cvs/rsync.
-
-        # XXX: fnmatch is actually not quite what we want: it's only
-        # approximately the same as real Unix fnmatch, and doesn't
-        # treat dotfiles correctly and allows * to match /.
-        # Eventually it should be replaced with something more
-        # accurate.
-        
-        for pat in self.get_ignore_list():
-            if '/' in pat or '\\' in pat:
-                
-                # as a special case, you can put ./ at the start of a
-                # pattern; this is good to match in the top-level
-                # only;
-                
-                if (pat[:2] == './') or (pat[:2] == '.\\'):
-                    newpat = pat[2:]
-                else:
-                    newpat = pat
-                if fnmatch.fnmatchcase(filename, newpat):
-                    return pat
-            else:
-                if fnmatch.fnmatchcase(splitpath(filename)[-1], pat):
-                    return pat
-        else:
-            return None
+        If the file is ignored, returns a match object, otherwise None. So
+        this can simply be used as a boolean if desired. The match object is
+        really not very useful, because the individual patterns are not
+        captured.
+        """
+        pat = self._get_ignore_regex()
+        return pat.match(filename)
+
+    def is_ignored_by(self, filename):
+        r"""Check whether the filename matches and return the pattern it matches.
+
+        This method is similar to is_ignored, but makes the extra effort to
+        return the pattern that matched.
+        """
+
+        pats = self._get_ignore_by_regex_list()
+        for pat in pats:
+            m = pat[0].match(filename)
+            if m:
+                return pat[m.lastindex]
+        return None
 
     def kind(self, file_id):
         return file_kind(self.id2abspath(file_id))


-- 
						 Jan 'Bulb' Hudec <bulb at ucw.cz>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 189 bytes
Desc: Digital signature
Url : https://lists.ubuntu.com/archives/bazaar/attachments/20060112/b7a6efce/attachment.pgp