Rev 2488: First step towards custom parsing. in http://bzr.arbash-meinel.com/branches/bzr/0.17-dev/knit_index_pyrex

Wed May 9 17:49:59 BST 2007

At http://bzr.arbash-meinel.com/branches/bzr/0.17-dev/knit_index_pyrex

------------------------------------------------------------
revno: 2488
revision-id: john at arbash-meinel.com-20070509164944-8dzfnsgdamvujo4b
parent: john at arbash-meinel.com-20070509152053-6wm1c1cg9wstaezq
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: knit_index_pyrex
timestamp: Wed 2007-05-09 11:49:44 -0500
message:
  First step towards custom parsing.
  By steping through the main string instead of splitting into lines
  we save approx 40%
    test_read_50k_index_c          617ms
    test_read_50k_index_c_again    592ms
    test_read_50k_index_py         895ms
    test_read_50k_index_py_again  1038ms
modified:
  bzrlib/knit_c.pyx              knit_c.pyx-20070509143944-u42gy8w387a10m0j-1
-------------- next part --------------
=== modified file 'bzrlib/knit_c.pyx'

--- a/bzrlib/knit_c.pyx	2007-05-09 14:40:42 +0000
+++ b/bzrlib/knit_c.pyx	2007-05-09 16:49:44 +0000
@@ -17,55 +17,214 @@
 """Pyrex extensions to knit parsing."""
 
 
+cdef extern from "stdlib.h":
+    long int strtol(char *nptr, char **endptr, int base)
+    unsigned long int strtoul(char *nptr, char **endptr, int base)
+
+
+cdef extern from "Python.h":
+    int PyDict_CheckExact(object)
+    void *PyDict_GetItem(object p, object key)
+    int PyDict_SetItem(object p, object key, object val) except -1
+
+    int PyList_Append(object lst, object item) except -1
+    void *PyList_GetItem_object_void "PyList_GET_ITEM" (object lst, int index)
+    object PyList_GET_ITEM (object lst, int index)
+    int PyList_CheckExact(object)
+
+    int PyTuple_CheckExact(object)
+    void *PyTuple_GetItem_void_void "PyTuple_GET_ITEM" (void* tpl, int index)
+    object PyTuple_New(int)
+    int PyTuple_SetItem(object tpl, int offset, object val)
+    void PyTuple_SET_ITEM(object tpl, int offset, object val)
+    object PyTuple_Pack(int n, ...)
+
+    char *PyString_AsString(object p)
+    char *PyString_AS_STRING_void "PyString_AS_STRING" (void *p)
+    object PyString_FromString(char *)
+    object PyString_FromStringAndSize(char *, int)
+    int PyString_Size(object p)
+    int PyString_GET_SIZE_void "PyString_GET_SIZE" (void *p)
+    int PyString_CheckExact(object p)
+
+    void Py_INCREF(object)
+    void Py_DECREF(object)
+
+
+cdef extern from "string.h":
+    char *strchr(char *s1, char c)
+    int strncmp(char *s1, char *s2, int len)
+    int strcmp(char *s1, char *s2)
+
+
+cdef class KnitIndexReader:
+
+    cdef object kndx
+    cdef object fp
+
+    cdef object cache
+    cdef object history
+
+    cdef object text
+    cdef char * text_str
+    cdef int text_size
+
+    cdef char * cur_str
+    cdef char * end_str
+
+    cdef int history_len
+
+    def __new__(self, kndx, fp):
+        self.kndx = kndx
+        self.fp = fp
+
+        self.cache = kndx._cache
+        self.history = kndx._history
+        self.text = None
+
+        self.text_str = NULL
+        self.text_size = 0
+        self.cur_str = NULL
+        self.end_str = NULL
+        self.history_len = 0
+
+    cdef void validate(self):
+        if not PyDict_CheckExact(self.cache):
+            raise TypeError('kndx._cache must be a python dict')
+        if not PyList_CheckExact(self.history):
+            raise TypeError('kndx._history must be a python list')
+
+    cdef void process_one_record(self, char *start, char *end):
+        """Take a simple string and split it into an index record."""
+        cdef char *version_id_str
+        cdef int version_id_size
+        cdef char *option_str
+        cdef int option_size
+        cdef char *pos_str
+        cdef int pos
+        cdef char *size_str
+        cdef int size
+        cdef char *parent_str
+        cdef int parent_size
+
+        version_id_str = start
+        option_str = strchr(version_id_str, c' ')
+        if option_str == NULL or option_str >= end:
+            # Short entry
+            return
+        version_id_size = <int>(option_str - version_id_str)
+        # Move past the space character
+        option_str = option_str + 1
+
+        pos_str = strchr(option_str, c' ')
+        if pos_str == NULL or pos_str >= end:
+            # Short entry
+            return
+        option_size = <int>(pos_str - option_str)
+        pos_str = pos_str + 1
+
+        size_str = strchr(pos_str, c' ')
+        if size_str == NULL or size_str >= end:
+            # Short entry
+            return
+        size_str = size_str + 1
+
+        # TODO: Make sure this works when there are no parents
+        parent_str = strchr(size_str, c' ')
+        if parent_str == NULL or parent_str >= end:
+            # Missing parents
+            return
+        parent_str = parent_str + 1
+
+        version_id = PyString_FromStringAndSize(version_id_str,
+                                                version_id_size)
+        options = PyString_FromStringAndSize(option_str, option_size)
+        options = options.split(',')
+
+        pos = strtol(pos_str, NULL, 10)
+        size = strtol(size_str, NULL, 10)
+
+        # TODO: Check that we are actually reading integers
+        parents = PyString_FromStringAndSize(parent_str,
+                                             <int>(end - parent_str))
+        parents = parents.split()
+        real_parents = []
+        for parent in parents:
+            if parent[0].startswith('.'):
+                real_parents.append(parent[1:])
+            else:
+                real_parents.append(self.history[int(parent)])
+
+        if version_id not in self.cache:
+            self.history.append(version_id)
+            index = self.history_len
+            self.history_len = self.history_len + 1
+        else:
+            index = self.cache[version_id][5]
+
+        self.cache[version_id] = (version_id,
+                                  options,
+                                  pos,
+                                  size,
+                                  real_parents,
+                                  index,
+                                 )
+
+    cdef void process_next_record(self):
+        """Process the next record in the file."""
+        cdef char *last
+        cdef char *start
+
+        start = self.cur_str
+        # Find the next newline
+        last = strchr(start, c'\n')
+        if last == NULL:
+            # Process until the end of the file
+            last = self.end_str-1
+            self.cur_str = self.end_str
+            line = PyString_FromStringAndSize(start, <int>(last - start))
+            ending = PyString_FromStringAndSize(last, 1)
+        else:
+            # The last character is right before the '\n'
+            # And the next string is right after it
+            line = PyString_FromStringAndSize(start, <int>(last - start))
+            self.cur_str = last + 1
+            last = last - 1
+            ending = PyString_FromStringAndSize(last, 3)
+
+        if last <= start or last[0] != c':':
+            # Incomplete record
+            return
+
+        self.process_one_record(start, last)
+
+    def read(self):
+        self.validate()
+
+        kndx = self.kndx
+        fp = self.fp
+        cache = self.cache
+        history = self.history
+
+        kndx.check_header(fp)
+
+        # We read the whole thing at once
+        # TODO: jam 2007-05-09 Consider reading incrementally rather than
+        #       having to have the whole thing read up front.
+        #       we already know that calling f.readlines() versus lots of
+        #       f.readline() calls is faster.
+        self.text = fp.read()
+        self.text_str = PyString_AsString(self.text)
+        self.text_size = PyString_Size(self.text)
+        self.cur_str = self.text_str
+        # This points to the last character in the string
+        self.end_str = self.text_str + self.text_size
+
+        while self.cur_str < self.end_str:
+            self.process_next_record()
+
+
 def _load_data_c(kndx, fp):
     """Load the knit index file into memory."""
-    cache = kndx._cache
-    history = kndx._history
-
-    kndx.check_header(fp)
-    # readlines reads the whole file at once:
-    # bad for transports like http, good for local disk
-    # we save 60 ms doing this one change (
-    # from calling readline each time to calling
-    # readlines once.
-    # probably what we want for nice behaviour on
-    # http is a incremental readlines that yields, or
-    # a check for local vs non local indexes,
-    history_top = len(history) - 1
-    for line in fp.readlines():
-        rec = line.split()
-        if len(rec) < 5 or rec[-1] != ':':
-            # corrupt line.
-            # FIXME: in the future we should determine if its a
-            # short write - and ignore it 
-            # or a different failure, and raise. RBC 20060407
-            continue
-
-        parents = []
-        for value in rec[4:-1]:
-            if value[0] == '.':
-                # uncompressed reference
-                parent_id = value[1:]
-            else:
-                parent_id = history[int(value)]
-            parents.append(parent_id)
-
-        version_id, options, pos, size = rec[:4]
-        version_id = version_id
-
-        # See kndx._cache_version
-        # only want the _history index to reference the 1st 
-        # index entry for version_id
-        if version_id not in cache:
-            history_top = history_top + 1
-            index = history_top
-            history.append(version_id)
-        else:
-            index = cache[version_id][5]
-        cache[version_id] = (version_id,
-                             options.split(','),
-                             int(pos),
-                             int(size),
-                             parents,
-                             index)
-        # end kndx._cache_version 
+    reader = KnitIndexReader(kndx, fp)
+    reader.read()