Rev 90: Change the code around again. in http://bazaar.launchpad.net/%7Ebzr/bzr-groupcompress/rabin

John Arbash Meinel john at arbash-meinel.com
Tue Mar 3 16:32:50 GMT 2009


At http://bazaar.launchpad.net/%7Ebzr/bzr-groupcompress/rabin

------------------------------------------------------------
revno: 90
revision-id: john at arbash-meinel.com-20090303163107-l4j0114btw2efmjp
parent: john at arbash-meinel.com-20090303160222-4bkou2s65s60h75a
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: rabin
timestamp: Tue 2009-03-03 10:31:07 -0600
message:
  Change the code around again.
  
  This time, the information about sources is maintained in the DeltaIndex object.
  And we pass that info down into create_delta_index, et al.
  
  Next step is to actually combine the delta indexes.
-------------- next part --------------
=== modified file '_groupcompress_pyx.pyx'
--- a/_groupcompress_pyx.pyx	2009-03-03 15:09:39 +0000
+++ b/_groupcompress_pyx.pyx	2009-03-03 16:31:07 +0000
@@ -24,14 +24,17 @@
     void memcpy(void *, void *, size_t)
 
 cdef extern from "delta.h":
+    struct source_info:
+        void *buf
+        unsigned long size
+        unsigned long agg_offset
     struct delta_index:
         unsigned long memsize
         void *src_buf
         unsigned long src_size
         unsigned int hash_mask
         # struct index_entry *hash[]
-    delta_index * create_delta_index(void *buf, unsigned long bufsize, unsigned
-                                     long agg_src_offset)
+    delta_index * create_delta_index(source_info *src)
     void free_delta_index(delta_index *index)
     void *create_delta(delta_index **indexes,
              unsigned int num_indexes,
@@ -84,9 +87,11 @@
     # isn't performance critical
     # cdef readonly list _sources
     cdef readonly object _sources
+    cdef source_info *_source_infos
     cdef delta_index **_indexes
     cdef readonly unsigned int _num_indexes
     cdef readonly unsigned int _max_num_indexes
+    cdef readonly unsigned int _max_num_sources
     cdef public unsigned long _source_offset
 
     def __repr__(self):
@@ -99,6 +104,9 @@
         self._max_num_indexes = 1024
         self._indexes = <delta_index**>safe_malloc(sizeof(delta_index*)
                                                    * self._max_num_indexes)
+        self._max_num_sources = 1024
+        self._source_infos = <source_info *>safe_malloc(sizeof(source_info)
+                                                        * self._max_num_sources)
         self._num_indexes = 0
         self._source_offset = 0
 
@@ -107,6 +115,7 @@
 
     def __dealloc__(self):
         self._ensure_no_indexes()
+        safe_free(<void **>&self._source_infos)
 
     def add_source(self, source, unadded_bytes):
         """Add a new bit of source text to the delta indexes.
@@ -118,15 +127,22 @@
         cdef char *c_source
         cdef Py_ssize_t c_source_size
         cdef delta_index *index
+        cdef unsigned int source_location
+        cdef source_info *src
         cdef unsigned int num_indexes
-        cdef unsigned long agg_src_offset
 
         if not PyString_CheckExact(source):
             raise TypeError('source is not a str')
 
+        source_location = len(self._sources)
+        if source_location >= self._max_num_sources:
+            self._expand_sources()
         self._sources.append(source)
         c_source = PyString_AS_STRING(source)
         c_source_size = PyString_GET_SIZE(source)
+        src = self._source_infos + source_location
+        src.buf = c_source
+        src.size = c_source_size
 
         # TODO: Are usage is ultimately going to be different than the one that
         #       was originally designed. Specifically, we are going to want to
@@ -134,9 +150,9 @@
         #       fit just fine into the structure. But for now, we just wrap
         #       create_delta_index (For example, we could always reserve enough
         #       space to hash a 4MB string, etc.)
-        agg_src_offset = self._source_offset + unadded_bytes
-        index = create_delta_index(c_source, c_source_size, agg_src_offset)
-        self._source_offset = agg_src_offset + c_source_size
+        src.agg_offset = self._source_offset + unadded_bytes
+        index = create_delta_index(src)
+        self._source_offset = src.agg_offset + src.size
         if index != NULL:
             num_indexes = self._num_indexes + 1
             if num_indexes >= self._max_num_indexes:
@@ -150,6 +166,12 @@
                                                 sizeof(delta_index *)
                                                 * self._max_num_indexes)
 
+    cdef _expand_sources(self):
+        self._max_num_sources = self._max_num_sources * 2
+        self._source_infos = <source_info *>safe_realloc(self._source_infos,
+                                                sizeof(source_info)
+                                                * self._max_num_sources)
+
     cdef _ensure_no_indexes(self):
         cdef int i
 
@@ -192,38 +214,9 @@
 
 
 def make_delta(source_bytes, target_bytes):
-    """Create a delta from source_bytes => target_bytes."""
-    cdef char *source
-    cdef Py_ssize_t source_size
-    cdef char *target
-    cdef Py_ssize_t target_size
-    cdef delta_index *index
-    cdef void * delta
-    cdef unsigned long delta_size
-    cdef unsigned long max_delta_size
-
-    max_delta_size = 0 # Unlimited
-
-    if not PyString_CheckExact(source_bytes):
-        raise TypeError('source is not a str')
-    if not PyString_CheckExact(target_bytes):
-        raise TypeError('target is not a str')
-
-    source = PyString_AS_STRING(source_bytes)
-    source_size = PyString_GET_SIZE(source_bytes)
-    target = PyString_AS_STRING(target_bytes)
-    target_size = PyString_GET_SIZE(target_bytes)
-
-    result = None
-    index = create_delta_index(source, source_size, 0)
-    if index != NULL:
-        delta = create_delta(&index, 1, target, target_size,
-                             &delta_size, max_delta_size)
-        free_delta_index(index);
-        if delta:
-            result = PyString_FromStringAndSize(<char *>delta, delta_size)
-            free(delta)
-    return result
+    """Create a delta, this is a wrapper around DeltaIndex.make_delta."""
+    di = DeltaIndex(source_bytes)
+    return di.make_delta(target_bytes)
 
 
 def apply_delta(source_bytes, delta_bytes):
@@ -341,32 +334,3 @@
     # *dst_size = out - dst_buf;
     assert (out - dst_buf) == PyString_GET_SIZE(result)
     return result
-
-
-def apply_delta2(source_bytes, delta_bytes):
-    """Apply a delta generated by make_delta to source_bytes."""
-    # This defers to the patch-delta code rather than implementing it here
-    # If this is faster, we can bring the memory allocation and error handling
-    # into apply_delta(), and leave the primary loop in a separate C func.
-    cdef char *source, *delta, *target
-    cdef Py_ssize_t source_size, delta_size
-    cdef unsigned long target_size
-
-    if not PyString_CheckExact(source_bytes):
-        raise TypeError('source is not a str')
-    if not PyString_CheckExact(delta_bytes):
-        raise TypeError('delta is not a str')
-
-    source = PyString_AS_STRING(source_bytes)
-    source_size = PyString_GET_SIZE(source_bytes)
-    delta = PyString_AS_STRING(delta_bytes)
-    delta_size = PyString_GET_SIZE(delta_bytes)
-
-    target = <char *>patch_delta(source, source_size,
-                                 delta, delta_size,
-                                 &target_size)
-    if target == NULL:
-        return None
-    result = PyString_FromStringAndSize(target, target_size)
-    free(target)
-    return result

=== modified file 'delta.h'
--- a/delta.h	2009-03-02 18:52:36 +0000
+++ b/delta.h	2009-03-03 16:31:07 +0000
@@ -6,6 +6,13 @@
 /* opaque object for delta index */
 struct delta_index;
 
+struct source_info {
+	const void *buf; /* Pointer to the beginning of source data */
+	unsigned long size; /* Total length of source data */
+	unsigned long agg_offset; /* Start of source data as part of the
+								 aggregate source */
+};
+
 /*
  * create_delta_index: compute index data from given buffer
  *
@@ -16,8 +23,7 @@
  * using free_delta_index().
  */
 extern struct delta_index *
-create_delta_index(const void *buf, unsigned long bufsize,
-                   unsigned long agg_src_offset);
+create_delta_index(const struct source_info *src);
 
 /*
  * free_delta_index: free the index created by create_delta_index()
@@ -45,53 +51,19 @@
  */
 extern void *
 create_delta(struct delta_index **indexes,
-         unsigned int num_indexes,
-	     const void *buf, unsigned long bufsize,
-	     unsigned long *delta_size, unsigned long max_delta_size);
-
-/*
- * diff_delta: create a delta from source buffer to target buffer
- *
- * If max_delta_size is non-zero and the resulting delta is to be larger
- * than max_delta_size then NULL is returned.  On success, a non-NULL
- * pointer to the buffer with the delta data is returned and *delta_size is
- * updated with its size.  The returned buffer must be freed by the caller.
- */
-static inline void *
-diff_delta(const void *src_buf, unsigned long src_bufsize,
-	   const void *trg_buf, unsigned long trg_bufsize,
-	   unsigned long *delta_size, unsigned long max_delta_size)
-{
-	struct delta_index *index = create_delta_index(src_buf, src_bufsize, 0);
-	if (index) {
-		void *delta = create_delta(&index, 1, trg_buf, trg_bufsize,
-					   delta_size, max_delta_size);
-		free_delta_index(index);
-		return delta;
-	}
-	return NULL;
-}
-
-/*
- * patch_delta: recreate target buffer given source buffer and delta data
- *
- * On success, a non-NULL pointer to the target buffer is returned and
- * *trg_bufsize is updated with its size.  On failure a NULL pointer is
- * returned.  The returned buffer must be freed by the caller.
- */
-extern void *patch_delta(const void *src_buf, unsigned long src_size,
-			 const void *delta_buf, unsigned long delta_size,
-			 unsigned long *dst_size);
+		 unsigned int num_indexes,
+		 const void *buf, unsigned long bufsize,
+		 unsigned long *delta_size, unsigned long max_delta_size);
 
 /* the smallest possible delta size is 4 bytes */
-#define DELTA_SIZE_MIN	4
+#define DELTA_SIZE_MIN  4
 
 /*
  * This must be called twice on the delta data buffer, first to get the
  * expected source buffer size, and again to get the target buffer size.
  */
 static inline unsigned long get_delta_hdr_size(const unsigned char **datap,
-					       const unsigned char *top)
+						   const unsigned char *top)
 {
 	const unsigned char *data = *datap;
 	unsigned char cmd;

=== modified file 'diff-delta.c'
--- a/diff-delta.c	2009-03-03 16:02:22 +0000
+++ b/diff-delta.c	2009-03-03 16:31:07 +0000
@@ -112,13 +112,6 @@
 	0x133eb0ac, 0x6d8b90a1, 0x450d4467, 0x3bb8646a
 };
 
-struct source_info {
-	const void *src_buf; /* Pointer to the beginning of source data */
-	unsigned long src_size; /* Total length of source data */
-	unsigned long agg_src_offset; /* Start of source data as part of the
-									 aggregate source */
-};
-
 struct index_entry {
 	const unsigned char *ptr;
 	const struct source_info *src;
@@ -132,7 +125,7 @@
 
 struct delta_index {
 	unsigned long memsize; /* Total bytes pointed to by this index */
-	struct source_info src; /* Information about the referenced source */
+	struct source_info *src; /* Information about the referenced source */
 	unsigned int hash_mask; /* val & hash_mask gives the hash index for a given
 							   entry */
 	unsigned int num_entries; /* The total number of entries in this index */
@@ -235,10 +228,8 @@
 		 * into consecutive array entries.
 		 */
 		packed_hash[i] = packed_entry;
-		for (entry = hash[i]; entry; entry = entry->next, ++packed_entry) {
-			*packed_entry = entry->entry;
-			packed_entry->src = &index->src;
-		}
+		for (entry = hash[i]; entry; entry = entry->next)
+			*packed_entry++ = entry->entry;
 	}
 
 	/* Sentinel value to indicate the length of the last hash bucket */
@@ -249,23 +240,23 @@
 }
 
 
-struct delta_index * create_delta_index(const void *buf, unsigned long bufsize,
-										unsigned long agg_src_offset)
+struct delta_index * create_delta_index(const struct source_info *src)
 {
 	unsigned int i, hsize, hmask, entries, prev_val, *hash_count;
-	const unsigned char *data, *buffer = buf;
+	const unsigned char *data, *buffer;
 	struct delta_index *index;
 	struct unpacked_index_entry *entry, **hash;
 	void *mem;
 	unsigned long memsize;
 
-	if (!buf || !bufsize)
+	if (!src->buf || !src->size)
 		return NULL;
+	buffer = src->buf;
 
 	/* Determine index hash size.  Note that indexing skips the
 	   first byte to allow for optimizing the Rabin's polynomial
 	   initialization in create_delta(). */
-	entries = (bufsize - 1)  / RABIN_WINDOW;
+	entries = (src->size - 1)  / RABIN_WINDOW;
 	hsize = entries / 4;
 	for (i = 4; (1u << i) < hsize && i < 31; i++);
 	hsize = 1 << i;
@@ -307,6 +298,7 @@
 			i = val & hmask;
 			entry->entry.ptr = data + RABIN_WINDOW;
 			entry->entry.val = val;
+			entry->entry.src = src;
 			entry->next = hash[i];
 			hash[i] = entry++;
 			hash_count[i]++;
@@ -316,13 +308,11 @@
 	entries = limit_hash_buckets(hash, hash_count, hsize, entries);
 	free(hash_count);
 	index = pack_delta_index(hash, hsize, entries);
+	index->src = src;
 	free(hash);
 	if (!index) {
 		return NULL;
 	}
-	index->src.src_buf = buf;
-	index->src.src_size = bufsize;
-	index->src.agg_src_offset = agg_src_offset;
 	return index;
 }
 
@@ -376,10 +366,10 @@
 	index = indexes[0];
 	for (j = 0; j < num_indexes; ++j) {
 		index = indexes[j];
-		i += index->src.src_size;
+		i += index->src->size;
 	}
-	assert(i <= index->src.src_size + index->src.agg_src_offset);
-	i = index->src.src_size + index->src.agg_src_offset;
+	assert(i <= index->src->size + index->src->agg_offset);
+	i = index->src->size + index->src->agg_offset;
 	while (i >= 0x80) {
 		out[outpos++] = i | 0x80;
 		i >>= 7;
@@ -441,8 +431,8 @@
 						continue;
 					ref = entry->ptr;
 					src = data;
-					ref_data = entry->src->src_buf;
-					ref_top = ref_data + entry->src->src_size;
+					ref_data = entry->src->buf;
+					ref_top = ref_data + entry->src->size;
 					ref_size = ref_top - ref;
 					/* ref_size is the longest possible match that we could make
 					 * here. If ref_size <= msize, then we know that we cannot
@@ -490,7 +480,7 @@
 			unsigned char *op;
 
 			if (inscnt) {
-				ref_data = msource->src_buf;
+				ref_data = msource->buf;
 				while (moff && ref_data[moff-1] == data[-1]) {
 					/* we can match one byte back */
 					msize++;
@@ -517,7 +507,7 @@
 			/* moff is the offset in the local structure, for encoding, we need
 			 * to push it into the global offset
 			 */
-			moff += msource->agg_src_offset;
+			moff += msource->agg_offset;
 			if (moff & 0x000000ff)
 				out[outpos++] = moff >> 0,  i |= 0x01;
 			if (moff & 0x0000ff00)
@@ -529,7 +519,7 @@
 			/* Put it back into local coordinates, in case we have multiple
 			 * copies in a row.
 			 */
-			moff -= msource->agg_src_offset;
+			moff -= msource->agg_offset;
 
 			if (msize & 0x00ff)
 				out[outpos++] = msize >> 0, i |= 0x10;



More information about the bazaar-commits mailing list