Rev 90: Change the code around again. in http://bazaar.launchpad.net/%7Ebzr/bzr-groupcompress/rabin
John Arbash Meinel
john at arbash-meinel.com
Tue Mar 3 16:32:50 GMT 2009
At http://bazaar.launchpad.net/%7Ebzr/bzr-groupcompress/rabin
------------------------------------------------------------
revno: 90
revision-id: john at arbash-meinel.com-20090303163107-l4j0114btw2efmjp
parent: john at arbash-meinel.com-20090303160222-4bkou2s65s60h75a
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: rabin
timestamp: Tue 2009-03-03 10:31:07 -0600
message:
Change the code around again.
This time, the information about sources is maintained in the DeltaIndex object.
And we pass that info down into create_delta_index, et al.
Next step is to actually combine the delta indexes.
-------------- next part --------------
=== modified file '_groupcompress_pyx.pyx'
--- a/_groupcompress_pyx.pyx 2009-03-03 15:09:39 +0000
+++ b/_groupcompress_pyx.pyx 2009-03-03 16:31:07 +0000
@@ -24,14 +24,17 @@
void memcpy(void *, void *, size_t)
cdef extern from "delta.h":
+ struct source_info:
+ void *buf
+ unsigned long size
+ unsigned long agg_offset
struct delta_index:
unsigned long memsize
void *src_buf
unsigned long src_size
unsigned int hash_mask
# struct index_entry *hash[]
- delta_index * create_delta_index(void *buf, unsigned long bufsize, unsigned
- long agg_src_offset)
+ delta_index * create_delta_index(source_info *src)
void free_delta_index(delta_index *index)
void *create_delta(delta_index **indexes,
unsigned int num_indexes,
@@ -84,9 +87,11 @@
# isn't performance critical
# cdef readonly list _sources
cdef readonly object _sources
+ cdef source_info *_source_infos
cdef delta_index **_indexes
cdef readonly unsigned int _num_indexes
cdef readonly unsigned int _max_num_indexes
+ cdef readonly unsigned int _max_num_sources
cdef public unsigned long _source_offset
def __repr__(self):
@@ -99,6 +104,9 @@
self._max_num_indexes = 1024
self._indexes = <delta_index**>safe_malloc(sizeof(delta_index*)
* self._max_num_indexes)
+ self._max_num_sources = 1024
+ self._source_infos = <source_info *>safe_malloc(sizeof(source_info)
+ * self._max_num_sources)
self._num_indexes = 0
self._source_offset = 0
@@ -107,6 +115,7 @@
def __dealloc__(self):
self._ensure_no_indexes()
+ safe_free(<void **>&self._source_infos)
def add_source(self, source, unadded_bytes):
"""Add a new bit of source text to the delta indexes.
@@ -118,15 +127,22 @@
cdef char *c_source
cdef Py_ssize_t c_source_size
cdef delta_index *index
+ cdef unsigned int source_location
+ cdef source_info *src
cdef unsigned int num_indexes
- cdef unsigned long agg_src_offset
if not PyString_CheckExact(source):
raise TypeError('source is not a str')
+ source_location = len(self._sources)
+ if source_location >= self._max_num_sources:
+ self._expand_sources()
self._sources.append(source)
c_source = PyString_AS_STRING(source)
c_source_size = PyString_GET_SIZE(source)
+ src = self._source_infos + source_location
+ src.buf = c_source
+ src.size = c_source_size
# TODO: Are usage is ultimately going to be different than the one that
# was originally designed. Specifically, we are going to want to
@@ -134,9 +150,9 @@
# fit just fine into the structure. But for now, we just wrap
# create_delta_index (For example, we could always reserve enough
# space to hash a 4MB string, etc.)
- agg_src_offset = self._source_offset + unadded_bytes
- index = create_delta_index(c_source, c_source_size, agg_src_offset)
- self._source_offset = agg_src_offset + c_source_size
+ src.agg_offset = self._source_offset + unadded_bytes
+ index = create_delta_index(src)
+ self._source_offset = src.agg_offset + src.size
if index != NULL:
num_indexes = self._num_indexes + 1
if num_indexes >= self._max_num_indexes:
@@ -150,6 +166,12 @@
sizeof(delta_index *)
* self._max_num_indexes)
+ cdef _expand_sources(self):
+ self._max_num_sources = self._max_num_sources * 2
+ self._source_infos = <source_info *>safe_realloc(self._source_infos,
+ sizeof(source_info)
+ * self._max_num_sources)
+
cdef _ensure_no_indexes(self):
cdef int i
@@ -192,38 +214,9 @@
def make_delta(source_bytes, target_bytes):
- """Create a delta from source_bytes => target_bytes."""
- cdef char *source
- cdef Py_ssize_t source_size
- cdef char *target
- cdef Py_ssize_t target_size
- cdef delta_index *index
- cdef void * delta
- cdef unsigned long delta_size
- cdef unsigned long max_delta_size
-
- max_delta_size = 0 # Unlimited
-
- if not PyString_CheckExact(source_bytes):
- raise TypeError('source is not a str')
- if not PyString_CheckExact(target_bytes):
- raise TypeError('target is not a str')
-
- source = PyString_AS_STRING(source_bytes)
- source_size = PyString_GET_SIZE(source_bytes)
- target = PyString_AS_STRING(target_bytes)
- target_size = PyString_GET_SIZE(target_bytes)
-
- result = None
- index = create_delta_index(source, source_size, 0)
- if index != NULL:
- delta = create_delta(&index, 1, target, target_size,
- &delta_size, max_delta_size)
- free_delta_index(index);
- if delta:
- result = PyString_FromStringAndSize(<char *>delta, delta_size)
- free(delta)
- return result
+ """Create a delta, this is a wrapper around DeltaIndex.make_delta."""
+ di = DeltaIndex(source_bytes)
+ return di.make_delta(target_bytes)
def apply_delta(source_bytes, delta_bytes):
@@ -341,32 +334,3 @@
# *dst_size = out - dst_buf;
assert (out - dst_buf) == PyString_GET_SIZE(result)
return result
-
-
-def apply_delta2(source_bytes, delta_bytes):
- """Apply a delta generated by make_delta to source_bytes."""
- # This defers to the patch-delta code rather than implementing it here
- # If this is faster, we can bring the memory allocation and error handling
- # into apply_delta(), and leave the primary loop in a separate C func.
- cdef char *source, *delta, *target
- cdef Py_ssize_t source_size, delta_size
- cdef unsigned long target_size
-
- if not PyString_CheckExact(source_bytes):
- raise TypeError('source is not a str')
- if not PyString_CheckExact(delta_bytes):
- raise TypeError('delta is not a str')
-
- source = PyString_AS_STRING(source_bytes)
- source_size = PyString_GET_SIZE(source_bytes)
- delta = PyString_AS_STRING(delta_bytes)
- delta_size = PyString_GET_SIZE(delta_bytes)
-
- target = <char *>patch_delta(source, source_size,
- delta, delta_size,
- &target_size)
- if target == NULL:
- return None
- result = PyString_FromStringAndSize(target, target_size)
- free(target)
- return result
=== modified file 'delta.h'
--- a/delta.h 2009-03-02 18:52:36 +0000
+++ b/delta.h 2009-03-03 16:31:07 +0000
@@ -6,6 +6,13 @@
/* opaque object for delta index */
struct delta_index;
+struct source_info {
+ const void *buf; /* Pointer to the beginning of source data */
+ unsigned long size; /* Total length of source data */
+ unsigned long agg_offset; /* Start of source data as part of the
+ aggregate source */
+};
+
/*
* create_delta_index: compute index data from given buffer
*
@@ -16,8 +23,7 @@
* using free_delta_index().
*/
extern struct delta_index *
-create_delta_index(const void *buf, unsigned long bufsize,
- unsigned long agg_src_offset);
+create_delta_index(const struct source_info *src);
/*
* free_delta_index: free the index created by create_delta_index()
@@ -45,53 +51,19 @@
*/
extern void *
create_delta(struct delta_index **indexes,
- unsigned int num_indexes,
- const void *buf, unsigned long bufsize,
- unsigned long *delta_size, unsigned long max_delta_size);
-
-/*
- * diff_delta: create a delta from source buffer to target buffer
- *
- * If max_delta_size is non-zero and the resulting delta is to be larger
- * than max_delta_size then NULL is returned. On success, a non-NULL
- * pointer to the buffer with the delta data is returned and *delta_size is
- * updated with its size. The returned buffer must be freed by the caller.
- */
-static inline void *
-diff_delta(const void *src_buf, unsigned long src_bufsize,
- const void *trg_buf, unsigned long trg_bufsize,
- unsigned long *delta_size, unsigned long max_delta_size)
-{
- struct delta_index *index = create_delta_index(src_buf, src_bufsize, 0);
- if (index) {
- void *delta = create_delta(&index, 1, trg_buf, trg_bufsize,
- delta_size, max_delta_size);
- free_delta_index(index);
- return delta;
- }
- return NULL;
-}
-
-/*
- * patch_delta: recreate target buffer given source buffer and delta data
- *
- * On success, a non-NULL pointer to the target buffer is returned and
- * *trg_bufsize is updated with its size. On failure a NULL pointer is
- * returned. The returned buffer must be freed by the caller.
- */
-extern void *patch_delta(const void *src_buf, unsigned long src_size,
- const void *delta_buf, unsigned long delta_size,
- unsigned long *dst_size);
+ unsigned int num_indexes,
+ const void *buf, unsigned long bufsize,
+ unsigned long *delta_size, unsigned long max_delta_size);
/* the smallest possible delta size is 4 bytes */
-#define DELTA_SIZE_MIN 4
+#define DELTA_SIZE_MIN 4
/*
* This must be called twice on the delta data buffer, first to get the
* expected source buffer size, and again to get the target buffer size.
*/
static inline unsigned long get_delta_hdr_size(const unsigned char **datap,
- const unsigned char *top)
+ const unsigned char *top)
{
const unsigned char *data = *datap;
unsigned char cmd;
=== modified file 'diff-delta.c'
--- a/diff-delta.c 2009-03-03 16:02:22 +0000
+++ b/diff-delta.c 2009-03-03 16:31:07 +0000
@@ -112,13 +112,6 @@
0x133eb0ac, 0x6d8b90a1, 0x450d4467, 0x3bb8646a
};
-struct source_info {
- const void *src_buf; /* Pointer to the beginning of source data */
- unsigned long src_size; /* Total length of source data */
- unsigned long agg_src_offset; /* Start of source data as part of the
- aggregate source */
-};
-
struct index_entry {
const unsigned char *ptr;
const struct source_info *src;
@@ -132,7 +125,7 @@
struct delta_index {
unsigned long memsize; /* Total bytes pointed to by this index */
- struct source_info src; /* Information about the referenced source */
+ struct source_info *src; /* Information about the referenced source */
unsigned int hash_mask; /* val & hash_mask gives the hash index for a given
entry */
unsigned int num_entries; /* The total number of entries in this index */
@@ -235,10 +228,8 @@
* into consecutive array entries.
*/
packed_hash[i] = packed_entry;
- for (entry = hash[i]; entry; entry = entry->next, ++packed_entry) {
- *packed_entry = entry->entry;
- packed_entry->src = &index->src;
- }
+ for (entry = hash[i]; entry; entry = entry->next)
+ *packed_entry++ = entry->entry;
}
/* Sentinel value to indicate the length of the last hash bucket */
@@ -249,23 +240,23 @@
}
-struct delta_index * create_delta_index(const void *buf, unsigned long bufsize,
- unsigned long agg_src_offset)
+struct delta_index * create_delta_index(const struct source_info *src)
{
unsigned int i, hsize, hmask, entries, prev_val, *hash_count;
- const unsigned char *data, *buffer = buf;
+ const unsigned char *data, *buffer;
struct delta_index *index;
struct unpacked_index_entry *entry, **hash;
void *mem;
unsigned long memsize;
- if (!buf || !bufsize)
+ if (!src->buf || !src->size)
return NULL;
+ buffer = src->buf;
/* Determine index hash size. Note that indexing skips the
first byte to allow for optimizing the Rabin's polynomial
initialization in create_delta(). */
- entries = (bufsize - 1) / RABIN_WINDOW;
+ entries = (src->size - 1) / RABIN_WINDOW;
hsize = entries / 4;
for (i = 4; (1u << i) < hsize && i < 31; i++);
hsize = 1 << i;
@@ -307,6 +298,7 @@
i = val & hmask;
entry->entry.ptr = data + RABIN_WINDOW;
entry->entry.val = val;
+ entry->entry.src = src;
entry->next = hash[i];
hash[i] = entry++;
hash_count[i]++;
@@ -316,13 +308,11 @@
entries = limit_hash_buckets(hash, hash_count, hsize, entries);
free(hash_count);
index = pack_delta_index(hash, hsize, entries);
+ index->src = src;
free(hash);
if (!index) {
return NULL;
}
- index->src.src_buf = buf;
- index->src.src_size = bufsize;
- index->src.agg_src_offset = agg_src_offset;
return index;
}
@@ -376,10 +366,10 @@
index = indexes[0];
for (j = 0; j < num_indexes; ++j) {
index = indexes[j];
- i += index->src.src_size;
+ i += index->src->size;
}
- assert(i <= index->src.src_size + index->src.agg_src_offset);
- i = index->src.src_size + index->src.agg_src_offset;
+ assert(i <= index->src->size + index->src->agg_offset);
+ i = index->src->size + index->src->agg_offset;
while (i >= 0x80) {
out[outpos++] = i | 0x80;
i >>= 7;
@@ -441,8 +431,8 @@
continue;
ref = entry->ptr;
src = data;
- ref_data = entry->src->src_buf;
- ref_top = ref_data + entry->src->src_size;
+ ref_data = entry->src->buf;
+ ref_top = ref_data + entry->src->size;
ref_size = ref_top - ref;
/* ref_size is the longest possible match that we could make
* here. If ref_size <= msize, then we know that we cannot
@@ -490,7 +480,7 @@
unsigned char *op;
if (inscnt) {
- ref_data = msource->src_buf;
+ ref_data = msource->buf;
while (moff && ref_data[moff-1] == data[-1]) {
/* we can match one byte back */
msize++;
@@ -517,7 +507,7 @@
/* moff is the offset in the local structure, for encoding, we need
* to push it into the global offset
*/
- moff += msource->agg_src_offset;
+ moff += msource->agg_offset;
if (moff & 0x000000ff)
out[outpos++] = moff >> 0, i |= 0x01;
if (moff & 0x0000ff00)
@@ -529,7 +519,7 @@
/* Put it back into local coordinates, in case we have multiple
* copies in a row.
*/
- moff -= msource->agg_src_offset;
+ moff -= msource->agg_offset;
if (msize & 0x00ff)
out[outpos++] = msize >> 0, i |= 0x10;
More information about the bazaar-commits
mailing list