Rev 80: Refactor the code a bit, so that I can re-use bits for a create_delta_index_from_delta. in http://bazaar.launchpad.net/%7Ebzr/bzr-groupcompress/rabin
John Arbash Meinel
john at arbash-meinel.com
Mon Mar 2 21:02:43 GMT 2009
At http://bazaar.launchpad.net/%7Ebzr/bzr-groupcompress/rabin
------------------------------------------------------------
revno: 80
revision-id: john at arbash-meinel.com-20090302210223-9ixutqay7sx8c1n3
parent: john at arbash-meinel.com-20090302202718-c7ojzhft35boi1kn
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: rabin
timestamp: Mon 2009-03-02 15:02:23 -0600
message:
Refactor the code a bit, so that I can re-use bits for a create_delta_index_from_delta.
-------------- next part --------------
=== modified file '_groupcompress_pyx.pyx'
--- a/_groupcompress_pyx.pyx 2009-03-02 19:36:29 +0000
+++ b/_groupcompress_pyx.pyx 2009-03-02 21:02:23 +0000
@@ -85,7 +85,7 @@
cdef delta_index **_indexes
cdef readonly unsigned int _num_indexes
cdef readonly unsigned int _max_num_indexes
- cdef readonly unsigned long _source_offset
+ cdef public unsigned long _source_offset
def __repr__(self):
return '%s(%d, %d, %d)' % (self.__class__.__name__,
=== modified file 'diff-delta.c'
--- a/diff-delta.c 2009-03-02 20:27:18 +0000
+++ b/diff-delta.c 2009-03-02 21:02:23 +0000
@@ -133,71 +133,13 @@
struct index_entry *hash[];
};
-struct delta_index * create_delta_index(const void *buf, unsigned long bufsize,
- unsigned long agg_src_offset)
+static unsigned int
+limit_hash_buckets(struct unpacked_index_entry **hash,
+ unsigned int *hash_count, unsigned int hsize,
+ unsigned int entries)
{
- unsigned int i, hsize, hmask, entries, prev_val, *hash_count;
- const unsigned char *data, *buffer = buf;
- struct delta_index *index;
- struct unpacked_index_entry *entry, **hash;
- struct index_entry *packed_entry, **packed_hash;
- void *mem;
- unsigned long memsize;
-
- if (!buf || !bufsize)
- return NULL;
-
- /* Determine index hash size. Note that indexing skips the
- first byte to allow for optimizing the Rabin's polynomial
- initialization in create_delta(). */
- entries = (bufsize - 1) / RABIN_WINDOW;
- hsize = entries / 4;
- for (i = 4; (1u << i) < hsize && i < 31; i++);
- hsize = 1 << i;
- hmask = hsize - 1;
-
- /* allocate lookup index */
- memsize = sizeof(*hash) * hsize +
- sizeof(*entry) * entries;
- mem = malloc(memsize);
- if (!mem)
- return NULL;
- hash = mem;
- mem = hash + hsize;
- entry = mem;
-
- memset(hash, 0, hsize * sizeof(*hash));
-
- /* allocate an array to count hash entries */
- hash_count = calloc(hsize, sizeof(*hash_count));
- if (!hash_count) {
- free(hash);
- return NULL;
- }
-
- /* then populate the index */
- prev_val = ~0;
- for (data = buffer + entries * RABIN_WINDOW - RABIN_WINDOW;
- data >= buffer;
- data -= RABIN_WINDOW) {
- unsigned int val = 0;
- for (i = 1; i <= RABIN_WINDOW; i++)
- val = ((val << 8) | data[i]) ^ T[val >> RABIN_SHIFT];
- if (val == prev_val) {
- /* keep the lowest of consecutive identical blocks */
- entry[-1].entry.ptr = data + RABIN_WINDOW;
- --entries;
- } else {
- prev_val = val;
- i = val & hmask;
- entry->entry.ptr = data + RABIN_WINDOW;
- entry->entry.val = val;
- entry->next = hash[i];
- hash[i] = entry++;
- hash_count[i]++;
- }
- }
-
+ struct unpacked_index_entry *entry;
+ unsigned int i;
/*
* Determine a limit on the number of entries in the same hash
* bucket. This guards us against pathological data sets causing
@@ -247,8 +189,18 @@
entry = entry->next;
} while (entry);
}
- free(hash_count);
+ return entries;
+}
+static struct delta_index *
+pack_delta_index(struct unpacked_index_entry **hash, unsigned int hsize,
+ unsigned int entries)
+{
+ unsigned int i, memsize;
+ struct unpacked_index_entry *entry;
+ struct delta_index *index;
+ struct index_entry *packed_entry, **packed_hash;
+ void *mem;
/*
* Now create the packed index in array form
* rather than linked lists.
@@ -258,16 +210,12 @@
+ sizeof(*packed_entry) * entries;
mem = malloc(memsize);
if (!mem) {
- free(hash);
return NULL;
}
index = mem;
index->memsize = memsize;
- index->src_buf = buf;
- index->src_size = bufsize;
- index->agg_src_offset = agg_src_offset;
- index->hash_mask = hmask;
+ index->hash_mask = hsize - 1;
mem = index->hash;
packed_hash = mem;
@@ -288,8 +236,84 @@
packed_hash[hsize] = packed_entry;
assert(packed_entry - (struct index_entry *)mem == entries);
- free(hash);
-
+ return index;
+}
+
+
+struct delta_index * create_delta_index(const void *buf, unsigned long bufsize,
+ unsigned long agg_src_offset)
+{
+ unsigned int i, hsize, hmask, entries, prev_val, *hash_count;
+ const unsigned char *data, *buffer = buf;
+ struct delta_index *index;
+ struct unpacked_index_entry *entry, **hash;
+ void *mem;
+ unsigned long memsize;
+
+ if (!buf || !bufsize)
+ return NULL;
+
+ /* Determine index hash size. Note that indexing skips the
+ first byte to allow for optimizing the Rabin's polynomial
+ initialization in create_delta(). */
+ entries = (bufsize - 1) / RABIN_WINDOW;
+ hsize = entries / 4;
+ for (i = 4; (1u << i) < hsize && i < 31; i++);
+ hsize = 1 << i;
+ hmask = hsize - 1;
+
+ /* allocate lookup index */
+ memsize = sizeof(*hash) * hsize +
+ sizeof(*entry) * entries;
+ mem = malloc(memsize);
+ if (!mem)
+ return NULL;
+ hash = mem;
+ mem = hash + hsize;
+ entry = mem;
+
+ memset(hash, 0, hsize * sizeof(*hash));
+
+ /* allocate an array to count hash entries */
+ hash_count = calloc(hsize, sizeof(*hash_count));
+ if (!hash_count) {
+ free(hash);
+ return NULL;
+ }
+
+ /* then populate the index */
+ prev_val = ~0;
+ for (data = buffer + entries * RABIN_WINDOW - RABIN_WINDOW;
+ data >= buffer;
+ data -= RABIN_WINDOW) {
+ unsigned int val = 0;
+ for (i = 1; i <= RABIN_WINDOW; i++)
+ val = ((val << 8) | data[i]) ^ T[val >> RABIN_SHIFT];
+ if (val == prev_val) {
+ /* keep the lowest of consecutive identical blocks */
+ entry[-1].entry.ptr = data + RABIN_WINDOW;
+ --entries;
+ } else {
+ prev_val = val;
+ i = val & hmask;
+ entry->entry.ptr = data + RABIN_WINDOW;
+ entry->entry.val = val;
+ entry->next = hash[i];
+ hash[i] = entry++;
+ hash_count[i]++;
+ }
+ }
+
+ entries = limit_hash_buckets(hash, hash_count, hsize, entries);
+ free(hash_count);
+ index = pack_delta_index(hash, hsize, entries);
+ if (!index) {
+ free(hash);
+ return NULL;
+ }
+ index->src_buf = buf;
+ index->src_size = bufsize;
+ index->agg_src_offset = agg_src_offset;
return index;
}
@@ -326,6 +350,8 @@
if (!trg_buf || !trg_size)
return NULL;
+ if (num_indexes == 0)
+ return NULL;
outpos = 0;
outsize = 8192;
@@ -337,6 +363,7 @@
/* store reference buffer size */
i = 0;
+ index = indexes[0];
for (j = 0; j < num_indexes; ++j) {
index = indexes[j];
i += index->src_size;
=== modified file 'groupcompress.py'
--- a/groupcompress.py 2009-03-02 20:16:09 +0000
+++ b/groupcompress.py 2009-03-02 21:02:23 +0000
@@ -178,6 +178,10 @@
new_chunks.insert(0, 'fulltext\n')
new_chunks.append('len:%s\n' % (input_len,))
unadded_bytes = sum(map(len, new_chunks))
+ deltas_unadded = (self.endpoint - self._delta_index._source_offset)
+ if deltas_unadded != 0:
+ import pdb; pdb.set_trace()
+ unadded_bytes += deltas_unadded
self._delta_index.add_source(target_text, unadded_bytes)
new_chunks.append(target_text)
else:
@@ -186,9 +190,10 @@
else:
new_chunks.insert(0, 'delta\n')
new_chunks.append('len:%s\n' % (len(delta),))
- unadded_bytes = sum(map(len, new_chunks))
- self._delta_index.add_source(delta, unadded_bytes)
+ # self._delta_index.add_source(delta, unadded_bytes)
new_chunks.append(delta)
+ unadded_bytes = sum(map(len, new_chunks))
+ self._delta_index._source_offset += unadded_bytes
delta_start = (self.endpoint, len(self.lines))
self.output_chunks(new_chunks)
self.input_bytes += input_len
More information about the bazaar-commits
mailing list