Rev 80: Refactor the code a bit, so that I can re-use bits for a create_delta_index_from_delta. in http://bazaar.launchpad.net/%7Ebzr/bzr-groupcompress/rabin

Mon Mar 2 21:02:43 GMT 2009

At http://bazaar.launchpad.net/%7Ebzr/bzr-groupcompress/rabin

------------------------------------------------------------
revno: 80
revision-id: john at arbash-meinel.com-20090302210223-9ixutqay7sx8c1n3
parent: john at arbash-meinel.com-20090302202718-c7ojzhft35boi1kn
committer: John Arbash Meinel <john at arbash-meinel.com>
branch nick: rabin
timestamp: Mon 2009-03-02 15:02:23 -0600
message:
  Refactor the code a bit, so that I can re-use bits for a create_delta_index_from_delta.
-------------- next part --------------
=== modified file '_groupcompress_pyx.pyx'

--- a/_groupcompress_pyx.pyx	2009-03-02 19:36:29 +0000
+++ b/_groupcompress_pyx.pyx	2009-03-02 21:02:23 +0000
@@ -85,7 +85,7 @@
     cdef delta_index **_indexes
     cdef readonly unsigned int _num_indexes
     cdef readonly unsigned int _max_num_indexes
-    cdef readonly unsigned long _source_offset
+    cdef public unsigned long _source_offset
 
     def __repr__(self):
         return '%s(%d, %d, %d)' % (self.__class__.__name__,

=== modified file 'diff-delta.c'
--- a/diff-delta.c	2009-03-02 20:27:18 +0000
+++ b/diff-delta.c	2009-03-02 21:02:23 +0000
@@ -133,71 +133,13 @@
 	struct index_entry *hash[];
 };
 
-struct delta_index * create_delta_index(const void *buf, unsigned long bufsize,
-										unsigned long agg_src_offset)
+static unsigned int
+limit_hash_buckets(struct unpacked_index_entry **hash,
+				   unsigned int *hash_count, unsigned int hsize,
+				   unsigned int entries)
 {
-	unsigned int i, hsize, hmask, entries, prev_val, *hash_count;
-	const unsigned char *data, *buffer = buf;
-	struct delta_index *index;
-	struct unpacked_index_entry *entry, **hash;
-	struct index_entry *packed_entry, **packed_hash;
-	void *mem;
-	unsigned long memsize;
-
-	if (!buf || !bufsize)
-		return NULL;
-
-	/* Determine index hash size.  Note that indexing skips the
-	   first byte to allow for optimizing the Rabin's polynomial
-	   initialization in create_delta(). */
-	entries = (bufsize - 1)  / RABIN_WINDOW;
-	hsize = entries / 4;
-	for (i = 4; (1u << i) < hsize && i < 31; i++);
-	hsize = 1 << i;
-	hmask = hsize - 1;
-
-	/* allocate lookup index */
-	memsize = sizeof(*hash) * hsize +
-		  sizeof(*entry) * entries;
-	mem = malloc(memsize);
-	if (!mem)
-		return NULL;
-	hash = mem;
-	mem = hash + hsize;
-	entry = mem;
-
-	memset(hash, 0, hsize * sizeof(*hash));
-
-	/* allocate an array to count hash entries */
-	hash_count = calloc(hsize, sizeof(*hash_count));
-	if (!hash_count) {
-		free(hash);
-		return NULL;
-	}
-
-	/* then populate the index */
-	prev_val = ~0;
-	for (data = buffer + entries * RABIN_WINDOW - RABIN_WINDOW;
-		 data >= buffer;
-		 data -= RABIN_WINDOW) {
-		unsigned int val = 0;
-		for (i = 1; i <= RABIN_WINDOW; i++)
-			val = ((val << 8) | data[i]) ^ T[val >> RABIN_SHIFT];
-		if (val == prev_val) {
-			/* keep the lowest of consecutive identical blocks */
-			entry[-1].entry.ptr = data + RABIN_WINDOW;
-			--entries;
-		} else {
-			prev_val = val;
-			i = val & hmask;
-			entry->entry.ptr = data + RABIN_WINDOW;
-			entry->entry.val = val;
-			entry->next = hash[i];
-			hash[i] = entry++;
-			hash_count[i]++;
-		}
-	}
-
+	struct unpacked_index_entry *entry;
+	unsigned int i;
 	/*
 	 * Determine a limit on the number of entries in the same hash
 	 * bucket.	This guards us against pathological data sets causing
@@ -247,8 +189,18 @@
 			entry = entry->next;
 		} while (entry);
 	}
-	free(hash_count);
+	return entries;
+}
 
+static struct delta_index *
+pack_delta_index(struct unpacked_index_entry **hash, unsigned int hsize,
+				 unsigned int entries)
+{
+	unsigned int i, memsize;
+	struct unpacked_index_entry *entry;
+	struct delta_index *index;
+	struct index_entry *packed_entry, **packed_hash;
+	void *mem;
 	/*
 	 * Now create the packed index in array form
 	 * rather than linked lists.
@@ -258,16 +210,12 @@
 		+ sizeof(*packed_entry) * entries;
 	mem = malloc(memsize);
 	if (!mem) {
-		free(hash);
 		return NULL;
 	}
 
 	index = mem;
 	index->memsize = memsize;
-	index->src_buf = buf;
-	index->src_size = bufsize;
-	index->agg_src_offset = agg_src_offset;
-	index->hash_mask = hmask;
+	index->hash_mask = hsize - 1;
 
 	mem = index->hash;
 	packed_hash = mem;
@@ -288,8 +236,84 @@
 	packed_hash[hsize] = packed_entry;
 
 	assert(packed_entry - (struct index_entry *)mem == entries);
-	free(hash);
-
+	return index;
+}
+
+
+struct delta_index * create_delta_index(const void *buf, unsigned long bufsize,
+										unsigned long agg_src_offset)
+{
+	unsigned int i, hsize, hmask, entries, prev_val, *hash_count;
+	const unsigned char *data, *buffer = buf;
+	struct delta_index *index;
+	struct unpacked_index_entry *entry, **hash;
+	void *mem;
+	unsigned long memsize;
+
+	if (!buf || !bufsize)
+		return NULL;
+
+	/* Determine index hash size.  Note that indexing skips the
+	   first byte to allow for optimizing the Rabin's polynomial
+	   initialization in create_delta(). */
+	entries = (bufsize - 1)  / RABIN_WINDOW;
+	hsize = entries / 4;
+	for (i = 4; (1u << i) < hsize && i < 31; i++);
+	hsize = 1 << i;
+	hmask = hsize - 1;
+
+	/* allocate lookup index */
+	memsize = sizeof(*hash) * hsize +
+		  sizeof(*entry) * entries;
+	mem = malloc(memsize);
+	if (!mem)
+		return NULL;
+	hash = mem;
+	mem = hash + hsize;
+	entry = mem;
+
+	memset(hash, 0, hsize * sizeof(*hash));
+
+	/* allocate an array to count hash entries */
+	hash_count = calloc(hsize, sizeof(*hash_count));
+	if (!hash_count) {
+		free(hash);
+		return NULL;
+	}
+
+	/* then populate the index */
+	prev_val = ~0;
+	for (data = buffer + entries * RABIN_WINDOW - RABIN_WINDOW;
+		 data >= buffer;
+		 data -= RABIN_WINDOW) {
+		unsigned int val = 0;
+		for (i = 1; i <= RABIN_WINDOW; i++)
+			val = ((val << 8) | data[i]) ^ T[val >> RABIN_SHIFT];
+		if (val == prev_val) {
+			/* keep the lowest of consecutive identical blocks */
+			entry[-1].entry.ptr = data + RABIN_WINDOW;
+			--entries;
+		} else {
+			prev_val = val;
+			i = val & hmask;
+			entry->entry.ptr = data + RABIN_WINDOW;
+			entry->entry.val = val;
+			entry->next = hash[i];
+			hash[i] = entry++;
+			hash_count[i]++;
+		}
+	}
+
+	entries = limit_hash_buckets(hash, hash_count, hsize, entries);
+	free(hash_count);
+	index = pack_delta_index(hash, hsize, entries);
+	if (!index) {
+		free(hash);
+		return NULL;
+	}
+	index->src_buf = buf;
+	index->src_size = bufsize;
+	index->agg_src_offset = agg_src_offset;
 	return index;
 }
 
@@ -326,6 +350,8 @@
 
 	if (!trg_buf || !trg_size)
 		return NULL;
+	if (num_indexes == 0)
+		return NULL;
 
 	outpos = 0;
 	outsize = 8192;
@@ -337,6 +363,7 @@
 
 	/* store reference buffer size */
 	i = 0;
+	index = indexes[0];
 	for (j = 0; j < num_indexes; ++j) {
 		index = indexes[j];
 		i += index->src_size;

=== modified file 'groupcompress.py'
--- a/groupcompress.py	2009-03-02 20:16:09 +0000
+++ b/groupcompress.py	2009-03-02 21:02:23 +0000
@@ -178,6 +178,10 @@
                 new_chunks.insert(0, 'fulltext\n')
                 new_chunks.append('len:%s\n' % (input_len,))
             unadded_bytes = sum(map(len, new_chunks))
+            deltas_unadded = (self.endpoint - self._delta_index._source_offset)
+            if deltas_unadded != 0:
+                import pdb; pdb.set_trace()
+            unadded_bytes += deltas_unadded
             self._delta_index.add_source(target_text, unadded_bytes)
             new_chunks.append(target_text)
         else:
@@ -186,9 +190,10 @@
             else:
                 new_chunks.insert(0, 'delta\n')
                 new_chunks.append('len:%s\n' % (len(delta),))
-            unadded_bytes = sum(map(len, new_chunks))
-            self._delta_index.add_source(delta, unadded_bytes)
+            # self._delta_index.add_source(delta, unadded_bytes)
             new_chunks.append(delta)
+            unadded_bytes = sum(map(len, new_chunks))
+            self._delta_index._source_offset += unadded_bytes
         delta_start = (self.endpoint, len(self.lines))
         self.output_chunks(new_chunks)
         self.input_bytes += input_len