Rev 19: Merge John. in http://people.ubuntu.com/~robertc/baz2.0/plugins/groupcompress/trunk

Robert Collins robertc at robertcollins.net
Fri Jul 25 04:13:27 BST 2008


At http://people.ubuntu.com/~robertc/baz2.0/plugins/groupcompress/trunk

------------------------------------------------------------
revno: 19
revision-id: robertc at robertcollins.net-20080725031326-2c1xq6xusfoefxsi
parent: robertc at robertcollins.net-20080725000927-d88agg6z2bfzcks2
parent: john at arbash-meinel.com-20080725014613-6pe4o1mjuhajfkwb
committer: Robert Collins <robertc at robertcollins.net>
branch nick: trunk
timestamp: Fri 2008-07-25 13:13:26 +1000
message:
  Merge John.
modified:
  _groupcompress_c.pyx           _groupcompress_c.pyx-20080724041824-yelg6ii7c7zxt4z0-1
  tests/test__groupcompress_c.py test__groupcompress_-20080724145854-koifwb7749cfzrvj-1
    ------------------------------------------------------------
    revno: 14.1.33
    revision-id: john at arbash-meinel.com-20080725014613-6pe4o1mjuhajfkwb
    parent: john at arbash-meinel.com-20080725013428-hi3lvvasnz8qorp8
    committer: John Arbash Meinel <john at arbash-meinel.com>
    branch nick: groupcompress
    timestamp: Thu 2008-07-24 20:46:13 -0500
    message:
      Use the cached match from the previous run, drops time from 2.4s => 2.0s on inventory.py
    modified:
      _groupcompress_c.pyx           _groupcompress_c.pyx-20080724041824-yelg6ii7c7zxt4z0-1
    ------------------------------------------------------------
    revno: 14.1.32
    revision-id: john at arbash-meinel.com-20080725013428-hi3lvvasnz8qorp8
    parent: john at arbash-meinel.com-20080724230854-rly3bvlhfl0p06jq
    committer: John Arbash Meinel <john at arbash-meinel.com>
    branch nick: groupcompress
    timestamp: Thu 2008-07-24 20:34:28 -0500
    message:
      Switch away from += for older versions of pyrex,
      and increase the internal hash array free space.
    modified:
      _groupcompress_c.pyx           _groupcompress_c.pyx-20080724041824-yelg6ii7c7zxt4z0-1
      tests/test__groupcompress_c.py test__groupcompress_-20080724145854-koifwb7749cfzrvj-1
=== modified file '_groupcompress_c.pyx'
--- a/_groupcompress_c.pyx	2008-07-25 00:09:27 +0000
+++ b/_groupcompress_c.pyx	2008-07-25 03:13:26 +0000
@@ -175,7 +175,7 @@
         # size hash table. However, any collision would then have a long way to
         # traverse before it could find a 'free' slot.
         # So we set the minimum size to give us 33% empty slots.
-        min_size = <Py_ssize_t>(needed * 1.5)
+        min_size = <Py_ssize_t>(needed * 2)
         hash_size = 1
         while hash_size < min_size:
             hash_size = hash_size << 1
@@ -199,11 +199,11 @@
         # We start off with a 8k hash (after doubling), because there isn't a
         # lot of reason to go smaller than that (this class isn't one you'll be
         # instantiating thousands of, and you are always likely to grow here.)
-        hash_size = 4096
+        hash_size = 2048
         while hash_size < needed:
             hash_size = hash_size << 1
-        # And we always give at least 2x blank space
-        hash_size = hash_size << 1
+        # And we always give at least 4x blank space
+        hash_size = hash_size << 2
         return hash_size
 
     def _py_compute_recommended_hash_size(self, needed):
@@ -543,10 +543,30 @@
     cdef int i
     lst = []
     for i from 0 <= i < count:
-        lst.append(array[i])
+        PyList_Append(lst, array[i])
     return lst
 
 
+cdef Py_ssize_t list_as_array(object lst, Py_ssize_t **array) except -1:
+    """Convert a Python sequence to an integer array.
+
+    :return: The size of the output, -1 on error
+    """
+    cdef int i
+    cdef PyObject *seq
+    cdef Py_ssize_t size
+
+    seq = PySequence_Fast(lst, "expected a sequence for lst")
+    try:
+        size = PySequence_Fast_GET_SIZE(seq)
+        array[0] = <Py_ssize_t*>safe_malloc(sizeof(Py_ssize_t) * size)
+        for i from 0 <= i < size:
+            array[0][i] = <object>PySequence_Fast_GET_ITEM(seq, i)
+    finally:
+        Py_DECREF(seq)
+    return size
+
+
 def _get_longest_match(equivalence_table, pos, max_pos, locations):
     """Get the longest possible match for the current position."""
     cdef EquivalenceTable eq
@@ -573,13 +593,16 @@
     copy_count = 0
 
     # TODO: Handle when locations is not None
+    if locations is not None:
+        match_count = list_as_array(locations, &matches)
     while cpos < cmax_pos:
         # TODO: Instead of grabbing the full list of matches and then doing an
         #       intersection, use a helper that does the intersection
         #       as-it-goes.
-        line = PyList_GET_ITEM(eq._right_lines, cpos)
-        safe_free(<void**>&matches)
-        match_count = eq.get_raw_matches(line, &matches)
+        if matches == NULL:
+            line = PyList_GET_ITEM(eq._right_lines, cpos)
+            safe_free(<void**>&matches)
+            match_count = eq.get_raw_matches(line, &matches)
         if match_count == 0:
             # No more matches, just return whatever we have, but we know that
             # this last position is not going to match anything

=== modified file 'tests/test__groupcompress_c.py'
--- a/tests/test__groupcompress_c.py	2008-07-24 23:08:54 +0000
+++ b/tests/test__groupcompress_c.py	2008-07-25 01:34:28 +0000
@@ -57,10 +57,10 @@
 
     def test_minimum_hash_size(self):
         eq = self._gc_module.EquivalenceTable([])
-        # We request at least 33% free space in the hash (to make collisions
+        # We request at least 50% free space in the hash (to make collisions
         # more bearable)
-        self.assertEqual(1024, eq._py_compute_minimum_hash_size(683))
-        self.assertEqual(2048, eq._py_compute_minimum_hash_size(684))
+        self.assertEqual(1024, eq._py_compute_minimum_hash_size(512))
+        self.assertEqual(2048, eq._py_compute_minimum_hash_size(513))
         self.assertEqual(2048, eq._py_compute_minimum_hash_size(1000))
         self.assertEqual(2048, eq._py_compute_minimum_hash_size(1024))
 
@@ -70,11 +70,11 @@
         self.assertEqual(8192, eq._py_compute_recommended_hash_size(10))
         self.assertEqual(8192, eq._py_compute_recommended_hash_size(1000))
         self.assertEqual(8192, eq._py_compute_recommended_hash_size(2000))
-        self.assertEqual(8192, eq._py_compute_recommended_hash_size(4000))
+        self.assertEqual(16384, eq._py_compute_recommended_hash_size(4000))
 
-        # And we recommend at least 50% free slots
-        self.assertEqual(8192, eq._py_compute_recommended_hash_size(4096))
-        self.assertEqual(16384, eq._py_compute_recommended_hash_size(4097))
+        # And we recommend at least 75% free slots
+        self.assertEqual(16384, eq._py_compute_recommended_hash_size(4096))
+        self.assertEqual(32768, eq._py_compute_recommended_hash_size(4097))
 
     def test__raw_lines(self):
         eq = self._gc_module.EquivalenceTable([1, 2, 3])




More information about the bazaar-commits mailing list