[Bf-blender-cvs] [1181dda] temp-ghash-experiments: GHash: fix/hugely enhance GHash tests.

Bastien Montagne noreply at git.blender.org
Sat Feb 21 00:31:50 CET 2015


Commit: 1181dda62f2961d1a6dab47aea672f1591dec00b
Author: Bastien Montagne
Date:   Fri Feb 20 20:00:52 2015 +0100
Branches: temp-ghash-experiments
https://developer.blender.org/rB1181dda62f2961d1a6dab47aea672f1591dec00b

GHash: fix/hugely enhance GHash tests.

Also, fix some stupid mistakes in some new murmur hashing helpers,
and rework quality function to simply compute variance in buckets occupation.

Note latest change has some quite funny effects, looks like variance is always
very close to load of the ghash...

As for performances:
* Globally, ghash and murmur have very similar results, both in speed and quality.
* For integers, ghash usually quicker.
* For strings, oddly murmur is much quicker when adding to ghash, but quite slower
  when lookingup in the ghash...

Anyway, still much to do in this area!

===================================================================

M	source/blender/blenlib/BLI_ghash.h
M	source/blender/blenlib/intern/BLI_ghash.c
M	tests/gtests/blenlib/BLI_ghash_test.cc

===================================================================

diff --git a/source/blender/blenlib/BLI_ghash.h b/source/blender/blenlib/BLI_ghash.h
index 15aae5f..0af320d 100644
--- a/source/blender/blenlib/BLI_ghash.h
+++ b/source/blender/blenlib/BLI_ghash.h
@@ -64,6 +64,7 @@ GHash *BLI_ghash_new_ex(GHashHashFP hashfp, GHashCmpFP cmpfp, const char *info,
 GHash *BLI_ghash_new(GHashHashFP hashfp, GHashCmpFP cmpfp, const char *info) ATTR_MALLOC ATTR_WARN_UNUSED_RESULT;
 void   BLI_ghash_free(GHash *gh, GHashKeyFreeFP keyfreefp, GHashValFreeFP valfreefp);
 void   BLI_ghash_insert(GHash *gh, void *key, void *val);
+bool   BLI_ghash_add(GHash *gh, void *key, void *val);
 bool   BLI_ghash_reinsert(GHash *gh, void *key, void *val, GHashKeyFreeFP keyfreefp, GHashValFreeFP valfreefp);
 void  *BLI_ghash_lookup(GHash *gh, const void *key) ATTR_WARN_UNUSED_RESULT;
 void  *BLI_ghash_lookup_default(GHash *gh, const void *key, void *val_default) ATTR_WARN_UNUSED_RESULT;
@@ -134,21 +135,30 @@ bool            BLI_ghashutil_strcmp(const void *a, const void *b);
                 CHECK_TYPE_ANY(&(key), int *, const int *), \
                 BLI_ghashutil_uinthash((unsigned int)key))
 unsigned int    BLI_ghashutil_uinthash(unsigned int key);
+unsigned int    BLI_ghashutil_inthash_p(const void *ptr);
+unsigned int    BLI_ghashutil_inthash_p_murmur(const void *ptr);
+bool            BLI_ghashutil_intcmp(const void *a, const void *b);
+
+
+unsigned int    BLI_ghashutil_uinthash_v4(const unsigned int key[4]);
 #define         BLI_ghashutil_inthash_v4(key) ( \
                 CHECK_TYPE_ANY(key, int *, const int *), \
                 BLI_ghashutil_uinthash_v4((const unsigned int *)key))
-unsigned int    BLI_ghashutil_uinthash_v4(const unsigned int key[4]);
 #define         BLI_ghashutil_inthash_v4_p \
    ((GSetHashFP)BLI_ghashutil_uinthash_v4)
 #define         BLI_ghashutil_uinthash_v4_p \
    ((GSetHashFP)BLI_ghashutil_uinthash_v4)
-unsigned int    BLI_ghashutil_uinthash_v4_p_murmur(const void *ptr);
+unsigned int    BLI_ghashutil_uinthash_v4_murmur(const unsigned int key[4]);
+#define         BLI_ghashutil_inthash_v4_murmur(key) ( \
+                CHECK_TYPE_ANY(key, int *, const int *), \
+                BLI_ghashutil_uinthash_v4_murmur((const unsigned int *)key))
+#define         BLI_ghashutil_inthash_v4_p_murmur \
+   ((GSetHashFP)BLI_ghashutil_uinthash_v4_murmur)
+#define         BLI_ghashutil_uinthash_v4_p_murmur \
+   ((GSetHashFP)BLI_ghashutil_uinthash_v4_murmur)
 bool            BLI_ghashutil_uinthash_v4_cmp(const void *a, const void *b);
 #define         BLI_ghashutil_inthash_v4_cmp \
                 BLI_ghashutil_uinthash_v4_cmp
-unsigned int    BLI_ghashutil_inthash_p(const void *ptr);
-unsigned int    BLI_ghashutil_inthash_p_murmur(const void *ptr);
-bool            BLI_ghashutil_intcmp(const void *a, const void *b);
 
 /** \} */
 
@@ -235,8 +245,8 @@ BLI_INLINE bool BLI_gsetIterator_done(GSetIterator *gsi) { return BLI_ghashItera
 	     BLI_gsetIterator_step(&gs_iter_), i_++)
 
 //~ #ifdef DEBUG
-double BLI_ghash_calc_quality(GHash *gh);
-double BLI_gset_calc_quality(GSet *gs);
+double BLI_ghash_calc_quality(GHash *gh, double *r_load, int *r_min_bucket, int *r_max_bucket);
+double BLI_gset_calc_quality(GSet *gs, double *r_load, int *r_min_bucket, int *r_max_bucket);
 //~ #endif
 
 #ifdef __cplusplus
diff --git a/source/blender/blenlib/intern/BLI_ghash.c b/source/blender/blenlib/intern/BLI_ghash.c
index a8aacbe..e0c2e4b 100644
--- a/source/blender/blenlib/intern/BLI_ghash.c
+++ b/source/blender/blenlib/intern/BLI_ghash.c
@@ -361,6 +361,26 @@ void BLI_ghash_insert(GHash *gh, void *key, void *val)
 }
 
 /**
+ * Like #BLI_ghash_insert, but does nothing in case \a key is already in the \a gh.
+ *
+ * Avoids #BLI_ghash_haskey, #BLI_ghash_insert calls (double lookups)
+ *
+ * \returns true if a new key has been added.
+ */
+bool BLI_ghash_add(GHash *gh, void *key, void *val)
+{
+	const unsigned int hash = ghash_keyhash(gh, key);
+	Entry *e = ghash_lookup_entry_ex(gh, key, hash);
+	if (e) {
+		return false;
+	}
+	else {
+		ghash_insert_ex(gh, key, val, hash);
+		return true;
+	}
+}
+
+/**
  * Inserts a new value to a key that may already be in ghash.
  *
  * Avoids #BLI_ghash_remove, #BLI_ghash_insert calls (double lookups)
@@ -717,11 +737,9 @@ unsigned int BLI_ghashutil_uinthash_v4(const unsigned int key[4])
 	hash += key[3];
 	return hash;
 }
-unsigned int BLI_ghashutil_uinthash_v4_p_murmur(const void *ptr)
+unsigned int BLI_ghashutil_uinthash_v4_murmur(const unsigned int key[4])
 {
-	const unsigned int *key = ptr;
-
-	return BLI_hash_mm2((const unsigned char *)key, sizeof(*key) * 4, 0);
+	return BLI_hash_mm2((const unsigned char *)key, sizeof(key), 0);
 }
 
 bool BLI_ghashutil_uinthash_v4_cmp(const void *a, const void *b)
@@ -757,9 +775,9 @@ unsigned int BLI_ghashutil_inthash_p(const void *ptr)
 
 unsigned int BLI_ghashutil_inthash_p_murmur(const void *ptr)
 {
-	const unsigned int *key = ptr;
+	uintptr_t key = (uintptr_t)ptr;
 
-	return BLI_hash_mm2((const unsigned char *)key, sizeof(*key), 0);
+	return BLI_hash_mm2((const unsigned char *)&key, sizeof(key), 0);
 }
 
 bool BLI_ghashutil_intcmp(const void *a, const void *b)
@@ -802,7 +820,7 @@ unsigned int BLI_ghashutil_strhash_p_murmur(const void *ptr)
 {
 	const unsigned char *key = ptr;
 
-	return BLI_hash_mm2(key, strlen((const char *)key), 0);
+	return BLI_hash_mm2(key, strlen((const char *)key) + 1, 0);
 }
 bool BLI_ghashutil_strcmp(const void *a, const void *b)
 {
@@ -1044,37 +1062,68 @@ GSet *BLI_gset_pair_new(const char *info)
 
 /** \name Debugging & Introspection
  * \{ */
-//~ #ifdef DEBUG
+
+#include "BLI_math.h"
 
 /**
- * Measure how well the hash function performs
- * (1.0 is approx as good as random distribution).
+ * Measure how well the hash function performs, i.e. variance of the distribution of the entries in the buckets.
+ * (0.0 is approx as good as even distribution).
  *
  * Smaller is better!
  */
-double BLI_ghash_calc_quality(GHash *gh)
+double BLI_ghash_calc_quality(GHash *gh, double *r_load, int *r_min_bucket, int *r_max_bucket)
 {
-	uint64_t sum = 0;
+	double sum = 0.0;
+	double mean;
 	unsigned int i;
 
-	if (gh->nentries == 0)
-		return -1.0;
+	if (gh->nentries == 0) {
+		if (r_load) {
+			*r_load = 0.0;
+		}
+		if (r_min_bucket) {
+			*r_min_bucket = 0;
+		}
+		if (r_max_bucket) {
+			*r_max_bucket = 0;
+		}
 
+		return 0.0;
+	}
+
+	mean = (double)gh->nentries / (double)gh->nbuckets;
+	if (r_load) {
+		*r_load = mean;
+	}
+	if (r_min_bucket) {
+		*r_min_bucket = INT_MAX;
+	}
+	if (r_max_bucket) {
+		*r_max_bucket = 0;
+	}
+
+	/* We already know our mean (i.e. load factor), easy to compute variance.
+	 * See http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Two-pass_algorithm
+	 */
 	for (i = 0; i < gh->nbuckets; i++) {
-		uint64_t count = 0;
+		int count = 0;
 		Entry *e;
 		for (e = gh->buckets[i]; e; e = e->next) {
-			count += 1;
+			count++;
+		}
+		if (r_min_bucket) {
+			*r_min_bucket = min_ii(*r_min_bucket, (int)count);
+		}
+		if (r_max_bucket) {
+			*r_max_bucket = max_ii(*r_max_bucket, (int)count);
 		}
-		sum += count * (count + 1);
+		sum += ((double)count - mean) * ((double)count - mean);
 	}
-	return ((double)sum * (double)gh->nbuckets /
-	        ((double)gh->nentries * (gh->nentries + 2 * gh->nbuckets - 1)));
+	return sum / (double)(gh->nbuckets - 1);
 }
-double BLI_gset_calc_quality(GSet *gs)
+double BLI_gset_calc_quality(GSet *gs, double *r_load, int *r_min_bucket, int *r_max_bucket)
 {
-	return BLI_ghash_calc_quality((GHash *)gs);
+	return BLI_ghash_calc_quality((GHash *)gs, r_load, r_min_bucket, r_max_bucket);
 }
 
-//~ #endif
 /** \} */
diff --git a/tests/gtests/blenlib/BLI_ghash_test.cc b/tests/gtests/blenlib/BLI_ghash_test.cc
index 0a27443..b45de8f 100644
--- a/tests/gtests/blenlib/BLI_ghash_test.cc
+++ b/tests/gtests/blenlib/BLI_ghash_test.cc
@@ -2,11 +2,11 @@
 
 #include "testing/testing.h"
 
-#define BLI_INLINE
-
 extern "C" {
 #include "MEM_guardedalloc.h"
+#include "BLI_utildefines.h"
 #include "BLI_ghash.h"
+#include "BLI_rand.h"
 #include "BLI_string.h"
 #include "PIL_time_utildefines.h"
 }
@@ -128,164 +128,250 @@ Vivamus eu ligula blandit, imperdiet arcu at, rutrum sem. Aliquam erat volutpat.
 Nullam eget malesuada justo, at molestie quam. Sed consequat massa eu faucibus maximus. Curabitur placerat orci sapien, sit amet semper magna sodales non. Ut fermentum accumsan odio in consectetur. Morbi neque mi, vulputate nec mi ut, cursus scelerisque lectus. Nulla sapien enim, finibus id ipsum luctus, consequat ullamcorper lectus. Sed volutpat sed massa in sodales. Morbi lacinia diam eu commodo vulputate. Fusce aliquet pulvinar dolor in egestas. Fusce molestie commodo leo eu ultricies [...]
 Praesent luctus vitae nunc vitae pellentesque. Praesent faucibus sed urna ut lacinia. Vivamus id justo quis dolor porta rutrum nec nec odio. Cras euismod tortor quis diam ultrices, eu mattis nisi consectetur. Fusce mattis nisi vel condimentum molestie. Fusce fringilla ut nibh volutpat elementum. Mauris posuere consectetur leo a aliquet. Donec quis sodales sapien. Maecenas ut felis tempus, eleifend mauris et, faucibus mi. Quisque fringilla orci arcu, sit amet porta risus hendrerit non. Ae [...]
 
+/* Using http://corpora.uni-leipzig.de/downloads/eng_wikipedia_2010_1M-text.tar.gz (1 million of words, about 122MB of text)
+ * from http://corpora.informatik.uni-leipzig.de/download.html */
+#define TEXT_CORPUS_PATH "/home/i74700deb64/Téléchargements/eng_wikipedia_2010_1M-text/eng_wikipedia_2010_1M-sentences.txt"
 
-TEST(ghash, TextGHash)
+#define PRINTF_GHASH_ST

@@ Diff output truncated at 10240 characters. @@




More information about the Bf-blender-cvs mailing list