diff options
author | jsdelfino <jsdelfino@13f79535-47bb-0310-9956-ffa450edef68> | 2010-04-12 06:57:32 +0000 |
---|---|---|
committer | jsdelfino <jsdelfino@13f79535-47bb-0310-9956-ffa450edef68> | 2010-04-12 06:57:32 +0000 |
commit | 76361f54a82714220a1aa8e317a0eb3be1f08211 (patch) | |
tree | dfd26d3aa068ec7f62014e96147bf88c9211afc9 /sca-cpp/trunk | |
parent | bd76aa29d2df9104d6989d6aa7b81dc3db1d455c (diff) |
Add various utility hash functions, useful to hash and retrieve keys from a datastore.
git-svn-id: http://svn.us.apache.org/repos/asf/tuscany@933115 13f79535-47bb-0310-9956-ffa450edef68
Diffstat (limited to 'sca-cpp/trunk')
-rw-r--r-- | sca-cpp/trunk/kernel/Makefile.am | 6 | ||||
-rwxr-xr-x | sca-cpp/trunk/kernel/hash-test | bin | 0 -> 186664 bytes | |||
-rw-r--r-- | sca-cpp/trunk/kernel/hash-test.cpp | 135 | ||||
-rw-r--r-- | sca-cpp/trunk/kernel/hash.hpp | 207 |
4 files changed, 346 insertions, 2 deletions
diff --git a/sca-cpp/trunk/kernel/Makefile.am b/sca-cpp/trunk/kernel/Makefile.am index a8828b0958..f7db7821a3 100644 --- a/sca-cpp/trunk/kernel/Makefile.am +++ b/sca-cpp/trunk/kernel/Makefile.am @@ -38,6 +38,8 @@ xml_test_LDFLAGS = -lxml2 xsd_test_SOURCES = xsd-test.cpp xsd_test_LDFLAGS = -lxml2 -noinst_PROGRAMS = string-test kernel-test mem-test parallel-test xml-test xsd-test -TESTS = string-test kernel-test mem-test parallel-test xml-test +hash_test_SOURCES = hash-test.cpp + +noinst_PROGRAMS = string-test kernel-test hash-test mem-test parallel-test xml-test xsd-test +TESTS = string-test kernel-test hash-test mem-test parallel-test xml-test diff --git a/sca-cpp/trunk/kernel/hash-test b/sca-cpp/trunk/kernel/hash-test Binary files differnew file mode 100755 index 0000000000..00d1ff3a2a --- /dev/null +++ b/sca-cpp/trunk/kernel/hash-test diff --git a/sca-cpp/trunk/kernel/hash-test.cpp b/sca-cpp/trunk/kernel/hash-test.cpp new file mode 100644 index 0000000000..b794e38920 --- /dev/null +++ b/sca-cpp/trunk/kernel/hash-test.cpp @@ -0,0 +1,135 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* $Rev$ $Date$ */ + +/** + * Test hash functions. + */ + +#include <assert.h> +#include "string.hpp" +#include "hash.hpp" +#include "perf.hpp" + +namespace tuscany { + +bool testCrc32hash() { + const string key("This is a test key"); + unsigned int h = crc32hash(c_str(key), length(key)); + assert(h != 0); + return true; +} + +bool testTimes33hash() { + const string key("This is a test key"); + unsigned int h = times33hash(c_str(key), length(key)); + assert(h != 0); + return true; +} + +bool testMurmurhash() { + const string key("This is a test key"); + unsigned int h = murmurhash(c_str(key), length(key)); + assert(h != 0); + return true; +} + +bool testPortablemurmurhash() { + const string key("This is a test key"); + unsigned int h = portablemurmurhash(c_str(key), length(key)); + assert(h != 0); + return true; +} + +struct crc32hashTest { + const string key; + crc32hashTest(const string& key) : key(key) { + } + bool operator()() const { + unsigned int h = crc32hash(c_str(key), length(key)); + assert(h != 0); + return true; + } +}; + +struct times33hashTest { + const string key; + times33hashTest(const string& key) : key(key) { + } + bool operator()() const { + unsigned int h = times33hash(c_str(key), length(key)); + assert(h != 0); + return true; + } +}; + +struct murmurhashTest { + const string key; + murmurhashTest(const string& key) : key(key) { + } + bool operator()() const { + unsigned int h = murmurhash(c_str(key), length(key)); + assert(h != 0); + return true; + } +}; + +struct portablemurmurhashTest { + const string key; + portablemurmurhashTest(const string& key) : key(key) { + } + bool operator()() const { + unsigned int h = portablemurmurhash(c_str(key), length(key)); + assert(h != 0); + return true; + } +}; + +bool testHashPerf() { + const string key("This is a test key"); + const int count = 100000; + + const lambda<bool()> crc32 = crc32hashTest(key); + cout << "crc32hash test " << time(crc32, 5, count) << " ms" << endl; + const lambda<bool()> times33 = times33hashTest(key); + cout << "times33hash test " << time(times33, 5, count) << " ms" << endl; + const lambda<bool()> murmur = murmurhashTest(key); + cout << "murmurhash test " << time(murmur, 5, count) << " ms" << endl; + const lambda<bool()> portablemurmur = portablemurmurhashTest(key); + cout << "portable murmurhash test " << time(portablemurmur, 5, count) << " ms" << endl; + + return true; +} + +} + +int main() { + tuscany::cout << "Testing..." << tuscany::endl; + + tuscany::testCrc32hash(); + tuscany::testTimes33hash(); + tuscany::testMurmurhash(); + tuscany::testPortablemurmurhash(); + tuscany::testHashPerf(); + + tuscany::cout << "OK" << tuscany::endl; + + return 0; +} diff --git a/sca-cpp/trunk/kernel/hash.hpp b/sca-cpp/trunk/kernel/hash.hpp new file mode 100644 index 0000000000..5cd213f09e --- /dev/null +++ b/sca-cpp/trunk/kernel/hash.hpp @@ -0,0 +1,207 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* $Rev$ $Date$ */ + +#ifndef tuscany_hash_hpp +#define tuscany_hash_hpp + +/** + * Fast hash functions. + */ + +#include <apr_hash.h> +#include <apr_memcache.h> + +namespace tuscany +{ + +/** + * Apache apr-util CRC32 hash function. + * + * See srclib/apr-util/memcache/apr_memcache.c from the Apache HTTPD + * source tree. Reproducing the comments from apr_memcache.c here: + * + * The crc32 functions and data were originally written by Spencer + * Garrett <srg@quick.com> and were gleaned from the PostgreSQL source + * tree at contrib/ltree/crc32.[ch] and from FreeBSD at + * src/usr.bin/cksum/crc32.c. + */ +const unsigned int crc32hash(const char* data, const unsigned int len) { + return apr_memcache_hash_default(NULL, data, len); +} + +/** + * Apache apr tables default hash function. + * + * See srclib/apr/tables/apr_hash.c from the Apache HTTPD source tree. + * Reproducing the comments from apr_hash.c here: + * + * This is the popular `times 33' hash algorithm which is used by + * perl and also appears in Berkeley DB. This is one of the best + * known hash functions for strings because it is both computed + * very fast and distributes very well. + * + * The originator may be Dan Bernstein but the code in Berkeley DB + * cites Chris Torek as the source. The best citation I have found + * is "Chris Torek, Hash function for text in C, Usenet message + * <27038@mimsy.umd.edu> in comp.lang.c , October, 1990." in Rich + * Salz's USENIX 1992 paper about INN which can be found at + * <http://citeseer.nj.nec.com/salz92internetnews.html>. + * + * The magic of number 33, i.e. why it works better than many other + * constants, prime or not, has never been adequately explained by + * anyone. So I try an explanation: if one experimentally tests all + * multipliers between 1 and 256 (as I did while writing a low-level + * data structure library some time ago) one detects that even + * numbers are not useable at all. The remaining 128 odd numbers + * (except for the number 1) work more or less all equally well. + * They all distribute in an acceptable way and this way fill a hash + * table with an average percent of approx. 86%. + * + * If one compares the chi^2 values of the variants (see + * Bob Jenkins ``Hashing Frequently Asked Questions'' at + * http://burtleburtle.net/bob/hash/hashfaq.html for a description + * of chi^2), the number 33 not even has the best value. But the + * number 33 and a few other equally good numbers like 17, 31, 63, + * 127 and 129 have nevertheless a great advantage to the remaining + * numbers in the large set of possible multipliers: their multiply + * operation can be replaced by a faster operation based on just one + * shift plus either a single addition or subtraction operation. And + * because a hash function has to both distribute good _and_ has to + * be very fast to compute, those few numbers should be preferred. + * + * -- Ralf S. Engelschall <rse@engelschall.com> + */ +const unsigned int times33hash(const char* data, const unsigned int len) { + apr_ssize_t l = len; + return apr_hashfunc_default(data, &l); +} + +/** + * A very fast, non-cryptographic hash suitable for general hash-based + * lookup. See http://murmurhash.googlepages.com/ for more details. + * + * Original code by Austin Appleby, released to the public domain and under + * the MIT license. + * + * Compiles down to ~52 instructions on x86. + * Passes chi^2 tests for practically all keysets & bucket sizes. + * Excellent avalanche behavior. Maximum bias is under 0.5%. + * Passes Bob Jenkin's frog.c torture-test. No collisions possible for 4 byte + * keys, no small 1 to 7 bit differentials. + */ +const unsigned int murmurhash(const char* key, const unsigned int klen) { + unsigned int len = klen; + const unsigned int seed = 0; + + // 'm' and 'r' are mixing constants generated offline. + // They're not really 'magic', they just happen to work well. + const unsigned int m = 0x5bd1e995; + const int r = 24; + + // Initialize the hash to a 'random' value + unsigned int h = seed ^ len; + + // Mix 4 bytes at a time into the hash + const unsigned char* data = (const unsigned char*)key; + while(len >= 4) { + unsigned int k = *(unsigned int*)data; + k *= m; + k ^= k >> r; + k *= m; + h *= m; + h ^= k; + data += 4; + len -= 4; + } + + // Handle the last few bytes of the input array + switch(len) { + case 3: h ^= data[2] << 16; + case 2: h ^= data[1] << 8; + case 1: h ^= data[0]; + h *= m; + }; + + // Do a few final mixes of the hash to ensure the last few + // bytes are well-incorporated. + h ^= h >> 13; + h *= m; + h ^= h >> 15; + + return h; +} + +/** + * An endian and alignment neutral, but half the speed, version of + * the murmur hash. + */ +const unsigned int portablemurmurhash(const char* key, const unsigned int klen) { + unsigned int len = klen; + const unsigned int seed = 0; + + // 'm' and 'r' are mixing constants generated offline. + // They're not really 'magic', they just happen to work well. + const unsigned int m = 0x5bd1e995; + const int r = 24; + + // Initialize the hash to a 'random' value + unsigned int h = seed ^ len; + + // Mix 4 bytes at a time into the hash + const unsigned char* data = (const unsigned char *)key; + while(len >= 4) { + unsigned int k; + k = data[0]; + k |= data[1] << 8; + k |= data[2] << 16; + k |= data[3] << 24; + k *= m; + k ^= k >> r; + k *= m; + h *= m; + h ^= k; + data += 4; + len -= 4; + } + + // Handle the last few bytes of the input array + switch(len) { + case 3: h ^= data[2] << 16; + case 2: h ^= data[1] << 8; + case 1: h ^= data[0]; + h *= m; + }; + + // Do a few final mixes of the hash to ensure the last few + // bytes are well-incorporated. + h ^= h >> 13; + h *= m; + h ^= h >> 15; + + return h; +} + +const unsigned int hashselect(const unsigned int hash, const unsigned int max) { + return hash % max; +} + +} +#endif /* tuscany_hash_hpp */ |