From 9ce2359f893870e3db5fea3c4d3fbbec4bad5a06 Mon Sep 17 00:00:00 2001 From: Yoni Fogel Date: Fri, 1 Feb 2008 21:52:10 +0000 Subject: [PATCH] Added hash table git-svn-id: file:///svn/tokudb@2039 c7de825b-a66e-492c-adef-691d508d4ae1 --- src/hash_table/hashtable.c | 342 +++++++++++++++++++++++++++++++++++++ src/hash_table/hashtable.h | 100 +++++++++++ src/hash_table/hashtest.c | 264 ++++++++++++++++++++++++++++ 3 files changed, 706 insertions(+) create mode 100755 src/hash_table/hashtable.c create mode 100755 src/hash_table/hashtable.h create mode 100755 src/hash_table/hashtest.c diff --git a/src/hash_table/hashtable.c b/src/hash_table/hashtable.c new file mode 100755 index 00000000000..361d9051aa7 --- /dev/null +++ b/src/hash_table/hashtable.c @@ -0,0 +1,342 @@ +/* -*- mode: C; c-basic-offset: 4 -*- */ +#ident "Copyright (c) 2007 Tokutek Inc. All rights reserved." + +/* Hash table with chaining. */ +#include "hashtable.h" +#include "memory.h" +#include "primes.h" +// #include "../include/ydb-constants.h" +#include +#include +#include +#include + +#include "key.h" +#include "yerror.h" +#include "hashfun.h" + +static int hashelt_list_verify(struct hashelt_list *helist) { + HASHELT e = helist->head; + if (e == 0) + return helist->tail == 0; + while (e->next) + e = e->next; + return helist->tail == e; +} + +static inline void hashelt_list_init(struct hashelt_list *helist) { + helist->head = helist->tail = 0; +} + +static inline void hashelt_list_append(struct hashelt_list *helist, HASHELT e) { + assert(hashelt_list_verify(helist)); + e->next = 0; + if (helist->tail) + helist->tail->next = e; + else + helist->head = e; + helist->tail = e; + assert(hashelt_list_verify(helist)); +} + +static inline HASHELT hashelt_list_pop(struct hashelt_list *helist) { + assert(hashelt_list_verify(helist)); + HASHELT e = helist->head; + if (e) { + helist->head = e->next; + if (helist->head == 0) + helist->tail = 0; + assert(hashelt_list_verify(helist)); + } + return e; +} + +static inline HASHELT hashelt_list_peek(struct hashelt_list *helist) { + return helist->head; +} + +int toku_hashtable_create (HASHTABLE *h) { + HASHTABLE MALLOC(tab); + unsigned int i; + if (tab==0) return -1; + tab->n_keys=0; + tab->primeidx=0; + tab->arraysize=toku_get_prime(tab->primeidx); + assert(sizeof(*tab->array)==sizeof(void*)); + tab->array = toku_calloc(tab->arraysize, sizeof(*tab->array)); + for (i=0; iarraysize; i++) tab->array[i]=0; + tab->allow_dups = 1; + *h=tab; + return 0; +} + +int toku_hashtable_set_dups (HASHTABLE tab, unsigned int allow_dups) { + tab->allow_dups = allow_dups; + return 0; +} + +static void hash_find_internal (HASHTABLE tab, unsigned int hash, const unsigned char *key, ITEMLEN keylen, HASHDUP *dup_ptr, HASHDUP **prev_ptr) { + unsigned int h = hash % tab->arraysize; + HASHDUP dup; + HASHDUP *prev = &tab->array[h]; + for (dup=*prev; dup; prev=&dup->next, dup=*prev) { + HASHELT he = hashelt_list_peek(&dup->kdlist); assert(he); + if (keylen==he->keylen && memcmp(key, he->keyval, keylen)==0) { + *prev_ptr = prev; + *dup_ptr = dup; + return; + } + } + *prev_ptr = prev; + *dup_ptr = 0; +} + +int toku_hash_find_idx (HASHTABLE tab, bytevec key, ITEMLEN keylen, int idx, bytevec *data, ITEMLEN *datalen, int *type) { + HASHDUP dup, *prev; + hash_find_internal(tab, hash_key (key, keylen), key, keylen, &dup, &prev); + if (dup==0) { + return -1; + } else { + HASHELT he = hashelt_list_peek(&dup->kdlist); + int i; + for (i=0; inext; + if (he == 0) + return -2; + } + *data = &he->keyval[he->keylen]; + *datalen = he->vallen; + *type = he->type; + return 0; + } +} + +int toku_hash_find (HASHTABLE tab, bytevec key, ITEMLEN keylen, bytevec *data, ITEMLEN *datalen, int *type) { + HASHDUP dup, *prev; + hash_find_internal(tab, hash_key (key, keylen), key, keylen, &dup, &prev); + if (dup==0) { + return -1; + } else { + HASHELT he = hashelt_list_peek(&dup->kdlist); + *data = &he->keyval[he->keylen]; + *datalen = he->vallen; + *type = he->type; + return 0; + } +} + +int toku_hash_rehash_everything (HASHTABLE tab, unsigned int primeindexdelta) { + int newprimeindex = primeindexdelta+tab->primeidx; + assert(newprimeindex>=0); + unsigned int newarraysize = toku_get_prime(newprimeindex); + HASHDUP *newarray = toku_calloc(newarraysize, sizeof(*tab->array)); + unsigned int i; + //printf("%s:%d newarraysize=%d\n", __FILE__, __LINE__, newarraysize); + assert(newarray!=0); + tab->primeidx=newprimeindex; + for (i=0; iarraysize; i++) { + HASHDUP dup; + while ((dup=tab->array[i])!=0) { + HASHELT he = hashelt_list_peek(&dup->kdlist); assert(he); + //unsigned int hk = hash_key((unsigned char *)he->key, he->keylen); + unsigned int h = he->hash%newarraysize; + //assert(he->hash==hk); + tab->array[i] = dup->next; + dup->next = newarray[h]; + newarray[h] = dup; + } + } + toku_free(tab->array); + // printf("Freed\n"); + tab->array=newarray; + tab->arraysize=newarraysize; + //printf("Done growing or shrinking\n"); + return 0; +} + +int toku_hash_insert (HASHTABLE tab, const void *key, ITEMLEN keylen, const void *val, ITEMLEN vallen, int type) +{ + unsigned int hk = hash_key (key,keylen); + unsigned int h = hk%tab->arraysize; + HASHDUP dup,*prev_ptr; + hash_find_internal(tab, hk, key, keylen, &dup, &prev_ptr); + if (dup == 0) { + dup = toku_malloc(sizeof *dup); + assert(dup); + hashelt_list_init(&dup->kdlist); + + dup->next = tab->array[h]; + tab->array[h]=dup; + } else if (!tab->allow_dups) + return BRT_ALREADY_THERE; + + HASHELT he=toku_malloc(sizeof(*he)+keylen+vallen); + assert(he); // ????? + he->type = type; + he->keylen = keylen; + he->vallen = vallen; + memmove(&he->keyval[0], key, keylen); + memmove(&he->keyval[keylen], val, vallen); + he->hash = hk; + + hashelt_list_append(&dup->kdlist, he); + + tab->n_keys++; + if (tab->n_keys > tab->arraysize) { + return toku_hash_rehash_everything(tab, +1); + } + return BRT_OK; +} + +int toku_hash_delete (HASHTABLE tab, const void *key, ITEMLEN keylen) { + HASHDUP dup, *prev_ptr; + //printf("%s:%d deleting %s (bucket %d)\n", __FILE__, __LINE__, key, hash_key(key,keylen)%tab->arraysize); + hash_find_internal(tab, hash_key (key, keylen), key, keylen, &dup, &prev_ptr); + if (dup==0) return DB_NOTFOUND; + else { + assert(*prev_ptr==dup); + + HASHELT he = hashelt_list_pop(&dup->kdlist); + assert(he); + //printf("%s:%d deleting %s %s\n", __FILE__, __LINE__, he->key, he->val); + toku_free_n(he, sizeof(*he)+he->keylen+he->vallen); + tab->n_keys--; + + if (!hashelt_list_peek(&dup->kdlist)) { + /* delete the dups from the hash list */ + *prev_ptr = dup->next; + toku_free_n(dup, sizeof *dup); + } + + if ((tab->n_keys * 4 < tab->arraysize) && tab->primeidx>0) { + return toku_hash_rehash_everything(tab, -1); + } + return BRT_OK; + } +} + +int toku_hash_delete_all (HASHTABLE tab, const void *key, ITEMLEN keylen) { + HASHDUP dup, *prev_ptr; + //printf("%s:%d deleting %s (bucket %d)\n", __FILE__, __LINE__, key, hash_key(key,keylen)%tab->arraysize); + hash_find_internal(tab, hash_key (key, keylen), key, keylen, &dup, &prev_ptr); + if (dup==0) return DB_NOTFOUND; + else { + assert(*prev_ptr==dup); + /* delete the dups from the hash list */ + *prev_ptr = dup->next; + + /* delete all of the kd pairs in the dup list */ + HASHELT he; + while ((he = hashelt_list_pop(&dup->kdlist)) != 0 ) { + //printf("%s:%d deleting %s %s\n", __FILE__, __LINE__, he->key, he->val); + toku_free_n(he, sizeof(*he)+he->keylen+he->vallen); + tab->n_keys--; + } + + toku_free_n(dup, sizeof *dup); + + if ((tab->n_keys * 4 < tab->arraysize) && tab->primeidx>0) { + return toku_hash_rehash_everything(tab, -1); + } + return BRT_OK; + } +} + + +int toku_hashtable_random_pick(HASHTABLE h, bytevec *key, ITEMLEN *keylen, bytevec *data, ITEMLEN *datalen, int *type, long int *randomnumber) { + unsigned int i; + unsigned int usei = (*randomnumber)%h->arraysize; + for (i=0; iarraysize; i++, usei++) { + if (usei>=h->arraysize) usei=0; + HASHDUP dup=h->array[usei]; + if (dup) { + HASHELT he = hashelt_list_peek(&dup->kdlist); assert(he); + *key = &he->keyval[0]; + *keylen = he->keylen; + *data = &he->keyval[he->keylen]; + *datalen = he->vallen; + *type = he->type; + *randomnumber = usei; + return 0; + } + } + return -1; +} + +#if 0 +int hashtable_find_last(HASHTABLE h, bytevec *key, ITEMLEN *keylen, bytevec *data, ITEMLEN *datalen) { + bytevec best_k=0, best_d; + ITEMLEN best_kl, best_dl; + HASHTABLE_ITERATE(h, this_k, this_kl, this_d, this_dl, + ({ + if (best_k==0 || toku_keycompare(best_k, best_kl, this_k, this_kl)<0) { + best_k = this_k; + best_kl = this_kl; + best_d = this_d; + best_dl = this_dl; + } + })); + if (best_k) { + *key = best_k; + *keylen = best_kl; + *data = best_d; + *datalen = best_dl; + return 0; + } else { + return -1; + } +} +#endif + +void toku_hashtable_iterate (HASHTABLE tab, void(*f)(bytevec key, ITEMLEN keylen, bytevec data, ITEMLEN datalen, int type, void*args), void* args) { + /* + int i; + for (i=0; iarraysize; i++) { + HASHELT he; + for (he=tab->array[i]; he; he=he->next) { + f(he->key, he->keylen, he->val, he->vallen, args); + } + } + */ + HASHTABLE_ITERATE(tab, key, keylen, val, vallen, type, f(key,keylen,val,vallen,type,args)); +} + +int toku_hashtable_n_entries(HASHTABLE tab) { + return tab->n_keys; +} + +/* Frees the list, but doesn't free the keys. */ +static void hasheltlist_free (HASHELT elt) { + if (elt==0) return; + else { + hasheltlist_free(elt->next); + toku_free_n(elt, sizeof(*elt)+elt->keylen+elt->vallen); + } +} + +/* Frees the table, but doesn't do anything to the contents of the table. The keys are still alloc'd. The internal storage of the hashtable is freed. */ +void toku_hashtable_free(HASHTABLE *tab) { + //printf("%s:%d free hashtable %p\n", __FILE__, __LINE__, tab); + toku_hashtable_clear(*tab); + //printf("%s:%d free %p\n", __FILE__, __LINE__, tab);n + toku_free((*tab)->array); + toku_free_n(*tab, sizeof(**tab)); + *tab=0; +} + + +void toku_hashtable_clear(HASHTABLE tab) { + unsigned int i; + for (i=0; iarraysize; i++) { + HASHDUP dup = tab->array[i]; + while (dup) { + HASHDUP nextdup = dup->next; + hasheltlist_free(hashelt_list_peek(&dup->kdlist)); + toku_free_n(dup, sizeof *dup); + dup = nextdup; + } + tab->array[i]=0; + } + tab->n_keys = 0; +} diff --git a/src/hash_table/hashtable.h b/src/hash_table/hashtable.h new file mode 100755 index 00000000000..314e31d0dfd --- /dev/null +++ b/src/hash_table/hashtable.h @@ -0,0 +1,100 @@ +#ifndef HASHTABLE_H +#define HASHTABLE_H + +#ident "Copyright (c) 2007 Tokutek Inc. All rights reserved." + +#include "brttypes.h" +/* Hash table with chaining. */ +/* The keys and values are byte sequences. */ +/* The keys and values are malloc'd by the hashtable. */ +/* Duplicate keys are allowed by default and are stored in a FIFO list */ + +typedef struct hashtable *HASHTABLE; + +int toku_hashtable_create (HASHTABLE*); + +/* Configure the hash table for duplicate keys. + allow_dups != 0 -> duplications allowed, allow_dups == 0 -> no duplicates */ + +int toku_hashtable_set_dups (HASHTABLE, unsigned int allow_dups); + +/* Return 0 if the key is found in the hashtable, -1 otherwise. */ +/* Warning: The data returned points to the internals of the hashtable. It is set to "const" to try to prevent you from messing it up. */ +int toku_hash_find (HASHTABLE tab, bytevec key, ITEMLEN keylen, bytevec *data, ITEMLEN *datalen, int *type); + +/* match on key, index on duplicates */ +int toku_hash_find_idx (HASHTABLE tab, bytevec key, ITEMLEN keylen, int idx, bytevec *data, ITEMLEN *datalen, int *type); + +/* Insert the key/data pair into the hash table. + If the key is not in the hash table then insert it. + If the key already exists and duplicates are allowed then append it to the list of duplicates. + If the key already exists and duplicates are not allowed then return an error */ + +int toku_hash_insert (HASHTABLE tab, const void *key, ITEMLEN keylen, const void *data, ITEMLEN datalen, int type); + +/* Delete the first entry with the given key + It is OK to delete something that isn't there. */ + +int toku_hash_delete (HASHTABLE tab, const void *key, ITEMLEN keylen); + +/* Delete all entries with the given key */ + +int toku_hash_delete_all (HASHTABLE tab, const void *key, ITEMLEN keylen); + +void toku_hashtable_free(HASHTABLE *tab); +int toku_hashtable_n_entries(HASHTABLE); + +void toku_hashtable_clear(HASHTABLE); + +int toku_hashtable_random_pick(HASHTABLE h, bytevec *key, ITEMLEN *keylen, bytevec *data, ITEMLEN *datalen, int *type, long int *randomnumber); +//int hashtable_find_last(HASHTABLE h, bytevec *key, ITEMLEN *keylen, bytevec *data, ITEMLEN *datalen); + +typedef struct hashelt *HASHELT; +struct hashelt { + HASHELT next; + unsigned int hash; + int type; + ITEMLEN keylen; + ITEMLEN vallen; + char keyval[]; /* the first KEYLEN bytes are the key. The next bytes are the value. */ +}; + +struct hashelt_list { + HASHELT head; + HASHELT tail; +}; + +typedef struct hashdup *HASHDUP; +struct hashdup { + HASHDUP next; + struct hashelt_list kdlist; +}; + +struct hashtable { + HASHDUP *array; + unsigned int n_keys; + unsigned int arraysize; + unsigned int primeidx; + unsigned int allow_dups; +}; + +/* You cannot add or delete elements from the hashtable while iterating. */ +void toku_hashtable_iterate (HASHTABLE tab, void(*f)(bytevec key,ITEMLEN keylen,bytevec data,ITEMLEN datalen,int type, void*), void*); + +// If you don't want to use something, do something like use "key __attribute__((__unused__))" for keyvar. +#define HASHTABLE_ITERATE(table,keyvar,keylenvar,datavar,datalenvar,typevar,body) ({ \ + unsigned int hi_counter; \ + for (hi_counter=0; hi_counterarraysize; hi_counter++) { \ + HASHDUP hi_dup; \ + for (hi_dup=table->array[hi_counter]; hi_dup; hi_dup=hi_dup->next) { \ + HASHELT hi_he; \ + for (hi_he=hi_dup->kdlist.head; hi_he; hi_he=hi_he->next) { \ + const char *keyvar = &hi_he->keyval[0]; \ + ITEMLEN keylenvar = hi_he->keylen; \ + const char *datavar = &hi_he->keyval[hi_he->keylen]; \ + ITEMLEN datalenvar = hi_he->vallen; \ + int typevar = hi_he->type; \ + body; \ + }}}}) + +#endif diff --git a/src/hash_table/hashtest.c b/src/hash_table/hashtest.c new file mode 100755 index 00000000000..9f4f7238719 --- /dev/null +++ b/src/hash_table/hashtest.c @@ -0,0 +1,264 @@ +/* -*- mode: C; c-basic-offset: 4 -*- */ +#ident "Copyright (c) 2007 Tokutek Inc. All rights reserved." + +#include "key.h" +#include "hashtable.h" +#include "memory.h" +#include "primes.h" +#include +#include +#include +#include +#include + +void verify_hash_instance (bytevec kv_v, ITEMLEN kl, bytevec dv_v, ITEMLEN dl, + int N, int *data, char *saw) { + char *kv = (char*)kv_v; + char *dv = (char*)dv_v; + int num, k; + assert(kv[0]=='k'); + assert(dv[0]=='d'); + assert(strcmp(kv+1, dv+1)==0); + assert(strlen(kv)+1==kl); + assert(strlen(dv)+1==dl); + num = atoi(kv+1); + for (k=0; k