From 2f8ec84fb873345973c7671f0bf455687b72b982 Mon Sep 17 00:00:00 2001 From: "Bradley C. Kuszmaul" Date: Fri, 13 Jul 2007 19:37:47 +0000 Subject: [PATCH] Rename git-svn-id: file:///svn/tokudb@4 c7de825b-a66e-492c-adef-691d508d4ae1 --- Makefile | 2 + include/db.h | 189 ++++ include/ydb-constants.h | 27 + newbrt/Makefile | 50 + newbrt/brt-internal.h | 89 ++ newbrt/brt-serialize-test.c | 68 ++ newbrt/brt-serialize.c | 441 +++++++++ newbrt/brt-test.c | 695 ++++++++++++++ newbrt/brt.c | 1544 +++++++++++++++++++++++++++++++ newbrt/brt.h | 35 + newbrt/brttypes.h | 8 + newbrt/cachetable-test.c | 277 ++++++ newbrt/cachetable.c | 453 +++++++++ newbrt/cachetable.h | 59 ++ newbrt/hashtable.c | 216 +++++ newbrt/hashtable.h | 58 ++ newbrt/hashtest.c | 113 +++ newbrt/header-io.c | 112 +++ newbrt/key.c | 27 + newbrt/key.h | 5 + newbrt/mdict-test.c | 89 ++ newbrt/mdict.c | 64 ++ newbrt/mdict.h | 83 ++ newbrt/memory.c | 191 ++++ newbrt/memory.h | 24 + newbrt/myassert.c | 9 + newbrt/myassert.h | 11 + newbrt/pma-internal.h | 38 + newbrt/pma-test.c | 444 +++++++++ newbrt/pma.c | 548 +++++++++++ newbrt/pma.h | 80 ++ newbrt/randdb4.c | 133 +++ newbrt/ybt-test.c | 55 ++ newbrt/ybt.c | 44 + newbrt/ybt.h | 12 + newbrt/yerror.h | 3 + pma/Makefile | 5 + src-bdbwrap/Makefile | 46 + src-bdbwrap/README | 121 +++ src-bdbwrap/bdbw.c | 750 +++++++++++++++ src-bdbwrap/bdbw.h | 213 +++++ src-bdbwrap/wrapper-design.text | 25 + src-bdbwrap/ydb-uniq.h | 129 +++ src-bdbwrap/ydb.c | 117 +++ src/Makefile | 6 + src/README | 91 ++ src/ydb.c | 420 +++++++++ utils/ydb_dump.c | 10 + 48 files changed, 8229 insertions(+) create mode 100644 Makefile create mode 100644 include/db.h create mode 100644 include/ydb-constants.h create mode 100644 newbrt/Makefile create mode 100644 newbrt/brt-internal.h create mode 100644 newbrt/brt-serialize-test.c create mode 100644 newbrt/brt-serialize.c create mode 100644 newbrt/brt-test.c create mode 100644 newbrt/brt.c create mode 100644 newbrt/brt.h create mode 100644 newbrt/brttypes.h create mode 100644 newbrt/cachetable-test.c create mode 100644 newbrt/cachetable.c create mode 100644 newbrt/cachetable.h create mode 100644 newbrt/hashtable.c create mode 100644 newbrt/hashtable.h create mode 100644 newbrt/hashtest.c create mode 100644 newbrt/header-io.c create mode 100644 newbrt/key.c create mode 100644 newbrt/key.h create mode 100644 newbrt/mdict-test.c create mode 100644 newbrt/mdict.c create mode 100644 newbrt/mdict.h create mode 100644 newbrt/memory.c create mode 100644 newbrt/memory.h create mode 100644 newbrt/myassert.c create mode 100644 newbrt/myassert.h create mode 100644 newbrt/pma-internal.h create mode 100644 newbrt/pma-test.c create mode 100644 newbrt/pma.c create mode 100644 newbrt/pma.h create mode 100644 newbrt/randdb4.c create mode 100644 newbrt/ybt-test.c create mode 100644 newbrt/ybt.c create mode 100644 newbrt/ybt.h create mode 100644 newbrt/yerror.h create mode 100644 pma/Makefile create mode 100644 src-bdbwrap/Makefile create mode 100644 src-bdbwrap/README create mode 100644 src-bdbwrap/bdbw.c create mode 100644 src-bdbwrap/bdbw.h create mode 100644 src-bdbwrap/wrapper-design.text create mode 100644 src-bdbwrap/ydb-uniq.h create mode 100644 src-bdbwrap/ydb.c create mode 100644 src/Makefile create mode 100644 src/README create mode 100644 src/ydb.c create mode 100644 utils/ydb_dump.c diff --git a/Makefile b/Makefile new file mode 100644 index 00000000000..08f92f36323 --- /dev/null +++ b/Makefile @@ -0,0 +1,2 @@ +TAGS: */*.c */*.h + etags */*.c */*.h diff --git a/include/db.h b/include/db.h new file mode 100644 index 00000000000..6ee5645ed7a --- /dev/null +++ b/include/db.h @@ -0,0 +1,189 @@ +#ifndef _YOBI_DB_H +#define _YOBI_DB_H + +#include "ydb-constants.h" + +#if defined(__cplusplus) +extern "C" { +#if 0 +} +#endif +#endif + +#include +#include + +typedef enum { + DB_BTREE=1, + // DB_HASH=2, + // DB_RECNO=3, + // DB_QUEUE=4, + // DB_UNKNOWN=5 /* Figure it out on open. */ +} DBTYPE; + +typedef enum { + DB_NOTICE_LOGFILE_CHANGED +} db_notices; + +enum { + DB_VERB_CHKPOINT = 0x0001, + DB_VERB_DEADLOCK = 0x0002, + DB_VERB_RECOVERY = 0x0004 + +}; + +typedef struct yobi_db DB; +typedef struct yobi_db_btree_stat DB_BTREE_STAT; +typedef struct yobi_db_env DB_ENV; +typedef struct yobi_db_key_range DB_KEY_RANGE; +typedef struct yobi_db_lsn DB_LSN; +typedef struct yobi_db_txn DB_TXN; +typedef struct yobi_db_txn_active DB_TXN_ACTIVE; +typedef struct yobi_db_txn_stat DB_TXN_STAT; +typedef struct yobi_dbc DBC; +typedef struct yobi_dbt DBT; + +struct yobi_db { + void *app_private; + int (*close) (DB *, u_int32_t); + int (*cursor) (DB *, DB_TXN *, DBC **, u_int32_t); + int (*del) (DB *, DB_TXN *, DBT *, u_int32_t); + int (*get) (DB *, DB_TXN *, DBT *, DBT *, u_int32_t); + int (*key_range) (DB *, DB_TXN *, DBT *, DB_KEY_RANGE *, u_int32_t); + int (*open) (DB *, DB_TXN *, + const char *, const char *, DBTYPE, u_int32_t, int); + int (*put) (DB *, DB_TXN *, DBT *, DBT *, u_int32_t); + int (*remove) (DB *, const char *, const char *, u_int32_t); + int (*rename) (DB *, const char *, const char *, const char *, u_int32_t); + int (*set_bt_compare) (DB *, + int (*)(DB *, const DBT *, const DBT *)); + int (*set_flags) (DB *, u_int32_t); + int (*stat) (DB *, void *, u_int32_t); + + struct ydb_db_internal *i; +}; +enum { + DB_DBT_MALLOC = 0x002, + DB_DBT_REALLOC = 0x010, + DB_DBT_USERMEM = 0x020, + DB_DBT_DUPOK = 0x040 +}; +struct yobi_dbt { + void *app_private; + void *data; + u_int32_t flags; + u_int32_t size; + u_int32_t ulen; +}; +struct yobi_db_txn { + int (*commit) (DB_TXN*, u_int32_t); + u_int32_t (*id) (DB_TXN *); + // internal stuff + struct yobi_db_txn_internal *i; +}; +struct yobi_dbc { + int (*c_get) (DBC *, DBT *, DBT *, u_int32_t); + int (*c_close) (DBC *); + int (*c_del) (DBC *, u_int32_t); + struct yobi_dbc_internal *i; +}; +struct yobi_db_env { + // Methods used by MYSQL + void (*err) (const DB_ENV *, int, const char *, ...); + int (*open) (DB_ENV *, const char *, u_int32_t, int); + int (*close) (DB_ENV *, u_int32_t); + int (*txn_checkpoint) (DB_ENV *, u_int32_t, u_int32_t, u_int32_t); + int (*log_flush) (DB_ENV *, const DB_LSN *); + void (*set_errcall) (DB_ENV *, void (*)(const char *, char *)); + void (*set_errpfx) (DB_ENV *, const char *); + void (*set_noticecall) (DB_ENV *, void (*)(DB_ENV *, db_notices)); + int (*set_flags) (DB_ENV *, u_int32_t, int); + int (*set_data_dir) (DB_ENV *, const char *); + int (*set_tmp_dir) (DB_ENV *, const char *); + int (*set_verbose) (DB_ENV *, u_int32_t, int); + int (*set_lg_bsize) (DB_ENV *, u_int32_t); + int (*set_lg_dir) (DB_ENV *, const char *); + int (*set_lg_max) (DB_ENV *, u_int32_t); + int (*set_cachesize) (DB_ENV *, u_int32_t, u_int32_t, int); + int (*set_lk_detect) (DB_ENV *, u_int32_t); + int (*set_lk_max) (DB_ENV *, u_int32_t); + int (*log_archive) (DB_ENV *, char **[], u_int32_t); + int (*txn_stat) (DB_ENV *, DB_TXN_STAT **, u_int32_t); + int (*txn_begin) (DB_ENV *, DB_TXN *, DB_TXN **, u_int32_t); + // Internal state + struct db_env_ydb_internal *i; +}; +struct yobi_db_key_range { + double less,equal,grater; +}; +struct yobi_db_btree_stat { + u_int32_t bt_ndata; + u_int32_t bt_nkeys; +}; +struct yobi_db_txn_stat { + u_int32_t st_nactive; + DB_TXN_ACTIVE *st_txnarray; +}; +struct yobi_db_lsn { + int hello; +}; +struct yobi_db_txn_active { + DB_LSN lsn; + u_int32_t txnid; +}; + +#ifndef _YDB_WRAP_H +#define DB_VERSION_STRING "Yobiduck: Fractal DB (November 19, 2006)" +#else +#define DB_VERSION_STRING_ydb "Yobiduck: Fractal DB (November 19, 2006) (wrapped bdb)" +#endif + +enum { + DB_ARCH_ABS = 0x001, + DB_ARCH_LOG = 0x004 +}; + +enum { + DB_CREATE = 0x0000001, + DB_RDONLY = 0x0000010, + DB_RECOVER = 0x0000020, + DB_THREAD = 0x0000040, + DB_TXN_NOSYNC = 0x0000100, + + DB_PRIVATE = 0x0100000 +}; + +enum { + DB_LOCK_DEFAULT = 1, + DB_LOCK_OLDEST = 7, + DB_LOCK_RANDOM = 8 +}; + +enum { + DB_DUP = 0x000002 +}; + +enum { + DB_NOOVERWRITE = 23 +}; + +enum { + DB_INIT_LOCK = 0x001000, + DB_INIT_LOG = 0x002000, + DB_INIT_MPOOL = 0x004000, + DB_INIT_TXN = 0x008000 +}; + +int db_create (DB **, DB_ENV *, u_int32_t); +int db_env_create (DB_ENV **, u_int32_t); + +int txn_begin (DB_ENV *, DB_TXN *, DB_TXN **, u_int32_t); +int txn_commit (DB_TXN *, u_int32_t); +int txn_abort (DB_TXN *); + +int log_compare (const DB_LSN *, const DB_LSN *); + +#if defined(__cplusplus) +} +#endif +#endif diff --git a/include/ydb-constants.h b/include/ydb-constants.h new file mode 100644 index 00000000000..e9560c6cccb --- /dev/null +++ b/include/ydb-constants.h @@ -0,0 +1,27 @@ +#ifndef _YDB_CONSTANTS_H +#define _YDB_CONSTANTS_H + +enum { + DB_KEYEMPTY = -30998, + DB_KEYEXIST = -30997, + DB_LOCK_DEADLOCK = -30996, + DB_NOTFOUND = -30991, + + // Private + DB_BADFORMAT = -31000 +}; + +enum { + //DB_AFTER = 1, + DB_FIRST = 10, + DB_GET_BOTH = 11, + DB_LAST = 18, + DB_NEXT = 19, + DB_NEXT_DUP = 20, + DB_PREV = 27, + DB_SET = 30, + DB_SET_RANGE = 32, + DB_RMW = 0x40000000 +}; + +#endif diff --git a/newbrt/Makefile b/newbrt/Makefile new file mode 100644 index 00000000000..26dd52d98de --- /dev/null +++ b/newbrt/Makefile @@ -0,0 +1,50 @@ +# GCOV_FLAGS = -fprofile-arcs -ftest-coverage +#PROF_FLAGS = -pg +#OPTFLAGS = -O2 +CFLAGS = -Wall -W $(OPTFLAGS) -g $(GCOV_FLAGS) $(PROF_FLAGS) -Werror +LDFLAGS = $(OPTFLAGS) -g $(GCOV_FLAGS) $(PROF_FLAGS) + +default: bins +BINS = pma-test brt-test cachetable-test brt-serialize-test randbrt randdb4 hashtest ybt-test +bins: $(BINS) +check: bins + ./ybt-test + ./mdict-test + ./pma-test + ./cachetable-test + ./brt-serialize-test + ./brt-test + ./hashtest + +# pma: PROF_FLAGS=-fprofile-arcs -ftest-coverage +key.o: brttypes.h key.h +pma-test.o: pma-internal.h pma.h yerror.h memory.h ../include/ydb-constants.h +pma-test: pma.o memory.o key.o ybt.o +pma.o: pma.h yerror.h pma-internal.h memory.h key.h ybt.h brttypes.h ../include/ydb-constants.h +ybt.o: ybt.h brttypes.h +ybt-test: ybt-test.o ybt.o memory.o +cachetable.o: cachetable.h +brt-test: brt.o hashtable.o pma.o memory.o brt-serialize.o cachetable.o header-io.o ybt.o key.o +brt-test.o brt.o: brt.h cachetable.h brttypes.h +brt-serialize-test.o: pma.h yerror.h brt.h memory.h hashtable.h brttypes.h brt-internal.h +brt.o: brt.h mdict.h pma.h brttypes.h memory.h brt-internal.h cachetable.h +mdict.o: pma.h +hashtable.o: hashtable.h brttypes.h memory.h key.h yerror.h ../include/ydb-constants.h +memory.o: memory.h +hashtest: hashtable.o memory.o +brt-serialize.o: brt.h cachetable.h memory.h mdict.h pma.h brttypes.h brt-internal.h +header-io.o: brttypes.h brt-internal.h memory.h +mdict-test: hashtable.o pma.o memory.o + +brt-serialize-test: brt-serialize-test.o brt-serialize.o memory.o hashtable.o pma.o key.o ybt.o + +cachetable-test.o: cachetable.h memory.h +cachetable-test: cachetable.o memory.o cachetable-test.o + +clean: + rm -rf *.o hashtest brt-test cachetable-test randbrt randdb4 *.bb *.bbg *.da + +randdb4: LOADLIBES=-ldb +randbrt: brt.o hashtable.o cachetable.o memory.o brt-serialize.o +TAGS: ../*/*.c ../*/*.h + etags ../*/*.c ../*/*.h diff --git a/newbrt/brt-internal.h b/newbrt/brt-internal.h new file mode 100644 index 00000000000..4bb2e80b902 --- /dev/null +++ b/newbrt/brt-internal.h @@ -0,0 +1,89 @@ +#include "cachetable.h" +#include "hashtable.h" +#include "pma.h" +#include "brt.h" +//#include "pma.h" + +typedef long long diskoff; /* Offset in a disk. -1 is the NULL pointer. */ + +enum { TREE_FANOUT = 16 }; //, NODESIZE=1<<20 }; +enum { KEY_VALUE_OVERHEAD = 8 }; /* Must store the two lengths. */ +struct nodeheader_in_file { + int n_in_buffer; + +}; +enum { BUFFER_HEADER_SIZE = (4 // height// + + 4 // n_children + + TREE_FANOUT * 8 // children + ) }; +typedef struct brtnode *BRTNODE; +/* Internal nodes. */ +struct brtnode { + enum typ_tag tag; + unsigned int nodesize; + diskoff thisnodename; + int height; /* height is always >= 0. 0 for leaf, >0 for nonleaf. */ + union node { + struct nonleaf { + int n_children; /* if n_children==TREE_FANOUT+1 then the tree needs to be rebalanced. */ + bytevec childkeys[TREE_FANOUT]; /* Pivot keys. Child 0's keys are <= childkeys[0]. Child 1's keys are <= childkeys[1]. + Note: It is possible that Child 1's keys are == to child 0's key's, so it is + not necessarily true that child 1's keys are > childkeys[0]. + However, in the absense of duplicate keys, child 1's keys *are* > childkeys[0]. */ + unsigned int childkeylens[TREE_FANOUT]; + unsigned int totalchildkeylens; + diskoff children[TREE_FANOUT+1]; /* unused if height==0 */ /* Note: The last element of these arrays is used only temporarily while splitting a node. */ + HASHTABLE htables[TREE_FANOUT+1]; + unsigned int n_bytes_in_hashtable[TREE_FANOUT+1]; /* how many bytes are in each hashtable (including overheads) */ + unsigned int n_bytes_in_hashtables; + } n; + struct leaf { + PMA buffer; + unsigned int n_bytes_in_buffer; + } l; + } u; +}; + +struct brt_header { + int dirty; + unsigned int nodesize; + diskoff freelist; + diskoff unused_memory; + diskoff unnamed_root; + int n_named_roots; /* -1 if the only one is unnamed */ + char **names; + diskoff *roots; +}; + + +struct brt { + CACHEFILE cf; + char *database_name; + // The header is shared. It is also ephemeral. + struct brt_header *h; + + BRT_CURSOR cursors_head, cursors_tail; +}; + +/* serialization code */ +void serialize_brtnode_to(int fd, diskoff off, diskoff size, BRTNODE node); +int deserialize_brtnode_from (int fd, diskoff off, BRTNODE *brtnode, int nodesize); +unsigned int serialize_brtnode_size(BRTNODE node); /* How much space will it take? */ +unsigned int brtnode_which_child (BRTNODE node, bytevec key, ITEMLEN keylen); +int keycompare (bytevec key1, ITEMLEN key1len, bytevec key2, ITEMLEN key2len); + +void verify_counts(BRTNODE); + +int serialize_brt_header_to (int fd, struct brt_header *h); +int deserialize_brtheader_from (int fd, diskoff off, struct brt_header **brth); + +static inline int brtnode_n_hashtables(BRTNODE node) { if (node->height==0) return 1; else return node->u.n.n_children; } + +//int write_brt_header (int fd, struct brt_header *header); + +#if 1 +#define DEADBEEF ((void*)0xDEADBEEF) +#else +#define DEADBEEF ((void*)0xDEADBEEFDEADBEEF) +#endif + diff --git a/newbrt/brt-serialize-test.c b/newbrt/brt-serialize-test.c new file mode 100644 index 00000000000..1fb58fba79f --- /dev/null +++ b/newbrt/brt-serialize-test.c @@ -0,0 +1,68 @@ +#include "brt.h" +#include "memory.h" +#include "brt-internal.h" + +#include +#include +#include + +void test_serialize(void) { +// struct brt source_brt; + struct brtnode sn,*dn; /* Source node, Dest node */ + int fd = open("brt-serialize-test.brt", O_RDWR|O_CREAT, 0777); + int r; + assert(fd>=0); + +// source_brt.fd=fd; + sn.nodesize = 1024; + sn.thisnodename = sn.nodesize*20; + sn.height = 1; + sn.u.n.n_children = 2; + sn.u.n.childkeys[0] = strdup("hello"); + sn.u.n.childkeylens[0] = 6; + sn.u.n.totalchildkeylens = 6; + sn.u.n.children[0] = sn.nodesize*30; + sn.u.n.children[1] = sn.nodesize*35; + r = hashtable_create(&sn.u.n.htables[0]); assert(r==0); + r = hashtable_create(&sn.u.n.htables[1]); assert(r==0); + r = hash_insert(sn.u.n.htables[0], "a", 2, "aval", 5); assert(r==0); + r = hash_insert(sn.u.n.htables[0], "b", 2, "bval", 5); assert(r==0); + r = hash_insert(sn.u.n.htables[1], "x", 2, "xval", 5); assert(r==0); + sn.u.n.n_bytes_in_hashtables = 3*(KEY_VALUE_OVERHEAD+2+5); + + serialize_brtnode_to(fd, sn.nodesize*20, sn.nodesize, &sn); + + deserialize_brtnode_from(fd, sn.nodesize*20, &dn, sn.nodesize); + assert(dn->thisnodename==sn.nodesize*20); + assert(dn->height == 1); + assert(dn->u.n.n_children==2); + assert(strcmp(dn->u.n.childkeys[0], "hello")==0); + assert(dn->u.n.childkeylens[0]==6); + assert(dn->u.n.totalchildkeylens==6); + assert(dn->u.n.children[0]==sn.nodesize*30); + assert(dn->u.n.children[1]==sn.nodesize*35); + { + bytevec data; ITEMLEN datalen; + int r = hash_find(dn->u.n.htables[0], "a", 2, &data, &datalen); + assert(r==0); + assert(strcmp(data,"aval")==0); + assert(datalen==5); + + r=hash_find(dn->u.n.htables[0], "b", 2, &data, &datalen); + assert(r==0); + assert(strcmp(data,"bval")==0); + assert(datalen==5); + + r=hash_find(dn->u.n.htables[1], "x", 2, &data, &datalen); + assert(r==0); + assert(strcmp(data,"xval")==0); + assert(datalen==5); + + } +} + +int main (int argc __attribute__((__unused__)), char *argv[] __attribute__((__unused__))) { + memory_check = 1; + test_serialize(); + return 0; +} diff --git a/newbrt/brt-serialize.c b/newbrt/brt-serialize.c new file mode 100644 index 00000000000..315d17d6b7e --- /dev/null +++ b/newbrt/brt-serialize.c @@ -0,0 +1,441 @@ +#define _XOPEN_SOURCE 500 + +#include "brt.h" +#include "memory.h" +//#include "pma.h" +#include "brt-internal.h" + +#include +#include +#include +#include +#include +#include + +struct cursor { + unsigned char *buf; + unsigned int size; + unsigned int ndone; +}; + +void wbuf_char (struct cursor *w, int ch) { + assert(w->ndonesize); + w->buf[w->ndone++]=ch; +} + +void wbuf_int (struct cursor *w, unsigned int i) { + wbuf_char(w, (i>>24)&0xff); + wbuf_char(w, (i>>16)&0xff); + wbuf_char(w, (i>>8)&0xff); + wbuf_char(w, (i>>0)&0xff); +} + +void wbuf_bytes (struct cursor *w, bytevec bytes_bv, int nbytes) { + const unsigned char *bytes=bytes_bv; + int i; + wbuf_int(w, nbytes); + for (i=0; i>32); + wbuf_int(w, off&0xFFFFFFFF); +} + +unsigned int rbuf_char (struct cursor *r) { + assert(r->ndonesize); + return r->buf[r->ndone++]; +} + +unsigned int rbuf_int (struct cursor *r) { + unsigned char c0 = rbuf_char(r); + unsigned char c1 = rbuf_char(r); + unsigned char c2 = rbuf_char(r); + unsigned char c3 = rbuf_char(r); + return ((c0<<24)| + (c1<<16)| + (c2<<8)| + (c3<<0)); +} + +/* Return a pointer into the middle of the buffer. */ +void rbuf_bytes (struct cursor *r, bytevec *bytes, unsigned int *n_bytes) +{ + *n_bytes = rbuf_int(r); + *bytes = &r->buf[r->ndone]; + r->ndone+=*n_bytes; + assert(r->ndone<=r->size); +} + +diskoff rbuf_diskoff (struct cursor *r) { + unsigned i0 = rbuf_int(r); + unsigned i1 = rbuf_int(r); + return ((unsigned long long)(i0)<<32) | ((unsigned long long)(i1)); +} + +static unsigned int serialize_brtnode_size_slow(BRTNODE node) { + unsigned int size=4+4; /* size+height */ + if (node->height>0) { + unsigned int hsize=0; + unsigned int csize=0; + int i; + size+=4; /* n_children */ + for (i=0; iu.n.n_children-1; i++) { + size+=4; + csize+=node->u.n.childkeylens[i]; + } + for (i=0; iu.n.n_children; i++) { + size+=8; + } + int n_hashtables = brtnode_n_hashtables(node); + size+=4; /* n_entries */ + for (i=0; i< n_hashtables; i++) { + HASHTABLE_ITERATE(node->u.n.htables[i], + key __attribute__((__unused__)), keylen, + data __attribute__((__unused__)), datalen, + (hsize+=8+keylen+datalen)); + } + assert(hsize==node->u.n.n_bytes_in_hashtables); + assert(csize==node->u.n.totalchildkeylens); + return size+hsize+csize; + } else { + unsigned int hsize=0; + PMA_ITERATE(node->u.l.buffer, + key __attribute__((__unused__)), keylen, + data __attribute__((__unused__)), datalen, + (hsize+=8+keylen+datalen)); + assert(hsize==node->u.l.n_bytes_in_buffer); + hsize+=4; /* add n entries in buffer table. */ + return size+hsize; + } + +} + +unsigned int serialize_brtnode_size (BRTNODE node) { + unsigned int result = 4+4; /* size+height */ + assert(sizeof(off_t)==8); + if (node->height>0) { + result+=4; /* n_children */ + result+=4*(node->u.n.n_children-1); /* key lengths */ + result+=node->u.n.totalchildkeylens; /* the lengths of the pivot keys, without their key lengths. */ + result+=8*(node->u.n.n_children); /* child offsets. */ + result+=4; /* n_entries in hash table. */ + result+=node->u.n.n_bytes_in_hashtables; + } else { + result+=4; /* n_entries in buffer table. */ + result+=node->u.l.n_bytes_in_buffer; + if (memory_check) { + unsigned int slowresult = serialize_brtnode_size_slow(node); + if (result!=slowresult) printf("%s:%d result=%d slowresult=%d\n", __FILE__, __LINE__, result, slowresult); + assert(result==slowresult); + } + } + return result; +} + +void serialize_brtnode_to(int fd, diskoff off, diskoff size, BRTNODE node) { + struct cursor w; + int i; + unsigned int calculated_size = serialize_brtnode_size(node); + assert(size>0); + w.buf=my_malloc(size); + w.size=size; + w.ndone=0; + //printf("%s:%d serializing %lld w height=%d p0=%p\n", __FILE__, __LINE__, off, node->height, node->mdicts[0]); + wbuf_int(&w, calculated_size); + wbuf_int(&w, node->height); + //printf("%s:%d w.ndone=%d n_children=%d\n", __FILE__, __LINE__, w.ndone, node->n_children); + if (node->height>0) { + wbuf_int(&w, node->u.n.n_children); + //printf("%s:%d w.ndone=%d\n", __FILE__, __LINE__, w.ndone); + for (i=0; iu.n.n_children-1; i++) { + wbuf_bytes(&w, node->u.n.childkeys[i], node->u.n.childkeylens[i]); + //printf("%s:%d w.ndone=%d (childkeylen[%d]=%d\n", __FILE__, __LINE__, w.ndone, i, node->childkeylens[i]); + } + for (i=0; iu.n.n_children; i++) { + wbuf_diskoff(&w, node->u.n.children[i]); + //printf("%s:%d w.ndone=%d\n", __FILE__, __LINE__, w.ndone); + } + + { + int n_entries=0; + int n_hash_tables = brtnode_n_hashtables(node); + for (i=0; i< n_hash_tables; i++) { + //printf("%s:%d p%d=%p n_entries=%d\n", __FILE__, __LINE__, i, node->mdicts[i], mdict_n_entries(node->mdicts[i])); + n_entries += hashtable_n_entries(node->u.n.htables[i]); + } + //printf("%s:%d n_entries=%d\n", __FILE__, __LINE__, n_entries); + wbuf_int(&w, n_entries); + for (i=0; i< n_hash_tables; i++) { + HASHTABLE_ITERATE(node->u.n.htables[i], key, keylen, data, datalen, + (wbuf_bytes(&w, key, keylen), + wbuf_bytes(&w, data, datalen))); + } + } + } else { + wbuf_int(&w, pma_n_entries(node->u.l.buffer)); + PMA_ITERATE(node->u.l.buffer, key, keylen, data, datalen, + (wbuf_bytes(&w, key, keylen), + wbuf_bytes(&w, data, datalen))); + } + assert(w.ndone<=w.size); + { + ssize_t r=pwrite(fd, w.buf, w.ndone, off); + if (r<0) printf("r=%d errno=%d\n", r, errno); + assert((size_t)r==w.ndone); + } + + //printf("%s:%d w.done=%d r=%d\n", __FILE__, __LINE__, w.ndone, r); + assert(calculated_size==w.ndone); + + //printf("%s:%d wrote %d bytes for %lld size=%lld\n", __FILE__, __LINE__, w.ndone, off, size); + assert(w.ndone<=size); + my_free(w.buf); +} + +int deserialize_brtnode_from (int fd, diskoff off, BRTNODE *brtnode, int nodesize) { + TAGMALLOC(BRTNODE, result); + struct cursor rc; + int i; + uint32_t datasize; + int r; + if (errno!=0) { + r=errno; + if (0) { died0: my_free(result); } + return r; + } + { + uint32_t datasize_n; + int r = pread(fd, &datasize_n, sizeof(datasize_n), off); + //printf("%s:%d r=%d the datasize=%d\n", __FILE__, __LINE__, r, ntohl(datasize_n)); + if (r!=sizeof(datasize_n)) { + if (r==-1) r=errno; + else r = DB_BADFORMAT; + goto died0; + } + datasize = ntohl(datasize_n); + if (datasize<=0 || datasize>(1<<30)) { r = DB_BADFORMAT; goto died0; } + } + rc.buf=my_malloc(datasize); + if (errno!=0) { + if (0) { died1: my_free(rc.buf); } + r=errno; + goto died0; + } + rc.size=datasize; + assert(rc.size>0); + rc.ndone=0; + //printf("Deserializing %lld datasize=%d\n", off, datasize); + { + ssize_t r=pread(fd, rc.buf, datasize, off); + if ((size_t)r!=datasize) { r=errno; goto died1; } + //printf("Got %d %d %d %d\n", rc.buf[0], rc.buf[1], rc.buf[2], rc.buf[3]); + } + { + unsigned int stored_size = rbuf_int(&rc); + if (stored_size!=datasize) { r=DB_BADFORMAT; goto died1; } + } + result->nodesize = nodesize; // How to compute the nodesize? + result->thisnodename = off; + result->height = rbuf_int(&rc); + //printf("height==%d\n", result->height); + if (result->height>0) { + result->u.n.totalchildkeylens=0; + for (i=0; iu.n.childkeys[i]=0; result->u.n.childkeylens[i]=0; } + for (i=0; iu.n.children[i]=0; result->u.n.htables[i]=0; result->u.n.n_bytes_in_hashtable[i]=0; } + result->u.n.n_children = rbuf_int(&rc); + //printf("n_children=%d\n", result->n_children); + assert(result->u.n.n_children>=0 && result->u.n.n_children<=TREE_FANOUT); + for (i=0; iu.n.n_children-1; i++) { + bytevec childkeyptr; + rbuf_bytes(&rc, &childkeyptr, &result->u.n.childkeylens[i]); /* Returns a pointer into the rbuf. */ + result->u.n.childkeys[i] = memdup(childkeyptr, result->u.n.childkeylens[i]); + //printf(" key %d length=%d data=%s\n", i, result->childkeylens[i], result->childkeys[i]); + result->u.n.totalchildkeylens+=result->u.n.childkeylens[i]; + } + for (i=0; iu.n.n_children; i++) { + result->u.n.children[i] = rbuf_diskoff(&rc); + //printf("Child %d at %lld\n", i, result->children[i]); + } + for (i=0; iu.n.n_bytes_in_hashtable[i] = 0; + } + result->u.n.n_bytes_in_hashtables = 0; + for (i=0; iu.n.htables[i]); + if (r!=0) { + int j; + if (0) { died_12: j=brtnode_n_hashtables(result); } + for (j=0; ju.n.htables[j]); + goto died1; + } + } + { + int n_in_hash = rbuf_int(&rc); + //printf("%d in hash\n", n_in_hash); + + for (i=0; iu.n.htables[childnum], key, keylen, val, vallen); /* Copies the data into the hash table. */ + if (r!=0) { goto died_12; } + } + diff = keylen + vallen + KEY_VALUE_OVERHEAD; + result->u.n.n_bytes_in_hashtables += diff; + result->u.n.n_bytes_in_hashtable[childnum] += diff; + //printf("Inserted\n"); + } + } + } else { + int n_in_buf = rbuf_int(&rc); + result->u.l.n_bytes_in_buffer = 0; + int r=pma_create(&result->u.l.buffer); + if (r!=0) { + if (0) { died_21: pma_free(&result->u.l.buffer); } + goto died1; + } + //printf("%s:%d r PMA= %p\n", __FILE__, __LINE__, result->u.l.buffer); + for (i=0; iu.l.buffer, key, keylen, val, vallen); + if (r!=0) goto died_21; + } + result->u.l.n_bytes_in_buffer += keylen + vallen + KEY_VALUE_OVERHEAD; + } + } + //printf("%s:%d Ok got %lld n_children=%d\n", __FILE__, __LINE__, result->thisnodename, result->n_children); + my_free(rc.buf); + *brtnode = result; + verify_counts(result); + return 0; +} + +unsigned int brtnode_which_child (BRTNODE node, bytevec key, ITEMLEN keylen) { + int i; + assert(node->height>0); + for (i=0; iu.n.n_children-1; i++) { + if (keycompare(key, keylen, node->u.n.childkeys[i], node->u.n.childkeylens[i])<=0) { + return i; + } + } + return node->u.n.n_children-1; +} + +void verify_counts (BRTNODE node) { + if (node->height==0) { + assert(node->u.l.buffer); + } else { + unsigned int sum = 0; + int i; + for (i=0; iu.n.n_children; i++) + sum += node->u.n.n_bytes_in_hashtable[i]; + for (; iu.n.n_bytes_in_hashtable[i]==0); + } + assert(sum==node->u.n.n_bytes_in_hashtables); + } +} + +int serialize_brt_header_to (int fd, struct brt_header *h) { + struct cursor w; + int i; + unsigned int size=0; /* I don't want to mess around calculating it exactly. */ + size += 4+4+8+8+4; /* this size, the tree's nodesize, freelist, unused_memory, nnamed_rootse. */ + if (h->n_named_roots<0) { + size+=8; + } else { + for (i=0; in_named_roots; i++) { + size+=12 + 1 + strlen(h->names[i]); + } + } + w.buf = my_malloc(size); + w.size = size; + w.ndone = 0; + wbuf_int (&w, size); + wbuf_int (&w, h->nodesize); + wbuf_diskoff(&w, h->freelist); + wbuf_diskoff(&w, h->unused_memory); + wbuf_int (&w, h->n_named_roots); + if (h->n_named_roots>0) { + for (i=0; in_named_roots; i++) { + char *s = h->names[i]; + unsigned int l = 1+strlen(s); + wbuf_diskoff(&w, h->roots[i]); + wbuf_bytes (&w, s, l); + assert(l>0 && s[l-1]==0); + } + } else { + wbuf_diskoff(&w, h->unnamed_root); + } + assert(w.ndone==size); + { + ssize_t r = pwrite(fd, w.buf, w.ndone, 0); + assert((size_t)r==w.ndone); + } + my_free(w.buf); + return 0; +} + +int deserialize_brtheader_from (int fd, diskoff off, struct brt_header **brth) { + struct brt_header *MALLOC(h); + struct cursor rc; + int size; + int sizeagain; + assert(off==0); + { + uint32_t size_n; + ssize_t r = pread(fd, &size_n, sizeof(size_n), off); + if (r==0) { my_free(h); return -1; } + assert(r==sizeof(size_n)); + size = ntohl(size_n); + } + rc.buf = my_malloc(size); + rc.size=size; + assert(rc.size>0); + rc.ndone=0; + { + ssize_t r = pread(fd, rc.buf, size, off); + assert(r==size); + } + h->dirty=0; + sizeagain = rbuf_int(&rc); + assert(sizeagain==size); + h->nodesize = rbuf_int(&rc); + h->freelist = rbuf_diskoff(&rc); + h->unused_memory = rbuf_diskoff(&rc); + h->n_named_roots = rbuf_int(&rc); + if (h->n_named_roots>=0) { + int i; + MALLOC_N(h->n_named_roots, h->roots); + MALLOC_N(h->n_named_roots, h->names); + for (i=0; in_named_roots; i++) { + bytevec nameptr; + unsigned int len; + h->roots[i] = rbuf_diskoff(&rc); + rbuf_bytes(&rc, &nameptr, &len); + assert(strlen(nameptr)+1==len); + h->names[i] = memdup(nameptr,len); + } + h->unnamed_root = -1; + } else { + h->roots = 0; + h->names = 0; + h->unnamed_root = rbuf_diskoff(&rc); + } + assert(rc.ndone==rc.size); + my_free(rc.buf); + *brth = h; + return 0; +} diff --git a/newbrt/brt-test.c b/newbrt/brt-test.c new file mode 100644 index 00000000000..b2408d26f8d --- /dev/null +++ b/newbrt/brt-test.c @@ -0,0 +1,695 @@ +#include "brt.h" + +#include "memory.h" +#include +#include +#include +#include +#include +#include +#include + +extern long long n_items_malloced; + +static void test0 (void) { + BRT t; + int r; + CACHETABLE ct; + char fname[]="testbrt.brt"; + printf("%s:%d test0\n", __FILE__, __LINE__); + memory_check=1; + memory_check_all_free(); + r = brt_create_cachetable(&ct, 0); + assert(r==0); + printf("%s:%d test0\n", __FILE__, __LINE__); + unlink(fname); + r = open_brt(fname, 0, 1, &t, 1024, ct); + assert(r==0); + printf("%s:%d test0\n", __FILE__, __LINE__); + printf("%s:%d n_items_malloced=%lld\n", __FILE__, __LINE__, n_items_malloced); + r = close_brt(t); assert(r==0); + printf("%s:%d n_items_malloced=%lld\n", __FILE__, __LINE__, n_items_malloced); assert(r==0); + r = cachetable_close(ct); + assert(r==0); + memory_check_all_free(); +} + +static void test1 (void) { + BRT t; + int r; + CACHETABLE ct; + char fname[]="testbrt.brt"; + memory_check=1; + memory_check_all_free(); + r = brt_create_cachetable(&ct, 0); + assert(r==0); + unlink(fname); + r = open_brt(fname, 0, 1, &t, 1024, ct); + assert(r==0); + brt_insert(t, "hello", 6, "there", 6); + { + bytevec val; ITEMLEN vallen; + r = brt_lookup(t, "hello", 6, &val, &vallen); + assert(r==0); + assert(strcmp(val, "there")==0); + assert(vallen==6); + } + r = close_brt(t); assert(r==0); + r = cachetable_close(ct); assert(r==0); + memory_check_all_free(); + printf("test1 ok\n"); +} + +static void test2 (int memcheck) { + BRT t; + int r; + int i; + CACHETABLE ct; + char fname[]="testbrt.brt"; + memory_check=memcheck; + printf("%s:%d checking\n", __FILE__, __LINE__); + memory_check_all_free(); + r = brt_create_cachetable(&ct, 0); assert(r==0); + unlink(fname); + r = open_brt(fname, 0, 1, &t, 1024, ct); + printf("%s:%d did setup\n", __FILE__, __LINE__); + assert(r==0); + for (i=0; i<2048; i++) { + char key[100],val[100]; + snprintf(key,100,"hello%d",i); + snprintf(val,100,"there%d",i); + brt_insert(t, key, 1+strlen(key), val, 1+strlen(val)); + //printf("%s:%d did insert %d\n", __FILE__, __LINE__, i); + if (0) { + brt_flush(t); + { + int n = get_n_items_malloced(); + printf("%s:%d i=%d n_items_malloced=%d\n", __FILE__, __LINE__, i, n); + if (n!=3) print_malloced_items(); + assert(n==3); + } + } + } + printf("%s:%d inserted\n", __FILE__, __LINE__); + r = close_brt(t); assert(r==0); + r = cachetable_close(ct); assert(r==0); + memory_check_all_free(); + printf("test2 ok\n"); +} + +static void test3 (int nodesize, int count, int memcheck) { + BRT t; + int r; + struct timeval t0,t1; + int i; + CACHETABLE ct; + char fname[]="testbrt.brt"; + memory_check=memcheck; + memory_check_all_free(); + r = brt_create_cachetable(&ct, 0); assert(r==0); + gettimeofday(&t0, 0); + unlink(fname); + r = open_brt(fname, 0, 1, &t, nodesize, ct); + assert(r==0); + for (i=0; i=0) { + char key[100], valexpected[100]; + bytevec val; + ITEMLEN vallen; + if (i%1000==0) printf("r"); fflush(stdout); + snprintf(key, 100, "key%d", rk); + snprintf(valexpected, 100, "val%d", values[rk]); + r = brt_lookup(t, key, 1+strlen(key), &val, &vallen); + assert(r==0); + assert(vallen==(1+strlen(valexpected))); + assert(memcmp(val,valexpected,vallen)==0); + } + } + printf("\n"); + my_free(values); + r = close_brt(t); assert(r==0); + r = cachetable_close(ct); assert(r==0); + memory_check_all_free(); +} + +static void test_dump_empty_db (void) { + BRT t; + CACHETABLE ct; + int r; + char fname[]="testbrt.brt"; + memory_check=1; + r = brt_create_cachetable(&ct, 0); + assert(r==0); + unlink(fname); + r = open_brt(fname, 0, 1, &t, 1024, ct); + assert(r==0); + dump_brt(t); + r = close_brt(t); assert(r==0); + r = cachetable_close(ct); assert(r==0); + memory_check_all_free(); +} + +/* Test running multiple trees in different files */ +static void test_multiple_files_of_size (int size) { + const char *n0 = "test0.brt"; + const char *n1 = "test1.brt"; + CACHETABLE ct; + BRT t0,t1; + int r,i; + printf("test_multiple_files_of_size(%d)\n", size); + unlink(n0); + unlink(n1); + memory_check_all_free(); + r = brt_create_cachetable(&ct, 0); assert(r==0); + r = open_brt(n0, 0, 1, &t0, size, ct); assert(r==0); + r = open_brt(n1, 0, 1, &t1, size, ct); assert(r==0); + for (i=0; i<10000; i++) { + char key[100],val[100]; + snprintf(key, 100, "key%d", i); + snprintf(val, 100, "val%d", i); + brt_insert(t0, key, 1+strlen(key), val, 1+strlen(val)); + snprintf(val, 100, "Val%d", i); + brt_insert(t1, key, 1+strlen(key), val, 1+strlen(val)); + } + //verify_brt(t0); + //dump_brt(t0); + //dump_brt(t1); + verify_brt(t0); + verify_brt(t1); + + r = close_brt(t0); assert(r==0); + r = close_brt(t1); assert(r==0); + r = cachetable_close(ct); assert(r==0); + memory_check_all_free(); + + /* Now see if the data is all there. */ + r = brt_create_cachetable(&ct, 0); assert(r==0); + r = open_brt(n0, 0, 0, &t0, 1<<12, ct); + printf("%s:%d r=%d\n", __FILE__, __LINE__,r); + assert(r==0); + r = open_brt(n1, 0, 0, &t1, 1<<12, ct); assert(r==0); + + for (i=0; i<10000; i++) { + char key[100],val[100]; + bytevec actualval; + ITEMLEN actuallen; + snprintf(key, 100, "key%d", i); + snprintf(val, 100, "val%d", i); + r=brt_lookup(t0, key, 1+strlen(key), &actualval, &actuallen); + assert(r==0); + assert(strcmp(val,actualval)==0); + assert(actuallen==1+strlen(val)); + snprintf(val, 100, "Val%d", i); + r=brt_lookup(t1, key, 1+strlen(key), &actualval, &actuallen); + assert(r==0); + assert(strcmp(val,actualval)==0); + assert(actuallen==1+strlen(val)); + } + + r = close_brt(t0); assert(r==0); + r = close_brt(t1); assert(r==0); + r = cachetable_close(ct); assert(r==0); + memory_check_all_free(); +} + +static void test_multiple_files (void) { + test_multiple_files_of_size (1<<12); + test_multiple_files_of_size (1<<20); +} + +static void test_named_db (void) { + const char *n0 = "test0.brt"; + const char *n1 = "test1.brt"; + CACHETABLE ct; + BRT t0; + int r; + printf("test_named_db\n"); + unlink(n0); + unlink(n1); + memory_check_all_free(); + r = brt_create_cachetable(&ct, 0); assert(r==0); + r = open_brt(n0, "db1", 1, &t0, 1<<12, ct); assert(r==0); + + brt_insert(t0, "good", 5, "day", 4); assert(r==0); + + r = close_brt(t0); assert(r==0); + r = cachetable_close(ct); assert(r==0); + memory_check_all_free(); + + memory_check_all_free(); + r = brt_create_cachetable(&ct, 0); assert(r==0); + r = open_brt(n0, "db1", 0, &t0, 1<<12, ct); assert(r==0); + + { + bytevec val; + ITEMLEN vallen; + r = brt_lookup(t0, "good", 5, &val, &vallen); + assert(r==0); + assert(vallen==4); + assert(strcmp(val,"day")==0); + } + + r = close_brt(t0); assert(r==0); + r = cachetable_close(ct); assert(r==0); + memory_check_all_free(); +} + +static void test_multiple_dbs (void) { + const char *n0 = "test0.brt"; + const char *n1 = "test1.brt"; + CACHETABLE ct; + BRT t0,t1; + int r; + printf("test_multiple_dbs: "); + unlink(n0); + unlink(n1); + memory_check_all_free(); + r = brt_create_cachetable(&ct, 0); assert(r==0); + r = open_brt(n0, "db1", 1, &t0, 1<<12, ct); assert(r==0); + r = open_brt(n1, "db2", 1, &t1, 1<<12, ct); assert(r==0); + + brt_insert(t0, "good", 5, "grief", 6); assert(r==0); + brt_insert(t1, "bad", 4, "night", 6); assert(r==0); + + r = close_brt(t0); assert(r==0); + r = close_brt(t1); assert(r==0); + r = cachetable_close(ct); assert(r==0); + + memory_check_all_free(); + + r = brt_create_cachetable(&ct, 0); assert(r==0); + r = open_brt(n0, "db1", 0, &t0, 1<<12, ct); assert(r==0); + r = open_brt(n1, "db2", 0, &t1, 1<<12, ct); assert(r==0); + + { + bytevec val; + ITEMLEN vallen; + r = brt_lookup(t0, "good", 5, &val, &vallen); + assert(r==0); + assert(vallen==6); + assert(strcmp(val,"grief")==0); + + r = brt_lookup(t1, "good", 5, &val, &vallen); + assert(r!=0); + + r = brt_lookup(t0, "bad", 4, &val, &vallen); + assert(r!=0); + + r = brt_lookup(t1, "bad", 4, &val, &vallen); + assert(r==0); + assert(vallen==6); + assert(strcmp(val,"night")==0); + } + + r = close_brt(t0); assert(r==0); + r = close_brt(t1); assert(r==0); + r = cachetable_close(ct); assert(r==0); + + memory_check_all_free(); + printf("ok\n"); +} + +/* Test to see a single file can contain many databases. */ +static void test_multiple_dbs_many (void) { + enum { MANYN = 16 }; + int i, r; + const char *name = "test.brt"; + CACHETABLE ct; + BRT trees[MANYN]; + printf("test_multiple_dbs_many:\n"); + memory_check_all_free(); + unlink(name); + r = brt_create_cachetable(&ct, MANYN+4); assert(r==0); + for (i=0; i +#include +#include +#include +#include +#include + +extern long long n_items_malloced; + +/* Frees a node, including all the stuff in the hash table. */ +void brtnode_free (BRTNODE node) { + int i; + //printf("%s:%d %p->mdict[0]=%p\n", __FILE__, __LINE__, node, node->mdicts[0]); + if (node->height>0) { + for (i=0; iu.n.n_children-1; i++) { + my_free((void*)node->u.n.childkeys[i]); + } + for (i=0; iu.n.n_children; i++) { + if (node->u.n.htables[i]) { + hashtable_free(&node->u.n.htables[i]); + } + } + } else { + if (node->u.l.buffer) // The buffer may have been freed already, in some cases. + pma_free(&node->u.l.buffer); + } + my_free(node); +} + +void brtnode_flush_callback (CACHEFILE cachefile, diskoff nodename, void *brtnode_v, int write_me, int keep_me) { + BRTNODE brtnode = brtnode_v; + if (0) { + printf("%s:%d brtnode_flush_callback %p keep_me=%d height=%d", __FILE__, __LINE__, brtnode, keep_me, brtnode->height); + if (brtnode->height==0) printf(" pma=%p", brtnode->u.l.buffer); + printf("\n"); + } + assert(brtnode->thisnodename==nodename); + //printf("%s:%d %p->mdict[0]=%p\n", __FILE__, __LINE__, brtnode, brtnode->mdicts[0]); + if (write_me) { + serialize_brtnode_to(cachefile_fd(cachefile), brtnode->thisnodename, brtnode->nodesize, brtnode); + } + //printf("%s:%d %p->mdict[0]=%p\n", __FILE__, __LINE__, brtnode, brtnode->mdicts[0]); + if (!keep_me) { + brtnode_free(brtnode); + } + //printf("%s:%d n_items_malloced=%lld\n", __FILE__, __LINE__, n_items_malloced); +} + +int brtnode_fetch_callback (CACHEFILE cachefile, diskoff nodename, void **brtnode_pv,void*extraargs) { + long nodesize=(long)extraargs; + BRTNODE *result=(BRTNODE*)brtnode_pv; + return deserialize_brtnode_from(cachefile_fd(cachefile), nodename, result, nodesize); +} + +void brtheader_flush_callback (CACHEFILE cachefile, diskoff nodename, void *header_v, int write_me, int keep_me) { + struct brt_header *h = header_v; + assert(nodename==0); + assert(!h->dirty); // shouldn't be dirty once it is unpinned. + if (write_me) { + serialize_brt_header_to(cachefile_fd(cachefile), h); + } + if (!keep_me) { + if (h->n_named_roots>0) { + int i; + for (i=0; in_named_roots; i++) { + my_free(h->names[i]); + } + my_free(h->names); + my_free(h->roots); + } + my_free(h); + } +} + +int brtheader_fetch_callback (CACHEFILE cachefile, diskoff nodename, void **headerp_v, void*extraargs __attribute__((__unused__))) { + struct brt_header **h = (struct brt_header **)headerp_v; + assert(nodename==0); + return deserialize_brtheader_from(cachefile_fd(cachefile), nodename, h); +} + +int read_and_pin_brt_header (CACHEFILE cf, struct brt_header **header) { + void *header_p; + int r = cachetable_get_and_pin(cf, 0, &header_p, + brtheader_flush_callback, brtheader_fetch_callback, 0); + if (r!=0) return r; + *header = header_p; + return 0; +} + +int unpin_brt_header (BRT brt) { + int r = cachetable_unpin(brt->cf, 0, brt->h->dirty); + brt->h->dirty=0; + brt->h=0; + return r; +} + + +typedef struct kvpair { + bytevec key; + unsigned int keylen; + bytevec val; + unsigned int vallen; +} *KVPAIR; + +int kvpair_compare (const void *av, const void *bv) { + const KVPAIR a = (const KVPAIR)av; + const KVPAIR b = (const KVPAIR)bv; + int r = keycompare(a->key, a->keylen, b->key, b->keylen); + //printf("keycompare(%s,\n %s)-->%d\n", a->key, b->key, r); + return r; +} + +#if 0 +/* in a leaf, they are already sorted because they are in a PMA */ +static void brtleaf_make_sorted_kvpairs (BRTNODE node, KVPAIR *pairs, int *n_pairs) { + int n_entries = mdict_n_entries(node->mdicts[0]); + KVPAIR result=my_calloc(n_entries, sizeof(*result)); + int resultcounter=0; + assert(node->n_children==0 && node->height==0); + MDICT_ITERATE(node->mdicts[0], key, keylen, data, datalen, ({ + result[resultcounter].key = key; + result[resultcounter].keylen = keylen; + result[resultcounter].val = data; + result[resultcounter].vallen = datalen; + resultcounter++; + })); + assert(resultcounter==n_entries); + qsort(result, resultcounter, sizeof(*result), kvpair_compare); + *pairs = result; + *n_pairs = resultcounter; +// { +// innt i; +// printf("Sorted pairs (sizeof *result=%d):\n", sizeof(*result)); +// for (i=0; ih->unused_memory; + brt->h->unused_memory+=size; + return result; +} + +diskoff malloc_diskblock (BRT brt, int size) { +#if 0 + int r = read_and_pin_brt_header(brt->fd, &brt->h); + assert(r==0); + { + diskoff result = malloc_diskblock_header_is_in_memory(brt, size); + r = write_brt_header(brt->fd, &brt->h); + assert(r==0); + return result; + } +#else + return malloc_diskblock_header_is_in_memory(brt,size); +#endif +} + +static void initialize_brtnode (BRT t, BRTNODE n, diskoff nodename, int height) { + int i; + n->tag = TYP_BRTNODE; + n->nodesize = t->h->nodesize; + n->thisnodename = nodename; + n->height = height; + assert(height>=0); + if (height>0) { + n->u.n.n_children = 0; + for (i=0; iu.n.childkeys[i] = 0; + n->u.n.childkeylens[i] = 0; + } + n->u.n.totalchildkeylens = 0; + for (i=0; iu.n.children[i] = 0; + n->u.n.htables[i] = 0; + n->u.n.n_bytes_in_hashtable[i] = 0; + } + n->u.n.n_bytes_in_hashtables = 0; + } else { + int r = pma_create(&n->u.l.buffer); + static int rcount=0; + assert(r==0); + //printf("%s:%d n PMA= %p (rcount=%d)\n", __FILE__, __LINE__, n->u.l.buffer, rcount); + rcount++; + n->u.l.n_bytes_in_buffer = 0; + } +} + +static void create_new_brtnode (BRT t, BRTNODE *result, int height) { + TAGMALLOC(BRTNODE, n); + int r; + diskoff name = malloc_diskblock(t, t->h->nodesize); + assert(n); + assert(t->h->nodesize>0); + //printf("%s:%d malloced %lld (and malloc again=%lld)\n", __FILE__, __LINE__, name, malloc_diskblock(t, t->nodesize)); + initialize_brtnode(t, n, name, height); + *result = n; + assert(n->nodesize>0); + r=cachetable_put(t->cf, n->thisnodename, n, + brtnode_flush_callback, brtnode_fetch_callback, (void*)t->h->nodesize); + assert(r==0); +} + +void delete_node (BRT t, BRTNODE node) { + int i; + assert(node->height>=0); + if (node->height==0) { + if (node->u.l.buffer) { + pma_free(&node->u.l.buffer); + } + node->u.l.n_bytes_in_buffer=0; + } else { + for (i=0; iu.n.n_children; i++) { + if (node->u.n.htables[i]) { + hashtable_free(&node->u.n.htables[i]); + } + node->u.n.n_bytes_in_hashtable[0]=0; + } + node->u.n.n_bytes_in_hashtables = 0; + node->u.n.totalchildkeylens=0; + node->u.n.n_children=0; + node->height=0; + node->u.l.buffer=0; /* It's a leaf now (height==0) so set the buffer to NULL. */ + } + cachetable_remove(t->cf, node->thisnodename, 0); /* Don't write it back to disk. */ +} + + +static void insert_to_buffer_in_leaf (BRTNODE node, bytevec key, unsigned int keylen, bytevec val, unsigned int vallen) { + unsigned int n_bytes_added = KEY_VALUE_OVERHEAD + keylen + vallen; + int r = pma_insert(node->u.l.buffer, key, keylen, val, vallen); + assert(r==0); + node->u.l.n_bytes_in_buffer += n_bytes_added; +} + +static int insert_to_hash_in_nonleaf (BRTNODE node, int childnum, bytevec key, unsigned int keylen, bytevec val, unsigned int vallen) { + unsigned int n_bytes_added = KEY_VALUE_OVERHEAD + keylen + vallen; + int r = hash_insert(node->u.n.htables[childnum], key, keylen, val, vallen); + if (r!=0) return r; + node->u.n.n_bytes_in_hashtable[childnum] += n_bytes_added; + node->u.n.n_bytes_in_hashtables += n_bytes_added; + return 0; +} + + +int brtleaf_split (BRT t, BRTNODE node, BRTNODE *nodea, BRTNODE *nodeb, bytevec*splitkey, ITEMLEN *splitkeylen) { + int did_split=0; + BRTNODE A,B; + assert(node->height==0); + assert(t->h->nodesize>=node->nodesize); /* otherwise we might be in trouble because the nodesize shrank. */ + create_new_brtnode(t, &A, 0); + create_new_brtnode(t, &B, 0); + //printf("%s:%d A PMA= %p\n", __FILE__, __LINE__, A->u.l.buffer); + //printf("%s:%d B PMA= %p\n", __FILE__, __LINE__, A->u.l.buffer); + assert(A->nodesize>0); + assert(B->nodesize>0); + assert(node->nodesize>0); + //printf("%s:%d A is at %lld\n", __FILE__, __LINE__, A->thisnodename); + //printf("%s:%d B is at %lld nodesize=%d\n", __FILE__, __LINE__, B->thisnodename, B->nodesize); + assert(node->height>0 || node->u.l.buffer!=0); + PMA_ITERATE(node->u.l.buffer, key, keylen, val, vallen, + ({ + if (!did_split) { + insert_to_buffer_in_leaf(A, key, keylen, val, vallen); + if (A->u.l.n_bytes_in_buffer *2 >= node->u.l.n_bytes_in_buffer) { + *splitkey = memdup(key, keylen); + *splitkeylen = keylen; + did_split=1; + } + } else { + insert_to_buffer_in_leaf(B, key, keylen, val, vallen); + } + })); + assert(node->height>0 || node->u.l.buffer!=0); + /* Remove it from the cache table, and free its storage. */ + //printf("%s:%d old pma = %p\n", __FILE__, __LINE__, node->u.l.buffer); + delete_node(t, node); + + assert(did_split==1); + *nodea = A; + *nodeb = B; + assert(serialize_brtnode_size(A)nodesize); + assert(serialize_brtnode_size(B)nodesize); + return 0; +} + +void brt_nonleaf_split (BRT t, BRTNODE node, BRTNODE *nodea, BRTNODE *nodeb, bytevec*splitkey, ITEMLEN *splitkeylen) { + int n_children_in_a = node->u.n.n_children/2; + BRTNODE A,B; + assert(node->height>0); + assert(node->u.n.n_children>=2); // Otherwise, how do we split? We need at least two children to split. */ + assert(t->h->nodesize>=node->nodesize); /* otherwise we might be in trouble because the nodesize shrank. */ + create_new_brtnode(t, &A, node->height); + create_new_brtnode(t, &B, node->height); + A->u.n.n_children=n_children_in_a; + B->u.n.n_children=node->u.n.n_children-n_children_in_a; + //printf("%s:%d A is at %lld\n", __FILE__, __LINE__, A->thisnodename); + { + /* The first n_children_in_a go into node a. + * That means that the first n_children_in_a-1 keys go into node a. + * The splitter key is key number n_children_in_a */ + int i; + for (i=0; iu.n.children[i] = node->u.n.children[i]; + A->u.n.htables[i] = node->u.n.htables[i]; + A->u.n.n_bytes_in_hashtables += (A->u.n.n_bytes_in_hashtable[i] = node->u.n.n_bytes_in_hashtable[i]); + + node->u.n.htables[i] = 0; + node->u.n.n_bytes_in_hashtables -= node->u.n.n_bytes_in_hashtable[i]; + node->u.n.n_bytes_in_hashtable[i] = 0; + } + for (i=n_children_in_a; iu.n.n_children; i++) { + int targchild = i-n_children_in_a; + B->u.n.children[targchild] = node->u.n.children[i]; + B->u.n.htables[targchild] = node->u.n.htables[i]; + B->u.n.n_bytes_in_hashtables += (B->u.n.n_bytes_in_hashtable[targchild] = node->u.n.n_bytes_in_hashtable[i]); + + node->u.n.htables[i] = 0; + node->u.n.n_bytes_in_hashtables -= node->u.n.n_bytes_in_hashtable[i]; + node->u.n.n_bytes_in_hashtable[i] = 0; + } + for (i=0; iu.n.childkeys[i] = node->u.n.childkeys[i]; + A->u.n.childkeylens[i] = node->u.n.childkeylens[i]; + A->u.n.totalchildkeylens += node->u.n.childkeylens[i]; + node->u.n.totalchildkeylens -= node->u.n.childkeylens[i]; + node->u.n.childkeys[i] = 0; + node->u.n.childkeylens[i] = 0; + } + *splitkey = node->u.n.childkeys[n_children_in_a-1]; + *splitkeylen = node->u.n.childkeylens[n_children_in_a-1]; + node->u.n.totalchildkeylens -= node->u.n.childkeylens[n_children_in_a-1]; + node->u.n.childkeys[n_children_in_a-1]=0; + node->u.n.childkeylens[n_children_in_a-1]=0; + for (i=n_children_in_a; iu.n.n_children-1; i++) { + B->u.n.childkeys[i-n_children_in_a] = node->u.n.childkeys[i]; + B->u.n.childkeylens[i-n_children_in_a] = node->u.n.childkeylens[i]; + B->u.n.totalchildkeylens += node->u.n.childkeylens[i]; + node->u.n.totalchildkeylens -= node->u.n.childkeylens[i]; + node->u.n.childkeys[i] = 0; + node->u.n.childkeylens[i] = 0; + } + assert(node->u.n.totalchildkeylens==0); + } + + { + int i; + for (i=0; iu.n.htables[i]==0); + assert(node->u.n.n_bytes_in_hashtable[i]==0); + } + assert(node->u.n.n_bytes_in_hashtables==0); + } + /* The buffer is all divied up between them, since just moved the hashtables over. */ + + *nodea = A; + *nodeb = B; + + /* Remove it from the cache table, and free its storage. */ + //printf("%s:%d removing %lld\n", __FILE__, __LINE__, node->thisnodename); + delete_node(t, node); + assert(serialize_brtnode_size(A)nodesize); + assert(serialize_brtnode_size(B)nodesize); +} + +void find_heaviest_child (BRTNODE node, int *childnum) { + int max_child = 0; + int max_weight = node->u.n.n_bytes_in_hashtable[0]; + int i; + + assert(node->u.n.n_children>0); + for (i=1; iu.n.n_children; i++) { + int this_weight = node->u.n.n_bytes_in_hashtable[i]; + if (max_weight < this_weight) { + max_child = i; + max_weight = this_weight; + } + } + *childnum = max_child; +} + +#if 0 +void find_heaviest_data (BRTNODE node, int *childnum_ret, KVPAIR *pairs_ret, int *n_pairs_ret) { + int child_weights[node->n_children]; + int child_counts[node->n_children]; + int i; + for (i=0; in_children; i++) child_weights[i] = child_counts[i] = 0; + + HASHTABLE_ITERATE(node->hashtable, key, keylen, data __attribute__((__unused__)), datalen, + ({ + int cnum; + for (cnum=0; cnumn_children-1; cnum++) { + if (keycompare(key, keylen, node->childkeys[cnum], node->childkeylens[cnum])<=0) + break; + } + child_weights[cnum] += keylen + datalen + KEY_VALUE_OVERHEAD; + child_counts[cnum]++; + })); + { + int maxchild=0, maxchildweight=child_weights[0]; + for (i=1; in_children; i++) { + if (maxchildweighthashtable, key, keylen, data, datalen, ({ + int cnum; + for (cnum=0; cnumn_children-1; cnum++) { + if (keycompare(key, keylen, node->childkeys[cnum], node->childkeylens[cnum])<=0) + break; + } + if (cnum==maxchild) { + pairs[pairs_count].key = key; + pairs[pairs_count].keylen = keylen; + pairs[pairs_count].val = data; + pairs[pairs_count].vallen = datalen; + pairs_count++; + } + })); + } + /* Now we have the pairs. */ + *childnum_ret = maxchild; + *pairs_ret = pairs; + *n_pairs_ret = maxchildcount; + } + } +} +#endif + +static int brtnode_insert (BRT t, BRTNODE node, bytevec key, ITEMLEN keylen, bytevec val, ITEMLEN vallen, + int *did_split, BRTNODE *nodea, BRTNODE *nodeb, bytevec*splitkey, ITEMLEN *splitkeylen, + int debug); + +/* key is not in the hashtable in node. Either put the key-value pair in the child, or put it in the node. */ +static int push_kvpair_down_only_if_it_wont_push_more_else_put_here (BRT t, BRTNODE node, BRTNODE child, + bytevec key, ITEMLEN keylen, bytevec val, ITEMLEN vallen, + int childnum_of_node) { + assert(node->height>0); /* Not a leaf. */ + int to_child=serialize_brtnode_size(child)+keylen+vallen+KEY_VALUE_OVERHEAD <= child->nodesize; + if (brt_debug_mode) { + printf("%s:%d pushing %s to %s %d", __FILE__, __LINE__, (char*)key, to_child? "child" : "hash", childnum_of_node); + if (childnum_of_node+1u.n.n_children) { + printf(" nextsplitkey=%s\n", (char*)node->u.n.childkeys[childnum_of_node]); + assert(keycompare(key, keylen, node->u.n.childkeys[childnum_of_node], node->u.n.childkeylens[childnum_of_node])<=0); + } else { + printf("\n"); + } + } + if (to_child) { + int again_split=-1; BRTNODE againa,againb; bytevec againkey; ITEMLEN againlen; + //printf("%s:%d hello!\n", __FILE__, __LINE__); + int r = brtnode_insert(t, child, key, keylen, val, vallen, + &again_split, &againa, &againb, &againkey, &againlen, + 0); + if (r!=0) return r; + assert(again_split==0); /* I only did the insert if I knew it wouldn't push down, and hence wouldn't split. */ + return r; + } else { + int r=insert_to_hash_in_nonleaf(node, childnum_of_node, key, keylen, val, vallen); + return r; + } +} + +static int push_a_kvpair_down (BRT t, BRTNODE node, BRTNODE child, int childnum, + bytevec key, ITEMLEN keylen, bytevec val, ITEMLEN vallen, + int *child_did_split, BRTNODE *childa, BRTNODE *childb, bytevec*childsplitkey, ITEMLEN *childsplitkeylen) { + //if (debug) printf("%s:%d %*sinserting down\n", __FILE__, __LINE__, debug, ""); + //printf("%s:%d hello!\n", __FILE__, __LINE__); + assert(node->height>0); + + { + int r = brtnode_insert(t, child, key, keylen, val, vallen, + child_did_split, childa, childb, childsplitkey, childsplitkeylen, + 0); + if (r!=0) return r; + } + //if (debug) printf("%s:%d %*sinserted down child_did_split=%d\n", __FILE__, __LINE__, debug, "", child_did_split); + { + int r = hash_delete(node->u.n.htables[childnum], key, keylen); // Must delete after doing the insert, to avoid operating on freed' key + if (r!=0) return r; + } + { + int n_bytes_removed = (keylen + vallen + KEY_VALUE_OVERHEAD); + node->u.n.n_bytes_in_hashtables -= n_bytes_removed; + node->u.n.n_bytes_in_hashtable[childnum] -= n_bytes_removed; + } + return 0; +} + +int split_count=0; + +/* NODE is a node with a child. + * childnum was split into two nodes childa, and childb. + * We must slide things around, & move things from the old table to the new tables. + * We also move things to the new children as much as we an without doing any pushdowns or splitting of the child. + * We must delete the old hashtable (but the old child is already deleted.) + * We also unpin the new children. + */ +static int handle_split_of_child (BRT t, BRTNODE node, int childnum, + BRTNODE childa, BRTNODE childb, bytevec childsplitkey, ITEMLEN childsplitkeylen, + int *did_split, BRTNODE *nodea, BRTNODE *nodeb, bytevec*splitkey, ITEMLEN *splitkeylen) { + assert(node->height>0); + HASHTABLE old_h = node->u.n.htables[childnum]; + int old_count = node->u.n.n_bytes_in_hashtable[childnum]; + int cnum; + int r; + assert(node->u.n.n_children<=TREE_FANOUT); + + if (brt_debug_mode) { + int i; + printf("%s:%d Child %d did split on %s\n", __FILE__, __LINE__, childnum, (char*)childsplitkey); + printf("%s:%d oldsplitkeys:", __FILE__, __LINE__); + for(i=0; iu.n.n_children-1; i++) printf(" %s", (char*)node->u.n.childkeys[i]); + printf("\n"); + } + + // Slide the children over. + for (cnum=node->u.n.n_children; cnum>childnum+1; cnum--) { + node->u.n.children[cnum] = node->u.n.children[cnum-1]; + node->u.n.htables[cnum] = node->u.n.htables[cnum-1]; + node->u.n.n_bytes_in_hashtable[cnum] = node->u.n.n_bytes_in_hashtable[cnum-1]; + } + node->u.n.children[childnum] = childa->thisnodename; + node->u.n.children[childnum+1] = childb->thisnodename; + hashtable_create(&node->u.n.htables[childnum]); + hashtable_create(&node->u.n.htables[childnum+1]); + node->u.n.n_bytes_in_hashtable[childnum] = 0; + node->u.n.n_bytes_in_hashtable[childnum+1] = 0; + // Slide the keys over + for (cnum=node->u.n.n_children-1; cnum>childnum; cnum--) { + node->u.n.childkeys[cnum] = node->u.n.childkeys[cnum-1]; + node->u.n.childkeylens[cnum] = node->u.n.childkeylens[cnum-1]; + } + node->u.n.childkeys[childnum]=childsplitkey; + node->u.n.childkeylens[childnum]= childsplitkeylen; + node->u.n.totalchildkeylens += childsplitkeylen; + node->u.n.n_children++; + + if (brt_debug_mode) { + int i; + printf("%s:%d splitkeys:", __FILE__, __LINE__); + for(i=0; iu.n.n_children-1; i++) printf(" %s", (char*)node->u.n.childkeys[i]); + printf("\n"); + } + + node->u.n.n_bytes_in_hashtables -= old_count; /* By default, they are all removed. We might add them back in. */ + /* Keep pushing to the children, but not if the children would require a pushdown */ + HASHTABLE_ITERATE(old_h, skey, skeylen, sval, svallen, ({ + if (keycompare(skey, skeylen, childsplitkey, childsplitkeylen)<=0) { + r=push_kvpair_down_only_if_it_wont_push_more_else_put_here(t, node, childa, skey, skeylen, sval, svallen, childnum); + } else { + r=push_kvpair_down_only_if_it_wont_push_more_else_put_here(t, node, childb, skey, skeylen, sval, svallen, childnum+1); + } + if (r!=0) return r; + })); + hashtable_free(&old_h); + + r=cachetable_unpin(t->cf, childa->thisnodename, 1); + assert(r==0); + r=cachetable_unpin(t->cf, childb->thisnodename, 1); + assert(r==0); + + + verify_counts(node); + verify_counts(childa); + verify_counts(childb); + + if (node->u.n.n_children>TREE_FANOUT) { + //printf("%s:%d about to split having pushed %d out of %d keys\n", __FILE__, __LINE__, i, n_pairs); + brt_nonleaf_split(t, node, nodea, nodeb, splitkey, splitkeylen); + //printf("%s:%d did split\n", __FILE__, __LINE__); + split_count++; + *did_split=1; + assert((*nodea)->height>0); + assert((*nodeb)->height>0); + assert((*nodea)->u.n.n_children>0); + assert((*nodeb)->u.n.n_children>0); + assert((*nodea)->u.n.children[(*nodea)->u.n.n_children-1]!=0); + assert((*nodeb)->u.n.children[(*nodeb)->u.n.n_children-1]!=0); + assert(serialize_brtnode_size(*nodea)<=(*nodea)->nodesize); + assert(serialize_brtnode_size(*nodeb)<=(*nodeb)->nodesize); + } else { + *did_split=0; + assert(serialize_brtnode_size(node)<=node->nodesize); + } + return 0; +} + +static int push_some_kvpairs_down (BRT t, BRTNODE node, int childnum, + int *did_split, BRTNODE *nodea, BRTNODE *nodeb, bytevec *splitkey, ITEMLEN *splitkeylen, + int debug) { + void *childnode_v; + BRTNODE child; + int r; + assert(node->height>0); + diskoff targetchild = node->u.n.children[childnum]; + assert(targetchild>=0 && targetchildh->unused_memory); // This assertion could fail in a concurrent setting since another process might have bumped unused memory. + r = cachetable_get_and_pin(t->cf, targetchild, &childnode_v, + brtnode_flush_callback, brtnode_fetch_callback, (void*)t->h->nodesize); + if (r!=0) return r; + child=childnode_v; + verify_counts(child); + //printf("%s:%d height=%d n_bytes_in_hashtable = {%d, %d, %d, ...}\n", __FILE__, __LINE__, child->height, child->n_bytes_in_hashtable[0], child->n_bytes_in_hashtable[1], child->n_bytes_in_hashtable[2]); + if (child->height>0 && child->u.n.n_children>0) assert(child->u.n.children[child->u.n.n_children-1]!=0); + if (debug) printf("%s:%d %*spush_some_kvpairs_down to %lld\n", __FILE__, __LINE__, debug, "", child->thisnodename); + /* I am exposing the internals of the hash table here, mostly because I am not thinking of a really + * good way to do it otherwise. I want to loop over the elements of the hash table, deleting some as I + * go. The HASHTABLE_ITERATE macro will break if I delete something from the hash table. */ + + { + bytevec key,val; + ITEMLEN keylen, vallen; + while(0==hashtable_random_pick(node->u.n.htables[childnum], &key, &keylen, &val, &vallen)) { + int child_did_split=0; BRTNODE childa, childb; bytevec childsplitkey; ITEMLEN childsplitkeylen; + if (debug) printf("%s:%d %*spush down %s\n", __FILE__, __LINE__, debug, "", (char*)key); + r = push_a_kvpair_down (t, node, child, childnum, + key, keylen, val, vallen, + &child_did_split, &childa, &childb, &childsplitkey, &childsplitkeylen); + if (r!=0) return r; + if (child_did_split) { + // If the child splits, we don't push down any further. + if (debug) printf("%s:%d %*shandle split splitkey=%s\n", __FILE__, __LINE__, debug, "", (char*)childsplitkey); + r=handle_split_of_child (t, node, childnum, + childa, childb, childsplitkey, childsplitkeylen, + did_split, nodea, nodeb, splitkey, splitkeylen); + return r; /* Don't do any more pushing if the child splits. */ + } + } + } + if (debug) printf("%s:%d %*sdone push_some_kvpairs_down, unpinning %lld\n", __FILE__, __LINE__, debug, "", targetchild); + r=cachetable_unpin(t->cf, targetchild, 1); + if (r!=0) return r; + *did_split=0; + return 0; +} + +int debugp1 (int debug) { + return debug ? debug+1 : 0; +} + +static int brtnode_maybe_push_down(BRT t, BRTNODE node, int *did_split, BRTNODE *nodea, BRTNODE *nodeb, bytevec *splitkey, ITEMLEN *splitkeylen, int debug) +/* If the buffer is too full, then push down. Possibly the child will split. That may make us split. */ +{ + assert(node->height>0); + if (debug) printf("%s:%d %*sIn maybe_push_down in_buffer=%d childkeylens=%d size=%d\n", __FILE__, __LINE__, debug, "", node->u.n.n_bytes_in_hashtables, node->u.n.totalchildkeylens, serialize_brtnode_size(node)); + if (serialize_brtnode_size(node) > node->nodesize ) { + if (debug) printf("%s:%d %*stoo full, height=%d\n", __FILE__, __LINE__, debug, "", node->height); + { + /* Push to a child. */ + /* Find the heaviest child, and push stuff to it. Keep pushing to the child until we run out. + * But if the child pushes something to its child and our buffer has gotten small enough, then we stop pushing. */ + int childnum; + if (debug) printf("%s:%d %*sfind_heaviest_data\n", __FILE__, __LINE__, debug, ""); + find_heaviest_child(node, &childnum); + if (debug) printf("%s:%d %*spush some down from %lld into %lld\n", __FILE__, __LINE__, debug, "", node->thisnodename, node->u.n.children[childnum]); + assert(node->u.n.children[childnum]!=0); + int r = push_some_kvpairs_down(t, node, childnum, did_split, nodea, nodeb, splitkey, splitkeylen, debugp1(debug)); + if (r!=0) return r; + assert(*did_split==0 || *did_split==1); + if (debug) printf("%s:%d %*sdid push_some_kvpairs_down did_split=%d\n", __FILE__, __LINE__, debug, "", *did_split); + if (*did_split) { + assert(serialize_brtnode_size(*nodea)<=(*nodea)->nodesize); + assert(serialize_brtnode_size(*nodeb)<=(*nodeb)->nodesize); + assert((*nodea)->u.n.n_children>0); + assert((*nodeb)->u.n.n_children>0); + assert((*nodea)->u.n.children[(*nodea)->u.n.n_children-1]!=0); + assert((*nodeb)->u.n.children[(*nodeb)->u.n.n_children-1]!=0); + } else { + assert(serialize_brtnode_size(node)<=node->nodesize); + } + } + } else { + *did_split=0; + assert(serialize_brtnode_size(node)<=node->nodesize); + } + return 0; +} + +static int brt_leaf_insert (BRT t, BRTNODE node, bytevec key, ITEMLEN keylen, bytevec val, ITEMLEN vallen, + int *did_split, BRTNODE *nodea, BRTNODE *nodeb, bytevec*splitkey, ITEMLEN *splitkeylen, + int debug) { + bytevec olddata; + ITEMLEN olddatalen; + enum pma_errors pma_status = pma_lookup(node->u.l.buffer, key, keylen, &olddata, &olddatalen); + if (pma_status==BRT_OK) { + pma_status = pma_delete(node->u.l.buffer, key, keylen); + assert(pma_status==BRT_OK); + node->u.l.n_bytes_in_buffer -= keylen + olddatalen + KEY_VALUE_OVERHEAD; + } + pma_status = pma_insert(node->u.l.buffer, key, keylen, val, vallen); + node->u.l.n_bytes_in_buffer += keylen + vallen + KEY_VALUE_OVERHEAD; + // If it doesn't fit, then split the leaf. + if (serialize_brtnode_size(node) > node->nodesize) { + int r = brtleaf_split (t, node, nodea, nodeb, splitkey, splitkeylen); + if (r!=0) return r; + //printf("%s:%d splitkey=%s\n", __FILE__, __LINE__, (char*)*splitkey); + split_count++; + *did_split = 1; + verify_counts(*nodea); verify_counts(*nodeb); + if (debug) printf("%s:%d %*snodeb->thisnodename=%lld nodeb->size=%d\n", __FILE__, __LINE__, debug, "", (*nodeb)->thisnodename, (*nodeb)->nodesize); + assert(serialize_brtnode_size(*nodea)<=(*nodea)->nodesize); + assert(serialize_brtnode_size(*nodeb)<=(*nodeb)->nodesize); + } else { + *did_split = 0; + } + return 0; +} + +static int brt_nonleaf_insert (BRT t, BRTNODE node, bytevec key, ITEMLEN keylen, bytevec val, ITEMLEN vallen, + int *did_split, BRTNODE *nodea, BRTNODE *nodeb, bytevec*splitkey, ITEMLEN *splitkeylen, + int debug) { + + bytevec olddata; + ITEMLEN olddatalen; + unsigned int childnum = brtnode_which_child(node, key, keylen); + int found = !hash_find(node->u.n.htables[childnum], key, keylen, &olddata, &olddatalen); + + if (0) { // It is faster to do this, except on yobiduck where things grind to a halt. + void *child_v; + if (node->height>0 && + 0 == cachetable_maybe_get_and_pin(t->cf, node->u.n.children[childnum], &child_v)) { + /* If the child is in memory, then go ahead and put it in the child. */ + BRTNODE child = child_v; + if (found) { + int diff = keylen + olddatalen + KEY_VALUE_OVERHEAD; + int r = hash_delete(node->u.n.htables[childnum], key, keylen); + assert(r==0); + node->u.n.n_bytes_in_hashtables -= diff; + node->u.n.n_bytes_in_hashtable[childnum] -= diff; + } + { + int child_did_split; + BRTNODE childa, childb; + bytevec childsplitkey; + ITEMLEN childsplitkeylen; + int r = brtnode_insert(t, child, key, keylen, val, vallen, + &child_did_split, &childa, &childb, &childsplitkey, &childsplitkeylen, 0); + if (r!=0) return r; + if (child_did_split) { + r=handle_split_of_child(t, node, childnum, + childa, childb, childsplitkey, childsplitkeylen, + did_split, nodea, nodeb, splitkey, splitkeylen); + if (r!=0) return r; + } else { + cachetable_unpin(t->cf, child->thisnodename, 1); + *did_split = 0; + } + } + return 0; + } + } + + if (debug) printf("%s:%d %*sDoing hash_insert\n", __FILE__, __LINE__, debug, ""); + verify_counts(node); + if (found) { + int r = hash_delete(node->u.n.htables[childnum], key, keylen); + int diff = keylen + olddatalen + KEY_VALUE_OVERHEAD; + assert(r==0); + node->u.n.n_bytes_in_hashtables -= diff; + node->u.n.n_bytes_in_hashtable[childnum] -= diff; + //printf("%s:%d deleted %d bytes\n", __FILE__, __LINE__, diff); + } + { + int diff = keylen + vallen + KEY_VALUE_OVERHEAD; + int r=hash_insert(node->u.n.htables[childnum], key, keylen, val, vallen); + assert(r==0); + node->u.n.n_bytes_in_hashtables += diff; + node->u.n.n_bytes_in_hashtable[childnum] += diff; + + } + if (debug) printf("%s:%d %*sDoing maybe_push_down\n", __FILE__, __LINE__, debug, ""); + int r = brtnode_maybe_push_down(t, node, did_split, nodea, nodeb, splitkey, splitkeylen, debugp1(debug)); + if (r!=0) return r; + if (debug) printf("%s:%d %*sDid maybe_push_down\n", __FILE__, __LINE__, debug, ""); + if (*did_split) { + assert(serialize_brtnode_size(*nodea)<=(*nodea)->nodesize); + assert(serialize_brtnode_size(*nodeb)<=(*nodeb)->nodesize); + assert((*nodea)->u.n.n_children>0); + assert((*nodeb)->u.n.n_children>0); + assert((*nodea)->u.n.children[(*nodea)->u.n.n_children-1]!=0); + assert((*nodeb)->u.n.children[(*nodeb)->u.n.n_children-1]!=0); + verify_counts(*nodea); + verify_counts(*nodeb); + } else { + assert(serialize_brtnode_size(node)<=node->nodesize); + verify_counts(node); + } + return 0; +} + + +static int brtnode_insert (BRT t, BRTNODE node, bytevec key, ITEMLEN keylen, bytevec val, ITEMLEN vallen, + int *did_split, BRTNODE *nodea, BRTNODE *nodeb, bytevec*splitkey, ITEMLEN *splitkeylen, + int debug) { + if (node->height==0) { + return brt_leaf_insert(t, node, key, keylen, val, vallen, + did_split, nodea, nodeb, splitkey, splitkeylen, + debug); + } else { + return brt_nonleaf_insert(t, node, key, keylen, val, vallen, + did_split, nodea, nodeb, splitkey, splitkeylen, + debug); + } +} + +enum {n_nodes_in_cache =64}; + +int brt_create_cachetable (CACHETABLE *ct, int cachelines) { + if (cachelines==0) cachelines=n_nodes_in_cache; + assert(cachelines>0); + return create_cachetable(ct, cachelines); +} + +static int setup_brt_root_node (BRT t, diskoff offset) { + int r; + BRTNODE MALLOC(node); + assert(node); + //printf("%s:%d\n", __FILE__, __LINE__); + initialize_brtnode(t, node, + offset, /* the location is one nodesize offset from 0. */ + 0); + if (0) { + printf("%s:%d for tree %p node %p mdict_create--> %p\n", __FILE__, __LINE__, t, node, node->u.l.buffer); + printf("%s:%d put root at %lld\n", __FILE__, __LINE__, offset); + } + r=cachetable_put(t->cf, offset, node, + brtnode_flush_callback, brtnode_fetch_callback, (void*)t->h->nodesize); + if (r!=0) { + my_free(node); + return r; + } + //printf("%s:%d created %lld\n", __FILE__, __LINE__, node->thisnodename); + verify_counts(node); + r=cachetable_unpin(t->cf, node->thisnodename, 1); + if (r!=0) { + my_free(node); + return r; + } + return 0; +} + +#define BRT_TRACE +#ifdef BRT_TRACE +#define WHEN_BRTTRACE(x) x +#else +#define WHEN_BRTTRACE(x) ((void)0) +#endif + +int open_brt (const char *fname, const char *dbname, int is_create, BRT *newbrt, int nodesize, CACHETABLE cachetable) { + /* If dbname is NULL then we setup to hold a single tree. Otherwise we setup an array. */ + int r; + BRT t; + char *malloced_name=0; + //printf("%s:%d %d alloced\n", __FILE__, __LINE__, get_n_items_malloced()); print_malloced_items(); + WHEN_BRTTRACE(fprintf(stderr, "BRTTRACE: open_brt(%s, \"%s\", %d, %p, %d, %p)\n", + fname, dbname, is_create, newbrt, nodesize, cachetable)); + if ((MALLOC(t))==0) { + assert(errno==ENOMEM); + r = ENOMEM; + if (0) { died0: my_free(t); } + return r; + } + if (dbname) { + malloced_name = mystrdup(dbname); + if (malloced_name==0) { + r = ENOMEM; + if (0) { died0a: if(malloced_name) my_free(malloced_name); } + goto died0; + } + } + t->database_name = malloced_name; + r=cachetable_openf(&t->cf, cachetable, fname, O_RDWR | (is_create ? O_CREAT : 0), 0777); + if (r!=0) { + if (0) { died1: cachefile_close(t->cf); } + goto died0a; + } + assert(nodesize>0); + //printf("%s:%d %d alloced\n", __FILE__, __LINE__, get_n_items_malloced()); print_malloced_items(); + if (is_create) { + r = read_and_pin_brt_header(t->cf, &t->h); + if (r==-1) { + /* construct a new header. */ + if ((MALLOC(t->h))==0) { + assert(errno==ENOMEM); + r = ENOMEM; + if (0) { died2: my_free(t->h); } + goto died1; + } + t->h->nodesize=nodesize; + t->h->freelist=-1; + t->h->unused_memory=2*nodesize; + if (dbname) { + t->h->unnamed_root = -1; + t->h->n_named_roots = 1; + if ((MALLOC_N(1, t->h->names))==0) { assert(errno==ENOMEM); r=ENOMEM; if (0) { died3: my_free(t->h->names); } goto died2; } + if ((MALLOC_N(1, t->h->roots))==0) { assert(errno==ENOMEM); r=ENOMEM; if (0) { died4: my_free(t->h->roots); } goto died3; } + if ((t->h->names[0] = mystrdup(dbname))==0) { assert(errno==ENOMEM); r=ENOMEM; if (0) { died5: my_free(t->h->names[0]); } goto died4; } + t->h->roots[0] = nodesize; + } else { + t->h->unnamed_root = nodesize; + t->h->n_named_roots = -1; + t->h->names=0; + t->h->roots=0; + } + if ((r=setup_brt_root_node(t, nodesize))!=0) { if (dbname) goto died5; else goto died2; } + if ((r=cachetable_put(t->cf, 0, t->h, brtheader_flush_callback, brtheader_fetch_callback, 0))) { if (dbname) goto died5; else goto died2; } + } else { + int i; + assert(r==0); + assert(t->h->unnamed_root==-1); + assert(t->h->n_named_roots>=0); + for (i=0; ih->n_named_roots; i++) { + if (strcmp(t->h->names[i], dbname)==0) { + r = EEXIST; + goto died1; /* deallocate everything. */ + } + } + if ((t->h->names = my_realloc(t->h->names, (1+t->h->n_named_roots)*sizeof(*t->h->names))) == 0) { assert(errno==ENOMEM); r=ENOMEM; goto died1; } + if ((t->h->roots = my_realloc(t->h->roots, (1+t->h->n_named_roots)*sizeof(*t->h->roots))) == 0) { assert(errno==ENOMEM); r=ENOMEM; goto died1; } + t->h->n_named_roots++; + if ((t->h->names[t->h->n_named_roots-1] = mystrdup(dbname)) == 0) { assert(errno==ENOMEM); r=ENOMEM; goto died1; } + printf("%s:%d t=%p\n", __FILE__, __LINE__, t); + t->h->roots[t->h->n_named_roots-1] = malloc_diskblock_header_is_in_memory(t, t->h->nodesize); + if ((r=setup_brt_root_node(t, t->h->roots[t->h->n_named_roots-1]))!=0) goto died1; + } + } else { + if ((r = read_and_pin_brt_header(t->cf, &t->h))!=0) goto died1; + if (!dbname) { + if (t->h->n_named_roots!=-1) { r = -2; /* invalid args??? */; goto died1; } + } else { + int i; + for (i=0; ih->n_named_roots; i++) { + if (strcmp(t->h->names[i], dbname)==0) { + goto found_it; + } + + } + r=ENOENT; /* the database doesn't exist */ + goto died1; + } + found_it: ; + } + assert(t->h); + if ((r = unpin_brt_header(t)) !=0) goto died1; + assert(t->h==0); + WHEN_BRTTRACE(fprintf(stderr, "BRTTRACE -> %p\n", t)); + t->cursors_head = t->cursors_tail = 0; + *newbrt = t; + return 0; +} + +int close_brt (BRT brt) { + int r; + while (brt->cursors_head) { + BRT_CURSOR c = brt->cursors_head; + r=brt_cursor_close(c); + if (r!=0) return r; + } + assert(0==cachefile_assert_all_unpinned(brt->cf)); + //printf("%s:%d closing cachetable\n", __FILE__, __LINE__); + if ((r = cachefile_close(brt->cf))!=0) return r; + if (brt->database_name) my_free(brt->database_name); + my_free(brt); + return 0; +} + +int brt_debug_mode = 0;//strcmp(key,"hello387")==0; + +CACHEKEY* calculate_root_offset_pointer (BRT brt) { + if (brt->database_name==0) { + return &brt->h->unnamed_root; + } else { + int i; + for (i=0; ih->n_named_roots; i++) { + if (strcmp(brt->database_name, brt->h->names[i])==0) { + return &brt->h->roots[i]; + } + } + } + abort(); +} + +int brt_insert (BRT brt, bytevec key, ITEMLEN keylen, bytevec val, ITEMLEN vallen) { + void *node_v; + BRTNODE node; + CACHEKEY *rootp; + int r; + int did_split; BRTNODE nodea=0, nodeb=0; bytevec splitkey; ITEMLEN splitkeylen; + int debug = brt_debug_mode;//strcmp(key,"hello387")==0; + //assert(0==cachetable_assert_all_unpinned(brt->cachetable)); + if ((r = read_and_pin_brt_header(brt->cf, &brt->h))) { + if (0) { died0: unpin_brt_header(brt); } + return r; + } + rootp = calculate_root_offset_pointer(brt); + if (debug) printf("%s:%d Getting %lld\n", __FILE__, __LINE__, *rootp); + if ((r=cachetable_get_and_pin(brt->cf, *rootp, &node_v, + brtnode_flush_callback, brtnode_fetch_callback, (void*)brt->h->nodesize))) { + goto died0; + } + node=node_v; + if (debug) printf("%s:%d node inserting\n", __FILE__, __LINE__); + r = brtnode_insert(brt, node, key, keylen, val, vallen, + &did_split, &nodea, &nodeb, &splitkey, &splitkeylen, + debug); + if (r!=0) return r; + if (debug) printf("%s:%d did_insert\n", __FILE__, __LINE__); + if (did_split) { + //printf("%s:%d did_split=%d nodeb=%p nodeb->thisnodename=%lld nodeb->nodesize=%d\n", __FILE__, __LINE__, did_split, nodeb, nodeb->thisnodename, nodeb->nodesize); + //printf("Did split, splitkey=%s\n", splitkey); + if (nodeb->height>0) assert(nodeb->u.n.children[nodeb->u.n.n_children-1]!=0); + assert(nodeb->nodesize>0); + } + if (did_split) { + /* We must cope. */ + BRTNODE MALLOC(newroot); + diskoff newroot_diskoff=malloc_diskblock(brt, brt->h->nodesize); + assert(newroot); + *rootp=newroot_diskoff; + brt->h->dirty=1; + initialize_brtnode (brt, newroot, newroot_diskoff, nodea->height+1); + newroot->u.n.n_children=2; + //printf("%s:%d Splitkey=%p %s\n", __FILE__, __LINE__, splitkey, splitkey); + newroot->u.n.childkeys[0] = splitkey; + newroot->u.n.childkeylens[0] = splitkeylen; + newroot->u.n.totalchildkeylens=splitkeylen; + newroot->u.n.children[0]=nodea->thisnodename; + newroot->u.n.children[1]=nodeb->thisnodename; + r=hashtable_create(&newroot->u.n.htables[0]); if (r!=0) return r; + r=hashtable_create(&newroot->u.n.htables[1]); if (r!=0) return r; + verify_counts(newroot); + r=cachetable_unpin(brt->cf, nodea->thisnodename, 1); if (r!=0) return r; + r=cachetable_unpin(brt->cf, nodeb->thisnodename, 1); if (r!=0) return r; + //printf("%s:%d put %lld\n", __FILE__, __LINE__, brt->root); + cachetable_put(brt->cf, newroot_diskoff, newroot, + brtnode_flush_callback, brtnode_fetch_callback, (void*)brt->h->nodesize); + } else { + if (node->height>0) + assert(node->u.n.n_children<=TREE_FANOUT); + } + cachetable_unpin(brt->cf, *rootp, 1); + if ((r = unpin_brt_header(brt))!=0) return r; + //assert(0==cachetable_assert_all_unpinned(brt->cachetable)); + return 0; +} + +// This is pretty ugly. +static unsigned char lookup_result[1000000]; + +int brt_lookup_node (BRT brt, diskoff off, bytevec key, ITEMLEN keylen, bytevec *val, ITEMLEN *vallen) { + void *node_v; + int r = cachetable_get_and_pin(brt->cf, off, &node_v, + brtnode_flush_callback, brtnode_fetch_callback, (void*)brt->h->nodesize); + bytevec answer; + ITEMLEN answerlen; + BRTNODE node; + int childnum; + if (r!=0) { + int r2; + died0: + printf("%s:%d r=%d\n", __FILE__, __LINE__, r); + r2 = cachetable_unpin(brt->cf, off, 0); + return r; + } + node=node_v; + if (node->height==0) { + r = pma_lookup(node->u.l.buffer, key, keylen, &answer, &answerlen); + //printf("%s:%d looked up something, got answerlen=%d\n", __FILE__, __LINE__, answerlen); + if (r!=0) goto died0; + if (r==0) { + *val = answer; + *vallen = answerlen; + } + r = cachetable_unpin(brt->cf, off, 0); + return r; + } + + childnum = brtnode_which_child(node, key, keylen); + // Leaves have a single mdict, where the data is found. + if (hash_find (node->u.n.htables[childnum], key, keylen, &answer, vallen)==0) { + //printf("Found %d bytes\n", *vallen); + assert(*vallen<=(int)(sizeof(lookup_result))); + memcpy(lookup_result, answer, *vallen); + //printf("Returning %s\n", lookup_result); + *val = lookup_result; + r = cachetable_unpin(brt->cf, off, 0); + assert(r==0); + return 0; + } + if (node->height==0) { + r = cachetable_unpin(brt->cf, off, 0); + if (r==0) return DB_NOTFOUND; + else return r; + } + { + int result = brt_lookup_node(brt, node->u.n.children[childnum], key, keylen, val, vallen); + r = cachetable_unpin(brt->cf, off, 0); + if (r!=0) return r; + return result; + } +} + + +int brt_lookup (BRT brt, bytevec key, unsigned int keylen, bytevec*val, unsigned int *vallen) { + int r; + CACHEKEY *rootp; + assert(0==cachefile_assert_all_unpinned(brt->cf)); + if ((r = read_and_pin_brt_header(brt->cf, &brt->h))) { + printf("%s:%d\n", __FILE__, __LINE__); + if (0) { died0: unpin_brt_header(brt); } + printf("%s:%d returning %d\n", __FILE__, __LINE__, r); + assert(0==cachefile_assert_all_unpinned(brt->cf)); + return r; + } + rootp = calculate_root_offset_pointer(brt); + if ((r = brt_lookup_node(brt, *rootp, key, keylen, val, vallen))) { + printf("%s:%d\n", __FILE__, __LINE__); + goto died0; + } + //printf("%s:%d r=%d", __FILE__, __LINE__, r); if (r==0) printf(" vallen=%d", *vallen); printf("\n"); + if ((r = unpin_brt_header(brt))!=0) return r; + assert(0==cachefile_assert_all_unpinned(brt->cf)); + return 0; +} + +int verify_brtnode (BRT brt, diskoff off, bytevec lorange, ITEMLEN lolen, bytevec hirange, ITEMLEN hilen, int recurse); + +int dump_brtnode (BRT brt, diskoff off, int depth, bytevec lorange, ITEMLEN lolen, bytevec hirange, ITEMLEN hilen) { + int result=0; + BRTNODE node; + void *node_v; + int r = cachetable_get_and_pin(brt->cf, off, &node_v, + brtnode_flush_callback, brtnode_fetch_callback, (void*)brt->h->nodesize); + assert(r==0); + node=node_v; + result=verify_brtnode(brt, off, lorange, lolen, hirange, hilen, 0); + printf("%*sNode=%p\n", depth, "", node); + if (node->height>0) { + printf("%*sNode %lld nodesize=%d height=%d n_children=%d n_bytes_in_hashtables=%d keyrange=%s %s\n", + depth, "", off, node->nodesize, node->height, node->u.n.n_children, node->u.n.n_bytes_in_hashtables, (char*)lorange, (char*)hirange); + //printf("%s %s\n", lorange ? lorange : "NULL", hirange ? hirange : "NULL"); + { + int i; + for (i=0; i< node->u.n.n_children-1; i++) { + printf("%*schild %d buffered (%d entries):\n", depth+1, "", i, hashtable_n_entries(node->u.n.htables[i])); + HASHTABLE_ITERATE(node->u.n.htables[i], key, keylen, data, datalen, + ({ + printf("%*s %s %s\n", depth+2, "", (char*)key, (char*)data); + assert(strlen((char*)key)+1==keylen); + assert(strlen((char*)data)+1==datalen); + })); + } + for (i=0; iu.n.n_children; i++) { + printf("%*schild %d\n", depth, "", i); + if (i>0) { + printf("%*spivot %d=%s\n", depth+1, "", i-1, (char*)node->u.n.childkeys[i-1]); + } + dump_brtnode(brt, node->u.n.children[i], depth+4, + (i==0) ? lorange : node->u.n.childkeys[i-1], + (i==0) ? lolen : node->u.n.childkeylens[i-1], + (i==node->u.n.n_children-1) ? hirange : node->u.n.childkeys[i], + (i==node->u.n.n_children-1) ? hilen : node->u.n.childkeylens[i] + ); + } + } + } else { + printf("%*sNode %lld nodesize=%d height=%d n_bytes_in_buffer=%d keyrange=%s %s\n", + depth, "", off, node->nodesize, node->height, node->u.l.n_bytes_in_buffer, (char*)lorange, (char*)hirange); + PMA_ITERATE(node->u.l.buffer, key, keylen, val, vallen, + ( keylen=keylen, vallen=vallen, printf(" %s:%s", (char*)key, (char*)val))); + printf("\n"); + } + r = cachetable_unpin(brt->cf, off, 0); + assert(r==0); + return result; +} + +int dump_brt (BRT brt) { + int r; + CACHEKEY *rootp; + if ((r = read_and_pin_brt_header(brt->cf, &brt->h))) { + if (0) { died0: unpin_brt_header(brt); } + return r; + } + rootp = calculate_root_offset_pointer(brt); + printf("split_count=%d\n", split_count); + if ((r = dump_brtnode(brt, *rootp, 0, 0, 0, 0, 0))) goto died0; + if ((r = unpin_brt_header(brt))!=0) return r; + return 0; +} + +int show_brtnode_blocknumbers (BRT brt, diskoff off) { + BRTNODE node; + void *node_v; + int i,r; + assert(off%brt->h->nodesize==0); + if ((r = cachetable_get_and_pin(brt->cf, off, &node_v, + brtnode_flush_callback, brtnode_fetch_callback, (void*)brt->h->nodesize))) { + if (0) { died0: cachetable_unpin(brt->cf, off, 0); } + return r; + } + node=node_v; + printf(" %lld", off/brt->h->nodesize); + if (node->height>0) { + for (i=0; iu.n.n_children; i++) { + if ((r=show_brtnode_blocknumbers(brt, node->u.n.children[i]))) goto died0; + } + } + r = cachetable_unpin(brt->cf, off, 0); + return r; +} + +int show_brt_blocknumbers (BRT brt) { + int r; + CACHEKEY *rootp; + if ((r = read_and_pin_brt_header(brt->cf, &brt->h))) { + if (0) { died0: unpin_brt_header(brt); } + return r; + } + rootp = calculate_root_offset_pointer(brt); + printf("BRT %p has blocks:", brt); + if ((r=show_brtnode_blocknumbers (brt, *rootp))) goto died0; + printf("\n"); + if ((r = unpin_brt_header(brt))!=0) return r; + return 0; +} + +int verify_brtnode (BRT brt, diskoff off, bytevec lorange, ITEMLEN lolen, bytevec hirange, ITEMLEN hilen, int recurse) { + int result=0; + BRTNODE node; + void *node_v; + int r; + if ((r = cachetable_get_and_pin(brt->cf, off, &node_v, + brtnode_flush_callback, brtnode_fetch_callback, (void*)brt->h->nodesize))) + return r; + node=node_v; + if (node->height>0) { + int i; + for (i=0; i< node->u.n.n_children-1; i++) { + bytevec thislorange,thishirange; + ITEMLEN thislolen, thishilen; + if (node->u.n.n_children==0 || i==0) { + thislorange=lorange; + thislolen =lolen; + } else { + thislorange=node->u.n.childkeys[i-1]; + thislolen =node->u.n.childkeylens[i-1]; + } + if (node->u.n.n_children==0 || i+1>=node->u.n.n_children) { + thishirange=hirange; + thishilen =hilen; + } else { + thishirange=node->u.n.childkeys[i]; + thishilen =node->u.n.childkeylens[i]; + } + { + void verify_pair (bytevec key, unsigned int keylen, + bytevec data __attribute__((__unused__)), unsigned int datalen __attribute__((__unused__)), + void *ignore __attribute__((__unused__))) { + if (thislorange) assert(keycompare(thislorange,thislolen,key,keylen)<0); + if (thishirange && keycompare(key,keylen,thishirange,thishilen)>0) { + printf("%s:%d in buffer %d key %s is bigger than %s\n", __FILE__, __LINE__, i, (char*)key, (char*)thishirange); + result=1; + } + } + hashtable_iterate(node->u.n.htables[i], verify_pair, 0); + } + } + for (i=0; iu.n.n_children; i++) { + if (i>0) { + if (lorange) assert(keycompare(lorange,lolen, node->u.n.childkeys[i-1], node->u.n.childkeylens[i-1])<0); + if (hirange) assert(keycompare(node->u.n.childkeys[i-1], node->u.n.childkeylens[i-1], hirange, hilen)<=0); + } + if (recurse) { + result|=verify_brtnode(brt, node->u.n.children[i], + (i==0) ? lorange : node->u.n.childkeys[i-1], + (i==0) ? lolen : node->u.n.childkeylens[i-1], + (i==node->u.n.n_children-1) ? hirange : node->u.n.childkeys[i], + (i==node->u.n.n_children-1) ? hilen : node->u.n.childkeylens[i], + recurse); + } + } + } + if ((r = cachetable_unpin(brt->cf, off, 0))) return r; + return result; +} + +int verify_brt (BRT brt) { + int r; + CACHEKEY *rootp; + if ((r = read_and_pin_brt_header(brt->cf, &brt->h))) { + if (0) { died0: unpin_brt_header(brt); } + return r; + } + rootp = calculate_root_offset_pointer(brt); + if ((r=verify_brtnode(brt, *rootp, 0, 0, 0, 0, 1))) goto died0; + if ((r = unpin_brt_header(brt))!=0) return r; + return 0; +} + +#if 0 +void brt_fsync (BRT brt) { + int r = cachetable_fsync(brt->cachetable); + assert(r==0); + r = fsync(brt->fd); + assert(r==0); +} + +void brt_flush (BRT brt) { + int r = cachetable_flush(brt->cachetable, brt); + assert(r==0); +} +#endif + +int brtnode_flush_child (BRT brt, BRTNODE node, int cnum) { + brt=brt; node=node; cnum=cnum; + abort(); /* Algorithm: For each key in the cnum'th mdict, insert it to the childnode. It may cause a split. */ +} + +#define CURSOR_PATHLEN_LIMIT 256 +struct brt_cursor { + BRT brt; + int path_len; /* -1 if the cursor points nowhere. */ + BRTNODE path[CURSOR_PATHLEN_LIMIT]; /* Include the leaf (last). These are all pinned. */ + int pathcnum[CURSOR_PATHLEN_LIMIT]; /* which child did we descend to from here? */ + PMA_CURSOR pmacurs; /* The cursor into the leaf. NULL if the cursor doesn't exist. */ + BRT_CURSOR prev,next; +}; +static int unpin_cursor (BRT_CURSOR cursor); + +int brt_cursor (BRT brt, BRT_CURSOR*cursor) { + BRT_CURSOR MALLOC(result); + assert(result); + result->brt = brt; + result->path_len = 0; + result->pmacurs = 0; + + if (brt->cursors_head) { + brt->cursors_head->prev = result; + } else { + brt->cursors_tail = result; + } + result->next = brt->cursors_head; + result->prev = 0; + brt->cursors_head = result; + *cursor = result; + return 0; +} + +int brt_cursor_close (BRT_CURSOR curs) { + BRT brt = curs->brt; + int r=unpin_cursor(curs); + if (curs->prev==0) { + assert(brt->cursors_head==curs); + brt->cursors_head = curs->next; + } else { + curs->prev->next = curs->next; + } + if (curs->next==0) { + assert(brt->cursors_tail==curs); + brt->cursors_tail = curs->prev; + } else { + curs->next->prev = curs->prev; + } + if (curs->pmacurs) { + int r2=pma_cursor_free(&curs->pmacurs); + if (r==0) r=r2; + } + my_free(curs); + return r; +} + +int brtcurs_set_position_last (BRT_CURSOR cursor, diskoff off) { + BRT brt=cursor->brt; + void *node_v; + int r = cachetable_get_and_pin(brt->cf, off, &node_v, + brtnode_flush_callback, brtnode_fetch_callback, (void*)brt->h->nodesize); + if (r!=0) { + if (0) { died0: cachetable_unpin(brt->cf, off, 0); } + return r; + } + BRTNODE node = node_v; + assert(cursor->path_lenpath[cursor->path_len++] = node; + if (node->height>0) { + int childnum = node->u.n.n_children-1; + try_prev_child: + cursor->pathcnum[cursor->path_len-1] = childnum; + r=brtcurs_set_position_last (cursor, node->u.n.children[childnum]); + if (r==DB_NOTFOUND) { + if (childnum>0) { + childnum--; + goto try_prev_child; + } + } + if (r!=0) { + /* we ran out of children without finding anything, or had some other trouble. */ + cursor->path_len--; + goto died0; + } + return 0; + } else { + r=pma_cursor(node->u.l.buffer, &cursor->pmacurs); + if (r!=0) { + if (0) { died10: pma_cursor_free(&cursor->pmacurs); } + cursor->path_len--; + goto died0; + } + r=pma_cursor_set_position_last(cursor->pmacurs); + if (r!=0) goto died10; /* we'll deallocate this cursor, and unpin this node, and go back up. */ + return 0; + } +} + +int brtcurs_set_position_first (BRT_CURSOR cursor, diskoff off) { + BRT brt=cursor->brt; + void *node_v; + int r = cachetable_get_and_pin(brt->cf, off, &node_v, + brtnode_flush_callback, brtnode_fetch_callback, (void*)brt->h->nodesize); + if (r!=0) { + if (0) { died0: cachetable_unpin(brt->cf, off, 0); } + return r; + } + BRTNODE node = node_v; + assert(cursor->path_lenpath[cursor->path_len++] = node; + if (node->height>0) { + int childnum = 0; + try_next_child: + cursor->pathcnum[cursor->path_len-1] = childnum; + r=brtcurs_set_position_first (cursor, node->u.n.children[childnum]); + if (r==DB_NOTFOUND) { + if (childnum+1u.n.n_children) { + childnum++; + goto try_next_child; + } + } + if (r!=0) { + /* we ran out of children without finding anything, or had some other trouble. */ + cursor->path_len--; + goto died0; + } + return 0; + } else { + r=pma_cursor(node->u.l.buffer, &cursor->pmacurs); + if (r!=0) { + if (0) { died10: pma_cursor_free(&cursor->pmacurs); } + cursor->path_len--; + goto died0; + } + r=pma_cursor_set_position_first(cursor->pmacurs); + if (r!=0) goto died10; /* we'll deallocate this cursor, and unpin this node, and go back up. */ + return 0; + } +} + +static int unpin_cursor (BRT_CURSOR cursor) { + BRT brt=cursor->brt; + int i; + int r=0; + for (i=0; ipath_len; i++) { + int r2 = cachetable_unpin(brt->cf, cursor->path[i]->thisnodename, 0); + if (r==0) r=r2; + } + cursor->path_len=0; + return r; +} + +int brt_c_get (BRT_CURSOR cursor, DBT *kbt, DBT *vbt, int flags) { + int do_rmw=0; + int r; + CACHEKEY *rootp; + + dump_brt(cursor->brt); + assert(0==cachefile_assert_all_unpinned(cursor->brt->cf)); + if ((r = read_and_pin_brt_header(cursor->brt->cf, &cursor->brt->h))) { + if (0) { died0: unpin_brt_header(cursor->brt); } + return r; + } + rootp = calculate_root_offset_pointer(cursor->brt); + if (flags&DB_RMW) { + do_rmw=1; + flags &= ~DB_RMW; + } + switch (flags) { + case DB_LAST: + r=unpin_cursor(cursor); if (r!=0) goto died0; + r=brtcurs_set_position_last(cursor, *rootp); if (r!=0) goto died0; + r=pma_cget_current(cursor->pmacurs, kbt, vbt); + break; + case DB_FIRST: + r=unpin_cursor(cursor); if (r!=0) goto died0; + r=brtcurs_set_position_first(cursor, *rootp); if (r!=0) goto died0; + r=pma_cget_current(cursor->pmacurs, kbt, vbt); + break; + default: + fprintf(stderr, "%s:%d c_get(...,%d) not ready\n", __FILE__, __LINE__, flags); + abort(); + } + if ((r = unpin_brt_header(cursor->brt))!=0) return r; + return 0; +} diff --git a/newbrt/brt.h b/newbrt/brt.h new file mode 100644 index 00000000000..78a88b71d9b --- /dev/null +++ b/newbrt/brt.h @@ -0,0 +1,35 @@ +#ifndef BRT_H +#define BRT_H + +// This must be first to make the 64-bit file mode work right in Linux +#define _FILE_OFFSET_BITS 64 +#include "brttypes.h" + +#include "ybt.h" +#include "../include/ydb-constants.h" +#include "cachetable.h" +typedef struct brt *BRT; +int open_brt (const char *fname, const char *dbname, int is_create, BRT *, int nodesize, CACHETABLE); +//int brt_create (BRT **, int nodesize, int n_nodes_in_cache); /* the nodesize and n_nodes in cache really should be separately configured. */ +//int brt_open (BRT *, char *fname, char *dbname); +int brt_insert (BRT brt, bytevec key, ITEMLEN keylen, bytevec val, ITEMLEN vallen); +int brt_lookup (BRT brt, bytevec key, ITEMLEN keylen, bytevec*val, ITEMLEN *vallen); +int close_brt (BRT); +int dump_brt (BRT brt); +void brt_fsync (BRT); /* fsync, but don't clear the caches. */ + +void brt_flush (BRT); /* fsync and clear the caches. */ + +int brt_create_cachetable (CACHETABLE *t, int n_cachlines /* Pass 0 if you want the default. */); + +extern int brt_debug_mode; +int verify_brt (BRT brt); + +int show_brt_blocknumbers(BRT); + +typedef struct brt_cursor *BRT_CURSOR; +int brt_cursor (BRT, BRT_CURSOR*); +int brt_c_get (BRT_CURSOR cursor, DBT *kbt, DBT *vbt, int brtc_flags); +int brt_cursor_close (BRT_CURSOR curs); + +#endif diff --git a/newbrt/brttypes.h b/newbrt/brttypes.h new file mode 100644 index 00000000000..95b5c0a2dda --- /dev/null +++ b/newbrt/brttypes.h @@ -0,0 +1,8 @@ +#ifndef BRTTYPES_H +#define BRTTYPES_H +#define _XOPEN_SOURCE 500 +#define _FILE_OFFSET_BITS 64 +typedef unsigned int ITEMLEN; +typedef const void *bytevec; +//typedef const void *bytevec; +#endif diff --git a/newbrt/cachetable-test.c b/newbrt/cachetable-test.c new file mode 100644 index 00000000000..eef6dfcfc36 --- /dev/null +++ b/newbrt/cachetable-test.c @@ -0,0 +1,277 @@ +#include "memory.h" +#include "cachetable.h" + +#include +#include +#include +#include +#include + +struct item { + CACHEKEY key; + char *something; +}; + +int expect_n_flushes=0; +CACHEKEY flushes[100]; + +static void expect1(CACHEKEY key) { + expect_n_flushes=1; + flushes[0]=key; +} +static void expectN(CACHEKEY key) { + flushes[expect_n_flushes++]=key; +} + +CACHEFILE expect_f; + +static void flush (CACHEFILE f, CACHEKEY key, void*value, int write_me __attribute__((__unused__)), int keep_mee __attribute__((__unused__))) { + struct item *it = value; + int i; + + printf("Flushing %lld (it=>key=%lld)\n", key, it->key); + + assert(expect_f==f); + assert(strcmp(it->something,"something")==0); + assert(it->key==key); + + /* Verify that we expected the flush. */ + for (i=0; ikey=key; + it->something="something"; + return it; +} + +CACHEKEY did_fetch=-1; +int fetch (CACHEFILE f, CACHEKEY key, void**value, void*extraargs) { + printf("Fetch %lld\n", key); + assert (expect_f==f); + assert((long)extraargs==23); + *value = make_item(key); + did_fetch=key; + return 0; +} + + +void test0 (void) { + void* t3=(void*)23; + CACHETABLE t; + CACHEFILE f; + int r; + char fname[] = "test.dat"; + r=create_cachetable(&t, 5); + assert(r==0); + unlink(fname); + r = cachetable_openf(&f, t, fname, O_RDWR|O_CREAT, 0777); + assert(r==0); + expect_f = f; + + expect_n_flushes=0; + r=cachetable_put(f, 1, make_item(1), flush, fetch, t3); /* 1P */ /* this is the lru list. 1 is pinned. */ + assert(r==0); + assert(expect_n_flushes==0); + + expect_n_flushes=0; + r=cachetable_put(f, 2, make_item(2), flush, fetch, t3); + assert(r==0); + r=cachetable_unpin(f, 2, 1); /* 2U 1P */ + assert(expect_n_flushes==0); + + expect_n_flushes=0; + r=cachetable_put(f, 3, make_item(3), flush, fetch, t3); + assert(r==0); + assert(expect_n_flushes==0); /* 3P 2U 1P */ /* 3 is most recently used (pinned), 2 is next (unpinned), 1 is least recent (pinned) */ + + expect_n_flushes=0; + r=cachetable_put(f, 4, make_item(4), flush, fetch, t3); + assert(r==0); + assert(expect_n_flushes==0); /* 4P 3P 2U 1P */ + + expect_n_flushes=0; + r=cachetable_put(f, 5, make_item(5), flush, fetch, t3); + assert(r==0); + r=cachetable_unpin(f, 5, 1); + assert(r==0); + r=cachetable_unpin(f, 3, 1); + assert(r==0); + assert(expect_n_flushes==0); /* 5U 4P 3U 2U 1P */ + + expect1(2); /* 2 is the oldest unpinned item. */ + r=cachetable_put(f, 6, make_item(6), flush, fetch, t3); /* 6P 5U 4P 3U 1P */ + assert(r==0); + assert(expect_n_flushes==0); + + + expect1(3); + r=cachetable_put(f, 7, make_item(7), flush, fetch, t3); + assert(r==0); + assert(expect_n_flushes==0); + r=cachetable_unpin(f, 7, 1); /* 7U 6P 5U 4P 1P */ + assert(r==0); + + { + void *item_v=0; + expect_n_flushes=0; + r=cachetable_get_and_pin(f, 5, &item_v, flush, fetch, t3); /* 5P 7U 6P 4P 1P */ + assert(r==0); + assert(((struct item *)item_v)->key==5); + assert(strcmp(((struct item *)item_v)->something,"something")==0); + assert(expect_n_flushes==0); + } + + { + void *item_v=0; + r=cachetable_unpin(f, 4, 1); + assert(r==0); + expect1(4); + did_fetch=-1; + r=cachetable_get_and_pin(f, 2, &item_v, flush, fetch, t3); /* 2p 5P 7U 6P 1P */ + assert(r==0); + assert(did_fetch==2); /* Expect that 2 is fetched in. */ + assert(((struct item *)item_v)->key==2); + assert(strcmp(((struct item *)item_v)->something,"something")==0); + assert(expect_n_flushes==0); + } + + r=cachetable_unpin(f, 2, 1); + assert(r==0); + r=cachetable_unpin(f ,5, 1); + assert(r==0); + r=cachetable_unpin(f, 6, 1); + assert(r==0); + r=cachetable_unpin(f, 1, 1); + assert(r==0); + r=cachetable_assert_all_unpinned(t); + assert(r==0); + + printf("Closing\n"); + expect1(2); + expectN(5); + expectN(7); + expectN(6); + expectN(1); + r=cachefile_close(f); + assert(r==0); + r=cachetable_close(t); + assert(r==0); + assert(expect_n_flushes==0); + expect_f = 0; + memory_check_all_free(); +} + +static void flush_n (CACHEFILE f __attribute__((__unused__)), CACHEKEY key __attribute__((__unused__)), void *value, int write_me __attribute__((__unused__)), int keep_me __attribute__((__unused__))) { + int *v = value; + assert(*v==0); +} +static int fetch_n (CACHEFILE f __attribute__((__unused__)), CACHEKEY key __attribute__((__unused__)), void**value, void*extraargs) { + assert((long)extraargs==42); + *value=0; + return 0; +} + + +void test_nested_pin (void) { + void *f2=(void*)42; + CACHETABLE t; + CACHEFILE f; + int i0, i1; + int r; + void *vv; + char fname[] = "test.dat"; + r = create_cachetable(&t, 1); + assert(r==0); + unlink(fname); + r = cachetable_openf(&f, t, fname, O_RDWR|O_CREAT, 0777); + assert(r==0); + expect_f = f; + + i0=0; i1=0; + r = cachetable_put(f, 1, &i0, flush_n, fetch_n, f2); + assert(r==0); + r = cachetable_get_and_pin(f, 1, &vv, flush_n, fetch_n, f2); + assert(r==0); + assert(vv==&i0); + assert(i0==0); + r = cachetable_unpin(f, 1, 0); + assert(r==0); + r = cachetable_put(f, 2, &i1, flush_n, fetch_n, f2); + assert(r!=0); // previously pinned, we shouldn't be able to put. + r = cachetable_unpin(f, 1, 0); + assert(r==0); + r = cachetable_put(f, 2, &i1, flush_n, fetch_n, f2); + assert(r==0); // now it is unpinned, we can put it. + +} + + +void null_flush (CACHEFILE cf __attribute__((__unused__)), + CACHEKEY k __attribute__((__unused__)), + void *v __attribute__((__unused__)), + int write_me __attribute__((__unused__)), + int keep_me __attribute__((__unused__))) { +} +int add123_fetch (CACHEFILE cf __attribute__((__unused__)), CACHEKEY key, void **value, void*extraargs) { + assert((long)extraargs==123); + *value = (void*)((unsigned long)key+123L); + return 0; +} +int add222_fetch (CACHEFILE cf __attribute__((__unused__)), CACHEKEY key, void **value, void*extraargs) { + assert((long)extraargs==222); + *value = (void*)((unsigned long)key+222L); + return 0; +} + + +void test_multi_filehandles (void) { + CACHETABLE t; + CACHEFILE f1,f2,f3; + char fname1[]="test.dat"; + char fname2[]="test2.dat"; + char fname3[]="test3.dat"; + int r; + void *v; + unlink(fname1); + unlink(fname2); + + r = create_cachetable(&t, 4); assert(r==0); + r = cachetable_openf(&f1, t, fname1, O_RDWR|O_CREAT, 0777); assert(r==0); + r = link(fname1, fname2); assert(r==0); + r = cachetable_openf(&f2, t, fname2, O_RDWR|O_CREAT, 0777); assert(r==0); + r = cachetable_openf(&f3, t, fname3, O_RDWR|O_CREAT, 0777); assert(r==0); + + assert(f1==f2); + assert(f1!=f3); + + r = cachetable_put(f1, 1, (void*)124, null_flush, add123_fetch, (void*)123); assert(r==0); + r = cachetable_get_and_pin(f2, 1, &v, null_flush, add123_fetch, (void*)123); assert(r==0); + assert((unsigned long)v==124); + r = cachetable_get_and_pin(f2, 2, &v, null_flush, add123_fetch, (void*)123); assert(r==0); + assert((unsigned long)v==125); + r = cachetable_get_and_pin(f3, 2, &v, null_flush, add222_fetch, (void*)222); assert(r==0); + assert((unsigned long)v==224); + r = cachetable_maybe_get_and_pin(f1, 2, &v); assert(r==0); + assert((unsigned long)v==125); + +} + +int main (int argc __attribute__((__unused__)), char *argv[] __attribute__((__unused__))) { + test0(); + test_nested_pin(); + test_multi_filehandles (); + printf("ok\n"); + return 0; +} diff --git a/newbrt/cachetable.c b/newbrt/cachetable.c new file mode 100644 index 00000000000..c7acdcaeb62 --- /dev/null +++ b/newbrt/cachetable.c @@ -0,0 +1,453 @@ +#include "cachetable.h" +#include "memory.h" +#include "yerror.h" +#include +#include +#include +#include +#include +#include + +//#define TRACE_CACHETABLE +#ifdef TRACE_CACHETABLE +#define WHEN_TRACE_CT(x) x +#else +#define WHEN_TRACE_CT(x) ((void)0) +#endif + +typedef struct ctpair *PAIR; +struct ctpair { + long long pinned; + char dirty; + CACHEKEY key; + void *value; + PAIR next,prev; // In LRU list. + PAIR hash_chain; + CACHEFILE cachefile; + void (*flush_callback)(CACHEFILE,CACHEKEY,void*, int write_me, int keep_me); + int (*fetch_callback)(CACHEFILE,CACHEKEY,void**,void*extrargs); + void*extraargs; +}; + +struct cachetable { + enum typ_tag tag; + int n_in_table; + int table_size; + PAIR *table; + PAIR head,tail; // of LRU list. head is the most recently used. tail is least recently used. + CACHEFILE cachefiles; +}; + +struct fileid { + dev_t st_dev; /* device and inode are enough to uniquely identify a file in unix. */ + ino_t st_ino; +}; + +struct cachefile { + CACHEFILE next; + int refcount; /* CACHEFILEs are shared. Use a refcount to decide when to really close it. */ + int fd; /* Bug: If a file is opened read-only, then it is stuck in read-only. If it is opened read-write, then subsequent writers can write to it too. */ + CACHETABLE cachetable; + struct fileid fileid; +}; + +int create_cachetable (CACHETABLE *result, int n_entries) { + CACHETABLE MALLOC(t); + int i; + t->n_in_table = 0; + t->table_size = n_entries; + t->table = my_calloc(t->table_size, sizeof(struct ctpair)); + assert(t->table); + t->head = t->tail = 0; + for (i=0; itable_size; i++) { + t->table[i]=0; + } + t->cachefiles = 0; + *result = t; + return 0; +} + +int cachetable_openf (CACHEFILE *cf, CACHETABLE t, const char *fname, int flags, mode_t mode) { + int r; + CACHEFILE extant; + struct stat statbuf; + struct fileid fileid; + int fd = open(fname, flags, mode); + if (fd<0) return errno; + memset(&fileid, 0, sizeof(fileid)); + r=fstat(fd, &statbuf); + assert(r==0); + fileid.st_dev = statbuf.st_dev; + fileid.st_ino = statbuf.st_ino; + for (extant = t->cachefiles; extant; extant=extant->next) { + if (memcmp(&extant->fileid, &fileid, sizeof(fileid))==0) { + close(fd); + extant->refcount++; + *cf = extant; + return 0; + } + } + { + CACHEFILE MALLOC(newcf); + newcf->next = t->cachefiles; + newcf->refcount = 1; + newcf->fd = fd; + newcf->cachetable = t; + newcf->fileid = fileid; + t->cachefiles = newcf; + *cf = newcf; + return 0; + } +} + +CACHEFILE remove_cf_from_list (CACHEFILE cf, CACHEFILE list) { + if (list==0) return 0; + else if (list==cf) { + return list->next; + } else { + list->next = remove_cf_from_list(cf, list->next); + return list; + } +} + +int cachefile_flush (CACHEFILE cf); + +int cachefile_close (CACHEFILE cf) { + assert(cf->refcount>0); + cf->refcount--; + if (cf->refcount==0) { + int r; + if ((r = cachefile_flush(cf))) return r; + r = close(cf->fd); + cf->cachetable->cachefiles = remove_cf_from_list(cf, cf->cachetable->cachefiles); + my_free(cf); + return r; + } else { + return 0; + } +} + +int cachetable_assert_all_unpinned (CACHETABLE t) { + int i; + int some_pinned=0; + for (i=0; itable_size; i++) { + PAIR p; + for (p=t->table[i]; p; p=p->hash_chain) { + assert(p->pinned>=0); + if (p->pinned) { + printf("%s:%d pinned: %lld (%p)\n", __FILE__, __LINE__, p->key, p->value); + some_pinned=1; + } + } + } + return some_pinned; +} + +int cachefile_assert_all_unpinned (CACHEFILE cf) { + int i; + int some_pinned=0; + CACHETABLE t = cf->cachetable; + for (i=0; itable_size; i++) { + PAIR p; + for (p=t->table[i]; p; p=p->hash_chain) { + assert(p->pinned>=0); + if (p->pinned && p->cachefile==cf) { + printf("%s:%d pinned: %lld (%p)\n", __FILE__, __LINE__, p->key, p->value); + some_pinned=1; + } + } + } + return some_pinned; +} + +static unsigned int hash_key (const char *key, int keylen) { + /* From Sedgewick. There are probably better hash functions. */ + unsigned int b = 378551; + unsigned int a = 63689; + unsigned int hash = 0; + int i; + for (i = 0; i < keylen; i++ ) { + hash = hash * a + key[i]; + a *= b; + } + return hash; +} + +static unsigned int hashit (CACHETABLE t, CACHEKEY key) { + return hash_key((char*)&key, sizeof(key))%t->table_size; +} + + +static void lru_remove (CACHETABLE t, PAIR p) { + if (p->next) { + p->next->prev = p->prev; + } else { + assert(t->tail==p); + t->tail = p->prev; + } + if (p->prev) { + p->prev->next = p->next; + } else { + assert(t->head==p); + t->head = p->next; + } + p->prev = p->next = 0; +} + +static void lru_add_to_list (CACHETABLE t, PAIR p) { + // requires that touch_me is not currently in the table. + assert(p->prev==0); + p->prev = 0; + p->next = t->head; + if (t->head) { + t->head->prev = p; + } else { + assert(!t->tail); + t->tail = p; + } + t->head = p; +} + +static void lru_touch (CACHETABLE t, PAIR p) { + lru_remove(t,p); + lru_add_to_list(t,p); +} + +static PAIR remove_from_hash_chain (PAIR remove_me, PAIR list) { + if (remove_me==list) return list->hash_chain; + list->hash_chain = remove_from_hash_chain(remove_me, list->hash_chain); + return list; +} + +static void flush_and_remove (CACHETABLE t, PAIR remove_me, int write_me) { + unsigned int h = hashit(t, remove_me->key); + lru_remove(t, remove_me); + //printf("flush_callback(%lld,%p)\n", remove_me->key, remove_me->value); + WHEN_TRACE_CT(printf("%s:%d CT flush_callback(%lld, %p, %p, dirty=%d, 0)\n", __FILE__, __LINE__, remove_me->key, remove_me->value, remove_me->otherargs, remove_me->dirty && write_me)); + remove_me->flush_callback(remove_me->cachefile, remove_me->key, remove_me->value, remove_me->dirty && write_me, 0); + t->n_in_table--; + // Remove it from the hash chain. + t->table[h] = remove_from_hash_chain (remove_me, t->table[h]); + my_free(remove_me); +} + +static void flush_and_keep (PAIR flush_me) { + if (flush_me->dirty) { + WHEN_TRACE_CT(printf("%s:%d CT flush_callback(%lld, %p, %p, dirty=1, 0)\n", __FILE__, __LINE__, flush_me->key, flush_me->value, flush_me->otherargs)); + flush_me->flush_callback(flush_me->cachefile, flush_me->key, flush_me->value, 1, 1); + flush_me->dirty=0; + } +} + +static int maybe_flush_some (CACHETABLE t) { + again: + if (t->n_in_table>=t->table_size) { + /* Try to remove one. */ + PAIR remove_me; + for (remove_me = t->tail; remove_me; remove_me = remove_me->prev) { + if (!remove_me->pinned) { + flush_and_remove(t, remove_me, 1); + goto again; + } + } + /* All were pinned. */ + printf("All are pinned\n"); + return 1; + } + return 0; +} + +int cachetable_put (CACHEFILE cachefile, CACHEKEY key, void*value, + void (*flush_callback)(CACHEFILE,CACHEKEY,void*, int /*write_me*/, int /*keep_me*/), + int (*fetch_callback)(CACHEFILE,CACHEKEY,void**,void*/*extraargs*/), + void*extraargs + ) { + int h = hashit(cachefile->cachetable, key); + PAIR p; + WHEN_TRACE_CT(printf("%s:%d CT cachetable_put(%lld)=%p\n", __FILE__, __LINE__, key, value)); + for (p=cachefile->cachetable->table[h]; p; p=p->hash_chain) { + if (p->key==key && p->cachefile==cachefile) { + // Semantically, these two asserts are not strictly right. After all, when are two functions eq? + // In practice, the functions better be the same. + assert(p->flush_callback==flush_callback); + assert(p->fetch_callback==fetch_callback); + return -1; /* Already present. */ + } + } + if (maybe_flush_some(cachefile->cachetable)) return -2; + + MALLOC(p); + p->pinned=1; + p->dirty =1; + p->key = key; + p->value = value; + p->next = p->prev = 0; + p->cachefile = cachefile; + p->flush_callback = flush_callback; + p->fetch_callback = fetch_callback; + p->extraargs = extraargs; + lru_add_to_list(cachefile->cachetable, p); + p->hash_chain = cachefile->cachetable->table[h]; + cachefile->cachetable->table[h] = p; + cachefile->cachetable->n_in_table++; + return 0; +} + +int cachetable_get_and_pin (CACHEFILE cachefile, CACHEKEY key, void**value, + void(*flush_callback)(CACHEFILE,CACHEKEY,void*,int write_me, int keep_me), + int(*fetch_callback)(CACHEFILE, CACHEKEY key, void**value,void*extraargs), /* If we are asked to fetch something, get it by calling this back. */ + void*extraargs + ) { + CACHETABLE t = cachefile->cachetable; + int h = hashit(t,key); + PAIR p; + for (p=t->table[h]; p; p=p->hash_chain) { + if (p->key==key && p->cachefile==cachefile) { + *value = p->value; + p->pinned++; + lru_touch(t,p); + WHEN_TRACE_CT(printf("%s:%d cachtable_get_and_pin(%lld)--> %p\n", __FILE__, __LINE__, key, *value)); + return 0; + } + } + if (maybe_flush_some(t)) return -2; + { + void *my_value; + int r; + WHEN_TRACE_CT(printf("%s:%d CT: fetch_callback(%lld...)\n", __FILE__, __LINE__, key)); + if ((r=fetch_callback(cachefile, key, &my_value,extraargs))) return r; + cachetable_put(cachefile, key, my_value, flush_callback, fetch_callback,extraargs); + *value = my_value; + } + WHEN_TRACE_CT(printf("%s:%d did fetch: cachtable_get_and_pin(%lld)--> %p\n", __FILE__, __LINE__, key, *value)); + return 0; +} + +int cachetable_maybe_get_and_pin (CACHEFILE cachefile, CACHEKEY key, void**value) { + CACHETABLE t = cachefile->cachetable; + int h = hashit(t,key); + PAIR p; + for (p=t->table[h]; p; p=p->hash_chain) { + if (p->key==key && p->cachefile==cachefile) { + *value = p->value; + p->pinned++; + lru_touch(t,p); + printf("%s:%d cachtable_maybe_get_and_pin(%lld)--> %p\n", __FILE__, __LINE__, key, *value); + return 0; + } + } + return -1; +} + + +int cachetable_unpin (CACHEFILE cachefile, CACHEKEY key, int dirty) { + CACHETABLE t = cachefile->cachetable; + int h = hashit(t,key); + PAIR p; + WHEN_TRACE_CT(printf("%s:%d unpin(%lld)\n", __FILE__, __LINE__, key)); + for (p=t->table[h]; p; p=p->hash_chain) { + if (p->key==key && p->cachefile==cachefile) { + assert(p->pinned>0); + p->pinned--; + p->dirty |= dirty; + return 0; + } + } + return 0; +} + +int cachetable_flush (CACHETABLE t) { + int i; + for (i=0; itable_size; i++) { + PAIR p; + while ((p = t->table[i])) + flush_and_remove(t, p, 1); // Must be careful, since flush_and_remove kills the linked list. + } + return 0; +} + +int cachefile_flush (CACHEFILE cf) { + int i; + CACHETABLE t = cf->cachetable; + for (i=0; itable_size; i++) { + PAIR p; + again: + p = t->table[i]; + while (p) { + if (p->cachefile==cf) { + flush_and_remove(t, p, 1); // Must be careful, since flush_and_remove kills the linked list. + goto again; + } else { + p=p->next; + } + } + } + return 0; +} + + +/* Require that it all be flushed. */ +int cachetable_close (CACHETABLE t) { + int i; + int r; + if ((r=cachetable_flush(t))) return r; + for (i=0; itable_size; i++) { + if (t->table[i]) return -1; + } + my_free(t->table); + my_free(t); + return 0; +} + +int cachetable_remove (CACHEFILE cachefile, CACHEKEY key, int write_me) { + /* Removing something already present is OK. */ + CACHETABLE t = cachefile->cachetable; + int h = hashit(t,key); + PAIR p; + for (p=t->table[h]; p; p=p->hash_chain) { + if (p->key==key && p->cachefile==cachefile) { + flush_and_remove(t, p, write_me); + return 0; + } + } + return 0; +} + +static int cachetable_fsync_pairs (CACHETABLE t, PAIR p) { + if (p) { + int r = cachetable_fsync_pairs(t, p->hash_chain); + if (r!=0) return r; + flush_and_keep(p); + } + return 0; +} + +int cachetable_fsync (CACHETABLE t) { + int i; + int r; + for (i=0; itable_size; i++) { + r=cachetable_fsync_pairs(t, t->table[i]); + if (r!=0) return r; + } + return 0; +} + +#if 0 +int cachefile_pwrite (CACHEFILE cf, const void *buf, size_t count, off_t offset) { + ssize_t r = pwrite(cf->fd, buf, count, offset); + if (r==-1) return errno; + assert((size_t)r==count); + return 0; +} +int cachefile_pread (CACHEFILE cf, void *buf, size_t count, off_t offset) { + ssize_t r = pread(cf->fd, buf, count, offset); + if (r==-1) return errno; + if (r==0) return -1; /* No error for EOF ??? */ + assert((size_t)r==count); + return 0; +} +#endif + +int cachefile_fd (CACHEFILE cf) { + return cf->fd; +} diff --git a/newbrt/cachetable.h b/newbrt/cachetable.h new file mode 100644 index 00000000000..a4474926b70 --- /dev/null +++ b/newbrt/cachetable.h @@ -0,0 +1,59 @@ +#ifndef CACHETABLE_H +#define CACHETABLE_H +#include + +/* Implement the cache table. */ + +typedef long long CACHEKEY; +typedef struct cachetable *CACHETABLE; +typedef struct cachefile *CACHEFILE; + +/* Maintain a cache mapping from cachekeys to values (void*) + * Some of the keys can be pinned. Don't pin too many or for too long. + * If the cachetable is too full, it will call the flush_callback() function with the key, the value, and the otherargs + and then remove the key-value pair from the cache. + * The callback won't be any of the currently pinned keys. + * Also when flushing an object, the cachetable drops all references to it, + * so you may need to free() it. + * Note: The cachetable should use a common pool of memory, flushing things across cachetables. + * (The first implementation doesn't) + * If you pin something twice, you must unpin it twice. + */ +int create_cachetable (CACHETABLE */*result*/, int /*n_entries*/); + +int cachetable_openf (CACHEFILE *,CACHETABLE, const char */*fname*/, int flags, mode_t mode); + +/* Error if already present. On success, pin the value. */ +int cachetable_put (CACHEFILE, CACHEKEY, void*/*value*/, + void(*flush_callback)(CACHEFILE, CACHEKEY key, void*value, int write_me, int keep_me), + int(*fetch_callback)(CACHEFILE, CACHEKEY key, void**value,void*extraargs), /* If we are asked to fetch something, get it by calling this back. */ + void*extraargs + ); + +int cachetable_get_and_pin (CACHEFILE, CACHEKEY, void**/*value*/, + void(*flush_callback)(CACHEFILE,CACHEKEY,void*,int write_me, int keep_me), + int(*fetch_callback)(CACHEFILE, CACHEKEY key, void**value,void*extraargs), /* If we are asked to fetch something, get it by calling this back. */ + void*extraargs + ); + +/* If the the item is already in memory, then return 0 and store it in the void**. + * If the item is not in memory, then return nonzero. */ +int cachetable_maybe_get_and_pin (CACHEFILE, CACHEKEY, void**); +int cachetable_unpin (CACHEFILE, CACHEKEY, int dirty); /* Note whether it is dirty when we unpin it. */ +int cachetable_remove (CACHEFILE, CACHEKEY, int /*write_me*/); /* Removing something already present is OK. */ +int cachetable_assert_all_unpinned (CACHETABLE); +int cachefile_assert_all_unpinned (CACHEFILE); + +//int cachetable_fsync_all (CACHETABLE); /* Flush everything to disk, but keep it in cache. */ +int cachetable_close (CACHETABLE); /* Flushes everything to disk, and destroys the cachetable. */ + +int cachefile_close (CACHEFILE); +//int cachefile_flush (CACHEFILE); /* Flush everything related to the VOID* to disk and free all memory. Don't destroy the cachetable. */ + +// Return on success (different from pread and pwrite) +//int cachefile_pwrite (CACHEFILE, const void *buf, size_t count, off_t offset); +//int cachefile_pread (CACHEFILE, void *buf, size_t count, off_t offset); + +int cachefile_fd (CACHEFILE); + +#endif diff --git a/newbrt/hashtable.c b/newbrt/hashtable.c new file mode 100644 index 00000000000..f6de079cf16 --- /dev/null +++ b/newbrt/hashtable.c @@ -0,0 +1,216 @@ +/* Hash table with chaining. */ +#include "hashtable.h" +#include "memory.h" +#include "../include/ydb-constants.h" +#include +#include +#include +#include + +#include "key.h" +#include "yerror.h" + +int hashtable_create (HASHTABLE *h) { + HASHTABLE MALLOC(tab); + int i; + if (tab==0) return -1; + tab->n_keys=0; + tab->arraysize=128; + assert(sizeof(*tab->array)==sizeof(void*)); + tab->array = my_calloc(tab->arraysize, sizeof(*tab->array)); + for (i=0; iarraysize; i++) tab->array[i]=0; + *h=tab; + return 0; +} + +static unsigned int hash_key (const char *key, ITEMLEN keylen) { + /* From Sedgewick. There are probably better hash functions. */ + unsigned int b = 378551; + unsigned int a = 63689; + unsigned int hash = 0; + ITEMLEN i; + for (i = 0; i < keylen; i++ ) { + hash = hash * a + key[i]; + a *= b; + } + return hash; +} + +static void hash_find_internal (HASHTABLE tab, const char *key, ITEMLEN keylen, HASHELT *hashelt, HASHELT **prev_ptr) { + unsigned int h = hash_key (key, keylen) % tab->arraysize; + HASHELT he; + HASHELT *prev = &tab->array[h]; + for (he=*prev; he; prev=&he->next, he=*prev) { + if (keylen==he->keylen && memcmp(key, he->key, keylen)==0) { + *prev_ptr = prev; + *hashelt = he; + return; + } + } + *prev_ptr = prev; + *hashelt = 0; +} + +int hash_find (HASHTABLE tab, bytevec key, ITEMLEN keylen, bytevec *data, ITEMLEN *datalen) { + HASHELT he, *prev_ptr; + hash_find_internal(tab, key, keylen, &he, &prev_ptr); + if (he==0) { + return -1; + } else { + *data = he->val; + *datalen = he->vallen; + return 0; + } +} + + +int hash_insert (HASHTABLE tab, const char *key, ITEMLEN keylen, const char *val, ITEMLEN vallen) +{ + unsigned int h = hash_key (key,keylen)%tab->arraysize; + { + HASHELT he,*prev_ptr; + hash_find_internal(tab, key, keylen, &he, &prev_ptr); + if (he!=0) { + return BRT_ALREADY_THERE; + } + } + { + /* Otherwise the key is not already present, so we need to add it. */ + HASHELT MALLOC(he); + he->key = memdup(key, keylen); + he->keylen = keylen; + he->val = memdup(val, vallen); + he->vallen = vallen; + he->next = tab->array[h]; + tab->array[h]=he; + tab->n_keys++; + if (tab->n_keys > tab->arraysize) { + int newarraysize = tab->arraysize*2; + HASHELT *newarray = my_calloc(newarraysize, sizeof(*tab->array)); + int i; + assert(newarray!=0); + for (i=0; iarraysize; i++) { + while ((he=tab->array[i])!=0) { + h = hash_key(he->key, he->keylen)%newarraysize; + tab->array[i] = he->next; + he->next = newarray[h]; + newarray[h] = he; + } + } + my_free(tab->array); + // printf("Freed\n"); + tab->array=newarray; + tab->arraysize=newarraysize; + //printf("Done growing\n"); + } + return BRT_OK; + } +} + +int hash_delete (HASHTABLE tab, const char *key, ITEMLEN keylen) { + HASHELT he, *prev_ptr; + //printf("%s:%d deleting %s (bucket %d)\n", __FILE__, __LINE__, key, hash_key(key,keylen)%tab->arraysize); + hash_find_internal(tab, key, keylen, &he, &prev_ptr); + if (he==0) return DB_NOTFOUND; + else { + //printf("%s:%d deleting %s %s\n", __FILE__, __LINE__, he->key, he->val); + assert(*prev_ptr==he); + *prev_ptr = he->next; + //printf("Freeing %s %s\n", he->key, he->val); + my_free(he->key); + my_free(he->val); + my_free(he); + tab->n_keys--; + return BRT_OK; + } +} + + +int hashtable_random_pick(HASHTABLE h, bytevec *key, ITEMLEN *keylen, bytevec *data, ITEMLEN *datalen) { + int i; + for (i=0; iarraysize; i++) { + HASHELT he=h->array[i]; + if (he) { + *key = he->key; + *keylen = he->keylen; + *data = he->val; + *datalen = he->vallen; + return 0; + } + } + return -1; +} + +#if 0 +int hashtable_find_last(HASHTABLE h, bytevec *key, ITEMLEN *keylen, bytevec *data, ITEMLEN *datalen) { + bytevec best_k=0, best_d; + ITEMLEN best_kl, best_dl; + HASHTABLE_ITERATE(h, this_k, this_kl, this_d, this_dl, + ({ + if (best_k==0 || keycompare(best_k, best_kl, this_k, this_kl)<0) { + best_k = this_k; + best_kl = this_kl; + best_d = this_d; + best_dl = this_dl; + } + })); + if (best_k) { + *key = best_k; + *keylen = best_kl; + *data = best_d; + *datalen = best_dl; + return 0; + } else { + return -1; + } +} +#endif + +void hashtable_iterate (HASHTABLE tab, void(*f)(bytevec key, ITEMLEN keylen, bytevec data, ITEMLEN datalen, void*args), void* args) { + /* + int i; + for (i=0; iarraysize; i++) { + HASHELT he; + for (he=tab->array[i]; he; he=he->next) { + f(he->key, he->keylen, he->val, he->vallen, args); + } + } + */ + HASHTABLE_ITERATE(tab, key, keylen, val, vallen, f(key,keylen,val,vallen,args)); +} + +int hashtable_n_entries(HASHTABLE tab) { + return tab->n_keys; +} + +/* Frees the list, but doesn't free the keys. */ +static void hasheltlist_free (HASHELT elt) { + if (elt==0) return; + else { + hasheltlist_free(elt->next); + my_free(elt->key); + my_free(elt->val); + my_free(elt); + } +} + +/* Frees the table, but doesn't do anything to the contents of the table. The keys are still alloc'd. The internal storage of the hashtable is freed. */ +void hashtable_free(HASHTABLE *tab) { + //printf("%s:%d free hashtable %p\n", __FILE__, __LINE__, tab); + hashtable_clear(*tab); + //printf("%s:%d free %p\n", __FILE__, __LINE__, tab);n + my_free((*tab)->array); + my_free(*tab); + *tab=0; +} + + +void hashtable_clear(HASHTABLE tab) { + int i; + for (i=0; iarraysize; i++) { + hasheltlist_free(tab->array[i]); + tab->array[i]=0; + } + tab->n_keys = 0; +} diff --git a/newbrt/hashtable.h b/newbrt/hashtable.h new file mode 100644 index 00000000000..89fcc58d6d4 --- /dev/null +++ b/newbrt/hashtable.h @@ -0,0 +1,58 @@ +#ifndef HASHTABLE_H +#define HASHTABLE_H + +#include "brttypes.h" +/* Hash table with chaining. */ +/* The keys and values are byte sequences. */ +/* The keys and values are malloc'd by the hashtable. */ + +typedef struct hashtable *HASHTABLE; + +int hashtable_create (HASHTABLE*); + +/* Return 0 if the key is found in the hashtable, -1 otherwise. */ +/* Warning: The data returned points to the internals of the hashtable. It is set to "const" to try to prevent you from messing it up. */ +int hash_find (HASHTABLE tab, bytevec key, ITEMLEN keylen, bytevec*data, ITEMLEN *datalen); + +/* Replace the key if it was already there. */ +int hash_insert (HASHTABLE tab, const char *key, ITEMLEN keylen, const char *data, ITEMLEN datalen); + +/* It is OK to delete something that isn't there. */ +int hash_delete (HASHTABLE tab, const char *key, ITEMLEN keylen); +void hashtable_free(HASHTABLE *tab); +int hashtable_n_entries(HASHTABLE); + +void hashtable_clear(HASHTABLE); + +int hashtable_random_pick(HASHTABLE h, bytevec *key, ITEMLEN *keylen, bytevec *data, ITEMLEN *datalen); +//int hashtable_find_last(HASHTABLE h, bytevec *key, ITEMLEN *keylen, bytevec *data, ITEMLEN *datalen); + +typedef struct hashelt *HASHELT; +struct hashelt { + char *key; ITEMLEN keylen; /* key is NULL for empty elements */ + char *val; ITEMLEN vallen; + HASHELT next; +}; + +struct hashtable { + int n_keys; + int arraysize; + HASHELT *array; +}; + +/* You cannot add or delete elements from the hashtable while iterating. */ +void hashtable_iterate (HASHTABLE tab, void(*f)(bytevec key,ITEMLEN keylen,bytevec data,ITEMLEN datalen,void*), void*); +// If you don't want to use something, do something like use "key __attribute__((__unused__))" for keyvar. +#define HASHTABLE_ITERATE(table,keyvar,keylenvar,datavar,datalenvar,body) ({ \ + int hi_counter; \ + for (hi_counter=0; hi_counterarraysize; hi_counter++) { \ + HASHELT hi_he; \ + for (hi_he=table->array[hi_counter]; hi_he; hi_he=hi_he->next) { \ + const char *keyvar = hi_he->key; \ + ITEMLEN keylenvar = hi_he->keylen; \ + const char *datavar = hi_he->val; \ + ITEMLEN datalenvar = hi_he->vallen; \ + body; \ + }}}) + +#endif diff --git a/newbrt/hashtest.c b/newbrt/hashtest.c new file mode 100644 index 00000000000..a92e1abf073 --- /dev/null +++ b/newbrt/hashtest.c @@ -0,0 +1,113 @@ +#include "key.h" +#include "hashtable.h" +#include +#include +#include +#include + +void verify_hash_instance (bytevec kv_v, ITEMLEN kl, bytevec dv_v, ITEMLEN dl, + int N, int *data, char *saw) { + char *kv = (char*)kv_v; + char *dv = (char*)dv_v; + int num, k; + assert(kv[0]=='k'); + assert(dv[0]=='d'); + assert(strcmp(kv+1, dv+1)==0); + assert(strlen(kv)+1==kl); + assert(strlen(dv)+1==dl); + num = atoi(kv+1); + for (k=0; k +#include +#include +#include + +int read_sint (int fd, int *result) { + unsigned char b[4]; + int r = read(fd, b, 4); + if (r!=4) return 1; + *result = (b[0]<<24) | (b[1]<<16) | (b[2]<<8) | (b[3]<<0); + return 0; +} +int read_uint (int fd, unsigned int *result) { + int sresult; + int r = read_sint(fd, &sresult); + if (r==0) { *result = r; } + return r; +} + +int write_int (int fd, unsigned int v) { + unsigned char b[4]; + int r; + b[0] = (v>>24)&0xff; + b[1] = (v>>16)&0xff; + b[2] = (v>>8)&0xff; + b[3] = (v>>0)&0xff; + r = write(fd, b, 4); + if (r!=4) return 1; + return 0; +} + +int read_diskoff (int fd, diskoff *result) { + unsigned int i0,i1; + int r; + r = read_uint(fd, &i0); if(r!=0) return r; + r = read_uint(fd, &i1); if(r!=0) return r; + *result = ((unsigned long long)i0)<<32 | ((unsigned long long)i1); + return 0; +} + +int write_diskoff (int fd, diskoff v) { + int r; + r = write_int(fd, (unsigned int)(v>>32)); if (r!=0) return r; + r = write_int(fd, (unsigned int)(v&0xffffffff)); if (r!=0) return r; + return 0; +} + +int read_bytes (int fd, int l, char *s) { + int r = read(fd, s, l); + if (r==l) return 0; + return -1; +} + +int write_bytes (int fd, int l, char *s) { + int r= write(fd, s, l); + if (r==l) return 0; + return -1; +} + +int read_brt_header (int fd, struct brt_header *header) { + { + off_t r = lseek(fd, 0, SEEK_SET); + assert(r==0); + } + /* Ignore magic for now. We'll need some magic at the beginning of the file. */ + { + int r; + r = read_uint(fd, &header->nodesize); + if (r!=0) return -1; + r = read_diskoff(fd, &header->freelist); assert(r==0); /* These asserts should do something smarter. */ + r = read_diskoff(fd, &header->unused_memory); assert(r==0); + r = read_sint(fd, &header->n_named_roots); assert(r==0); + if (header->n_named_roots>0) { + int i; + header->unnamed_root = -1; + MALLOC_N(header->n_named_roots, header->names); + MALLOC_N(header->n_named_roots, header->roots); + for (i=0; in_named_roots; i++) { + unsigned int l; + char *s; + r = read_diskoff(fd, &header->roots[i]); assert(r==0); + r = read_uint(fd, &l); assert(r==0); /* count includes the trailing null. */ + MALLOC_N(l, s); + r = read_bytes(fd, l, s); assert(r==0); + assert(l>0 && s[l-1]==0); + header->names[i] = s; + } + } else { + r = read_diskoff(fd, &header->unnamed_root); assert(r==0); + header->names = 0; + header->roots = 0; + } + } + return 0; +} + +int read_brt_h_unused_memory (int fd, diskoff *unused_memory) { + off_t r = lseek(fd, 12, SEEK_SET); + assert(r==12); + r = read_diskoff(fd, unused_memory); + return r; +} + +int write_brt_h_unused_memory (int fd, diskoff unused_memory) { + off_t r = lseek(fd, 12, SEEK_SET); + assert(r==12); + r = write_diskoff(fd, unused_memory); + return r; +} diff --git a/newbrt/key.c b/newbrt/key.c new file mode 100644 index 00000000000..1d03012c11e --- /dev/null +++ b/newbrt/key.c @@ -0,0 +1,27 @@ +#include "brt-internal.h" +#include +#include + +int keycompare (bytevec key1, ITEMLEN key1len, bytevec key2, ITEMLEN key2len) { + if (key1len==key2len) { + return memcmp(key1,key2,key1len); + } else if (key1len0); + assert(keycompare("a",1, "aa",2)<0); + assert(keycompare("b",1, "aa",2)>0); + assert(keycompare("aa",2, "b",1)<0); + assert(keycompare("aaaba",5, "aaaba",5)==0); + assert(keycompare("aaaba",5, "aaaaa",5)>0); + assert(keycompare("aaaaa",5, "aaaba",5)<0); + assert(keycompare("aaaaa",3, "aaaba",3)==0); +} diff --git a/newbrt/key.h b/newbrt/key.h new file mode 100644 index 00000000000..2352430402a --- /dev/null +++ b/newbrt/key.h @@ -0,0 +1,5 @@ +#include "brttypes.h" + +int keycompare (bytevec key1, ITEMLEN key1len, bytevec key2, ITEMLEN key2len); +void test_keycompare (void) ; + diff --git a/newbrt/mdict-test.c b/newbrt/mdict-test.c new file mode 100644 index 00000000000..4980394ccd4 --- /dev/null +++ b/newbrt/mdict-test.c @@ -0,0 +1,89 @@ +#include "mdict.h" +#include +#include +#include +#include + +void verify_mdict_instance (bytevec kv_v, ITEMLEN kl, bytevec dv_v, ITEMLEN dl, + int N, int *data, char *saw) { + char *kv = (char*)kv_v; + char *dv = (char*)dv_v; + int num, k; + assert(kv[0]=='k'); + assert(dv[0]=='d'); + assert(strcmp(kv+1, dv+1)==0); + assert(strlen(kv)+1==kl); + assert(strlen(dv)+1==dl); + num = atoi(kv+1); + for (k=0; kpma); + if (r==0) { + *mdict = result; + } + return r; +} + +void mdict_free (MDICT m) { + pma_free(m->pma); + my_free(m); +} + +int mdict_n_entries (MDICT m) { + return pma_n_entries(m->pma); +} + + +/* Returns an error if the key is already present. */ +/* The values returned should not be modified. */ +/* May damage the cursor. */ +int mdict_insert (MDICT m, bytevec key, ITEMLEN keylen, bytevec data, ITEMLEN datalen) { + return pma_insert(m->pma, key, keylen, data, datalen); +} +/* This returns an error if the key is NOT present. */ +int mdict_replace (MDICT, bytevec key, ITEMLEN keylen, bytevec data, ITEMLEN datalen); +/* This returns an error if the key is NOT present. */ +int mdict_delete (MDICT m, bytevec key, ITEMLEN keylen) { + return pma_delete(m->pma, key, keylen); +} + +/* Exposes internals of the MDICT by returning a pointer to the guts. + * Don't modify the returned data. Don't free it. */ +int mdict_lookup (MDICT m, bytevec key, ITEMLEN keylen, bytevec*data, ITEMLEN *datalen) { + return pma_lookup(m->pma, key, keylen, data, datalen); +} + + +int mdict_random_pick(MDICT m, bytevec *key, ITEMLEN *keylen, bytevec *data, ITEMLEN *datalen) { + return pma_random_pick(m->pma, key, keylen, data, datalen); +} + +void mdict_iterate (MDICT m, void(*f)(bytevec,ITEMLEN,bytevec,ITEMLEN, void*), void*v) { + pma_iterate(m->pma, f, v); +} + + +#else +foo +#endif diff --git a/newbrt/mdict.h b/newbrt/mdict.h new file mode 100644 index 00000000000..e7824863cbf --- /dev/null +++ b/newbrt/mdict.h @@ -0,0 +1,83 @@ +#ifndef MDICT_H +#define MDICT_H + +#include "brttypes.h" + +//#define USEPMA +#define USEHASH + +int keycompare (bytevec key1, ITEMLEN key1len, bytevec key2, ITEMLEN key2len); + +#ifdef USEPMA + +#include "pma.h" +#define MDICT PMA +#define MDICT_OK PMA_OK +#define MDICT_NOTFOUND PMA_NOTFOUND +#define mdict_free pma_free +#define mdict_n_entries pma_n_entries +#define MDICT_ITERATE PMA_ITERATE +#define mdict_insert pma_insert +#define mdict_create pma_create +#define mdict_delete pma_delete +#define mdict_lookup pma_lookup +#define mdict_random_pick pma_random_pick +#define mdict_iterate pma_iterate + +#elif defined(USEHASH) +#include "hashtable.h" +#define MDICT HASHTABLE +#define MDICT_OK 0 +#define MDICT_NOTFOUND -1 +#define MDICT_ALREADY_THERE -2 +#define mdict_free hashtable_free +#define mdict_n_entries hashtable_n_entries +#define MDICT_ITERATE HASHTABLE_ITERATE +#define mdict_insert hash_insert +#define mdict_create hashtable_create +#define mdict_delete hash_delete +#define mdict_lookup hash_find +#define mdict_random_pick hashtable_random_pick +#define mdict_iterate hashtable_iterate +#define mdict_find_last hashtable_find_last + +#else +/* In-memory dictionary. */ + + +enum mdict_errors { MDICT_OK=0, MDICT_NOTFOUND = -1, MDICT_ALREADY_THERE = -2 }; + +typedef struct mdict *MDICT; + +int mdict_create (MDICT*); +void mdict_free (MDICT); + +int mdict_n_entries (MDICT); + +/* Returns an error if the key is already present. */ +/* The values returned should not be modified. */ +/* May damage the cursor. */ +int mdict_insert (MDICT, bytevec key, ITEMLEN keylen, bytevec data, ITEMLEN datalen); +/* This returns an error if the key is NOT present. */ +int mdict_replace (MDICT, bytevec key, ITEMLEN keylen, bytevec data, ITEMLEN datalen); +/* This returns an error if the key is NOT present. */ +int mdict_delete (MDICT, bytevec key, ITEMLEN keylen); + +/* Exposes internals of the MDICT by returning a pointer to the guts. + * Don't modify the returned data. Don't free it. */ +int mdict_lookup (MDICT, bytevec key, ITEMLEN keylen, bytevec*data, ITEMLEN *datalen); + + +int mdict_random_pick(MDICT, bytevec *key, ITEMLEN *keylen, bytevec *data, ITEMLEN *datalen); + +void mdict_iterate (MDICT, void(*)(bytevec,ITEMLEN,bytevec,ITEMLEN, void*), void*); + +#define MDICT_ITERATE(table,keyvar,keylenvar,datavar,datalenvar,body) ({ \ + void __do_iterate(bytevec keyvar, ITEMLEN keylenvar, bytevec datavar, ITEMLEN datalenvar, void *__ignore __attribute__((__unused__))) { \ + body; \ + } \ + mdict_iterate(table,__do_iterate, 0); \ +}) + +#endif +#endif diff --git a/newbrt/memory.c b/newbrt/memory.c new file mode 100644 index 00000000000..ec0dff6b56a --- /dev/null +++ b/newbrt/memory.c @@ -0,0 +1,191 @@ +#include "memory.h" +#include +#include +#include +#include +#include + +int memory_check=1; + +#define WHEN_MEM_DEBUG(x) ({if (memory_check) ({x});}) + + +long long n_items_malloced=0; + +/* Memory checking */ +enum { items_limit = 1000 }; +int overflowed=0; +static void *items[items_limit]; +static long sizes[items_limit]; + +void note_did_malloc (void *p, long size) { + WHEN_MEM_DEBUG( + if (n_items_malloced%p\n", __FILE__, __LINE__, nmemb, size, r); + note_did_malloc(r, nmemb*size); + //if ((long)r==0x80523f8) { printf("%s:%d %p\n", __FILE__, __LINE__, r); } + return r; +} +void *my_malloc(long size) { + void * r; + errno=0; + r=actual_malloc(size); + //printf("%s:%d malloc(%ld)->%p\n", __FILE__, __LINE__, size,r); + note_did_malloc(r, size); + //if ((long)r==0x80523f8) { printf("%s:%d %p size=%ld\n", __FILE__, __LINE__, r, size); } + return r; +} +void *tagmalloc(unsigned long size, int typtag) { + void *r = my_malloc(size); + assert(size>sizeof(int)); + ((int*)r)[0] = typtag; + return r; +} + +void *my_realloc(void *p, long size) { + void *newp; + note_did_free(p); + errno=0; + newp = actual_realloc(p, size); + //printf("%s:%d realloc(%p,%ld)-->%p\n", __FILE__, __LINE__, p, size, newp); + note_did_malloc(newp, size); + return newp; +} + +void my_free(void* p) { + //printf("%s:%d free(%p)\n", __FILE__, __LINE__, p); + note_did_free(p); + actual_free(p); +} + +void *memdup (const void *v, unsigned int len) { + void *r=my_malloc(len); + memcpy(r,v,len); + return r; +} +char *mystrdup (const char *s) { + return memdup(s, strlen(s)+1); +} + +void memory_check_all_free (void) { + if (n_items_malloced>0) { + printf("n_items_malloced=%lld\n", n_items_malloced); + if (memory_check) + printf(" one item is %p size=%ld\n", items[0], sizes[0]); + } + assert(n_items_malloced==0); +} + +int get_n_items_malloced (void) { return n_items_malloced; } +void print_malloced_items (void) { + int i; + for (i=0; i + +/* errno is set to 0 or a value to indicate problems. */ +void *my_calloc(long nmemb, long size); +void *my_malloc(long size); +void *tagmalloc(unsigned long size, int typ); +void my_free(void*); +void *my_realloc(void *, long size); + +#define MALLOC(v) v = my_malloc(sizeof(*v)) +#define MALLOC_N(n,v) v = my_malloc((n)*sizeof(*v)) + +#define TAGMALLOC(t,v) t v = tagmalloc(sizeof(*v), TYP_ ## t); + +void *memdup (const void *v, unsigned int len); +char *mystrdup (const char *s); + +void memory_check_all_free (void); +void do_memory_check(void); + +extern int memory_check; // Set to nonzero to get a (much) slower version of malloc that does (much) more checking. + +int get_n_items_malloced(void); +void print_malloced_items(void); diff --git a/newbrt/myassert.c b/newbrt/myassert.c new file mode 100644 index 00000000000..678088f82a1 --- /dev/null +++ b/newbrt/myassert.c @@ -0,0 +1,9 @@ +#include "myassert.h" +#include +#include + +#ifdef TESTER +void my_assert(int a, const char *f, int l) { + if (!a) { fprintf(stderr, "Assertion failed at %s:%d\n", f, l); abort(); } +} +#endif diff --git a/newbrt/myassert.h b/newbrt/myassert.h new file mode 100644 index 00000000000..0ca88a8fe1a --- /dev/null +++ b/newbrt/myassert.h @@ -0,0 +1,11 @@ +#ifndef MYASSERT_H +#define MYASSERT_H + +#ifndef TESTER +#include +#else +extern void my_assert(int, const char *, int); +#define assert(x) my_assert(x, __FILE__, __LINE__) +#endif + +#endif diff --git a/newbrt/pma-internal.h b/newbrt/pma-internal.h new file mode 100644 index 00000000000..19574ed5e9e --- /dev/null +++ b/newbrt/pma-internal.h @@ -0,0 +1,38 @@ +#include "pma.h" + +struct pair { + bytevec key; /* NULL for empty slots */ + int keylen; + bytevec val; + int vallen; +}; + +struct pma_cursor { + PMA pma; + int position; /* -1 if the position is undefined. */ + PMA_CURSOR next,prev; + void *skey, *sval; /* used in dbts. */ +}; + +struct pma { + enum typ_tag tag; + int N; /* How long is the array? Always a power of two >= 4. */ + int n_pairs_present; /* How many array elements are non-null. */ + struct pair *pairs; + int uplgN; /* The smallest power of two >= lg(N) */ + double densitystep; /* Each doubling decreases the density by densitystep. + * For example if array_len=256 and uplgN=8 then there are 5 doublings. + * Regions of size 8 are full. Regions of size 16 are 90% full. + * Regions of size 32 are 80% full. Regions of size 64 are 70% full. + * Regions of size 128 are 60% full. Regions of size 256 are 50% full. + * The densitystep is 0.10. */ + PMA_CURSOR cursors_head, cursors_tail; +}; + +int pmainternal_count_region (struct pair *pairs, int lo, int hi); +void pmainternal_calculate_parameters (PMA pma); +int pmainternal_smooth_region (struct pair *pairs, int n, int idx); +int pmainternal_printpairs (struct pair *pairs, int N); +int pmainternal_make_space_at (PMA pma, int idx); +int pmainternal_find (PMA pma, bytevec key, int keylen); +void print_pma (PMA pma); /* useful for debugging, so keep the name short. I.e., not pmainternal_print_pma() */ diff --git a/newbrt/pma-test.c b/newbrt/pma-test.c new file mode 100644 index 00000000000..51a2985937b --- /dev/null +++ b/newbrt/pma-test.c @@ -0,0 +1,444 @@ +#include "pma-internal.h" +#include "../include/ydb-constants.h" +#include "memory.h" +#include "key.h" +#include +#include +#include +#include + +static void test_make_space_at (void) { + PMA pma; + int r=pma_create(&pma); + assert(r==0); + assert(pma_n_entries(pma)==0); + r=pmainternal_make_space_at(pma, 2); + assert(pma_index_limit(pma)==4); + assert((unsigned long)pma->pairs[pma_index_limit(pma)].key==0xdeadbeefL); + print_pma(pma); + + pma->pairs[2].key="A"; + pma->n_pairs_present++; + r=pmainternal_make_space_at(pma,2); + printf("Requested space at 2, got space at %d\n", r); + print_pma(pma); + assert(pma->pairs[r].key==0); + assert((unsigned long)pma->pairs[pma_index_limit(pma)].key==0xdeadbeefL); + + assert(pma_index_limit(pma)==4); + pma->pairs[0].key="A"; + pma->pairs[1].key="B"; + pma->pairs[2].key=0; + pma->pairs[3].key=0; + pma->n_pairs_present=2; + print_pma(pma); + r=pmainternal_make_space_at(pma,0); + printf("Requested space at 0, got space at %d\n", r); + print_pma(pma); + assert((unsigned long)pma->pairs[pma_index_limit(pma)].key==0xdeadbeefL); // make sure it doesn't go off the end. + + assert(pma_index_limit(pma)==8); + pma->pairs[0].key = "A"; + pma->pairs[1].key = 0; + pma->pairs[2].key = 0; + pma->pairs[3].key = 0; + pma->pairs[4].key = "B"; + pma->pairs[5].key = 0; + pma->pairs[6].key = 0; + pma->pairs[7].key = 0; + pma->n_pairs_present=2; + print_pma(pma); + r=pmainternal_make_space_at(pma,5); + print_pma(pma); + printf("r=%d\n", r); + { + int i; + for (i=0; ipairs[i].key) { + assert(ipairs[i].key=0; // zero it so that we don't mess things up on free + pma->pairs[i].val=0; + } + } + r=pma_free(&pma); assert(r==0); + assert(pma==0); +} + +static void test_pma_find (void) { + PMA pma; + int i; + int r; + const int N = 16; + MALLOC(pma); + MALLOC_N(N,pma->pairs); + // All that is needed to test pma_find is N and pairs. + pma->N = N; + for (i=0; ipairs[i].key=0; + assert(pma_index_limit(pma)==N); + r=pmainternal_find(pma, "hello", 5); + assert(r==0); + + pma->pairs[5].key="hello"; + pma->pairs[5].keylen=5; + assert(pma_index_limit(pma)==N); + r=pmainternal_find(pma, "hello", 5); + assert(pma_index_limit(pma)==N); + assert(r==5); + r=pmainternal_find(pma, "there", 5); + assert(r==6); + r=pmainternal_find(pma, "aaa", 3); + assert(r==0); + + pma->pairs[N-1].key="there"; + pma->pairs[N-1].keylen=5; + r=pmainternal_find(pma, "hello", 5); + assert(r==5); + r=pmainternal_find(pma, "there", 5); + assert(r==N-1); + r=pmainternal_find(pma, "aaa", 3); + assert(r==0); + r=pmainternal_find(pma, "hellob", 6); + assert(r==6); + r=pmainternal_find(pma, "zzz", 3); + assert(r==N); + my_free(pma->pairs); + my_free(pma); +} + +void test_smooth_region_N (int N) { + struct pair pairs[N]; + char *strings[100]; + char string[100]; + int i; + int len; + if (N<10) len=1; + else if (N<100) len=2; + else len=8; + for (i=0; ir); + } + } + assert(cleari==0); + } + } + } +} + + +void test_smooth_region6 (void) { + enum {N=7}; + struct pair pairs[N] = {{.key="A"},{.key="B"},{.key=0},{.key=0},{.key=0},{.key=0},{.key=0}}; + int r = pmainternal_smooth_region(pairs, N, 2); + printf("{%s %s %s %s %s %s %s} %d\n", + (char*)pairs[0].key, (char*)pairs[1].key, (char*)pairs[2].key, (char*)pairs[3].key, (char*)pairs[4].key, (char*)pairs[5].key, (char*)pairs[6].key, + r); +} + + +static void test_smooth_region (void) { + test_smooth_region_N(4); + test_smooth_region_N(5); + test_smooth_region6(); +} + +static void test_calculate_parameters (void) { + struct pma pma; + pma.N=4; pmainternal_calculate_parameters(&pma); assert(pma.uplgN==2); assert(pma.densitystep==0.5); + pma.N=8; pmainternal_calculate_parameters(&pma); assert(pma.uplgN==4); assert(pma.densitystep==0.5); + +} + +static void test_count_region (void) { + struct pair pairs[4]={{.key=0},{.key=0},{.key=0},{.key=0}}; + assert(pmainternal_count_region(pairs,0,4)==0); + assert(pmainternal_count_region(pairs,2,4)==0); + assert(pmainternal_count_region(pairs,0,2)==0); + pairs[2].key="A"; + assert(pmainternal_count_region(pairs,0,4)==1); + assert(pmainternal_count_region(pairs,2,4)==1); + assert(pmainternal_count_region(pairs,0,2)==0); + assert(pmainternal_count_region(pairs,2,2)==0); + assert(pmainternal_count_region(pairs,2,3)==1); + pairs[3].key="B"; + pairs[0].key="a"; + assert(pmainternal_count_region(pairs,0,4)==3); +} + +static void test_pma_random_pick (void) { + PMA pma; + int r = pma_create(&pma); + bytevec key,val; + ITEMLEN keylen,vallen; + assert(r==0); + r = pma_random_pick(pma, &key, &keylen, &val, &vallen); + assert(r==DB_NOTFOUND); + r = pma_insert(pma, "hello", 6, "there", 6); + assert(r==BRT_OK); + r = pma_random_pick(pma, &key, &keylen, &val, &vallen); + assert(r==0); + assert(keylen==6); assert(vallen==6); + assert(strcmp(key,"hello")==0); + assert(strcmp(val,"there")==0); + r = pma_delete(pma, "nothello", 9); + assert(r==DB_NOTFOUND); + r = pma_delete(pma, "hello", 6); + assert(r==BRT_OK); + + r = pma_random_pick(pma, &key, &keylen, &val, &vallen); + assert(r==DB_NOTFOUND); + + r = pma_insert(pma, "hello", 6, "there", 6); + assert(r==BRT_OK); + + + r = pma_random_pick(pma, &key, &keylen, &val, &vallen); + assert(r==0); + assert(keylen==6); assert(vallen==6); + assert(strcmp(key,"hello")==0); + assert(strcmp(val,"there")==0); + + r = pma_insert(pma, "aaa", 4, "athere", 7); assert(r==BRT_OK); + r = pma_insert(pma, "aab", 4, "bthere", 7); assert(r==BRT_OK); + r = pma_insert(pma, "aac", 4, "cthere", 7); assert(r==BRT_OK); + r = pma_insert(pma, "aad", 4, "dthere", 7); assert(r==BRT_OK); + r = pma_insert(pma, "aae", 4, "ethere", 7); assert(r==BRT_OK); + r = pma_insert(pma, "aaf", 4, "fthere", 7); assert(r==BRT_OK); + r = pma_insert(pma, "aag", 4, "gthere", 7); assert(r==BRT_OK); + r = pma_delete(pma, "aaa", 4); assert(r==BRT_OK); + r = pma_delete(pma, "aab", 4); assert(r==BRT_OK); + r = pma_delete(pma, "aac", 4); assert(r==BRT_OK); + r = pma_delete(pma, "aad", 4); assert(r==BRT_OK); + r = pma_delete(pma, "aae", 4); assert(r==BRT_OK); + r = pma_delete(pma, "aag", 4); assert(r==BRT_OK); + r = pma_delete(pma, "hello", 6); assert(r==BRT_OK); + + r = pma_random_pick(pma, &key, &keylen, &val, &vallen); + assert(r==0); + assert(keylen==4); assert(vallen==7); + assert(strcmp(key,"aaf")==0); + assert(strcmp(val,"fthere")==0); + r=pma_free(&pma); assert(r==0); + assert(pma==0); +} + +static void test_find_insert (void) { + PMA pma; + int r; + bytevec dv; + ITEMLEN dl; + pma_create(&pma); + r=pma_lookup(pma, "aaa", 3, &dv, &dl); + assert(r==DB_NOTFOUND); + + r=pma_insert(pma, "aaa", 3, "aaadata", 7); + assert(r==BRT_OK); + + dv=0; dl=0; + r=pma_lookup(pma, "aaa", 3, &dv, &dl); + assert(r==BRT_OK); + assert(keycompare(dv,dl,"aaadata", 7)==0); + + r=pma_insert(pma, "bbb", 4, "bbbdata", 8); + assert(r==BRT_OK); + + r=pma_lookup(pma, "aaa", 3, &dv, &dl); + assert(r==BRT_OK); + assert(keycompare(dv,dl,"aaadata", 7)==0); + + r=pma_lookup(pma, "bbb", 4, &dv, &dl); + assert(r==BRT_OK); + assert(keycompare(dv,dl,"bbbdata", 8)==0); + + assert((unsigned long)pma->pairs[pma_index_limit(pma)].key==0xdeadbeefL); + + r=pma_insert(pma, "00000", 6, "d0", 3); + assert(r==BRT_OK); + + assert((unsigned long)pma->pairs[pma_index_limit(pma)].key==0xdeadbeefL); + + r=pma_free(&pma); assert(r==0); assert(pma==0); + pma_create(&pma); assert(pma!=0); + + { + int i; + for (i=0; i<100; i++) { + char string[10]; + char dstring[10]; + snprintf(string,10,"%05d",i); + snprintf(dstring,10,"d%d", i); + printf("Inserting %d: string=%s dstring=%s\n", i, string, dstring); + r=pma_insert(pma, string, strlen(string)+1, dstring, strlen(dstring)+1); + assert(r==BRT_OK); + } + } + r=pma_free(&pma); assert(r==0); assert(pma==0); +} + +static int tpi_k,tpi_v; +static void do_sum_em (bytevec key, ITEMLEN keylen, bytevec val, ITEMLEN vallen, void *v) { + assert((unsigned long)v==0xdeadbeefL); + assert(strlen(key)+1==keylen); + assert(strlen(val)+1==vallen); + tpi_k += atoi(key); + tpi_v += atoi(val); +} + +static void test_pma_iterate_internal (PMA pma, int expected_k, int expected_v) { + tpi_k=tpi_v=0; + pma_iterate(pma, do_sum_em, (void*)0xdeadbeefL); + assert(tpi_k==expected_k); + assert(tpi_v==expected_v); +} + +static void test_pma_iterate (void) { + PMA pma; + int r; + pma_create(&pma); + r=pma_insert(pma, "42", 3, "-19", 4); + assert(r==BRT_OK); + test_pma_iterate_internal(pma, 42, -19); + + r=pma_insert(pma, "12", 3, "-100", 5); + assert(r==BRT_OK); + test_pma_iterate_internal(pma, 42+12, -19-100); + r=pma_free(&pma); assert(r==0); assert(pma==0); +} + +static void test_pma_iterate2 (void) { + PMA pma0,pma1; + int r; + int sum=0; + int n_items=0; + r=pma_create(&pma0); assert(r==0); + r=pma_create(&pma1); assert(r==0); + pma_insert(pma0, "a", 2, "aval", 5); + pma_insert(pma0, "b", 2, "bval", 5); + pma_insert(pma1, "x", 2, "xval", 5); + PMA_ITERATE(pma0,kv __attribute__((__unused__)),kl,dv __attribute__((__unused__)),dl, (n_items++,sum+=kl+dl)); + PMA_ITERATE(pma1,kv __attribute__((__unused__)),kl,dv __attribute__((__unused__)), dl, (n_items++,sum+=kl+dl)); + assert(sum==21); + assert(n_items==3); + r=pma_free(&pma0); assert(r==0); assert(pma0==0); + r=pma_free(&pma1); assert(r==0); assert(pma1==0); +} + +/* Check to see if we can create and kill a cursor. */ +void test_pma_cursor_0 (void) { + PMA pma; + PMA_CURSOR c=0; + int r; + r=pma_create(&pma); assert(r==0); + r=pma_cursor(pma, &c); assert(r==0); assert(c!=0); + printf("%s:%d\n", __FILE__, __LINE__); + r=pma_free(&pma); assert(r!=0); /* didn't deallocate the cursor. */ + printf("%s:%d\n", __FILE__, __LINE__); + r=pma_cursor_free(&c); assert(r==0); + printf("%s:%d\n", __FILE__, __LINE__); + r=pma_free(&pma); assert(r==0); /* did deallocate the cursor. */ +} + +/* Make sure we can free the cursors in any order. There is a doubly linked list of cursors + * and if we free them in a different order, then different unlinking code is invoked. */ +void test_pma_cursor_1 (void) { + PMA pma; + PMA_CURSOR c0=0,c1=0,c2=0; + int r; + int order; + for (order=0; order<6; order++) { + r=pma_create(&pma); assert(r==0); + r=pma_cursor(pma, &c0); assert(r==0); assert(c0!=0); + r=pma_cursor(pma, &c1); assert(r==0); assert(c1!=0); + r=pma_cursor(pma, &c2); assert(r==0); assert(c2!=0); + + r=pma_free(&pma); assert(r!=0); + + if (order<2) { r=pma_cursor_free(&c0); assert(r==0); c0=c1; c1=c2; } + else if (order<4) { r=pma_cursor_free(&c1); assert(r==0); c1=c2; } + else { r=pma_cursor_free(&c2); assert(r==0); } + + r=pma_free(&pma); assert(r!=0); + + if (order%2==0) { r=pma_cursor_free(&c0); assert(r==0); c0=c1; } + else { r=pma_cursor_free(&c1); assert(r==0); } + + r=pma_free(&pma); assert(r!=0); + + r = pma_cursor_free(&c0); assert(r==0); + + r=pma_free(&pma); assert(r==0); + } +} + +void test_pma_cursor_2 (void) { + PMA pma; + PMA_CURSOR c=0; + int r; + DBT key,val; + ybt_init(&key); key.flags=DB_DBT_REALLOC; + ybt_init(&val); val.flags=DB_DBT_REALLOC; + r=pma_create(&pma); assert(r==0); + r=pma_cursor(pma, &c); assert(r==0); assert(c!=0); + r=pma_cursor_set_position_last(c); assert(r==DB_NOTFOUND); + r=pma_cursor_free(&c); assert(r==0); + r=pma_free(&pma); assert(r==0); +} + +void test_pma_cursor (void) { + test_pma_cursor_0(); + test_pma_cursor_1(); + test_pma_cursor_2(); +} + +void pma_tests (void) { + memory_check=1; + test_pma_iterate(); memory_check_all_free(); + test_pma_iterate2(); memory_check_all_free(); + test_make_space_at(); memory_check_all_free(); + test_smooth_region(); memory_check_all_free(); + test_find_insert(); memory_check_all_free(); + test_pma_find(); memory_check_all_free(); + test_calculate_parameters(); memory_check_all_free(); + test_count_region(); memory_check_all_free(); + test_keycompare(); memory_check_all_free(); + test_pma_random_pick(); memory_check_all_free(); + test_pma_cursor(); memory_check_all_free(); +} + +int main (int argc __attribute__((__unused__)), char *argv[] __attribute__((__unused__))) { + pma_tests(); + return 0; +} diff --git a/newbrt/pma.c b/newbrt/pma.c new file mode 100644 index 00000000000..f79539752a0 --- /dev/null +++ b/newbrt/pma.c @@ -0,0 +1,548 @@ +/* An in-memory Packed Memory Array dictionary. + The keys and values are arrays of bytes, but are not necessarily kept in scan order. + Only the pointers are kept. + */ + +#include "pma-internal.h" +#include "key.h" +#include "memory.h" +#include "myassert.h" +#include "../include/ydb-constants.h" +#include +#include + +/* Only needed for testing. */ +#include + + +int pma_n_entries (PMA pma) { + return pma->n_pairs_present; +} + +int pma_index_limit (PMA pma) { + return pma->N; +} +int pmanode_valid (PMA pma, int i) { + assert(0<=i); assert(ipairs[i].key!=0; +} +bytevec pmanode_key (PMA pma, int i) { + assert(0<=i); assert(ipairs[i].key; +} +ITEMLEN pmanode_keylen (PMA pma, int i) { + assert(0<=i); assert(ipairs[i].keylen; +} +bytevec pmanode_val (PMA pma, int i) { + assert(0<=i); assert(ipairs[i].val; +} +ITEMLEN pmanode_vallen (PMA pma, int i) { + assert(0<=i); assert(ipairs[i].vallen; +} + +/* Could pick the same one every time if we wanted. */ +int pma_random_pick(PMA pma, bytevec *key, ITEMLEN *keylen, bytevec *val, ITEMLEN *vallen) { +#if 1 + int i; + /* For now a simple implementation where we simply start at the beginning and look. */ + for (i=0; ipairs[i].key) { + *key = pmanode_key(pma,i); + *keylen = pmanode_keylen(pma,i); + *val = pmanode_val(pma,i); + *vallen = pmanode_vallen(pma,i); + return 0; + } + } + return DB_NOTFOUND; +#else + /* Maybe we should pick a random item to remove in order to reduce the unbalancing. */ + int i; + int l = pma_index_limit(pma); + int r = random()%l; + /* For now a simple implementation where we simply start at the beginning and look. */ + for (i=0; ipairs[ir].key) { + *key = pmanode_key(pma,ir); + *keylen = pmanode_keylen(pma,ir); + *val = pmanode_val(pma,ir); + *vallen = pmanode_vallen(pma,ir); + return 0; + } + } + return DB_NOTFOUND; + +#endif +} + +static int pma_count_finds=0; +static int pma_count_divides=0; +static int pma_count_scans=0; +void pma_show_stats (void) { + printf("%d finds, %d divides, %d scans\n", pma_count_finds, pma_count_divides, pma_count_scans); +} + +// Return the smallest index such that no lower index contains a larger key. +// This will be in the range 0 (inclusive) to pma_index_limit(pma) (inclusive). +// Thus the returned index may not be a valid index into the array if it is == pma_index_limit(pma) +// For example: if the array is empty, that means we return 0. +// For example: if the array is full of small keys, that means we return pma_index_limit(pma), which is off the end of teh array. +// For example: if the array is full of large keys, then we return 0. +int pmainternal_find (PMA pma, bytevec key, int keylen) { + int lo=0, hi=pma_index_limit(pma); + /* lo and hi are the minimum and maximum values (inclusive) that we could possibly return. */ + pma_count_finds++; + while (lopairs[mid].key!=0) { + // Found one. + int cmp = keycompare(key,keylen, pma->pairs[mid].key, pma->pairs[mid].keylen); + if (cmp==0) return mid; + else if (cmp<0) { + /* key is smaller than the midpoint, so look in the low half. */ + hi = (lo+hi)/2; /* recalculate the midpoint, since mid is no necessarily the midpoint now. */ + pma_count_divides++; + goto next_range; + } else { + /* key is larger than the midpoint. So look in the high half. */ + lo = mid+1; /* The smallest value we could want to return is lo. */ + pma_count_divides++; + goto next_range; + } + /* Not reached */ + } + pma_count_scans++; + } + /* If we got here, all from mid to hi were null, so adjust hi to the midpoint. */ + /* If the whole array is null, we'll end up returning index 0, which is good. */ + hi = (lo+hi)/2; + pma_count_divides++; + next_range: ; /* We have adjusted lo and hi, so look again. */ + } + assert(0<=lo); + assert(lo==hi); + assert(hi <= pma_index_limit(pma)); + /* If lo points at something, the something should not be smaller than key. */ + if (lo>0 && lo < pma_index_limit(pma) && pma->pairs[lo].key) { + //printf("lo=%d\n", lo); + assert(0 >= keycompare(key, keylen, pma->pairs[lo].key, pma->pairs[lo].keylen)); + } + return lo; +} + +//int min (int i, int j) { if (in_pairs_present); + count=pmainternal_printpairs(pma->pairs, pma_index_limit(pma)); + printf("\n"); + assert(count==pma->n_pairs_present); +} + +/* Smooth the data, and return the location of the null. */ +int distribute_data (struct pair *destpairs, int dcount, + struct pair *sourcepairs, int scount) { + assert(scount<=dcount); + if (scount==0) { + return -1; + } + if (scount==1) { + *destpairs=*sourcepairs; + if (destpairs->key==0) return 0; + else return -1; + } else { + int r1 = distribute_data(destpairs, dcount/2, + sourcepairs, scount/2); + int r2 = distribute_data(destpairs +dcount/2, dcount-dcount/2, + sourcepairs+scount/2, scount-scount/2); + assert(r1==-1 || r2==-1); + if (r1!=-1) return r1; + else if (r2!=-1) return r2+dcount/2; + else return -1; + } +} + +/* spread the non-empty pairs around. There are n of them. Create an empty slot just before the IDXth + element, and return that slot's index in the smoothed array. */ +int pmainternal_smooth_region (struct pair *pairs, int n, int idx) { + int i; + int n_present=0; + for (i=0; i=lgN) { + n_divisions++; + N/=2; + } + pma->uplgN=N; + //printf("uplgN = %d n_divisions=%d\n", pma->uplgN, n_divisions); + assert(n_divisions>0); + pma->densitystep = 0.5/n_divisions; +} + +int pmainternal_count_region (struct pair *pairs, int lo, int hi) { + int n=0; + while (loN = 4; + result->n_pairs_present = 0; + MALLOC_N((1+result->N),result->pairs); + result->pairs[result->N].key = (void*)0xdeadbeef; + //printf("pairs=%p (size=%d)\n", result->pairs,result->N*sizeof(*result->pairs)); + if (result->pairs==0) { + my_free(result); + return -1; + } + for (i=0; iN; i++) { + result->pairs[i].key = 0; + result->pairs[i].keylen = 0; + result->pairs[i].val = 0; + result->pairs[i].vallen = 0; + } + pmainternal_calculate_parameters(result); + result->cursors_head = result->cursors_tail = 0; + *pma = result; + assert((unsigned long)result->pairs[result->N].key==0xdeadbeefL); + return 0; +} + + +int pma_cursor (PMA pma, PMA_CURSOR *cursp) { + PMA_CURSOR MALLOC(curs); + if (errno!=0) return errno; + assert(curs!=0); + curs->position=-1; /* undefined */ + if (pma->cursors_head) { + pma->cursors_head->prev = curs; + } else { + pma->cursors_tail = curs; + } + curs->next = pma->cursors_head; + curs->prev = 0; + curs->pma = pma; + curs->skey = 0; + curs->sval=0; + pma->cursors_head = curs; + *cursp=curs; + return 0; +} + +int pma_cursor_set_position_last (PMA_CURSOR c) +{ + PMA pma = c->pma; + c->position=pma->N-1; + while (c->pma->pairs[c->position].key==0) { + if (c->position>0) c->position--; + else return DB_NOTFOUND; + } + return 0; +} + +int pma_cursor_set_position_first (PMA_CURSOR c) +{ + PMA pma = c->pma; + c->position=0; + while (c->pma->pairs[c->position].key==0) { + if (c->position+1N) c->position++; + else return DB_NOTFOUND; + } + return 0; +} + +int pma_cget_current (PMA_CURSOR c, DBT *key, DBT *val) { + PMA pma = c->pma; + if (pma->pairs[c->position].key==0) return BRT_KEYEMPTY; + ybt_set_value(key, pma->pairs[c->position].key, pma->pairs[c->position].keylen, &c->skey); + ybt_set_value(val, pma->pairs[c->position].val, pma->pairs[c->position].vallen, &c->sval); + return 0; +} + + +#if 0 +int pma_cget_first (PMA_CURSOR c, YBT *key, YBT *val) { + PMA pma=c->pma; + c->position=0; + if (pma->n_pairs_present==0) return DB_NOTFOUND; + while (pma->pairs[c->position].key==0 && c->positionN) { + c->position++; + } + assert(c->positionN && pma->pairs[c->position].key!=0); + ybt_set_value(key, pma->pairs[c->position].key, pma->pairs[c->position].keylen, &c->skey); + ybt_set_value(val, pma->pairs[c->position].val, pma->pairs[c->position].vallen, &c->sval); + return 0; +} +#endif + +int pma_cursor_free (PMA_CURSOR *cursp) { + PMA_CURSOR curs=*cursp; + PMA pma = curs->pma; + if (curs->prev==0) { + assert(pma->cursors_head==curs); + pma->cursors_head = curs->next; + } else { + curs->prev->next = curs->next; + } + if (curs->next==0) { + assert(pma->cursors_tail==curs); + pma->cursors_tail = curs->prev; + } else { + curs->next->prev = curs->prev; + } + if (curs->skey) my_free(curs->skey); + if (curs->sval) my_free(curs->sval); + my_free(curs); + *cursp=0; + return 0; +} + +/* Make some space for a key to go at idx (the thing currently at idx should end up at to the right.) */ +/* Return the new index. (Making space may involve moving things around, including the hole at index.) */ +int pmainternal_make_space_at (PMA pma, int idx) { + /* Within a range LO to HI we have a limit of how much packing we will tolerate. + * We allow the entire array to be 50% full. + * We allow a region of size lgN to be full. + * At sizes in between, we interpolate. + */ + int size=pma->uplgN; + int lo=idx; + int hi=idx; + double density=1.0; + while (1) { + /* set hi-lo equal size, make sure it is a supserset of (hi,lo). */ + lo=idx-size/2; + hi=idx+size/2; + //printf("lo=%d hi=%d\n", lo, hi); + if (lo<0) { hi-=lo; lo=0; } + else if (hi>pma_index_limit(pma)) { lo-=(hi-pma_index_limit(pma)); hi=pma_index_limit(pma); } + else { ; /* nothing */ } + + //printf("lo=%d hi=%d\n", lo, hi); + assert(0<=lo); assert(lo0.499); assert(density<=1); + if (density<0.5001) { assert(lo==0); assert(hi==pma_index_limit(pma)); } + { + int count = (1+ /* Don't forget space for the new guy. */ + pmainternal_count_region(pma->pairs, lo, hi)); + if (count/(double)(hi-lo) <= density) break; + if (lo==0 && hi==pma_index_limit(pma)) { + /* The array needs to be doubled in size. */ + int i; + assert(size==pma_index_limit(pma)); + size*=2; + //printf("realloc %p to %d\n", pma->pairs, size*sizeof(*pma->pairs)); + pma->pairs = my_realloc(pma->pairs, (1+size)*sizeof(*pma->pairs)); + for (i=hi; ipairs[i].key=0; + pma->pairs[size].key = (void*)0xdeadbeefL; + pma->N=size; + pmainternal_calculate_parameters(pma); + hi=size; + //printf("doubled N\n"); + break; + } + } + density-=pma->densitystep; + size*=2; + } + //printf("%s:%d Smoothing from %d to %d to density %f\n", __FILE__, __LINE__, lo, hi, density); + { + int new_index = pmainternal_smooth_region(pma->pairs+lo, hi-lo, idx-lo); + return new_index+lo; + } +} + + +/* Exposes internals of the PMA by returning a pointer to the guts. + * Don't modify the returned data. Don't free it. */ +enum pma_errors pma_lookup (PMA pma, bytevec key, ITEMLEN keylen, bytevec*val, ITEMLEN *vallen) { + int l = pmainternal_find(pma, key, keylen); + assert(0<=l ); assert(l<=pma_index_limit(pma)); + if (l==pma_index_limit(pma)) return DB_NOTFOUND; + if (keycompare(key,keylen,pma->pairs[l].key,pma->pairs[l].keylen)==0) { + *val = pma->pairs[l].val; + *vallen = pma->pairs[l].vallen; + return BRT_OK; + } else { + return DB_NOTFOUND; + } +} + +void maybe_free (const void *p) { + if (p) my_free((void*)p); +} + +/* returns 0 if OK. + * You must have freed all the cursors, otherwise returns nonzero and does nothing. */ +int pma_free (PMA *pmap) { + int i; + PMA pma=*pmap; + if (pma->cursors_head) return -1; + for (i=0; ipairs[i].key) { + maybe_free(pma->pairs[i].key); + maybe_free(pma->pairs[i].val); + pma->pairs[i].key=0; + pma->pairs[i].val=0; + } + } + my_free(pma->pairs); + my_free(pma); + *pmap=0; + return 0; +} + +/* Copies keylen and datalen */ +int pma_insert (PMA pma, bytevec key, ITEMLEN keylen, bytevec data, ITEMLEN datalen) { + int idx = pmainternal_find(pma, key, keylen); + if (idx < pma_index_limit(pma) && pma->pairs[idx].key) { + if (0==keycompare(key, keylen, pma->pairs[idx].key, pma->pairs[idx].keylen)) { + return BRT_ALREADY_THERE; /* It is already here. Return an error. */ + } + } + if (pma->pairs[idx].key) { + idx = pmainternal_make_space_at (pma, idx); /* returns the new idx. */ + } + assert(!pma->pairs[idx].key); + pma->pairs[idx].key = memdup(key, keylen); + pma->pairs[idx].keylen = keylen; + pma->pairs[idx].val = memdup(data, datalen); + pma->pairs[idx].vallen = datalen; + pma->n_pairs_present++; + return BRT_OK; +} + +#if 0 +void smooth_after_delete (PMA pma, int idx) { + int size=pma->uplgN; + int lo=idx; + int hi=idx; + double density=0.1; + while (1) { + lo=idx-size/2; + hi=idx+size/2; + if (lo<0) { hi-=lo; lo=0; } + else if (hi>pma_index_limit(pma)) { lo-=(hi-pma_index_limit(pma)); hi=pma_index_limit(pma); } + else { ; /* nothing */ } + + assert(density<0.25); + { + int count=pmainternal_count_region(pma->pairs, lo, hi); + if (count/(double)(hi-lo) >= density) break; + if (lo==0 && hi==pma_index_limit(pma)) { + /* The array needs to be shrunk */ + +} +#endif + + +int pma_delete (PMA pma, bytevec key, ITEMLEN keylen) { + int l = pmainternal_find(pma, key, keylen); + if (pma->pairs[l].key==0) { + printf("%s:%d l=%d r=%d\n", __FILE__, __LINE__, l, DB_NOTFOUND); + return DB_NOTFOUND; + } + assert(pma->pairs[l].val!=0); + my_free((void*)pma->pairs[l].key); + my_free((void*)pma->pairs[l].val); + pma->pairs[l].key = 0; + pma->pairs[l].val = 0; + pma->pairs[l].keylen = 0; + pma->pairs[l].vallen = 0; + pma->n_pairs_present--; + // Need to rebalance +// smooth_after_delete(pma,l); + return BRT_OK; +} + +void pma_iterate (PMA pma, void(*f)(bytevec,ITEMLEN,bytevec,ITEMLEN, void*), void*v) { + int i; + for (i=0; ipairs[i].key) { + f(pma->pairs[i].key, pma->pairs[i].keylen, + pma->pairs[i].val, pma->pairs[i].vallen, + v); + } + } +} + + diff --git a/newbrt/pma.h b/newbrt/pma.h new file mode 100644 index 00000000000..76e3a1e540e --- /dev/null +++ b/newbrt/pma.h @@ -0,0 +1,80 @@ +#ifndef PMA_H +#define PMA_H + +#include "brttypes.h" +#include "ybt.h" +#include "yerror.h" + +/* An in-memory Packed Memory Array dictionary. */ +/* There is a built-in-cursor. */ + +typedef struct pma *PMA; +typedef struct pma_cursor *PMA_CURSOR; + +/* All functions return 0 on success. */ +int pma_create (PMA *); + +/* returns 0 if OK. + * You must have freed all the cursors, otherwise returns nonzero and does nothing. */ +int pma_free (PMA *); + +int pma_n_entries (PMA); + +/* Returns an error if the key is already present. */ +/* The values returned should not be modified.by the caller. */ +/* Any cursors should be updated. */ +/* Duplicates the key and keylen. */ +enum pma_errors pma_insert (PMA, bytevec key, ITEMLEN keylen, bytevec data, ITEMLEN datalen); +/* This returns an error if the key is NOT present. */ +int pma_replace (PMA, bytevec key, ITEMLEN keylen, bytevec data, ITEMLEN datalen); +/* This returns an error if the key is NOT present. */ +int pma_delete (PMA, bytevec key, ITEMLEN keylen); + +/* Exposes internals of the PMA by returning a pointer to the guts. + * Don't modify the returned data. Don't free it. */ +enum pma_errors pma_lookup (PMA, bytevec key, ITEMLEN keylen, bytevec*data, ITEMLEN *datalen); + +/* Move the cursor to the beginning or the end or to a key */ +int pma_cursor (PMA, PMA_CURSOR *); +int pma_cursor_free (PMA_CURSOR*); + +int pma_cursor_set_position_last (PMA_CURSOR c); +int pma_cursor_set_position_first (PMA_CURSOR c); +int pma_cget_current (PMA_CURSOR c, DBT *key, DBT *val); + +/* Return PMA_NOTFOUND if the pma is empty. */ +#if 0 +int pma_cget_first (PMA_CURSOR, YBT */*key*/, YBT */*val*/); +int pma_cursor_first (PMA); +int pma_cursor_last (PMA); +int pma_cursor_set (PMA, bytevec key, int keylen); +int pma_cursor_next (PMA); +int pma_cursor_prev (PMA); +int pma_cursor_get (PMA, bytevec *key, int *keylen, bytevec *data, int *datalen); +#endif + +int pma_random_pick(PMA, bytevec *key, ITEMLEN *keylen, bytevec *data, ITEMLEN *datalen); + +int pma_index_limit(PMA); +int pmanode_valid(PMA,int); +bytevec pmanode_key(PMA,int); +ITEMLEN pmanode_keylen(PMA,int); +bytevec pmanode_val(PMA,int); +ITEMLEN pmanode_vallen(PMA,int); + +void pma_iterate (PMA, void(*)(bytevec,ITEMLEN,bytevec,ITEMLEN, void*), void*); + +#define PMA_ITERATE(table,keyvar,keylenvar,datavar,datalenvar,body) ({ \ + int __i; \ + for (__i=0; __i +#include +#include +#include +#include +#include +#include + +enum { MAX_PATHNAME_LEN = 100 }; +const char dir[]="db4dir"; + +DB_ENV *env=0; +DB *db=0; + +#if DB_VERSION_MINOR == 0 +#define IF40(x,y) x +#else +#define IF40(x,y) y +#endif + +void create_directory (void) { + char command[MAX_PATHNAME_LEN]; + int r; + r=snprintf(command, MAX_PATHNAME_LEN, "rm -rf %s", dir); + assert(rset_cachesize(env, 0, 512*(1<<20), 0); + assert(r==0); + + IF40((void)0, + ({ + unsigned int gbytes,bytes; + int ncaches; + r=env->get_cachesize(env, &gbytes, &bytes, &ncaches); + assert(r==0); + printf("Using %.2fMiB Berkeley DB Cache Size\n", gbytes*1024 + ((double)bytes/(1<<20))); + })); + + + r= env->open(env, dir, DB_CREATE|DB_INIT_MPOOL,0777); // No logging. + assert(r==0); + r=db_create(&db, env, 0); + assert(r==0); + IF40( + r=db->open(db, "files", 0, DB_BTREE, DB_CREATE, 0777), + r=db->open(db, 0, "files", 0, DB_BTREE, DB_CREATE, 0777)); + assert(r==0); + +} + +int write_one (long int n1, long int n2) { + char keystring[100],valstring[100]; + int keysize; + int datasize; + DB_TXN *txn=0; + DBT key,data; + int r; + keysize = snprintf(keystring, 100, "%08lx%08lx", n1, n2); + datasize = snprintf(valstring, 100, "%ld %ld %ld %ld %ld %ld", n1, n2, (long)(random()), (long)(random()), (long)(random()), (long)(random())); + memset(&key, 0, sizeof(key)); + memset(&data, 0, sizeof(data)); + key.data = keystring; + key.size = keysize; + data.data = valstring; + data.size = datasize; + r = db->put(db, txn, &key, &data, 0); + assert(r==0); + return keysize+datasize; +} + +/* Write a sequence evenly spaced. */ +long long write_sequence (int n_inserts) { + unsigned int step = UINT_MAX/n_inserts; + int i,j; + long long n_bytes=0; + printf("%d inserts, step %d\n", n_inserts, step); + for (i=0,j=0; itv_sec-t0->tv_sec)+1e-6*(t1->tv_usec-t0->tv_usec); +} + +int main (int argc __attribute__((__unused__)), char *argv[] __attribute__((__unused__))) { + int n_s_inserts=200000000; + int n_inserts=50000; + struct timeval t0,t1,t00; + long long n_bytes; + int r; + create_directory(); + gettimeofday(&t0, 0); + n_bytes=write_sequence(n_s_inserts); + gettimeofday(&t00, 0); + r=db->sync(db, 0); assert(r==0); + gettimeofday(&t1, 0); + { + double t = tdiff(&t1, &t0); + printf("%9d sequential inserts in %.3fs (%.3fs in sync), %.1f inserts/s. %lld bytes, %.1f bytes/s\n", n_s_inserts, t, tdiff(&t1,&t00), n_s_inserts/t, n_bytes, n_bytes/t); + } + + gettimeofday(&t0, 0); + n_bytes=write_random(n_inserts); + gettimeofday(&t00, 0); + r=db->sync(db, 0); assert(r==0); + gettimeofday(&t1, 0); + { + double t = tdiff(&t1, &t0); + printf("%9d random inserts in %.3fs (%.3fs in sync), %.1f inserts/s. %lld bytes, %.1f bytes/s\n", n_inserts, t, tdiff(&t1, &t00), n_inserts/t, n_bytes, n_bytes/t); + } + gettimeofday(&t0, 0); + r=db->close(db,0); assert(r==0); + r=env->close(env,0); assert(r==0); + gettimeofday(&t1, 0); + printf("Time to close %.3fs\n", tdiff(&t1,&t0)); + return 0; +} diff --git a/newbrt/ybt-test.c b/newbrt/ybt-test.c new file mode 100644 index 00000000000..ec3564cee61 --- /dev/null +++ b/newbrt/ybt-test.c @@ -0,0 +1,55 @@ +#define _FILE_OFFSET_BITS 64 + +#include "ybt.h" +#include "memory.h" +#include +#include + +void ybt_test0 (void) { + void *v0=0,*v1=0; + DBT t0,t1; + ybt_init(&t0); + ybt_init(&t1); + ybt_set_value(&t0, "hello", 6, &v0); + ybt_set_value(&t1, "foo", 4, &v1); + assert(t0.size==6); + assert(strcmp(t0.data, "hello")==0); + assert(t1.size==4); + assert(strcmp(t1.data, "foo")==0); + + ybt_set_value(&t1, "byebye", 7, &v0); /* Use v0, not v1 */ + assert(strcmp(t0.data, "byebye")==0); /* t0's data should be changed too, since it used v0 */ + assert(strcmp(t1.data, "byebye")==0); + + my_free(v0); my_free(v1); + memory_check_all_free(); + + /* See if we can probe to find out how big something is by setting ulen=0 with YBT_USERMEM */ + ybt_init(&t0); + t0.flags = DB_DBT_USERMEM; + t0.ulen = 0; + ybt_set_value(&t0, "hello", 6, 0); + assert(t0.data==0); + assert(t0.size==6); + + /* Check realloc. */ + ybt_init(&t0); + t0.flags = DB_DBT_REALLOC; + v0 = 0; + ybt_set_value(&t0, "internationalization", 21, &v0); + assert(v0==0); /* Didn't change v0 */ + assert(t0.size==21); + assert(strcmp(t0.data, "internationalization")==0); + + ybt_set_value(&t0, "provincial", 11, &v0); + assert(t0.size==11); + assert(strcmp(t0.data, "provincial")==0); + + my_free(t0.data); + memory_check_all_free(); +} + +int main (int argc __attribute__((__unused__)), char *argv[] __attribute__((__unused__))) { + ybt_test0(); + return 0; +} diff --git a/newbrt/ybt.c b/newbrt/ybt.c new file mode 100644 index 00000000000..8a557289843 --- /dev/null +++ b/newbrt/ybt.c @@ -0,0 +1,44 @@ +#define _FILE_OFFSET_BITS 64 +#include "ybt.h" +#include "memory.h" +#include +#include + +int ybt_init (DBT *ybt) { + memset(ybt, 0, sizeof(*ybt)); + return 0; +} + +int ybt_set_value (DBT *ybt, bytevec val, ITEMLEN vallen, void **staticptrp) { + if (ybt->flags==DB_DBT_MALLOC) { + domalloc: + ybt->data = my_malloc(vallen); + if (errno!=0) return errno; + ybt->ulen = vallen; + } else if (ybt->flags==DB_DBT_REALLOC) { + if (ybt->data==0) goto domalloc; + ybt->data = my_realloc(ybt->data, vallen); + if (errno!=0) return errno; + ybt->ulen = vallen; + + } else if (ybt->flags==DB_DBT_USERMEM) { + /*nothing*/ + } else { + if (staticptrp==0) return -1; + void *staticptr=*staticptrp; + if (staticptr==0) + staticptr = my_malloc(vallen); + else + staticptr = my_realloc(staticptr, vallen); + if (errno!=0) return errno; + *staticptrp = staticptr; + ybt->data = staticptr; + ybt->ulen = vallen; + } + ybt->size = vallen; + if (ybt->ulen>0) { + if (ybt->ulenulen; + memcpy(ybt->data, val, vallen); + } + return 0; +} diff --git a/newbrt/ybt.h b/newbrt/ybt.h new file mode 100644 index 00000000000..f8a0480d053 --- /dev/null +++ b/newbrt/ybt.h @@ -0,0 +1,12 @@ +#ifndef YBT_H +#define YBT_H + +// brttypes.h must be first to make 64-bit file mode work right in linux. +#include "brttypes.h" +#include "../include/db.h" + + +int ybt_init (DBT *); +int ybt_set_value (DBT *, bytevec val, ITEMLEN vallen, void **staticptrp); + +#endif diff --git a/newbrt/yerror.h b/newbrt/yerror.h new file mode 100644 index 00000000000..b73b7a04ea4 --- /dev/null +++ b/newbrt/yerror.h @@ -0,0 +1,3 @@ +enum pma_errors { BRT_OK=0, BRT_ALREADY_THERE = -2, BRT_KEYEMPTY=-3 }; + +enum typ_tag { TYP_BRTNODE = 0xdead0001, TYP_CACHETABLE, TYP_PMA }; diff --git a/pma/Makefile b/pma/Makefile new file mode 100644 index 00000000000..5ce240cc3a6 --- /dev/null +++ b/pma/Makefile @@ -0,0 +1,5 @@ +CFLAGS = -Wall -W -Werror -g +pma: LDFLAGS=-lm +pma: +pma.o: + diff --git a/src-bdbwrap/Makefile b/src-bdbwrap/Makefile new file mode 100644 index 00000000000..9640f98c0bd --- /dev/null +++ b/src-bdbwrap/Makefile @@ -0,0 +1,46 @@ +CFLAGS = -g -W -Wall -Wno-unused +CPPFLAGS = -I../include + +C_OBJS= mut_pthread.lo \ + bt_compare.lo bt_conv.lo bt_curadj.lo bt_cursor.lo bt_delete.lo \ + bt_method.lo bt_open.lo bt_put.lo bt_rec.lo bt_reclaim.lo \ + bt_recno.lo bt_rsearch.lo bt_search.lo bt_split.lo bt_stat.lo \ + bt_upgrade.lo bt_verify.lo btree_auto.lo crdel_auto.lo \ + crdel_rec.lo db.lo db_am.lo db_auto.lo db_byteorder.lo db_cam.lo \ + db_conv.lo db_dispatch.lo db_dup.lo db_err.lo db_getlong.lo \ + db_idspace.lo db_iface.lo db_join.lo db_log2.lo db_meta.lo \ + db_method.lo db_open.lo db_overflow.lo db_pr.lo db_rec.lo \ + db_reclaim.lo db_rename.lo db_remove.lo db_ret.lo db_salloc.lo \ + db_shash.lo db_truncate.lo db_upg.lo db_upg_opd.lo db_vrfy.lo \ + db_vrfyutil.lo dbm.lo dbreg.lo dbreg_auto.lo dbreg_rec.lo \ + dbreg_util.lo env_file.lo env_method.lo env_open.lo env_recover.lo \ + env_region.lo fileops_auto.lo fop_basic.lo fop_rec.lo \ + fop_util.lo hash.lo hash_auto.lo hash_conv.lo hash_dup.lo \ + hash_func.lo hash_meta.lo hash_method.lo hash_open.lo \ + hash_page.lo hash_rec.lo hash_reclaim.lo hash_stat.lo \ + hash_upgrade.lo hash_verify.lo hmac.lo hsearch.lo lock.lo \ + lock_deadlock.lo lock_method.lo lock_region.lo lock_stat.lo \ + lock_util.lo log.lo log_archive.lo log_compare.lo log_get.lo \ + log_method.lo log_put.lo mp_alloc.lo mp_bh.lo mp_fget.lo \ + mp_fopen.lo mp_fput.lo mp_fset.lo mp_method.lo mp_region.lo \ + mp_register.lo mp_stat.lo mp_sync.lo mp_trickle.lo mutex.lo \ + os_abs.lo os_alloc.lo os_clock.lo os_config.lo os_dir.lo \ + os_errno.lo os_fid.lo os_fsync.lo os_handle.lo os_id.lo \ + os_map.lo os_method.lo os_oflags.lo os_open.lo os_region.lo \ + os_rename.lo os_root.lo os_rpath.lo os_rw.lo os_seek.lo \ + os_sleep.lo os_spin.lo os_stat.lo os_tmpdir.lo os_unlink.lo \ + qam.lo qam_auto.lo qam_conv.lo qam_files.lo qam_method.lo \ + qam_open.lo qam_rec.lo qam_stat.lo qam_upgrade.lo qam_verify.lo \ + rep_method.lo rep_record.lo rep_region.lo rep_util.lo sha1.lo \ + txn.lo txn_auto.lo txn_method.lo txn_rec.lo txn_recover.lo \ + txn_region.lo txn_stat.lo txn_util.lo xa.lo xa_db.lo xa_map.lo + +install: libdb.so + cp libdb.so ../src/ +libdb.so: ydb.lo bdbw.lo + echo cc ydb.lo bdbw.lo BDB-OBJS -shared -fPIC -o libdb.so $(CFLAGS) + @cc ydb.lo bdbw.lo $(patsubst %,/home/bradley/mysql/build-bdb-with-uniquename/bdb/build_unix/%,$(C_OBJS)) -shared -fPIC -o libdb.so $(CFLAGS) +ydb.lo: bdbw.h +bdbw.lo: CPPFLAGS=-I/home/bradley/mysql/build-bdb-with-uniquename/bdb/build_unix +%.lo: %.c + cc $(CPPFLAGS) $< -c -fPIC -o $@ $(CFLAGS) diff --git a/src-bdbwrap/README b/src-bdbwrap/README new file mode 100644 index 00000000000..eccf88609b4 --- /dev/null +++ b/src-bdbwrap/README @@ -0,0 +1,121 @@ +cd ~/yobiduck/ydb/src +make + + +cd ~/mysql/bdbi/mysql-5.0.27/ +export LD_RUN_PATH=/home/bradley/yobiduck/ydb/src +./configure --with-berkeley-db-includes=/home/bradley/yobiduck/ydb/include --with-berkeley-db --with-berkeley-db-libs=/home/bradley/yobiduck/ydb/src --prefix=/home/bradley/usr + +make + +make install + +#This one may not be needed +~/mysql/bdbi/usr/bin/mysql_install_db + +# +pushd /home/bradley/mysql/bdbi/usr/ ; /home/bradley/mysql/bdbi/usr//bin/mysqld_safe & +popd + +~/mysql/bdbi/usr/bin/mysql -u root +mysql> show databases; +mysql> create database yobitest; +mysql> use yobitest; +mysql> create table t1 (i int) engine=bdb; + +Look for the error in /home/bradley/mysql/bdbi/usr/var/yobert.err + + + +---- +This links right: + LD_LIBRARY_PATH=/home/bradley/mysql/bdbi/usr/lib/mysql/ ldd sql/mysqld +---- +This works, + LD_LIBRARY_PATH=/home/bradley/mysql/bdbi/usr/lib/mysql/ /home/bradley/mysql/bdbi/usr//bin/mysqld +producing the following in the log + +061208 16:11:35 InnoDB: Started; log sequence number 0 43655 +ydb.c:78 db_env_create flags=0 + +---- + the LD_RUN_PATH thing above works. + + +--- on laptop I did this instead: + +cd ~/mysql/mysql-5.0.27/ +export LD_RUN_PATH=/home/bradley/yobiduck/ydb/src +./configure --with-berkeley-db-includes=/home/bradley/yobiduck/ydb/include --with-berkeley-db --with-berkeley-db-libs=/home/bradley/yobiduck/src --prefix=/home/bradley/usr + +make +make install +/home/bradley/usr/bin/mysql_install_db +/home/bradley/usr/bin/mysqld_safe & +/home/bradley/usr/bin/mysql -u root +~/mysql/bdbi/usr/bin/mysql -u root +mysql> show databases; +mysql> create database yobitest; +mysql> use yobitest; +mysql> create table t1 (i int) engine=bdb; + +Look for the error in ~/usr/var/localhost.localdomain.err + +--- +To clean up after a total screwup: + +rm -rf ~/usr/var/ + +Didn't manage to clean it up very well. + +--- +Goal: compile mysql with debugging + +export LD_RUN_PATH=/home/bradley/yobiduck/ydb/src +./configure CFLAGS="-g -O2" --with-berkeley-db-includes=/home/bradley/yobiduck/ydb/include --with-berkeley-db --with-berkeley-db-libs=/home/bradley/yobiduck/src --prefix=/home/bradley/usr + +If you want to debug, you might need to start mysqld without using mysqld_safe. +Here is one way to do it: + +gdb ~/usr/libexec/mysqld +(gdb) run --basedir=/home/bradley/usr --datadir=/home/bradley/usr/var --pid-file=/home/bradley/usr/var/localhost.localdomain.pid --skip-external-locking + + +That was screwed up (the configure args were wrong.) Try again: + +--- +export LD_RUN_PATH=/home/bradley/yobiduck/ydb/src +./configure CXXFLAGS="-g -O2" CFLAGS="-g -O2" --with-berkeley-db-includes=/home/bradley/yobiduck/ydb/include --with-berkeley-db --with-berkeley-db-libs=/home/bradley/yobiduck/ydb/src --prefix=/home/bradley/usr + +gdb ~/usr/libexec/mysqld +(gdb) run --basedir=/home/bradley/usr --datadir=/home/bradley/usr/var --pid-file=/home/bradley/usr/var/localhost.localdomain.pid --skip-external-locking + +and + ~/usr/bin/mysql -u root + +Note: Had to change mysql to declare berkeley_cmp_hidden_key and berkeley_cmp_packed_key to be extern, not static. + +---- +To start the mysql clean: + rm -r /home/bradley/usr/var/ +/home/bradley/usr/bin/mysql_install_db +then run mysqld and mysql do +$ ~/usr/bin/mysql -u root +mysql> create database yobitest; +mysql> use yobitest; +mysql> create table t1 (i int) engine=bdb; +mysql> insert t1 values (3); +mysql> quit + +~/usr/bin/mysqladmin -u root shutdown + +Then create the trace1 from ydbtrace.c +(copy the output onto traces/trace1.c +cd traces +make + +rm ~/usr/var/log.0000000001 +rm ~/usr/var/yobitest/t1.db +./runtrace1 + + diff --git a/src-bdbwrap/bdbw.c b/src-bdbwrap/bdbw.c new file mode 100644 index 00000000000..44773ab5c8d --- /dev/null +++ b/src-bdbwrap/bdbw.c @@ -0,0 +1,750 @@ +/* Wrapper for bdb.c. */ + +#include +/* This includes the ydb db.h, but with unique names for everything. */ +#include "ydb-uniq.h" +/* This include is to the berkeley-db compiled with --with-uniquename */ +#include +/* This include is to the interface between ydb and bdb. */ +#include "bdbw.h" +#include +#include +#include +#include +#include +#include +#include +#include + +#define barf() ({ fprintf(stderr, "YDB: BARF %s:%d in %s\n", __FILE__, __LINE__, __func__); }) +#define barff(fmt,...) ({ fprintf(stderr, "YDB: BARF %s:%d in %s, ", __FILE__, __LINE__, __func__); fprintf(stderr, fmt, __VA_ARGS__); }) +#define note() ({ fprintf(stderr, "YDB: Note %s:%d in %s\n", __FILE__, __LINE__, __func__); }) +#define notef(fmt,...) ({ fprintf(stderr, "YDB: Note %s:%d in %s, ", __FILE__, __LINE__, __func__); fprintf(stderr, fmt, __VA_ARGS__); }) + +static char *tracefname = "/home/bradley/ydbtrace.c"; +static FILE *traceout=0; +unsigned long long objnum=1; +void tracef (const char *fmt, ...) __attribute__((format (printf, 1, 2))); +void tracef (const char *fmt, ...) { + va_list ap; + va_start(ap, fmt); + if (traceout==0) { + struct timeval tv; + char *ctimes; + gettimeofday(&tv, 0); + ctimes = ctime(&tv.tv_sec); + ctimes[strlen(ctimes)-1]=0; + traceout=fopen(tracefname, "a"); + assert(traceout); + fprintf(stderr, "traceout created\n"); + fprintf(traceout, "/* bdbw trace captured %s (%ld.%06ld) */\n", + ctimes, tv.tv_sec, tv.tv_usec); + } + vfprintf(traceout, fmt, ap); + fflush(traceout); + va_end(ap); +} + +struct db_env_ydb_internal { + unsigned long long objnum; + DB_ENV *env; + void (*noticecall)(DB_ENV_ydb*, db_notices_ydb); +}; + +struct yobi_db_txn_internal { + long long objnum; + DB_TXN *txn; +}; + +static void ydb_env_err (const DB_ENV_ydb *env, int error, const char *fmt, ...) { + va_list ap; + va_start(ap, fmt); + fprintf(stderr, "YDB Error %d:", error); + vfprintf(stderr, fmt, ap); + va_end(ap); +} + +#define doit(flag) ({ if (flag ## _ydb & flags) { gotit|=flag; flags&=~flag ## _ydb; } }) + +void doits_internal (u_int32_t flag_ydb, u_int32_t flag_bdb, char *flagname, u_int32_t *flags_ydb, u_int32_t *flags_bdb, char **flagstring, int *flagstringlen) { + if (flag_ydb & *flags_ydb) { + int len = strlen(flagname); + *flags_bdb |= flag_bdb; + *flags_ydb &= ~flag_ydb; + assert(len + 2 < *flagstringlen); + snprintf(*flagstring, *flagstringlen, "|%s", flagname); + *flagstring += len+1; + *flagstringlen -= len+1; + } +} + +#define doits(flag) doits_internal(flag ## _ydb, flag, #flag, &flags, &gotit, &flagstring, &flagstringlen) + +static u_int32_t convert_envopen_flags(u_int32_t flags, char *flagstring, int flagstringlen) { + u_int32_t gotit=0; + snprintf(flagstring, flagstringlen, "0"); flagstringlen--; flagstring++; + doits(DB_INIT_LOCK); + doits(DB_INIT_LOG); + doits(DB_INIT_MPOOL); + doits(DB_INIT_TXN); + doits(DB_CREATE); + doits(DB_THREAD); + doits(DB_RECOVER); + doits(DB_PRIVATE); + assert(flags==0); + return gotit; +} + +static u_int32_t open_flags_ydb_2_bdb (u_int32_t flags, char *flagstring, int flagstringlen) { + u_int32_t gotit=0; + snprintf(flagstring, flagstringlen, "0"); flagstringlen--; flagstring++; + doits(DB_CREATE); + doits(DB_RDONLY); + doits(DB_RECOVER); + doits(DB_THREAD); + assert(flags==0); + return gotit; +} + + +u_int32_t convert_db_create_flags(u_int32_t flags) { + if (flags==0) return 0; + abort(); +} + +#define retit(flag) ({ if (flag ## _ydb == flags) { strncpy(flagstring, #flag ,flagstringlen); return flag; } }) + +u_int32_t convert_c_get_flags(u_int32_t flags, char *flagstring, int flagstringlen) { + retit(DB_FIRST); + retit(DB_LAST); + retit(DB_NEXT); + abort(); +} + +int ydb_env_open (DB_ENV_ydb *env, const char *home, u_int32_t flags, int mode) { + int r; + char flagstring[1000]; + u_int32_t bdb_flags = convert_envopen_flags(flags, flagstring, sizeof(flagstring)); + //note(); + r = env->i->env->open(env->i->env, home, bdb_flags, mode); + tracef("r = envobj(%lld)->open(envobj(%lld), \"%s\", %s, 0%o); assert(r==%d);\n", + env->i->objnum, env->i->objnum, home, flagstring, mode, r); + return r; +} + +int bdbw_env_close (DB_ENV_ydb * env, u_int32_t flags) { + int r; + notef("flags=%d\n", flags); + assert(flags==0); + r = env->i->env->close(env->i->env, 0); + env->i->env=0; + // free(env); + return r; +} + +u_int32_t convert_log_archive_flags (u_int32_t flags, char *flagstring, int flagstringlen) { + retit(DB_ARCH_ABS); + retit(DB_ARCH_LOG); + abort(); +} + +int ydb_env_log_archive (DB_ENV_ydb *env, char **list[], u_int32_t flags) { + int r; + char flagstring[1000]; + int bdbflags = convert_log_archive_flags(flags, flagstring, sizeof(flagstring)); + r = env->i->env->log_archive(env->i->env, list, bdbflags); + assert(r==0); + tracef("{ char **list; r = envobj(%lld)->log_archive(envobj(%lld), &list, %s); assert(r==%d); }\n", + env->i->objnum, env->i->objnum, flagstring, r); + return r; +} +int ydb_env_log_flush (DB_ENV_ydb * env, const DB_LSN_ydb * lsn) { + barf(); + return 1; +} +int ydb_env_set_cachesize (DB_ENV_ydb * env, u_int32_t gbytes, u_int32_t bytes, int ncache) { + return env->i->env->set_cachesize(env->i->env, gbytes, bytes, ncache); +} +int ydb_env_set_data_dir (DB_ENV_ydb * env, const char *dir) { + return env->i->env->set_data_dir(env->i->env, dir); +} +void ydb_env_set_errcall (DB_ENV_ydb *env, void (*errcall)(const char *, char *)) { + env->i->env->set_errcall(env->i->env, errcall); +} +void ydb_env_set_errpfx (DB_ENV_ydb * env, const char *errpfx) { + env->i->env->set_errpfx(env->i->env, errpfx); +} +int ydb_env_set_flags (DB_ENV_ydb *env, u_int32_t flags, int onoff) { + assert(flags==0); + return env->i->env->set_flags(env->i->env, flags, onoff); +} +int ydb_env_set_lg_bsize (DB_ENV_ydb * env, u_int32_t bsize) { + return env->i->env->set_lg_bsize(env->i->env, bsize); +} +int ydb_env_set_lg_dir (DB_ENV_ydb *env, const char * dir) { + barf(); + return 1; +} +int ydb_env_set_lg_max (DB_ENV_ydb *env, u_int32_t lg_max) { + return env->i->env->set_lg_max(env->i->env, lg_max); +} +int ydb_env_set_lk_detect (DB_ENV_ydb *env, u_int32_t detect) { + return env->i->env->set_lk_detect(env->i->env, detect); +} +int ydb_env_set_lk_max (DB_ENV_ydb *env, u_int32_t lk_max) { + return env->i->env->set_lk_max(env->i->env, lk_max); +} +void ydbenv_bdb_noticecall (DB_ENV *bdb_env, db_notices notices) { + DB_ENV_ydb *ydb_env = bdb_env->app_private; + tracef("/* Doing noticecall */\n"); + assert(notices==0 || notices==DB_NOTICE_LOGFILE_CHANGED); + ydb_env->i->noticecall(ydb_env, notices==0 ? 0 : DB_NOTICE_LOGFILE_CHANGED_ydb); +} +extern void berkeley_noticecall (DB_ENV_ydb *, db_notices_ydb); +void ydb_env_set_noticecall (DB_ENV_ydb *env, void (*noticecall)(DB_ENV_ydb *, db_notices_ydb)) { + env->i->env->set_noticecall(env->i->env, ydbenv_bdb_noticecall); + env->i->noticecall = noticecall; + { + const char *fun_name; + if (noticecall==berkeley_noticecall) { + fun_name = "berkeley_noticecall"; + } else { + fun_name = "Unknown_function"; + } + tracef("envobj(%lld)->set_noticecall(envobj(%lld), %s);\n", + env->i->objnum, env->i->objnum, fun_name); + } +} +int ydb_env_set_tmp_dir (DB_ENV_ydb * env, const char *tmp_dir) { + int r = env->i->env->set_tmp_dir(env->i->env, tmp_dir); + tracef("r = envobj(%lld)->set_tmp_dir(envobj(%lld), \"%s\"); assert(r==%d);\n", + env->i->objnum, env->i->objnum, tmp_dir, r); + return r; +} +int ydb_env_set_verbose (DB_ENV_ydb *env, u_int32_t which, int onoff) { + barf(); + return 1; +} +int ydb_env_txn_checkpoint (DB_ENV_ydb *env, u_int32_t kbyte, u_int32_t min, u_int32_t flags) { + int r; + assert(flags==0); + r=env->i->env->txn_checkpoint(env->i->env, kbyte, min, 0); + assert(r==0); + tracef("r=envobj(%lld)->txn_checkpoint(envobj(%lld), %u, %u, %u); assert(r==0);\n", + env->i->objnum, env->i->objnum, kbyte, min, flags); + return r; +} + +int ydb_env_txn_stat (DB_ENV_ydb *env, DB_TXN_STAT_ydb **statp, u_int32_t flags) { + barf(); + return 1; +} + +int db_env_create_bdbw (struct yobi_db_env **envp, u_int32_t flags) { + struct yobi_db_env *result = malloc(sizeof(*result)); + int r; + //note(); + result->i = malloc(sizeof(*result->i)); + result->i->objnum = objnum++; + + result->err = ydb_env_err; + result->open = ydb_env_open; + result->close = bdbw_env_close; + result->txn_checkpoint = ydb_env_txn_checkpoint; + result->log_flush = ydb_env_log_flush; + result->set_errcall = ydb_env_set_errcall; + result->set_errpfx = ydb_env_set_errpfx; + result->set_noticecall = ydb_env_set_noticecall; + result->set_flags = ydb_env_set_flags; + result->set_data_dir = ydb_env_set_data_dir; + result->set_tmp_dir = ydb_env_set_tmp_dir; + result->set_verbose = ydb_env_set_verbose; + result->set_lg_bsize = ydb_env_set_lg_bsize; + result->set_lg_dir = ydb_env_set_lg_dir; + result->set_lg_max = ydb_env_set_lg_max; + result->set_cachesize = ydb_env_set_cachesize; + result->set_lk_detect = ydb_env_set_lk_detect; + result->set_lk_max = ydb_env_set_lk_max; + result->log_archive = ydb_env_log_archive; + result->txn_stat = ydb_env_txn_stat; + result->txn_begin = txn_begin_bdbw; + + r = db_env_create_4001(&result->i->env, flags); + result->i->env->app_private = result; + *envp = result; + + tracef("r=db_env_create(new_envobj(%lld), %u); assert(r==%d);\n", + result->i->objnum, flags, r); + + return r; +} + +int yobi_db_txn_commit (DB_TXN_ydb *txn, u_int32_t flags) { + int r; + //notef("flags=%d\n", flags); + assert(flags==0); + r = txn->i->txn->commit(txn->i->txn, 0); + txn->i->txn = 0; + assert(flags==0); // need to convert otherwise. + tracef("r=txnobj(%lld)->commit(txnobj(%lld), %d); assert(r==%d);\n", + txn->i->objnum, txn->i->objnum, flags, r); + // free(txn); + return r; +} + +u_int32_t yobi_db_txn_id (DB_TXN_ydb *txn) { + barf(); + abort(); +} + +// There is no txn_begin when generated with --with-uniquename. +int txn_begin_bdbw (struct yobi_db_env *env, struct yobi_db_txn *stxn, struct yobi_db_txn **txn, u_int32_t flags) { + int r; + struct yobi_db_txn *result = malloc(sizeof(*result)); + result->commit = yobi_db_txn_commit; + result->id = yobi_db_txn_id; + result->i = malloc(sizeof(*result->i)); + result->i->objnum = objnum++; + //note(); + r = env->i->env->txn_begin(env->i->env, + stxn ? stxn->i->txn : 0, + &result->i->txn, flags); + *txn = result; + tracef("r = envobj(%lld)->txn_begin(envobj(%lld), ", env->i->objnum , env->i->objnum); + if (!stxn) tracef("0, "); else tracef(" txnobj(%lld), ", stxn->i->objnum); + tracef("new_txnobj(%lld), 0x%x); ", result->i->objnum, flags); + tracef(" assert(r==%d);\n", r); + return r; +} + +int txn_abort_bdbw (DB_TXN_ydb *txn) { + barf(); + abort(); +} + +int txn_commit_bdbw (DB_TXN_ydb *txn, u_int32_t flags) { + int r; + u_int32_t bdbflags = 0; + char *bdbflagsstring = "0"; + assert(flags==0); + r = txn->i->txn->commit(txn->i->txn, bdbflags); + assert(r==0); + tracef("r=txnobj(%lld)->commit(txnobj(%lld), %s); assert(r==%d);\n", + txn->i->objnum, txn->i->objnum, bdbflagsstring, r); + return r; +} + +struct ydb_db_internal { + long long objnum; + DB *db; + int (*bt_compare)(DB_ydb *, const DBT_ydb *, const DBT_ydb *); +}; + +static int bdbw_db_close (DB_ydb *db, u_int32_t flags) { + int r; + //notef("flags=%d\n", flags); + assert(flags==0); + r = db->i->db->close(db->i->db, 0); + tracef("r=dbobj(%lld)->close(dbobj(%lld), 0); assert(r==0);\n", + db->i->objnum, db->i->objnum); + db->i->db = 0; + // free(db); + return r; +} + +struct yobi_dbc_internal { + DBC *dbc; + long long objnum; +}; + +void dbt_bdb2ydb (DBT *da, DBT_ydb *a, const char *varname) { + u_int32_t aflags = a->flags; + memset(da, 0, sizeof(*da)); + tracef(" memset(&%s,0,sizeof(a));\n", varname); + da->data = a->data; + if (aflags==DB_DBT_USERMEM_ydb) { + aflags &= ~DB_DBT_USERMEM_ydb; + da->flags |= DB_DBT_USERMEM; + tracef(" %s.flags |= DB_DBT_USERMEM;\n", varname); + if (a->ulen>0) { + tracef(" %s.data = malloc(%d);\n", varname, a->ulen); + } else { + tracef(" %s.data = 0;\n", varname); + } + da->ulen = a->ulen; + tracef(" %s.ulen = %d;\n", varname, a->ulen); + } + assert(aflags==0); +} + +int yobi_dbc_c_get (DBC_ydb *dbc, DBT_ydb *a, DBT_ydb *b, u_int32_t flags) { + int r; + DBT da; + DBT db; + const int flagstringlen=100; + char flagstring[flagstringlen]; + int bdb_flags = convert_c_get_flags(flags, flagstring, flagstringlen); + tracef("{ DBT a,b; \n"); + dbt_bdb2ydb(&da, a, "a"); + dbt_bdb2ydb(&db, b, "b"); + assert(flags==DB_LAST_ydb || flags==DB_FIRST_ydb || flags==DB_NEXT_ydb); + r = dbc->i->dbc->c_get(dbc->i->dbc, &da, &db, bdb_flags); + tracef(" r = dbcobj(%lld)->c_get(dbcobj(%lld), ", + dbc->i->objnum, dbc->i->objnum); + tracef(" &a, &b, "); + tracef(" %s);\n", flagstring); + if (r==0) { + tracef(" assert(r==%d);\n", r); + tracef(" assert(a.size==%d);\n", da.size); + //tracef(" assert(memcmp(a.address, "); + tracef(" assert(b.size==%d);\n", db.size); + a->size = da.size; + a->data = da.data; + b->size = db.size; + b->data = db.data; + assert(r==0); + } else if (r==DB_PAGE_NOTFOUND) { + tracef(" assert(r==DB_PAGE_NOTFOUND);\n"); + } else if (r==DB_NOTFOUND) { + tracef(" assert(r==DB_NOTFOUND);\n"); + } else { + printf("DB Error r=%d: %s\n", r, db_strerror(r)); + abort(); + } + tracef("}\n"); + return r; +} + +int yobi_dbc_c_close (DBC_ydb *dbc) { + int r; + r = dbc->i->dbc->c_close(dbc->i->dbc); + assert(r==0); + tracef("r=dbcobj(%lld)->c_close(dbcobj(%lld)); assert(r==%d);\n", + dbc->i->objnum, dbc->i->objnum, r); + dbc->i->dbc = 0; + // free(dbc->i); free(dbc); + return r; +} + +int yobi_dbc_c_del (DBC_ydb *dbc, u_int32_t flags) { + barf(); + abort(); +} + +static int bdbw_db_cursor (DB_ydb *db, DB_TXN_ydb *txn, DBC_ydb **c, u_int32_t flags) { + struct yobi_dbc *dbc = malloc(sizeof(*dbc)); + int r; + dbc->c_get = yobi_dbc_c_get; + dbc->c_close = yobi_dbc_c_close; + dbc->c_del = yobi_dbc_c_del; + dbc->i = malloc(sizeof(*dbc->i)); + assert(dbc->i); + assert(flags==0); + dbc->i->objnum = objnum++; + r=db->i->db->cursor(db->i->db, txn ? txn->i->txn : 0, &dbc->i->dbc, flags); + assert(r==0); + //note(); + *c = dbc; + tracef("r=dbobj(%lld)->cursor(dbobj(%lld), txnobj(%lld), new_dbcobj(%lld), %d); assert(r==%d);\n", + db->i->objnum, db->i->objnum, txn ? txn->i->objnum : -1, dbc->i->objnum, flags, r); + return r; +} + +static int bdbw_db_del (DB_ydb *db, DB_TXN_ydb *txn, DBT_ydb *dbt, u_int32_t flags) { + barf(); + abort(); +} + +static int bdbw_db_get (DB_ydb *db, DB_TXN_ydb *txn, DBT_ydb *dbta, DBT_ydb *dbtb, u_int32_t flags) { + barf(); + abort(); +} + +static int bdbw_db_key_range (DB_ydb *db, DB_TXN_ydb *txn, DBT_ydb *dbt, DB_KEY_RANGE_ydb *kr, u_int32_t flags) { + barf(); + abort(); +} + +static int bdbw_db_open (DB_ydb *db, DB_TXN_ydb *txn, const char *fname, const char *dbname, DBTYPE_ydb dbtype, u_int32_t flags, int mode) { + int r; + char flagstring[1000]; + u_int32_t bdb_flags = open_flags_ydb_2_bdb(flags, flagstring, sizeof(flagstring)); + //notef("txn=%p fname=%s dbname=%s dbtype=%d flags=0x%x (bdb=0x%x) %mode=0%o\n", txn, fname, dbname, dbtype, flags, bdb_flags, mode); + assert(dbtype == DB_BTREE_ydb); + r = db->i->db->open(db->i->db, + txn ? txn->i->txn : 0, + fname, dbname, DB_BTREE, bdb_flags, mode); + assert(db->i->db->app_private == db); + tracef("r=dbobj(%lld)->open(dbobj(%lld), txnobj(%lld), \"%s\", \"%s\",", + db->i->objnum, db->i->objnum, txn ? txn->i->objnum : -1, fname, dbname); + if (dbtype==DB_BTREE_ydb) tracef(" DB_BTREE,"); + else abort(); + tracef(" %s, 0%o);", flagstring, mode); + assert(r==0); + tracef(" assert(r==%d);\n", r); + return r; +} + +static int bdbw_bt_compare (DB *db, const DBT *a, const DBT *b) { + DB_ydb *ydb = db->app_private; + DBT_ydb a_y, b_y; + note(); + assert(ydb); + a_y.data = a->data; + a_y.size = a->size; + b_y.data = b->data; + b_y.size = b->size; + return ydb->i->bt_compare(ydb, &a_y, &b_y); +} + +u_int32_t convert_put_flags(u_int32_t flags, char *flagstring, int flagstringlen) { + if (flags==0) { + snprintf(flagstring, flagstringlen, "0"); + return 0; + } + retit(DB_NOOVERWRITE); + abort(); +} + +int bdbw_db_put (DB_ydb *db, DB_TXN_ydb *txn, DBT_ydb *dbta, DBT_ydb *dbtb, u_int32_t flags) { + int r; + unsigned int i; + DBT a,b; + char flagstring[1000]; + u_int32_t bdbflags = convert_put_flags(flags, flagstring, sizeof(flagstring)); + assert(dbta->flags==0); assert(dbtb->flags==0); + assert(dbta->ulen==0); assert(dbtb->ulen==0); + tracef("{ DBT a,b;\n"); + tracef(" unsigned char adata[%d] = {", dbta->size); + for (i=0; isize; i++) { + if (i>0) tracef(", "); + tracef("%u", ((unsigned char*)(dbta->data))[i]); + } + tracef("};\n unsigned char bdata[%d] = {", dbtb->size); + for (i=0; isize; i++) { + if (i>0) tracef(", "); + tracef("%u", ((unsigned char*)(dbtb->data))[i]); + } + tracef("};\n memset(&a,0,sizeof(a)); memset(&b,0,sizeof(b));\n"); + tracef(" a.data = adata; b.data=bdata;\n"); + tracef(" a.flags= 0; b.flags=0;\n"); + tracef(" a.ulen=0; b.ulen=0;\n"); + tracef(" a.size=%d; b.size=%d;\n", dbta->size, dbtb->size); + memset(&a, 0, sizeof(a)); + memset(&b, 0, sizeof(b)); + a.data = dbta->data; b.data = dbtb->data; + a.flags = 0; b.flags = 0; + a.ulen = 0; b.ulen = 0; + a.size = dbta->size; b.size = dbtb->size; + r=db->i->db->put(db->i->db, txn ? txn->i->txn : 0, &a, &b, flags); + assert(r==0); + tracef(" r=dbobj(%lld)->put(dbobj(%lld), txnobj(%lld), &a, &b, %s); assert(r==%d);\n}\n", + db->i->objnum, db->i->objnum, txn ? txn->i->objnum : -1, flagstring, r); + return r; +} +int bdbw_db_remove (DB_ydb *db, const char *fname, const char *dbname, u_int32_t flags) { + barf(); + abort(); +} +int bdbw_db_rename (DB_ydb *db, const char *namea, const char *nameb, const char *namec, u_int32_t flags) { + barf(); + abort(); +} + +extern int berkeley_cmp_hidden_key(DB_ydb *, const DBT_ydb *, const DBT_ydb *); + +static int bdbw_db_set_bt_compare (DB_ydb *db, int (*bt_compare)(DB_ydb *, const DBT_ydb *, const DBT_ydb *)) { + int r; + r = db->i->db->set_bt_compare(db->i->db, bdbw_bt_compare); + db->i->bt_compare = bt_compare; + { + const char *fun_name; + if (bt_compare==berkeley_cmp_hidden_key) { + fun_name = "berkeley_cmp_hidden_key"; + } else { + fun_name = "Unknown_function"; + } + tracef("r = dbobj(%lld)->set_bt_compare(dbobj(%lld), %s); assert(r==%d);\n", + db->i->objnum, db->i->objnum, fun_name, r); + } + return r; +} + +int bdbw_db_set_flags (DB_ydb *db, u_int32_t flags) { + int r; + assert(flags==0); + r = db->i->db->set_flags(db->i->db, 0); + assert(r==0); + tracef("r=dbobj(%lld)->set_flags(dbobj(%lld), 0); assert(r==0);\n", + db->i->objnum, db->i->objnum); + return r; +} +int bdbw_db_stat (DB_ydb *db, void *v, u_int32_t flags) { + barf(); + abort(); +} + +int db_create_bdbw (DB_ydb **db, DB_ENV_ydb *env, u_int32_t flags) { + DB_ydb *result=malloc(sizeof(*result)); + int r; + result->app_private = 0; + result->close = bdbw_db_close; + result->cursor = bdbw_db_cursor; + result->del = bdbw_db_del; + result->get = bdbw_db_get; + result->key_range = bdbw_db_key_range; + result->open = bdbw_db_open; + result->put = bdbw_db_put; + result->remove = bdbw_db_remove; + result->rename = bdbw_db_rename; + result->set_bt_compare = bdbw_db_set_bt_compare; + result->set_flags = bdbw_db_set_flags; + result->stat = bdbw_db_stat; + result->i = malloc(sizeof(*result->i)); + r=db_create(&result->i->db, env->i->env, convert_db_create_flags(flags)); + result->i->objnum = objnum++; + result->i->db->app_private = result; + result->i->bt_compare = 0; + *db = result; + tracef("r=db_create(new_dbobj(%lld), envobj(%lld), %d); assert(r==%d);\n", + result->i->objnum, env->i->objnum, flags, r); + return r; +} + +#if 0 + +void bdbw_db_env_err (const DB_ENV_ydb *env, int error, const char *fmt, ...) { + va_list ap; + va_start(ap, fmt); + fprintf(stderr, "YDB Error %d:", error); + vfprintf(stderr, fmt, ap); + va_end(ap); +} + +#define barf() ({ fprintf(stderr, "YDB: BARF %s:%d in %s\n", __FILE__, __LINE__, __func__); }) +#define barff(fmt,...) ({ fprintf(stderr, "YDB: BARF %s:%d in %s, ", __FILE__, __LINE__, __func__); fprintf(stderr, fmt, __VA_ARGS__); }) +#define note() ({ fprintf(stderr, "YDB: Note %s:%d in %s\n", __FILE__, __LINE__, __func__); }) +#define notef(fmt,...) ({ fprintf(stderr, "YDB: Note %s:%d in %s, ", __FILE__, __LINE__, __func__); fprintf(stderr, fmt, __VA_ARGS__); }) + +void print_flags (u_int32_t flags) { + u_int32_t gotit=0; + int doneone=0; +#define doit(flag) if (flag & flags) { if (doneone) printf(" | "); printf("%s", #flag); doneone=1; gotit|=flag; } + printf(" flags="); + doit(DB_INIT_LOCK_ydb); + doit(DB_INIT_LOG_ydb); + doit(DB_INIT_MPOOL_ydb); + doit(DB_INIT_TXN_ydb); + doit(DB_CREATE_ydb); + doit(DB_THREAD_ydb); + doit(DB_RECOVER_ydb); + doit(DB_PRIVATE_ydb); + if (gotit!=flags) printf(" flags 0x%x not accounted for", flags&~gotit); + printf("\n"); +} + +int yobi_db_env_open (DB_ENV_ydb *env, const char *home, u_int32_t flags, int mode) { + notef("(%p, \"%s\", 0x%x, 0%o)\n", env, home, flags, mode); + env->dir = strdup(home); + env->open_flags = flags; + env->open_mode = mode; + print_flags(flags); + assert(DB_PRIVATE & flags); // This means that we don't have to do anything with shared memory. And that's good enough for mysql. + return 0; +} +int yobi_db_env_close (DB_ENV_ydb * env, u_int32_t flags) { + barf(); + return 1; +} +int yobi_db_env_log_archive (DB_ENV_ydb *env, char **list[], u_int32_t flags) { + barf(); + return 1; +} +int yobi_db_env_log_flush (DB_ENV_ydb * env, const DB_LSN_ydb * lsn) { + barf(); + return 1; +} +int yobi_db_env_set_cachesize (DB_ENV_ydb * env, u_int32_t gbytes, u_int32_t bytes, int ncache) { + barf(); + return 1; +} +int yobi_db_env_set_data_dir (DB_ENV_ydb * env, const char *dir) { + barf(); + return 1; +} +void yobi_db_env_set_errcall (DB_ENV_ydb *env, void (*errcall)(const char *, char *)) { + note(); + env->errcall=errcall; +} +void yobi_db_env_set_errpfx (DB_ENV_ydb * env, const char *errpfx) { + notef("(%p, %s)\n", env, errpfx); + env->errpfx = errpfx; +} +int yobi_db_env_set_flags (DB_ENV_ydb *env, u_int32_t flags, int onoff) { + barf(); + return 1; +} +int yobi_db_env_set_lg_bsize (DB_ENV_ydb * env, u_int32_t bsize) { + barf(); + return 1; +} +int yobi_db_env_set_lg_dir (DB_ENV_ydb * env, const char * dir) { + barf(); + return 1; +} +int yobi_db_env_set_lg_max (DB_ENV_ydb *env, u_int32_t lg_max) { + barf(); + return 1; +} +int yobi_db_env_set_lk_detect (DB_ENV_ydb *env, u_int32_t detect) { + barf(); + return 1; +} +int yobi_db_env_set_lk_max (DB_ENV_ydb *env, u_int32_t lk_max) { + barf(); + return 1; +} +void yobi_db_env_set_noticecall (DB_ENV_ydb *env, void (*noticeall)(DB_ENV_ydb *, db_notices_ydb)) { + barf(); +} +int yobi_db_env_set_tmp_dir (DB_ENV_ydb * env, const char *tmp_dir) { + barf(); + return 1; +} +int yobi_db_env_set_verbose (DB_ENV_ydb *env, u_int32_t which, int onoff) { + barf(); + return 1; +} +int yobi_db_env_txn_checkpoint (DB_ENV_ydb *env, u_int32_t kbyte, u_int32_t min, u_int32_t flags) { + barf(); + return 1; +} + +int yobi_db_env_txn_stat (DB_ENV_ydb *env, DB_TXN_STAT_ydb **statp, u_int32_t flags) { + barf(); + return 1; +} + +void yobi_default_errcall(const char *errpfx, char *msg) { + fprintf(stderr, "YDB: %s: %s", errpfx, msg); +} + + + + +int yobi_db_txn_commit (DB_TXN_ydb *txn, u_int32_t flags) { + notef("flags=%d\n", flags); + return 0; +} + +u_int32_t yobi_db_txn_id (DB_TXN_ydb *txn) { + barf(); + abort(); +} + +int log_compare_ydb (const DB_LSN_ydb *a, const DB_LSN_ydb *b) { + fprintf(stderr, "%s:%d log_compare(%p,%p)\n", __FILE__, __LINE__, a, b); + abort(); +} + +#endif diff --git a/src-bdbwrap/bdbw.h b/src-bdbwrap/bdbw.h new file mode 100644 index 00000000000..1c216cbd060 --- /dev/null +++ b/src-bdbwrap/bdbw.h @@ -0,0 +1,213 @@ +#ifndef _BDBW_H +#define _BDBW_H + +#if defined(__cplusplus) +extern "C" { +#if 0 +} +#endif +#endif + +int db_env_create_bdbw (struct yobi_db_env **, u_int32_t); +int txn_abort_bdbw (struct yobi_db_txn *); +int txn_begin_bdbw (struct yobi_db_env *env, struct yobi_db_txn *stxn, struct yobi_db_txn **txn, u_int32_t flags); +int txn_commit_bdbw (struct yobi_db_txn *, u_int32_t); +int db_create_bdbw (struct yobi_db **, struct yobi_db_env *, u_int32_t); + + +#if 0 +typedef enum { + DB_BTREE=1, + // DB_HASH=2, + // DB_RECNO=3, + // DB_QUEUE=4, + // DB_UNKNOWN=5 /* Figure it out on open. */ +} DBTYPE; + +typedef enum { + DB_NOTICE_LOGFILE_CHANGED +} db_notices; + +enum { + DB_VERB_CHKPOINT = 0x0001, + DB_VERB_DEADLOCK = 0x0002, + DB_VERB_RECOVERY = 0x0004 + +}; + +typedef struct yobi_db DB; +typedef struct yobi_db_btree_stat DB_BTREE_STAT; +typedef struct yobi_db_env DB_ENV; +typedef struct yobi_db_key_range DB_KEY_RANGE; +typedef struct yobi_db_lsn DB_LSN; +typedef struct yobi_db_txn DB_TXN; +typedef struct yobi_db_txn_active DB_TXN_ACTIVE; +typedef struct yobi_db_txn_stat DB_TXN_STAT; +typedef struct yobi_dbc DBC; +typedef struct yobi_dbt DBT; + +struct yobi_db { + void *app_private; + int (*close) (DB *, u_int32_t); + int (*cursor) (DB *, DB_TXN *, DBC **, u_int32_t); + int (*del) (DB *, DB_TXN *, DBT *, u_int32_t); + int (*get) (DB *, DB_TXN *, DBT *, DBT *, u_int32_t); + int (*key_range) (DB *, DB_TXN *, DBT *, DB_KEY_RANGE *, u_int32_t); + int (*open) (DB *, DB_TXN *, + const char *, const char *, DBTYPE, u_int32_t, int); + int (*put) (DB *, DB_TXN *, DBT *, DBT *, u_int32_t); + int (*remove) (DB *, const char *, const char *, u_int32_t); + int (*rename) (DB *, const char *, const char *, const char *, u_int32_t); + int (*set_bt_compare) (DB *, + int (*)(DB *, const DBT *, const DBT *)); + int (*set_flags) (DB *, u_int32_t); + int (*stat) (DB *, void *, u_int32_t); + + struct ydb_db_internal *i; +}; +enum { + DB_DBT_MALLOC = 0x002, + DB_DBT_REALLOC = 0x010, + DB_DBT_USERMEM = 0x020, + DB_DBT_DUPOK = 0x040 +}; +struct yobi_dbt { + void *app_private; + void *data; + u_int32_t flags; + u_int32_t size; + u_int32_t ulen; +}; +struct yobi_db_txn { + int (*commit) (DB_TXN*, u_int32_t); + u_int32_t (*id) (DB_TXN *); +}; +struct yobi_dbc { + int (*c_get) (DBC *, DBT *, DBT *, u_int32_t); + int (*c_close) (DBC *); + int (*c_del) (DBC *, u_int32_t); +}; +struct yobi_db_env { + // Methods used by MYSQL + void (*err) (const DB_ENV *, int, const char *, ...); + int (*open) (DB_ENV *, const char *, u_int32_t, int); + int (*close) (DB_ENV *, u_int32_t); + int (*txn_checkpoint) (DB_ENV *, u_int32_t, u_int32_t, u_int32_t); + int (*log_flush) (DB_ENV *, const DB_LSN *); + void (*set_errcall) (DB_ENV *, void (*)(const char *, char *)); + void (*set_errpfx) (DB_ENV *, const char *); + void (*set_noticecall) (DB_ENV *, void (*)(DB_ENV *, db_notices)); + int (*set_flags) (DB_ENV *, u_int32_t, int); + int (*set_data_dir) (DB_ENV *, const char *); + int (*set_tmp_dir) (DB_ENV *, const char *); + int (*set_verbose) (DB_ENV *, u_int32_t, int); + int (*set_lg_bsize) (DB_ENV *, u_int32_t); + int (*set_lg_dir) (DB_ENV *, const char *); + int (*set_lg_max) (DB_ENV *, u_int32_t); + int (*set_cachesize) (DB_ENV *, u_int32_t, u_int32_t, int); + int (*set_lk_detect) (DB_ENV *, u_int32_t); + int (*set_lk_max) (DB_ENV *, u_int32_t); + int (*log_archive) (DB_ENV *, char **[], u_int32_t); + int (*txn_stat) (DB_ENV *, DB_TXN_STAT **, u_int32_t); + // Internal state + void (*errcall)(const char *, char *); + const char *errpfx; + char *dir; /* A malloc'd copy of the directory. */ + u_int32_t open_flags; + int open_mode; +}; +struct yobi_db_key_range { + double less,equal,grater; +}; +struct yobi_db_btree_stat { + u_int32_t bt_ndata; + u_int32_t bt_nkeys; +}; +struct yobi_db_txn_stat { + u_int32_t st_nactive; + DB_TXN_ACTIVE *st_txnarray; +}; +struct yobi_db_lsn { + int hello; +}; +struct yobi_db_txn_active { + DB_LSN lsn; + u_int32_t txnid; +}; + +#ifndef _YDB_WRAP_H +#define DB_VERSION_STRING "Yobiduck: Fractal DB (November 19, 2006)" +#else +#define DB_VERSION_STRING_ydb "Yobiduck: Fractal DB (November 19, 2006) (wrapped bdb)" +#endif + +enum { + DB_ARCH_ABS = 0x001, + DB_ARCH_LOG = 0x004 +}; + +enum { + //DB_AFTER = 1, + DB_FIRST = 10, + DB_GET_BOTH = 11, + DB_LAST = 18, + DB_NEXT = 19, + DB_NEXT_DUP = 20, + DB_PREV = 27, + DB_SET = 30, + DB_SET_RANGE = 32, + DB_RMW = 0x40000000 +}; + +enum { + DB_KEYEMPTY = -30998, + DB_KEYEXIST = -30997, + DB_LOCK_DEADLOCK = -30996, + DB_NOTFOUND = -30991 +}; + + +enum { + DB_CREATE = 0x0000001, + DB_RDONLY = 0x0000010, + DB_RECOVER = 0x0000020, + DB_THREAD = 0x0000040, + DB_TXN_NOSYNC = 0x0000100, + + DB_PRIVATE = 0x0100000 +}; + +enum { + DB_LOCK_DEFAULT = 1, + DB_LOCK_OLDEST = 7, + DB_LOCK_RANDOM = 8 +}; + +enum { + DB_DUP = 0x000002 +}; + +enum { + DB_NOOVERWRITE = 23 +}; + +enum { + DB_INIT_LOCK = 0x001000, + DB_INIT_LOG = 0x002000, + DB_INIT_MPOOL = 0x004000, + DB_INIT_TXN = 0x008000 +}; + +int db_env_create (DB_ENV **, u_int32_t); + +int txn_begin (DB_ENV *, DB_TXN *, DB_TXN **, u_int32_t); +int txn_abort (DB_TXN *); + +int log_compare (const DB_LSN *, const DB_LSN *); + +#endif + +#if defined(__cplusplus) +} +#endif +#endif diff --git a/src-bdbwrap/wrapper-design.text b/src-bdbwrap/wrapper-design.text new file mode 100644 index 00000000000..200c519565a --- /dev/null +++ b/src-bdbwrap/wrapper-design.text @@ -0,0 +1,25 @@ +This directory provides a wrapper using the ydb db.h header file but +it calls the bdb internals. To get this to work requires a little +magic, since both BDB and YDB use the same type names. + +BDB helps with a ./configure option (--with-uniquename) that makes it +so that all the link-time symbols have different names. But the type +names and enums collide. Hence we cannot include both the +../include/db.h and the BDB db.h. + +To fix that we have a header ydb-uniq.h, which uses completely +different names for everything in the ydb interface. But those names +are compatible with the standard names: That is all the structs have +the same layout. + +Thus we have the following three headers + bdb_db.h which defines things like db_env_create_4001(DBENV **, ...) + + ydb_db.h which defines things like db_env_create(DBENV **, ...) (but the DBENV is a different type from BDB's) + bdbw.h which defines things like db_env_create_ydb(DBENV_ydb **, ...) + +bdbw.h can be included with bdb_db.h or ydb_db.h +bdb_db.h and ydb_db.h cannot both be included in the same file. + +An application, such as mysql, includes db.h from ../include +That is the ydb header file. diff --git a/src-bdbwrap/ydb-uniq.h b/src-bdbwrap/ydb-uniq.h new file mode 100644 index 00000000000..c60e7621098 --- /dev/null +++ b/src-bdbwrap/ydb-uniq.h @@ -0,0 +1,129 @@ +#ifndef _YDB_WRAP_H +#define _YDB_WRAP_H + +#define DB_BTREE DB_BTREE_ydb +#define DB_NOTICE_LOGFILE_CHANGED DB_NOTICE_LOGFILE_CHANGED_ydb +#define DBTYPE DBTYPE_ydb +#define db_notices db_notices_ydb +#define txn_abort txn_abort_ydb +#define txn_begin txn_begin_ydb +#define txn_commit txn_commit_ydb +#define DB_VERB_CHKPOINT DB_VERB_CHKPOINT_ydb +#define DB_VERB_DEADLOCK DB_VERB_DEADLOCK_ydb +#define DB_VERB_RECOVERY DB_VERB_RECOVERY_ydb +#define DB DB_ydb +#define DB_BTREE_STAT DB_BTREE_STAT_ydb +#define DB_ENV DB_ENV_ydb +#define DB_KEY_RANGE DB_KEY_RANGE_ydb +#define DB_LSN DB_LSN_ydb +#define DB_TXN DB_TXN_ydb +#define DB_TXN_ACTIVE DB_TXN_ACTIVE_ydb +#define DB_TXN_STAT DB_TXN_STAT_ydb +#define DBC DBC_ydb +#define DBT DBT_ydb +#define DB_DBT_MALLOC B_DBT_MALLOC_ydb +#define DB_DBT_REALLOC DB_DBT_REALLOC_ydb +#define DB_DBT_USERMEM DB_DBT_USERMEM_ydb +#define DB_DBT_DUPOK DB_DBT_DUPOK_ydb +#define DB_VERSION_STRING DB_VERSION_STRING_ydb +#define DB_ARCH_ABS DB_ARCH_ABS_ydb +#define DB_ARCH_LOG DB_ARCH_LOG_ydb +#define DB_FIRST DB_FIRST_ydb +#define DB_GET_BOTH DB_GET_BOTH_ydb +#define DB_LAST DB_LAST_ydb +#define DB_NEXT DB_NEXT_ydb +#define DB_NEXT_DUP DB_NEXT_DUP_ydb +#define DB_PREV DB_PREV_ydb +#define DB_SET DB_SET_ydb +#define DB_SET_RANGE DB_SET_RANGE_ydb +#define DB_RMW DB_RMW_ydb +#define DB_KEYEMPTY DB_KEYEMPTY_ydb +#define DB_KEYEXIST DB_KEYEXIST_ydb +#define DB_LOCK_DEADLOCK DB_LOCK_DEADLOCK_ydb +#define DB_NOTFOUND DB_NOTFOUND_ydb +#define DB_CREATE DB_CREATE_ydb +#define DB_RDONLY DB_RDONLY_ydb +#define DB_RECOVER DB_RECOVER_ydb +#define DB_THREAD DB_THREAD_ydb +#define DB_TXN_NOSYNC DB_TXN_NOSYNC_ydb +#define DB_PRIVATE DB_PRIVATE_ydb +#define DB_LOCK_DEFAULT DB_LOCK_DEFAULT_ydb +#define DB_LOCK_OLDEST DB_LOCK_OLDEST_ydb +#define DB_LOCK_RANDOM DB_LOCK_RANDOM_ydb +#define DB_DUP DB_DUP_ydb +#define DB_NOOVERWRITE DB_NOOVERWRITE_ydb +#define DB_INIT_LOCK DB_INIT_LOCK_ydb +#define DB_INIT_LOG DB_INIT_LOG_ydb +#define DB_INIT_MPOOL DB_INIT_MPOOL_ydb +#define DB_INIT_TXN DB_INIT_TXN_ydb +#define db_create db_create_ydb +#define db_env_create db_env_create_ydb +#define txn_begin txn_begin_ydb +#define txn_commit txn_commit_ydb +#define txn_abort txn_abort_ydb +#define log_compare log_compare_ydb + +#include "../include/db.h" +#undef DB_BTREE +#undef DB_NOTICE_LOGFILE_CHANGED +#undef DBTYPE +#undef db_notices +#undef txn_abort +#undef txn_begin +#undef txn_commit +#undef DB_VERB_CHKPOINT +#undef DB_VERB_DEADLOCK +#undef DB_VERB_RECOVERY +#undef DB +#undef DB_BTREE_STAT +#undef DB_ENV +#undef DB_KEY_RANGE +#undef DB_LSN +#undef DB_TXN +#undef DB_TXN_ACTIVE +#undef DB_TXN_STAT +#undef DBC +#undef DBT +#undef DB_DBT_MALLOC +#undef DB_DBT_REALLOC +#undef DB_DBT_USERMEM +#undef DB_DBT_DUPOK +#undef DB_VERSION_STRING +#undef DB_ARCH_ABS +#undef DB_ARCH_LOG +#undef DB_FIRST +#undef DB_GET_BOTH +#undef DB_LAST +#undef DB_NEXT +#undef DB_NEXT_DUP +#undef DB_PREV +#undef DB_SET +#undef DB_SET_RANGE +#undef DB_RMW +#undef DB_KEYEMPTY +#undef DB_KEYEXIST +#undef DB_LOCK_DEADLOCK +#undef DB_NOTFOUND +#undef DB_CREATE +#undef DB_RDONLY +#undef DB_RECOVER +#undef DB_THREAD +#undef DB_TXN_NOSYNC +#undef DB_PRIVATE +#undef DB_LOCK_DEFAULT +#undef DB_LOCK_OLDEST +#undef DB_LOCK_RANDOM +#undef DB_DUP +#undef DB_NOOVERWRITE +#undef DB_INIT_LOCK +#undef DB_INIT_LOG +#undef DB_INIT_MPOOL +#undef DB_INIT_TXN +#undef db_create +#undef db_env_create +#undef txn_begin +#undef txn_commit +#undef txn_abort +#undef log_compare + +#endif diff --git a/src-bdbwrap/ydb.c b/src-bdbwrap/ydb.c new file mode 100644 index 00000000000..fe4872027cd --- /dev/null +++ b/src-bdbwrap/ydb.c @@ -0,0 +1,117 @@ +/* This version is what Mysql calls. + * It invokes the version in bdbw. + * The version in bdbw then converts to Berkeley DB Calls. */ +#include +/* This include is to the ydb include, which is what mysql sees. */ +#include +#include +#include +#include +#include +#include +#include "bdbw.h" + +#define barf() ({ fprintf(stderr, "YDB: BARF %s:%d in %s\n", __FILE__, __LINE__, __func__); }) +#define barff(fmt,...) ({ fprintf(stderr, "YDB: BARF %s:%d in %s, ", __FILE__, __LINE__, __func__); fprintf(stderr, fmt, __VA_ARGS__); }) +#define note() ({ fprintf(stderr, "YDB: Note %s:%d in %s\n", __FILE__, __LINE__, __func__); }) +#define notef(fmt,...) ({ fprintf(stderr, "YDB: Note %s:%d in %s, ", __FILE__, __LINE__, __func__); fprintf(stderr, fmt, __VA_ARGS__); }) + +int db_env_create (DB_ENV **envp, u_int32_t flags) { + return db_env_create_bdbw(envp, flags); +} + +int txn_abort (DB_TXN *txn) { + return txn_abort_bdbw(txn); +} + +int txn_begin (DB_ENV *env, DB_TXN *stxn, DB_TXN **txn, u_int32_t flags) { + return txn_begin_bdbw(env, stxn, txn, flags); +} + + +int txn_commit (DB_TXN *txn, u_int32_t flags) { + return txn_commit_bdbw(txn, flags); +} + + + +struct ydb_db_internal { + int foo; +}; + +void print_flags (u_int32_t flags) { + u_int32_t gotit=0; + int doneone=0; +#define doit(flag) if (flag & flags) { if (doneone) printf(" | "); printf("%s", #flag); doneone=1; gotit|=flag; } + printf(" flags="); + doit(DB_INIT_LOCK); + doit(DB_INIT_LOG); + doit(DB_INIT_MPOOL); + doit(DB_INIT_TXN); + doit(DB_CREATE); + doit(DB_THREAD); + doit(DB_RECOVER); + doit(DB_PRIVATE); + if (gotit!=flags) printf(" flags 0x%x not accounted for", flags&~gotit); + printf("\n"); +} + +int log_compare (const DB_LSN *a, const DB_LSN *b) { + fprintf(stderr, "%s:%d log_compare(%p,%p)\n", __FILE__, __LINE__, a, b); + abort(); +} + +static int yobi_db_close (DB *db, u_int32_t flags) { + barf(); + abort(); +} + +int yobi_db_cursor (DB *db, DB_TXN *txn, DBC **c, u_int32_t flags) { + barf(); + abort(); +} + +int yobi_db_del (DB *db, DB_TXN *txn, DBT *dbt, u_int32_t flags) { + barf(); + abort(); +} + +int yobi_db_get (DB *db, DB_TXN *txn, DBT *dbta, DBT *dbtb, u_int32_t flags) { + barf(); + abort(); +} + +int yobi_db_key_range (DB *db, DB_TXN *txn, DBT *dbt, DB_KEY_RANGE *kr, u_int32_t flags) { + barf(); + abort(); +} +int yobi_db_open (DB *db, DB_TXN *txn, const char *fname, const char *dbname, DBTYPE dbtype, u_int32_t flags, int mode) { + notef("txn=%p fname=%s dbname=%s dbtype=%d flags=0x%x mode=0%o\n", txn, fname, dbname, dbtype, flags, mode); + print_flags(flags); + return 0; +} +int yobi_db_put (DB *db, DB_TXN *txn, DBT *dbta, DBT *dbtb, u_int32_t flags) { + barf(); + abort(); +} +int yobi_db_remove (DB *db, const char *fname, const char *dbname, u_int32_t flags) { + barf(); + abort(); +} +int yobi_db_rename (DB *db, const char *namea, const char *nameb, const char *namec, u_int32_t flags) { + barf(); + abort(); +} +int yobi_db_set_flags (DB *db, u_int32_t flags) { + barf(); + abort(); +} +int yobi_db_stat (DB *db, void *v, u_int32_t flags) { + barf(); + abort(); +} + +int db_create (DB **db, DB_ENV *env, u_int32_t flags) { + return db_create_bdbw(db, env, flags); +} + diff --git a/src/Makefile b/src/Makefile new file mode 100644 index 00000000000..bbee2eaece8 --- /dev/null +++ b/src/Makefile @@ -0,0 +1,6 @@ +CFLAGS = -W -Wall -Wno-unused -g +CPPFLAGS = -I../include -I../newbrt +ydb.o: ../include/db.h ../newbrt/cachetable.h +libdb.so: ydb.c + cc $(CPPFLAGS) ydb.c -shared -fPIC -o libdb.so $(CFLAGS) +libdb.a(ydb.o): ydb.o diff --git a/src/README b/src/README new file mode 100644 index 00000000000..7bc305e1b7a --- /dev/null +++ b/src/README @@ -0,0 +1,91 @@ +cd ~/yobiduck/ydb/src +make + + +cd ~/mysql/bdbi/mysql-5.0.27/ +export LD_RUN_PATH=/home/bradley/yobiduck/ydb/src +./configure --with-berkeley-db-includes=/home/bradley/yobiduck/ydb/include --with-berkeley-db --with-berkeley-db-libs=/home/bradley/yobiduck/ydb/src --prefix=/home/bradley/usr + +make + +make install + +#This one may not be needed +~/mysql/bdbi/usr/bin/mysql_install_db + +# +pushd /home/bradley/mysql/bdbi/usr/ ; /home/bradley/mysql/bdbi/usr//bin/mysqld_safe & +popd + +~/mysql/bdbi/usr/bin/mysql -u root +mysql> show databases; +mysql> create database yobitest; +mysql> use yobitest; +mysql> create table t1 (i int) engine=bdb; + +Look for the error in /home/bradley/mysql/bdbi/usr/var/yobert.err + + + +---- +This links right: + LD_LIBRARY_PATH=/home/bradley/mysql/bdbi/usr/lib/mysql/ ldd sql/mysqld +---- +This works, + LD_LIBRARY_PATH=/home/bradley/mysql/bdbi/usr/lib/mysql/ /home/bradley/mysql/bdbi/usr//bin/mysqld +producing the following in the log + +061208 16:11:35 InnoDB: Started; log sequence number 0 43655 +ydb.c:78 db_env_create flags=0 + +---- + the LD_RUN_PATH thing above works. + + +--- on laptop I did this instead: + +export LD_RUN_PATH=/home/bradley/yobiduck/ydb/src +./configure --with-berkeley-db-includes=/home/bradley/yobiduck/ydb/include --with-berkeley-db --with-berkeley-db-libs=/home/bradley/yobiduck/src --prefix=/home/bradley/usr + +make +make install +/home/bradley/usr/bin/mysql_install_db +/home/bradley/usr/bin/mysqld_safe & +/home/bradley/usr/bin/mysql -u root +~/mysql/bdbi/usr/bin/mysql -u root +mysql> show databases; +mysql> create database yobitest; +mysql> use yobitest; +mysql> create table t1 (i int) engine=bdb; + +Look for the error in ~/usr/var/localhost.localdomain.err + +--- +To clean up after a total screwup: + +rm -rf ~/usr/var/ + +Didn't manage to clean it up very well. + +--- +Goal: compiler mysql wiht debugging + +export LD_RUN_PATH=/home/bradley/yobiduck/ydb/src +./configure CFLAGS="-g -O2" --with-berkeley-db-includes=/home/bradley/yobiduck/ydb/include --with-berkeley-db --with-berkeley-db-libs=/home/bradley/yobiduck/src --prefix=/home/bradley/usr + +If you want to debug, you might need to start mysqld without using mysqld_safe. +Here is one way to do it: + +gdb ~/usr/libexec/mysqld +(gdb) run --basedir=/home/bradley/usr --datadir=/home/bradley/usr/var --pid-file=/home/bradley/usr/var/localhost.localdomain.pid --skip-external-locking + + +That was screwed up (the configure args were wrong.) Try again: + +--- +export LD_RUN_PATH=/home/bradley/yobiduck/ydb/src +./configure CXXFLAGS="-g -O2" CFLAGS="-g -O2" --with-berkeley-db-includes=/home/bradley/yobiduck/ydb/include --with-berkeley-db --with-berkeley-db-libs=/home/bradley/yobiduck/ydb/src --prefix=/home/bradley/usr + +gdb ~/usr/libexec/mysqld +(gdb) run --basedir=/home/bradley/usr --datadir=/home/bradley/usr/var --pid-file=/home/bradley/usr/var/localhost.localdomain.pid --skip-external-locking + diff --git a/src/ydb.c b/src/ydb.c new file mode 100644 index 00000000000..24cf6a4a310 --- /dev/null +++ b/src/ydb.c @@ -0,0 +1,420 @@ +/* -*- mode: C; c-basic-offset: 4 -*- */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cachetable.h" + + +struct db_header { + int n_databases; // Or there can be >=1 named databases. This is the count. + char *database_names; // These are the names + BRT *database_brts; // These +}; + +struct ydb_db_internal { + int freed; + int (*bt_compare)(DB *, const DBT *, const DBT *); + struct db_header *header; + int database_number; // -1 if it is the single unnamed database. Nonnengative number otherwise. + DB_ENV *env; + char *full_fname; + char *database_name; + //int fd; + u_int32_t open_flags; + int open_mode; + BRT brt; +}; + +void yobi_db_env_err (const DB_ENV *env __attribute__((__unused__)), int error, const char *fmt, ...) { + va_list ap; + va_start(ap, fmt); + fprintf(stderr, "YDB Error %d:", error); + vfprintf(stderr, fmt, ap); + va_end(ap); +} + +#define barf() ({ fprintf(stderr, "YDB: BARF %s:%d in %s\n", __FILE__, __LINE__, __func__); }) +#define barff(fmt,...) ({ fprintf(stderr, "YDB: BARF %s:%d in %s, ", __FILE__, __LINE__, __func__); fprintf(stderr, fmt, __VA_ARGS__); }) +#define note() ({ fprintf(stderr, "YDB: Note %s:%d in %s\n", __FILE__, __LINE__, __func__); }) +#define notef(fmt,...) ({ fprintf(stderr, "YDB: Note %s:%d in %s, ", __FILE__, __LINE__, __func__); fprintf(stderr, fmt, __VA_ARGS__); }) + +void print_flags (u_int32_t flags) { + u_int32_t gotit=0; + int doneone=0; +#define doit(flag) if (flag & flags) { if (doneone) printf(" | "); printf("%s", #flag); doneone=1; gotit|=flag; } + printf(" flags="); + doit(DB_INIT_LOCK); + doit(DB_INIT_LOG); + doit(DB_INIT_MPOOL); + doit(DB_INIT_TXN); + doit(DB_CREATE); + doit(DB_THREAD); + doit(DB_RECOVER); + doit(DB_PRIVATE); + if (gotit!=flags) printf(" flags 0x%x not accounted for", flags&~gotit); + printf("\n"); +} + +struct db_env_ydb_internal { + u_int32_t open_flags; + int open_mode; + void (*errcall)(const char *, char *); + const char *errpfx; + char *dir; /* A malloc'd copy of the directory. */ + char *tmp_dir; + void (*noticecall)(DB_ENV *, db_notices); + int n_files; + int files_array_limit; // How big is *files ? + struct ydb_file **files; + CACHETABLE cachetable; +}; + +int yobi_db_env_open (DB_ENV *env, const char *home, u_int32_t flags, int mode) { + int r; + notef("(%p, \"%s\", 0x%x, 0%o)\n", env, home, flags, mode); + env->i->dir = strdup(home); + env->i->open_flags = flags; + env->i->open_mode = mode; + + print_flags(flags); + assert(DB_PRIVATE & flags); // This means that we don't have to do anything with shared memory. And that's good enough for mysql. + + r = brt_create_cachetable(&env->i->cachetable, 32); + assert(r==0); + return 0; +} +int yobi_db_env_close (DB_ENV * env, u_int32_t flags) { + barf(); + return 1; +} +int yobi_db_env_log_archive (DB_ENV *env, char **list[], u_int32_t flags) { + *list = NULL; + return 0; +} +int yobi_db_env_log_flush (DB_ENV * env, const DB_LSN * lsn) { + barf(); + return 1; +} +int yobi_db_env_set_cachesize (DB_ENV * env, u_int32_t gbytes, u_int32_t bytes, int ncache) { + barf(); + return 1; +} +int yobi_db_env_set_data_dir (DB_ENV * env, const char *dir) { + barf(); + return 1; +} +void yobi_db_env_set_errcall (DB_ENV *env, void (*errcall)(const char *, char *)) { + env->i->errcall=errcall; +} +void yobi_db_env_set_errpfx (DB_ENV * env, const char *errpfx) { + env->i->errpfx = strdup(errpfx); +} +int yobi_db_env_set_flags (DB_ENV *env, u_int32_t flags, int onoff) { + barf(); + return 1; +} +int yobi_db_env_set_lg_bsize (DB_ENV * env, u_int32_t bsize) { + barf(); + return 1; +} +int yobi_db_env_set_lg_dir (DB_ENV * env, const char * dir) { + barf(); + return 1; +} +int yobi_db_env_set_lg_max (DB_ENV *env, u_int32_t lg_max) { + barf(); + return 1; +} +int yobi_db_env_set_lk_detect (DB_ENV *env, u_int32_t detect) { + barf(); + return 1; +} +int yobi_db_env_set_lk_max (DB_ENV *env, u_int32_t lk_max) { + barf(); + return 1; +} +void yobi_db_env_set_noticecall (DB_ENV *env, void (*noticecall)(DB_ENV *, db_notices)) { + env->i->noticecall = noticecall; +} +int yobi_db_env_set_tmp_dir (DB_ENV * env, const char *tmp_dir) { + env->i->tmp_dir = strdup(tmp_dir); + return 0; +} +int yobi_db_env_set_verbose (DB_ENV *env, u_int32_t which, int onoff) { + barf(); + return 1; +} +int yobi_db_env_txn_checkpoint (DB_ENV *env, u_int32_t kbyte, u_int32_t min, u_int32_t flags) { + return 0; +} + +int yobi_db_env_txn_stat (DB_ENV *env, DB_TXN_STAT **statp, u_int32_t flags) { + barf(); + return 1; +} + +void yobi_default_errcall(const char *errpfx, char *msg) { + fprintf(stderr, "YDB: %s: %s", errpfx, msg); +} + +int db_env_create (DB_ENV **envp, u_int32_t flags) { + DB_ENV *result=malloc(sizeof(*result)); + fprintf(stderr, "%s:%d db_env_create flags=%d, returning %p\n", __FILE__, __LINE__, flags, result); + result->err = yobi_db_env_err; + result->open = yobi_db_env_open; + result->close = yobi_db_env_close; + result->txn_checkpoint = yobi_db_env_txn_checkpoint; + result->log_flush = yobi_db_env_log_flush; + result->set_errcall = yobi_db_env_set_errcall; + result->set_errpfx = yobi_db_env_set_errpfx; + result->set_noticecall = yobi_db_env_set_noticecall; + result->set_flags = yobi_db_env_set_flags; + result->set_data_dir = yobi_db_env_set_data_dir; + result->set_tmp_dir = yobi_db_env_set_tmp_dir; + result->set_verbose = yobi_db_env_set_verbose; + result->set_lg_bsize = yobi_db_env_set_lg_bsize; + result->set_lg_dir = yobi_db_env_set_lg_dir; + result->set_lg_max = yobi_db_env_set_lg_max; + result->set_cachesize = yobi_db_env_set_cachesize; + result->set_lk_detect = yobi_db_env_set_lk_detect; + result->set_lk_max = yobi_db_env_set_lk_max; + result->log_archive = yobi_db_env_log_archive; + result->txn_stat = yobi_db_env_txn_stat; + result->txn_begin = txn_begin; + + result->i = malloc(sizeof(*result->i)); + result->i->dir = 0; + result->i->noticecall = 0; + result->i->tmp_dir = 0; + + result->i->errcall = yobi_default_errcall; + result->i->errpfx = ""; + + result->i->n_files = 0; + result->i->files_array_limit = 4; + result->i->files = malloc(result->i->files_array_limit*sizeof(*result->i->files)); + + *envp = result; + return 0; +} + + +int yobi_db_txn_commit (DB_TXN *txn, u_int32_t flags) { + notef("flags=%d\n", flags); + return 0; +} + +u_int32_t yobi_db_txn_id (DB_TXN *txn) { + barf(); + abort(); +} + +int txn_begin (DB_ENV *env, DB_TXN *stxn, DB_TXN **txn, u_int32_t flags) { + DB_TXN *result = malloc(sizeof(*result)); + notef("parent=%p flags=0x%x\n", stxn, flags); + result->commit = yobi_db_txn_commit; + result->id = yobi_db_txn_id; + *txn = result; + return 0; +} + +int txn_abort (DB_TXN *txn) { + fprintf(stderr, "txn_abort(%p)\n", txn); + abort(); +} + +int txn_commit (DB_TXN *txn, u_int32_t flags) { + fprintf(stderr, "%s:%d txn_commit(%p,%ud)\n", __FILE__, __LINE__, txn, flags); + abort(); +} + +int log_compare (const DB_LSN *a, const DB_LSN *b) { + fprintf(stderr, "%s:%d log_compare(%p,%p)\n", __FILE__, __LINE__, a, b); + abort(); +} + +int yobi_db_close (DB *db, u_int32_t flags) { + int r = close_brt(db->i->brt); + printf("%s:%d %d=yobi_db_close(%p)\n", __FILE__, __LINE__, r, db); + db->i->freed = 1; + return r; +} + +struct yobi_dbc_internal { + BRT_CURSOR c; + DB *db; +}; + +int yobi_c_get (DBC *c, DBT *key, DBT *data, u_int32_t flag) { + return brt_c_get(c->i->c, key, data, flag); +} + +int yobi_c_close (DBC *c) { + int r = brt_cursor_close(c->i->c); + printf("%s:%d %d=yobi_c_close(%p)\n", __FILE__, __LINE__, r, c); + return r; +} + +int yobi_c_del (DBC *c, u_int32_t flags) { + barf(); + return 0; +} + +int yobi_db_cursor (DB *db, DB_TXN *txn, DBC **c, u_int32_t flags) { + DBC *result=malloc(sizeof(*result)); + int r; + assert(result); + result->c_get = yobi_c_get; + result->c_close = yobi_c_close; + result->c_del = yobi_c_del; + result->i = malloc(sizeof(*result->i)); + result->i->db = db; + r = brt_cursor(db->i->brt, &result->i->c); + assert(r==0); + *c = result; + return 0; +} + +int yobi_db_del (DB *db, DB_TXN *txn, DBT *dbt, u_int32_t flags) { + barf(); + abort(); +} + +int yobi_db_get (DB *db, DB_TXN *txn, DBT *dbta, DBT *dbtb, u_int32_t flags) { + barf(); + abort(); +} + +int yobi_db_key_range (DB *db, DB_TXN *txn, DBT *dbt, DB_KEY_RANGE *kr, u_int32_t flags) { + barf(); + abort(); +} + +char *construct_full_name (const char *dir, const char *fname) { + if (fname[0]=='/') + dir = ""; + { + int dirlen = strlen(dir); + int fnamelen = strlen(fname); + int len = dirlen+fnamelen+2; // One for the / between (which may not be there). One for the trailing null. + char *result = malloc(len); + int l; + printf("%s:%d len(%d)=%d+%d+2\n", __FILE__, __LINE__, len, dirlen, fnamelen); + assert(result); + l=snprintf(result, len, "%s", dir); + if (l==0 || result[l-1]!='/') { + /* Didn't put a slash down. */ + if (fname[0]!='/') { + result[l++]='/'; + result[l]=0; + } + } + l+=snprintf(result+l, len-l, "%s", fname); + return result; + } +} + +// The decision to embedded subdatabases in files is a little bit painful. +// My original design was to simply create another file, but it turns out that we +// have to inherit mode bits and so forth from the first file that was created. +// Other problems may ensue (who is responsible for deleting the file? That's not so bad actually.) +// This suggests that we really need to put the multiple databases into one file. +int yobi_db_open (DB *db, DB_TXN *txn, const char *fname, const char *dbname, DBTYPE dbtype, u_int32_t flags, int mode) { + // Warning. Should check arguments. Should check return codes on malloc and open and so forth. + + int openflags=0; + int r; + notef("txn=%p fname=%s dbname=%s dbtype=%d flags=0x%x mode=0%o\n", txn, fname, dbname, dbtype, flags, mode); + print_flags(flags); + if (db->i->full_fname) return -1; /* It was already open. */ + db->i->full_fname = construct_full_name(db->i->env->i->dir, fname); + printf("Full name = %s\n", db->i->full_fname); + db->i->database_name = strdup(dbname); + + if (flags&DB_RDONLY) openflags |= O_RDONLY; + else openflags |= O_RDWR; + + if (flags&DB_CREATE) openflags |= O_CREAT; + + { + struct stat statbuf; + if (stat(db->i->full_fname, &statbuf)==0) { + /* If the database exists at the file level, and we specified no db_name, then complain here. */ + if (dbname==0 && (flags&DB_CREATE)) return EEXIST; + } else { + if (!(flags&DB_CREATE)) return ENOENT; + } + } + + db->i->open_flags = flags; + db->i->open_mode = mode; + // Warning: new_brt has deficienceis: + // Each tree has its own cache, instead of a big shared cache. + // It doesn't do error checking on insert. + // It's tough to do cursors. + r=open_brt(db->i->full_fname, dbname, (flags&DB_CREATE), &db->i->brt, 1<<20, db->i->env->i->cachetable); + assert(r==0); + return 0; +} +int yobi_db_put (DB *db, DB_TXN *txn, DBT *dbta, DBT *dbtb, u_int32_t flags) { + int r = brt_insert(db->i->brt, dbta->data, dbta->size, dbtb->data, dbtb->size); + printf("%s:%d %d=yobi_db_put(...)\n", __FILE__, __LINE__, r); + return r; +} +int yobi_db_remove (DB *db, const char *fname, const char *dbname, u_int32_t flags) { + barf(); + abort(); +} +int yobi_db_rename (DB *db, const char *namea, const char *nameb, const char *namec, u_int32_t flags) { + barf(); + abort(); +} +int yobi_db_set_bt_compare (DB *db, int (*bt_compare)(DB *, const DBT *, const DBT *)) { + note(); + db->i->bt_compare = bt_compare; + return 0; +} +int yobi_db_set_flags (DB *db, u_int32_t flags) { + assert(flags==0); + return 0; +} +int yobi_db_stat (DB *db, void *v, u_int32_t flags) { + barf(); + abort(); +} + +int db_create (DB **db, DB_ENV *env, u_int32_t flags) { + DB *result=malloc(sizeof(*result)); + fprintf(stderr, "%s:%d db_create(%p, %p, 0x%x)\n", __FILE__, __LINE__, db, env, flags); + print_flags(flags); + result->app_private = 0; + result->close = yobi_db_close; + result->cursor = yobi_db_cursor; + result->del = yobi_db_del; + result->get = yobi_db_get; + result->key_range = yobi_db_key_range; + result->open = yobi_db_open; + result->put = yobi_db_put; + result->remove = yobi_db_remove; + result->rename = yobi_db_rename; + result->set_bt_compare = yobi_db_set_bt_compare; + result->set_flags = yobi_db_set_flags; + result->stat = yobi_db_stat; + result->i = malloc(sizeof(*result->i)); + result->i->env = env; + result->i->bt_compare = 0; + result->i->freed = 0; + result->i->full_fname = 0; + *db = result; + return 0; +} diff --git a/utils/ydb_dump.c b/utils/ydb_dump.c new file mode 100644 index 00000000000..39e6800b684 --- /dev/null +++ b/utils/ydb_dump.c @@ -0,0 +1,10 @@ +#include "../include/db.h" +#include +#include + +int main (int argc, char *argv[]) { + char *fname; + assert(argc==2); + fname = argv[1]; + +}