From 629898864254401523333f34be11d69ded4c9bdf Mon Sep 17 00:00:00 2001 From: "Bradley C. Kuszmaul" Date: Mon, 7 Apr 2008 01:30:25 +0000 Subject: [PATCH] Merge the tokudb.558 branch back to to the main branch with: {{{ svn merge -r3272:3320 https://svn.tokutek.com/tokudb/tokudb.558 }}} No conflicts. git-svn-id: file:///svn/tokudb@3322 c7de825b-a66e-492c-adef-691d508d4ae1 --- newbrt/Makefile | 13 +- newbrt/brt-internal.h | 32 +- newbrt/brt-serialize.c | 57 ++- newbrt/brt-test-helpers.c | 29 +- newbrt/brt-test.c | 49 -- newbrt/brt-test2.c | 11 +- newbrt/brt-test3.c | 1 + newbrt/brt-test4.c | 4 + newbrt/brt-verify.c | 54 ++- newbrt/brt.c | 861 ++++++++++++++++++++++++----------- newbrt/brtdump.c | 17 +- newbrt/brttypes.h | 11 +- newbrt/cachetable.c | 7 +- newbrt/fingerprint.c | 18 +- newbrt/gpma.c | 69 ++- newbrt/gpma.h | 12 +- newbrt/leafentry.c | 583 ++++++++++++++++++++---- newbrt/leafentry.h | 101 +++- newbrt/log.c | 59 ++- newbrt/log.h | 6 +- newbrt/logformat.c | 74 ++- newbrt/memory.h | 5 +- newbrt/mempool.c | 6 +- newbrt/rbuf.h | 8 +- newbrt/recover.c | 107 ++++- newbrt/roll.c | 109 ++--- newbrt/test-del-inorder.c | 2 +- src/tests/Makefile | 4 +- src/tests/test_abort1.c | 14 +- src/tests/test_abort2.c | 12 +- src/tests/test_dup_delete.c | 2 +- src/tests/test_log6a_abort.c | 1 + src/ydb.c | 3 + 33 files changed, 1670 insertions(+), 671 deletions(-) diff --git a/newbrt/Makefile b/newbrt/Makefile index 1340f7aa5a9..68f1a276401 100644 --- a/newbrt/Makefile +++ b/newbrt/Makefile @@ -58,6 +58,7 @@ REGRESSION_TESTS = \ brt-test2 \ brt-test3 \ brt-test4 \ + brt-test5 \ cachetable-test \ cachetable-test2 \ fifo-test \ @@ -75,6 +76,7 @@ REGRESSION_TESTS = \ test-gpma-blackbox \ test-gpma-glassbox \ test-gpma-glassbox \ + test-gpma-leftmost-dup \ test-inc-split \ test-primes \ test_oexcl \ @@ -177,7 +179,7 @@ check-fanout: let BRT_FANOUT=BRT_FANOUT+1; \ done -log-test log-test2 log-test3 log-test4 log-test5 log-test6 benchmark-test brt-test brt-test0 brt-test1 brt-test2 brt-test3 brt-test4 test-brt-overflow brt-test-named-db brt-test-cursor brt-test-cursor-2 test-brt-delete-both brt-serialize-test brtdump test-inc-split test-del-inorder: LDFLAGS+=-lz +log-test log-test2 log-test3 log-test4 log-test5 log-test6 benchmark-test brt-test brt-test0 brt-test1 brt-test2 brt-test3 brt-test4 brt-test5 test-brt-overflow brt-test-named-db brt-test-cursor brt-test-cursor-2 test-brt-delete-both brt-serialize-test brtdump test-inc-split test-del-inorder: LDFLAGS+=-lz # pma: PROF_FLAGS=-fprofile-arcs -ftest-coverage BRT_INTERNAL_H_INCLUDES = brt-internal.h cachetable.h fifo.h gpma.h brt.h brt-search.h brttypes.h yerror.h ybt.h log.h ../include/db.h kv-pair.h memory.h crc.h mempool.h leafentry.h @@ -193,18 +195,19 @@ pma.o: gpma.h yerror.h pma-internal.h memory.h key.h ybt.h brttypes.h log.h ../i test-gpma-glassbox.o: test-gpma-glassbox.c gpma.h gpma-internal.h toku_assert.h memory.h test-gpma-glassbox: test-gpma-glassbox.o toku_assert.o memory-debug.o gpma.o test-gpma-blackbox: test-gpma-blackbox.o toku_assert.o memory.o gpma.o -test-gpma-blackbox.o: test-gpma-blackbox.c gpma.h memory.h toku_assert.h test-gpma-worstinsert: test-gpma-worstinsert.o toku_assert.o memory.o gpma.o -test-gpma-worstinsert.o test-gpma-blackbox.o: gpma.h memory.h toku_assert.h +test-gpma-leftmost-dup: test-gpma-leftmost-dup.o toku_assert.o memory.o gpma.o +test-gpma-worstinsert.o test-gpma-blackbox.o test-gpma-leftmost-dup.o: gpma.h memory.h toku_assert.h +: gpma.h memory.h toku_assert.h gpma.o: gpma.c gpma.h ybt.o: ybt.h brttypes.h ../include/db.h ybt-test: ybt-test.o ybt.o memory.o toku_assert.o ybt-test.o: ybt.h ../include/db.h cachetable.o: cachetable.h hashfun.h memory.h -brt-test0 brt-test1 brt-test2 brt-test3 brt-test4 test-brt-overflow brt-test-named-db brt-test-cursor brt-test-cursor-2 brt-test: ybt.o brt.o fifo.o gpma.o leafentry.o memory.o brt-serialize.o cachetable.o ybt.o key.o primes.o toku_assert.o log.o mempool.o brt-verify.o fingerprint.o log_code.o roll.o +brt-test0 brt-test1 brt-test2 brt-test3 brt-test4 brt-test5 test-brt-overflow brt-test-named-db brt-test-cursor brt-test-cursor-2 brt-test: ybt.o brt.o fifo.o gpma.o leafentry.o memory.o brt-serialize.o cachetable.o ybt.o key.o primes.o toku_assert.o log.o mempool.o brt-verify.o fingerprint.o log_code.o roll.o log.o: log_header.h log-internal.h log.h wbuf.h crc.h brttypes.h $(BRT_INTERNAL_H_INCLUDES) logformat: logformat.o toku_assert.o -brt-test0.o brt-test1.o brt-test2.o brt-test3.o brt-test4.o test-brt-overflow.h brt-test-named-db.o brt-test-cursor.o brt-test-cursor-2.o brt-test.o brt.o: brt.h brt-search.h ../include/db.h fifo.h gpma.h brttypes.h cachetable.h memory.h $(BRT_INTERNAL_H_INCLUDES) +brt-test0.o brt-test1.o brt-test2.o brt-test3.o brt-test4.o brt-test5.o test-brt-overflow.h brt-test-named-db.o brt-test-cursor.o brt-test-cursor-2.o brt-test.o brt.o: brt.h brt-search.h ../include/db.h fifo.h gpma.h brttypes.h cachetable.h memory.h $(BRT_INTERNAL_H_INCLUDES) brt-serialize-test.o: $(BRT_INTERNAL_H_INCLUDES) brt.o: $(BRT_INTERNAL_H_INCLUDES) key.h log_header.h fifo.o: fifo.h brttypes.h diff --git a/newbrt/brt-internal.h b/newbrt/brt-internal.h index e8ef508d2a2..182b5eeb191 100644 --- a/newbrt/brt-internal.h +++ b/newbrt/brt-internal.h @@ -160,8 +160,8 @@ extern CACHEKEY* toku_calculate_root_offset_pointer (BRT brt); static const BRTNODE null_brtnode=0; -extern u_int32_t toku_calccrc32_kvpair (const void *key, int keylen, const void *val, int vallen); -extern u_int32_t toku_calccrc32_kvpair_struct (const struct kv_pair *kvp); +//extern u_int32_t toku_calccrc32_kvpair (const void *key, int keylen, const void *val, int vallen); +//extern u_int32_t toku_calccrc32_kvpair_struct (const struct kv_pair *kvp); extern u_int32_t toku_calccrc32_cmd (u_int32_t type, TXNID xid, const void *key, u_int32_t keylen, const void *val, u_int32_t vallen); extern u_int32_t toku_calccrc32_cmdstruct (BRT_CMD cmd); @@ -193,25 +193,17 @@ int toku_testsetup_insert_to_nonleaf (BRT brt, DISKOFF diskoff, enum brt_cmd_typ int toku_set_func_fsync (int (*fsync_function)(int)); -/* allocate a kv pair from a kv memory pool */ -//static inline struct kv_pair *kv_pair_malloc_mempool(const void *key, int keylen, const void *val, int vallen, struct mempool *mp) { -// struct kv_pair *kv = toku_mempool_malloc(mp, sizeof (struct kv_pair) + keylen + vallen, 4); -// if (kv) -// kv_pair_init(kv, key, keylen, val, vallen); -// return kv; -//} - -static inline struct kv_pair *brtnode_malloc_kv_pair (GPMA pma, struct mempool *mp, const void *key, unsigned int keylen, const void *val, unsigned int vallen) { - struct kv_pair *kv = mempool_malloc_from_gpma(pma, mp, sizeof (struct kv_pair) + keylen + vallen); - kv_pair_init(kv, key, keylen, val, vallen); - return kv; -} - -// used for the leaf compare fun -struct lc_pair { +// These two go together to do lookups in a brtnode using the keys in a command. +struct cmd_leafval_bessel_extra { BRT t; - int compare_both; // compare_both is set if it is a DUPSORT database and both keys are needed (e.g, for DB_DELETE_ANY) + BRT_CMD cmd; + int compare_both_keys; // Set to 1 for DUPSORT databases that are not doing a DELETE_BOTH }; -int toku_brtleaf_compare_fun (u_int32_t alen __attribute__((__unused__)), void *aval, u_int32_t blen __attribute__((__unused__)), void *bval, void *lc /*this is (struct lc_pair *) cast to (void*). */) ; +int toku_cmd_leafval_bessel (u_int32_t dlen, void *leafentry, void *extra); + +int toku_brt_root_put_cmd(BRT brt, BRT_CMD cmd, TOKULOGGER logger); + +int toku_gpma_compress_kvspace (GPMA pma, struct mempool *memp); +void *mempool_malloc_from_gpma(GPMA pma, struct mempool *mp, size_t size); #endif diff --git a/newbrt/brt-serialize.c b/newbrt/brt-serialize.c index 11fbccae3ae..f6fafec5f0c 100644 --- a/newbrt/brt-serialize.c +++ b/newbrt/brt-serialize.c @@ -62,9 +62,8 @@ static unsigned int toku_serialize_brtnode_size_slow(BRTNODE node) { GPMA_ITERATE(node->u.l.buffer, idx, vlen, vdata, ({ - struct kv_pair *p=vdata; - assert(vlen==sizeof(*p)+kv_pair_keylen(p)+kv_pair_vallen(p)); - hsize+=PMA_ITEM_OVERHEAD+KEY_VALUE_OVERHEAD+kv_pair_keylen(p)+kv_pair_vallen(p); + LEAFENTRY le=vdata; + hsize+= PMA_ITEM_OVERHEAD + leafentry_disksize(le); })); assert(hsize==node->u.l.n_bytes_in_buffer); hsize+=4; /* the PMA size */ @@ -97,7 +96,7 @@ unsigned int toku_serialize_brtnode_size (BRTNODE node) { return result; } -void toku_serialize_brtnode_to(int fd, DISKOFF off, DISKOFF size, BRTNODE node) { +void toku_serialize_brtnode_to (int fd, DISKOFF off, DISKOFF size, BRTNODE node) { //printf("%s:%d serializing\n", __FILE__, __LINE__); struct wbuf w; int i; @@ -105,7 +104,7 @@ void toku_serialize_brtnode_to(int fd, DISKOFF off, DISKOFF size, BRTNODE node) assert(calculated_size<=size); //char buf[size]; char *MALLOC_N(size,buf); - toku_verify_counts(node); + //toku_verify_counts(node); assert(size>0); wbuf_init(&w, buf, size); //printf("%s:%d serializing %lld w height=%d p0=%p\n", __FILE__, __LINE__, off, node->height, node->mdicts[0]); @@ -174,19 +173,14 @@ void toku_serialize_brtnode_to(int fd, DISKOFF off, DISKOFF size, BRTNODE node) assert(check_local_fingerprint==node->local_fingerprint); } } else { - //printf(" n_entries=%d\n", toku_pma_n_entries(node->u.l.buffer)); + //printf("%s:%d writing node %lld n_entries=%d\n", __FILE__, __LINE__, node->thisnodename, toku_gpma_n_entries(node->u.l.buffer)); wbuf_uint(&w, toku_gpma_n_entries(node->u.l.buffer)); wbuf_uint(&w, toku_gpma_index_limit(node->u.l.buffer)); GPMA_ITERATE(node->u.l.buffer, idx, vlen, vdata, ({ - struct kv_pair *p=vdata; - assert((char*)node->u.l.buffer_mempool.base<= (char*)p && (char*)p < (char*)node->u.l.buffer_mempool.base+node->u.l.buffer_mempool.size ); - u_int32_t keylen=kv_pair_keylen(p); - u_int32_t datalen=kv_pair_vallen(p); - assert(vlen==sizeof(*p)+keylen+datalen); + //printf(" %s:%d idx=%d\n", __FILE__, __LINE__, idx); wbuf_uint(&w, idx); - wbuf_bytes(&w, kv_pair_key(p), keylen); - wbuf_bytes(&w, kv_pair_val(p), datalen); + wbuf_LEAFENTRY(&w, vdata); })); } assert(w.ndone<=w.size); @@ -343,7 +337,7 @@ int toku_deserialize_brtnode_from (int fd, DISKOFF off, BRTNODE *brtnode, unsign int diff; bytevec key; ITEMLEN keylen; bytevec val; ITEMLEN vallen; - toku_verify_counts(result); + //toku_verify_counts(result); int type = rbuf_char(&rc); TXNID xid = rbuf_ulonglong(&rc); rbuf_bytes(&rc, &key, &keylen); /* Returns a pointer into the rbuf. */ @@ -387,19 +381,24 @@ int toku_deserialize_brtnode_from (int fd, DISKOFF off, BRTNODE *brtnode, unsign } u_int32_t actual_sum = 0; + //printf("%s:%d node %lld, reading %d items\n", __FILE__, __LINE__, off, n_in_buf); for (i=0; iu.l.n_bytes_in_buffer += keylen + vallen + KEY_VALUE_OVERHEAD + PMA_ITEM_OVERHEAD; - struct kv_pair *pair = brtnode_malloc_kv_pair(result->u.l.buffer, &result->u.l.buffer_mempool, key, keylen, val, vallen); - assert(pair); - int pairlen = kv_pair_size(pair); - toku_gpma_set_at_index(result->u.l.buffer, idx, pairlen, pair); - actual_sum += result->rand4fingerprint*toku_calccrc32_kvpair_struct(pair); -// printf("%s:%d rand4=%08x actual=%08x this=%08x expect=%08x\n", __FILE__, __LINE__, result->rand4fingerprint, actual_sum, toku_calccrc32_kvpair_struct(pair), result->local_fingerprint); + //printf("%s:%d idx=%d\n", __FILE__, __LINE__, idx); + u_int32_t memsize, disksize; + rbuf_LEAFENTRY(&rc, &memsize, &disksize, &tmp_le); + LEAFENTRY le = mempool_malloc_from_gpma(result->u.l.buffer, &result->u.l.buffer_mempool, memsize); + assert(le); + memcpy(le, tmp_le, memsize); + toku_free(tmp_le); + assert(disksize==leafentry_disksize(le)); + result->u.l.n_bytes_in_buffer += disksize + PMA_ITEM_OVERHEAD; + //printf("idx=%d\n", idx); + toku_gpma_set_at_index(result->u.l.buffer, idx, memsize, le); + actual_sum += result->rand4fingerprint*toku_le_crc(le); + //printf("%s:%d rand4=%08x fp=%08x \n", __FILE__, __LINE__, result->rand4fingerprint, actual_sum); } if (r!=0) goto died_21; @@ -411,7 +410,7 @@ int toku_deserialize_brtnode_from (int fd, DISKOFF off, BRTNODE *brtnode, unsign //fprintf(stderr, "%s:%d Good checksum=%08x height=%d\n", __FILE__, __LINE__, actual_sum, result->height); } - toku_verify_counts(result); + //toku_verify_counts(result); } { unsigned int n_read_so_far = rc.ndone; @@ -430,7 +429,7 @@ int toku_deserialize_brtnode_from (int fd, DISKOFF off, BRTNODE *brtnode, unsign //printf("%s:%d Ok got %lld n_children=%d\n", __FILE__, __LINE__, result->thisnodename, result->n_children); toku_free(rc.buf); *brtnode = result; - toku_verify_counts(result); + //toku_verify_counts(result); return 0; } @@ -444,8 +443,8 @@ void toku_verify_counts (BRTNODE node) { GPMA_ITERATE(node->u.l.buffer, idx, dlen, ddata, ({ count++; - sum+=(PMA_ITEM_OVERHEAD + dlen); - fp += toku_calccrc32_kvpair_struct(ddata); + sum+= PMA_ITEM_OVERHEAD + leafentry_disksize(ddata); // use the disk size, not the memory size. + fp += toku_le_crc(ddata); })); assert(count==toku_gpma_n_entries(node->u.l.buffer)); assert(sum==node->u.l.n_bytes_in_buffer); diff --git a/newbrt/brt-test-helpers.c b/newbrt/brt-test-helpers.c index 6699ffa28ac..5aacc296b2c 100644 --- a/newbrt/brt-test-helpers.c +++ b/newbrt/brt-test-helpers.c @@ -76,27 +76,38 @@ int toku_testsetup_insert_to_leaf (BRT brt, DISKOFF diskoff, char *key, int keyl toku_verify_counts(node); assert(node->height==0); - struct kv_pair *kv = brtnode_malloc_kv_pair(node->u.l.buffer, &node->u.l.buffer_mempool, key, keylen, val, vallen); - struct lc_pair lc = {brt, node->flags & TOKU_DB_DUPSORT}; + u_int32_t lesize, disksize; + LEAFENTRY tmp_leafentry; + r = le_committed(keylen, key, vallen, val, &lesize, &disksize, &tmp_leafentry); + + LEAFENTRY leafentry = mempool_malloc_from_gpma(node->u.l.buffer, &node->u.l.buffer_mempool, lesize); + memcpy(leafentry, tmp_leafentry, lesize); + toku_free(tmp_leafentry); + u_int32_t storedlen; void *storeddata; u_int32_t idx; - r = toku_gpma_lookup_item(node->u.l.buffer, kv_pair_size(kv), kv, toku_brtleaf_compare_fun, &lc, &storedlen, &storeddata, &idx); + DBT keydbt,valdbt; + BRT_CMD_S cmd = {BRT_INSERT, 0, .u.id={toku_fill_dbt(&keydbt, key, keylen), + toku_fill_dbt(&valdbt, val, vallen)}}; + struct cmd_leafval_bessel_extra be = {brt, &cmd, node->flags & TOKU_DB_DUPSORT}; + r = toku_gpma_lookup_bessel(node->u.l.buffer, toku_cmd_leafval_bessel, 0, &be, &storedlen, &storeddata, &idx); + if (r==0) { // It's already there. So now we have to remove it and put the new one back in. - node->u.l.n_bytes_in_buffer -= PMA_ITEM_OVERHEAD + storedlen; - node->local_fingerprint -= node->rand4fingerprint*toku_calccrc32_kvpair_struct(storeddata); + node->u.l.n_bytes_in_buffer -= PMA_ITEM_OVERHEAD + leafentry_disksize(storeddata); + node->local_fingerprint -= node->rand4fingerprint*toku_le_crc(storeddata); toku_mempool_mfree(&node->u.l.buffer_mempool, storeddata, storedlen); // Now put the new kv in. - toku_gpma_set_at_index(node->u.l.buffer, idx, kv_pair_size(kv), kv); + toku_gpma_set_at_index(node->u.l.buffer, idx, lesize, leafentry); } else { - r = toku_gpma_insert(node->u.l.buffer, kv_pair_size(kv), kv, toku_brtleaf_compare_fun, &lc, 0, 0, 0); + r = toku_gpma_insert_bessel(node->u.l.buffer, lesize, leafentry, toku_cmd_leafval_bessel, &be, 0, 0, 0); assert(r==0); } - node->u.l.n_bytes_in_buffer += PMA_ITEM_OVERHEAD + kv_pair_size(kv); - node->local_fingerprint += node->rand4fingerprint*toku_calccrc32_kvpair_struct(kv); + node->u.l.n_bytes_in_buffer += PMA_ITEM_OVERHEAD + disksize; + node->local_fingerprint += node->rand4fingerprint*toku_le_crc(leafentry); node->dirty=1; *subtree_fingerprint = node->local_fingerprint; diff --git a/newbrt/brt-test.c b/newbrt/brt-test.c index cea75465854..759e0ed8940 100644 --- a/newbrt/brt-test.c +++ b/newbrt/brt-test.c @@ -21,53 +21,6 @@ static TOKUTXN const null_txn = 0; static DB * const null_db = 0; -static void test5 (void) { - int r; - BRT t; - int limit=100000; - int *values; - int i; - CACHETABLE ct; - char fname[]="testbrt.brt"; - toku_memory_check_all_free(); - MALLOC_N(limit,values); - for (i=0; i=0) { - char key[100], valexpected[100]; - DBT k,v; - if (i%1000==0 && verbose) { printf("r"); fflush(stdout); } - snprintf(key, 100, "key%d", rk); - snprintf(valexpected, 100, "val%d", values[rk]); - r = toku_brt_lookup(t, toku_fill_dbt(&k, key, 1+strlen(key)), toku_init_dbt(&v)); - assert(r==0); - assert(v.size==(1+strlen(valexpected))); - assert(memcmp(v.data,valexpected,v.size)==0); - } - } - if (verbose) printf("\n"); - toku_free(values); - r = toku_close_brt(t); assert(r==0); - r = toku_cachetable_close(&ct); assert(r==0); - toku_memory_check_all_free(); -} - static void test_dump_empty_db (void) { BRT t; CACHETABLE ct; @@ -1518,8 +1471,6 @@ static void brt_blackbox_test (void) { toku_memory_check_all_free(); test_multiple_dbs(); toku_memory_check_all_free(); - if (verbose) printf("test5\n"); - test5(); if (verbose) printf("test_multiple_files\n"); test_multiple_files(); diff --git a/newbrt/brt-test2.c b/newbrt/brt-test2.c index da971e21543..fc415a100da 100644 --- a/newbrt/brt-test2.c +++ b/newbrt/brt-test2.c @@ -10,7 +10,7 @@ static TOKUTXN const null_txn = 0; static DB * const null_db = 0; -static void test2 (int memcheck) { +static void test2 (int memcheck, int limit) { BRT t; int r; int i; @@ -24,7 +24,7 @@ static void test2 (int memcheck) { r = toku_open_brt(fname, 0, 1, &t, 1024, ct, null_txn, toku_default_compare_fun, null_db); if (verbose) printf("%s:%d did setup\n", __FILE__, __LINE__); assert(r==0); - for (i=0; i<4096; i++) { + for (i=0; ilocal_fingerprint); } else { - gpma_verify_fingerprint(node->u.l.buffer, node->rand4fingerprint, node->local_fingerprint); + toku_verify_counts(node); } } +static int compare_pairs (BRT brt, struct kv_pair *a, struct kv_pair *b) { + DBT x,y; + int cmp = brt->compare_fun(brt->db, + toku_fill_dbt(&x, kv_pair_key(a), kv_pair_keylen(a)), + toku_fill_dbt(&y, kv_pair_key(b), kv_pair_keylen(b))); + if (cmp==0 && (brt->flags & TOKU_DB_DUPSORT)) { + cmp = brt->dup_compare(brt->db, + toku_fill_dbt(&x, kv_pair_val(a), kv_pair_vallen(a)), + toku_fill_dbt(&y, kv_pair_val(b), kv_pair_vallen(b))); + } + return cmp; +} +static int compare_leafentries (BRT brt, LEAFENTRY a, LEAFENTRY b) { + DBT x,y; + int cmp = brt->compare_fun(brt->db, + toku_fill_dbt(&x, le_any_key(a), le_any_keylen(a)), + toku_fill_dbt(&y, le_any_key(b), le_any_keylen(b))); + if (cmp==0 && (brt->flags & TOKU_DB_DUPSORT)) { + cmp = brt->dup_compare(brt->db, + toku_fill_dbt(&x, le_any_val(a), le_any_vallen(a)), + toku_fill_dbt(&y, le_any_val(b), le_any_vallen(b))); + } + return cmp; +} + int toku_verify_brtnode (BRT brt, DISKOFF off, bytevec lorange, ITEMLEN lolen, bytevec hirange, ITEMLEN hilen, int recurse) { int result=0; BRTNODE node; @@ -56,7 +72,7 @@ int toku_verify_brtnode (BRT brt, DISKOFF off, bytevec lorange, ITEMLEN lolen, b verify_local_fingerprint(node); if (node->height>0) { int i; - for (i=0; i< node->u.n.n_children-1; i++) { + for (i=0; i< node->u.n.n_children; i++) { bytevec thislorange,thishirange; ITEMLEN thislolen, thishilen; if (node->u.n.n_children==0 || i==0) { @@ -89,8 +105,14 @@ int toku_verify_brtnode (BRT brt, DISKOFF off, bytevec lorange, ITEMLEN lolen, b toku_fifo_iterate(BNC_BUFFER(node,i), verify_pair, 0); } } + //if (lorange) printf("%s:%d lorange=%s\n", __FILE__, __LINE__, (char*)lorange); + //if (hirange) printf("%s:%d lorange=%s\n", __FILE__, __LINE__, (char*)hirange); + for (i=0; iu.n.n_children-2; i++) { + assert(compare_pairs(brt, node->u.n.childkeys[i], node->u.n.childkeys[i+1])<0); + } for (i=0; iu.n.n_children; i++) { if (i>0) { + //printf(" %s:%d i=%d %p v=%s\n", __FILE__, __LINE__, i, node->u.n.childkeys[i-1], (char*)kv_pair_key(node->u.n.childkeys[i-1])); if (lorange) assert(toku_keycompare(lorange,lolen, kv_pair_key(node->u.n.childkeys[i-1]), toku_brt_pivot_key_len(brt, node->u.n.childkeys[i-1]))<0); if (hirange) assert(toku_keycompare(kv_pair_key(node->u.n.childkeys[i-1]), toku_brt_pivot_key_len(brt, node->u.n.childkeys[i-1]), hirange, hilen)<=0); } @@ -103,6 +125,16 @@ int toku_verify_brtnode (BRT brt, DISKOFF off, bytevec lorange, ITEMLEN lolen, b recurse); } } + } else { + // Make sure that they are in increasing order. + void *prev=0; + GPMA_ITERATE(node->u.l.buffer, idx, dlen, data, + ({ + if (prev==0) + prev=data; + else + assert(compare_leafentries(brt, prev, data)<0); + })); } if ((r = toku_cachetable_unpin(brt->cf, off, 0, 0))) return r; return result; diff --git a/newbrt/brt.c b/newbrt/brt.c index 8befac36afc..bc9a86e6b61 100644 --- a/newbrt/brt.c +++ b/newbrt/brt.c @@ -40,6 +40,13 @@ #include "mempool.h" #include "leafentry.h" +//#define SLOW +#ifdef SLOW +#define VERIFY_NODE(n) toku_verify_counts(n) +#else +#define VERIFY_NODE(n) ((void)0) +#endif + extern long long n_items_malloced; static int malloc_diskblock (DISKOFF *res, BRT brt, int size, TOKULOGGER); @@ -210,7 +217,7 @@ int toku_unpin_brtnode (BRT brt, BRTNODE node) { // node->log_lsn = toku_txn_get_last_lsn(txn); // //if (node->log_lsn.lsn>33320) printf("%s:%d node%lld lsn=%lld\n", __FILE__, __LINE__, node->thisnodename, node->log_lsn.lsn); // } - //toku_verify_counts(node); + VERIFY_NODE(node); return toku_cachetable_unpin(brt->cf, node->thisnodename, node->dirty, brtnode_size(node)); } @@ -221,16 +228,6 @@ typedef struct kvpair { unsigned int vallen; } *KVPAIR; -#if 0 -int kvpair_compare (const void *av, const void *bv) { - const KVPAIR a = (const KVPAIR)av; - const KVPAIR b = (const KVPAIR)bv; - int r = toku_keycompare(a->key, a->keylen, b->key, b->keylen); - //printf("keycompare(%s,\n %s)-->%d\n", a->key, b->key, r); - return r; -} -#endif - /* Forgot to handle the case where there is something in the freelist. */ static int malloc_diskblock_header_is_in_memory (DISKOFF *res, BRT brt, int size, TOKULOGGER logger) { DISKOFF result = brt->h->unused_memory; @@ -314,7 +311,7 @@ int toku_create_new_brtnode (BRT t, BRTNODE *result, int height, TOKULOGGER logg *result = n; assert(n->nodesize>0); // n->brt = t; - //printf("%s:%d putting %p (%lld) parent=%p\n", __FILE__, __LINE__, n, n->thisnodename, parent_brtnode); + //printf("%s:%d putting %p (%lld)\n", __FILE__, __LINE__, n, n->thisnodename); r=toku_cachetable_put(t->cf, n->thisnodename, n, brtnode_size(n), toku_brtnode_flush_callback, toku_brtnode_fetch_callback, t); assert(r==0); @@ -348,13 +345,13 @@ int move_between_mempools (u_int32_t len, void *odata, void **ndata, void *extra struct move_struct *ms=extra; assert(ms->from->height==0); assert(ms->to->height==0); - assert(len==(unsigned)kv_pair_size(odata)); + assert(len==(unsigned)leafentry_memsize(odata)); void *newitem=mempool_malloc_from_gpma(ms->to->u.l.buffer, &ms->to->u.l.buffer_mempool, len); assert(newitem); memcpy(newitem, odata, len); toku_mempool_mfree(&ms->from->u.l.buffer_mempool, odata, len); *ndata = newitem; - assert(len==(unsigned)kv_pair_size(newitem)); + assert(len==(unsigned)leafentry_memsize(newitem)); return 0; } @@ -395,8 +392,8 @@ static int note_move_items_between (u_int32_t nitems, u_int32_t *froms, u_int32_ u_int32_t diffsize = 0; u_int32_t diff_fp = 0; for (i=0; ifrom->local_fingerprint -= ms->from->rand4fingerprint * diff_fp; ms->to->local_fingerprint += ms->to->rand4fingerprint * diff_fp; @@ -409,16 +406,18 @@ struct delete_struct { BRTNODE node; }; +#if 0 static int brt_leaf_delete_callback (u_int32_t slotnum, u_int32_t len, void *data, void *extra) { struct delete_struct *d = extra; d->node->local_fingerprint -= d->node->rand4fingerprint*toku_calccrc32_kvpair_struct(data); - d->node->u.l.n_bytes_in_buffer -= PMA_ITEM_OVERHEAD + len; + d->node->u.l.n_bytes_in_buffer -= PMA_ITEM_OVERHEAD + leafentry_disksize(data); toku_mempool_mfree(&d->node->u.l.buffer_mempool, data, len); d->node->dirty=1; // Should use slotnum for logging slotnum=slotnum; //???? return 0; } +#endif static int brtleaf_split (TOKULOGGER logger, FILENUM filenum, BRT t, BRTNODE node, BRTNODE *nodea, BRTNODE *nodeb, DBT *splitk) { BRTNODE B; @@ -438,9 +437,9 @@ static int brtleaf_split (TOKULOGGER logger, FILENUM filenum, BRT t, BRTNODE nod //toku_verify_gpma(node->u.l.buffer); GPMA_ITERATE(node->u.l.buffer, idx, vlen, vdata, ({ - struct kv_pair *p=vdata; + char *p=vdata; //printf("%s:%d %d:%p ", __FILE__, __LINE__, idx, p); - assert((char*)node->u.l.buffer_mempool.base<= (char*)p && (char*)p < (char*)node->u.l.buffer_mempool.base+node->u.l.buffer_mempool.size ); + assert((char*)node->u.l.buffer_mempool.base<= p && p < (char*)node->u.l.buffer_mempool.base+node->u.l.buffer_mempool.size ); })); r = toku_gpma_split(node->u.l.buffer, B->u.l.buffer, PMA_ITEM_OVERHEAD, move_between_mempools, @@ -449,28 +448,28 @@ static int brtleaf_split (TOKULOGGER logger, FILENUM filenum, BRT t, BRTNODE nod &ms); GPMA_ITERATE(node->u.l.buffer, idx, vlen, vdata, ({ - struct kv_pair *p=vdata; + char *p=vdata; //printf("%s:%d %d:%p ", __FILE__, __LINE__, idx, p); - assert((char*)node->u.l.buffer_mempool.base<= (char*)p && (char*)p < (char*)node->u.l.buffer_mempool.base+node->u.l.buffer_mempool.size ); + assert((char*)node->u.l.buffer_mempool.base<= p && p < (char*)node->u.l.buffer_mempool.base+node->u.l.buffer_mempool.size ); })); GPMA_ITERATE(B->u.l.buffer, idx, vlen, vdata, ({ - struct kv_pair *p=vdata; + char *p=vdata; //printf("%s:%d %d:%p\n", __FILE__, __LINE__, idx, p); - assert((char*)B->u.l.buffer_mempool.base<= (char*)p && (char*)p < (char*)B->u.l.buffer_mempool.base+node->u.l.buffer_mempool.size ); + assert((char*)B->u.l.buffer_mempool.base<= p && p < (char*)B->u.l.buffer_mempool.base+node->u.l.buffer_mempool.size ); })); //toku_verify_gpma(node->u.l.buffer); //toku_verify_gpma(B->u.l.buffer); if (splitk) { memset(splitk, 0, sizeof *splitk); - struct kv_pair *kp=ms.last_pair_remaining_in_from.data; + LEAFENTRY le=ms.last_pair_remaining_in_from.data; if (node->flags&TOKU_DB_DUPSORT) { - splitk->size = kv_pair_keylen(kp)+kv_pair_vallen(kp); - splitk->data = kv_pair_malloc(kv_pair_key(kp), kv_pair_keylen(kp), kv_pair_val(kp), kv_pair_vallen(kp)); + splitk->size = le_any_keylen(le)+le_any_vallen(le); + splitk->data = kv_pair_malloc(le_any_key(le), le_any_keylen(le), le_any_val(le), le_any_vallen(le)); } else { - splitk->size = kv_pair_keylen(kp); - splitk->data = kv_pair_malloc(kv_pair_key(kp), kv_pair_keylen(kp), 0, 0); + splitk->size = le_any_keylen(le); + splitk->data = kv_pair_malloc(le_any_key(le), le_any_keylen(le), 0, 0); } splitk->flags=0; } @@ -486,9 +485,9 @@ static int brtleaf_split (TOKULOGGER logger, FILENUM filenum, BRT t, BRTNODE nod return 0; } -#define MAX_PATHLEN_TO_ROOT 40 +//#define MAX_PATHLEN_TO_ROOT 40 -static int log_and_save_brtenq(TOKULOGGER logger, BRT t, BRTNODE node, int childnum, TXNID xid, int type, const char *key, int keylen, const char *data, int datalen, u_int32_t *fingerprint, DISKOFFARRAY path_to_parent) { +static int log_and_save_brtenq(TOKULOGGER logger, BRT t, BRTNODE node, int childnum, TXNID xid, int type, const char *key, int keylen, const char *data, int datalen, u_int32_t *fingerprint) { BYTESTRING keybs = {.len=keylen, .data=(char*)key}; BYTESTRING databs = {.len=datalen, .data=(char*)data}; u_int32_t old_fingerprint = *fingerprint; @@ -498,19 +497,11 @@ static int log_and_save_brtenq(TOKULOGGER logger, BRT t, BRTNODE node, int child *fingerprint = new_fingerprint; int r = toku_log_brtenq(logger, (LSN*)0, 0, toku_cachefile_filenum(t->cf), node->thisnodename, childnum, xid, type, keybs, databs, old_fingerprint, new_fingerprint); if (r!=0) return r; - TOKUTXN txn; - if (0==toku_txnid2txn(logger, xid, &txn) && txn) { - DISKOFFARRAY path = path_to_parent; - path.array = toku_memdup(path.array, sizeof(path.array[0])*(1+path.len)); - if (path.array==0) return errno; - r = toku_logger_save_rollback_xactiontouchednonleaf(txn, toku_cachefile_filenum(t->cf), path, node->thisnodename); - if (r!=0) return r; - } return 0; } /* Side effect: sets splitk->data pointer to a malloc'd value */ -static int brt_nonleaf_split (BRT t, BRTNODE node, BRTNODE *nodea, BRTNODE *nodeb, DBT *splitk, TOKULOGGER logger, DISKOFFARRAY path_to_parent) { +static int brt_nonleaf_split (BRT t, BRTNODE node, BRTNODE *nodea, BRTNODE *nodeb, DBT *splitk, TOKULOGGER logger) { int old_n_children = node->u.n.n_children; int n_children_in_a = old_n_children/2; int n_children_in_b = old_n_children-n_children_in_a; @@ -538,9 +529,6 @@ static int brt_nonleaf_split (BRT t, BRTNODE node, BRTNODE *nodea, BRTNODE *node BNC_SUBTREE_FINGERPRINT(B,i)=0; } - assert(path_to_parent.lenthisnodename; // Don't have to restore it since path_to_parent is passed by value, and this one not used again except in this loop. - for (i=n_children_in_a; ithisnodename, n_children_in_a, xid, type, keybs, databs, old_from_fingerprint, new_from_fingerprint); if (r!=0) return r; - r = log_and_save_brtenq(logger, t, B, targchild, xid, type, key, keylen, data, datalen, &B->local_fingerprint, path_to_parent); + r = log_and_save_brtenq(logger, t, B, targchild, xid, type, key, keylen, data, datalen, &B->local_fingerprint); r = toku_fifo_enq(to_htab, key, keylen, data, datalen, type, xid); if (r!=0) return r; toku_fifo_deq(from_htab); @@ -656,14 +644,13 @@ static int brtnode_put_cmd (BRT t, BRTNODE node, BRT_CMD cmd, int *did_split, BRTNODE *nodea, BRTNODE *nodeb, DBT *split, int debug, - TOKULOGGER, DISKOFFARRAY path_to_parent); + TOKULOGGER); /* key is not in the buffer. Either put the key-value pair in the child, or put it in the node. */ static int push_brt_cmd_down_only_if_it_wont_push_more_else_put_here (BRT t, BRTNODE node, BRTNODE child, BRT_CMD cmd, int childnum_of_node, - TOKULOGGER logger, - DISKOFFARRAY path_to_parent) { + TOKULOGGER logger) { assert(node->height>0); /* Not a leaf. */ DBT *k = cmd->u.id.key; DBT *v = cmd->u.id.val; @@ -695,13 +682,10 @@ static int push_brt_cmd_down_only_if_it_wont_push_more_else_put_here (BRT t, BRT DBT againk; toku_init_dbt(&againk); //printf("%s:%d hello!\n", __FILE__, __LINE__); - assert(path_to_parent.lenthisnodename; r = brtnode_put_cmd(t, child, cmd, &again_split, &againa, &againb, &againk, 0, - logger, - path_to_parent); + logger); if (r!=0) return r; assert(again_split==0); /* I only did the insert if I knew it wouldn't push down, and hence wouldn't split. */ } else { @@ -715,19 +699,15 @@ static int push_a_brt_cmd_down (BRT t, BRTNODE node, BRTNODE child, int childnum BRT_CMD cmd, int *child_did_split, BRTNODE *childa, BRTNODE *childb, DBT *childsplitk, - TOKULOGGER logger, - DISKOFFARRAY path_to_parent) { + TOKULOGGER logger) { //if (debug) printf("%s:%d %*sinserting down\n", __FILE__, __LINE__, debug, ""); //printf("%s:%d hello!\n", __FILE__, __LINE__); assert(node->height>0); { - assert(path_to_parent.lenthisnodename; int r = brtnode_put_cmd(t, child, cmd, child_did_split, childa, childb, childsplitk, 0, - logger, - path_to_parent); + logger); if (r!=0) return r; } @@ -765,7 +745,7 @@ static int push_a_brt_cmd_down (BRT t, BRTNODE node, BRTNODE child, int childnum return 0; } -static int brtnode_maybe_push_down(BRT t, BRTNODE node, int *did_split, BRTNODE *nodea, BRTNODE *nodeb, DBT *splitk, int debug, TOKULOGGER logger, DISKOFFARRAY path_to_parent); +static int brtnode_maybe_push_down(BRT t, BRTNODE node, int *did_split, BRTNODE *nodea, BRTNODE *nodeb, DBT *splitk, int debug, TOKULOGGER logger); static int split_count=0; @@ -781,8 +761,7 @@ static int handle_split_of_child (BRT t, BRTNODE node, int childnum, DBT *childsplitk, /* the data in the childsplitk is alloc'd and is consumed by this call. */ int *did_split, BRTNODE *nodea, BRTNODE *nodeb, DBT *splitk, - TOKULOGGER logger, - DISKOFFARRAY path_to_parent) { + TOKULOGGER logger) { assert(node->height>0); assert(0 <= childnum && childnum < node->u.n.n_children); FIFO old_h = BNC_BUFFER(node,childnum); @@ -875,15 +854,20 @@ static int handle_split_of_child (BRT t, BRTNODE node, int childnum, switch (type) { case BRT_INSERT: case BRT_DELETE_BOTH: - case BRT_DELETE: - if (type!=BRT_DELETE || 0==(t->flags&TOKU_DB_DUPSORT)) { + case BRT_DELETE_ANY: + case BRT_ABORT_BOTH: + case BRT_ABORT_ANY: + case BRT_COMMIT_BOTH: + case BRT_COMMIT_ANY: + if ((type!=BRT_DELETE_ANY && type!=BRT_ABORT_ANY && type!=BRT_COMMIT_ANY) || 0==(t->flags&TOKU_DB_DUPSORT)) { // If it's an INSERT or DELETE_BOTH or there are no duplicates then we just put the command into one subtree int cmp = brt_compare_pivot(t, &skd, &svd, childsplitk->data); if (cmp <= 0) pusha = 1; else pushb = 1; } else { - assert(type==BRT_DELETE && t->flags&TOKU_DB_DUPSORT); - // It is a DELETE and it's a DUPSORT database, in which case if the comparison function comes up 0 we must write the command to both children. (See #201) + assert((type==BRT_DELETE_ANY || type==BRT_ABORT_ANY || type==BRT_COMMIT_ANY) && t->flags&TOKU_DB_DUPSORT); + // It is a DELETE or ABORT_ANY and it's a DUPSORT database, + // in which case if the comparison function comes up 0 we must write the command to both children. (See #201) int cmp = brt_compare_pivot(t, &skd, 0, childsplitk->data); if (cmp<=0) pusha=1; if (cmp>=0) pushb=1; // Could be that both pusha and pushb are set @@ -891,7 +875,7 @@ static int handle_split_of_child (BRT t, BRTNODE node, int childnum, if (pusha) { // If we already have something in the buffer, we must add the new command to the buffer so that commands don't get out of order. if (toku_fifo_n_entries(BNC_BUFFER(node,childnum))==0) { - r=push_brt_cmd_down_only_if_it_wont_push_more_else_put_here(t, node, childa, &brtcmd, childnum, logger, path_to_parent); + r=push_brt_cmd_down_only_if_it_wont_push_more_else_put_here(t, node, childa, &brtcmd, childnum, logger); } else { r=insert_to_buffer_in_nonleaf(node, childnum, &skd, &svd, type, xid); } @@ -899,7 +883,7 @@ static int handle_split_of_child (BRT t, BRTNODE node, int childnum, if (pushb) { // If we already have something in the buffer, we must add the new command to the buffer so that commands don't get out of order. if (toku_fifo_n_entries(BNC_BUFFER(node,childnum+1))==0) { - r=push_brt_cmd_down_only_if_it_wont_push_more_else_put_here(t, node, childb, &brtcmd, childnum+1, logger, path_to_parent); + r=push_brt_cmd_down_only_if_it_wont_push_more_else_put_here(t, node, childb, &brtcmd, childnum+1, logger); } else { r=insert_to_buffer_in_nonleaf(node, childnum+1, &skd, &svd, type, xid); } @@ -926,9 +910,9 @@ static int handle_split_of_child (BRT t, BRTNODE node, int childnum, //verify_local_fingerprint_nonleaf(childb); //verify_local_fingerprint_nonleaf(node); - //toku_verify_counts(node); - //toku_verify_counts(childa); - //toku_verify_counts(childb); + VERIFY_NODE(node); + VERIFY_NODE(childa); + VERIFY_NODE(childb); r=toku_unpin_brtnode(t, childa); assert(r==0); @@ -937,7 +921,7 @@ static int handle_split_of_child (BRT t, BRTNODE node, int childnum, if (node->u.n.n_children>TREE_FANOUT) { //printf("%s:%d about to split having pushed %d out of %d keys\n", __FILE__, __LINE__, i, n_pairs); - r=brt_nonleaf_split(t, node, nodea, nodeb, splitk, logger, path_to_parent); + r=brt_nonleaf_split(t, node, nodea, nodeb, splitk, logger); if (r!=0) return r; //printf("%s:%d did split\n", __FILE__, __LINE__); split_count++; @@ -957,7 +941,7 @@ static int handle_split_of_child (BRT t, BRTNODE node, int childnum, if (toku_serialize_brtnode_size(node) > node->nodesize) { /* lighten the node by pushing down its buffers. this may cause the current node to split and go away */ - r = brtnode_maybe_push_down(t, node, did_split, nodea, nodeb, splitk, 0, logger, path_to_parent); + r = brtnode_maybe_push_down(t, node, did_split, nodea, nodeb, splitk, 0, logger); assert(r == 0); } if (*did_split == 0) assert(toku_serialize_brtnode_size(node)<=node->nodesize); @@ -969,8 +953,7 @@ static int push_some_brt_cmds_down (BRT t, BRTNODE node, int childnum, int *did_split, BRTNODE *nodea, BRTNODE *nodeb, DBT *splitk, int debug, - TOKULOGGER logger, - DISKOFFARRAY path_to_parent) { + TOKULOGGER logger) { void *childnode_v; BRTNODE child; int r; @@ -983,7 +966,7 @@ static int push_some_brt_cmds_down (BRT t, BRTNODE node, int childnum, //printf("%s:%d pin %p\n", __FILE__, __LINE__, childnode_v); child=childnode_v; //verify_local_fingerprint_nonleaf(child); - //toku_verify_counts(child); + VERIFY_NODE(child); //printf("%s:%d height=%d n_bytes_in_buffer = {%d, %d, %d, ...}\n", __FILE__, __LINE__, child->height, child->n_bytes_in_buffer[0], child->n_bytes_in_buffer[1], child->n_bytes_in_buffer[2]); if (child->height>0 && child->u.n.n_children>0) assert(BNC_DISKOFF(child, child->u.n.n_children-1)!=0); if (debug) printf("%s:%d %*spush_some_brt_cmds_down to %lld\n", __FILE__, __LINE__, debug, "", child->thisnodename); @@ -1018,8 +1001,7 @@ static int push_some_brt_cmds_down (BRT t, BRTNODE node, int childnum, &brtcmd, &child_did_split, &childa, &childb, &childsplitk, - logger, - path_to_parent); + logger); if (0){ unsigned int sum=0; @@ -1037,8 +1019,7 @@ static int push_some_brt_cmds_down (BRT t, BRTNODE node, int childnum, r=handle_split_of_child (t, node, childnum, childa, childb, &childsplitk, did_split, nodea, nodeb, splitk, - logger, - path_to_parent); + logger); //if (*did_split) { // verify_local_fingerprint_nonleaf(*nodea); // verify_local_fingerprint_nonleaf(*nodeb); @@ -1061,7 +1042,7 @@ static int debugp1 (int debug) { return debug ? debug+1 : 0; } -static int brtnode_maybe_push_down(BRT t, BRTNODE node, int *did_split, BRTNODE *nodea, BRTNODE *nodeb, DBT *splitk, int debug, TOKULOGGER logger, DISKOFFARRAY path_to_parent) +static int brtnode_maybe_push_down(BRT t, BRTNODE node, int *did_split, BRTNODE *nodea, BRTNODE *nodeb, DBT *splitk, int debug, TOKULOGGER logger) /* If the buffer is too full, then push down. Possibly the child will split. That may make us split. */ { assert(node->height>0); @@ -1077,7 +1058,7 @@ static int brtnode_maybe_push_down(BRT t, BRTNODE node, int *did_split, BRTNODE find_heaviest_child(node, &childnum); if (0) printf("%s:%d %*spush some down from %lld into %lld (child %d)\n", __FILE__, __LINE__, debug, "", node->thisnodename, BNC_DISKOFF(node, childnum), childnum); assert(BNC_DISKOFF(node, childnum)!=0); - int r = push_some_brt_cmds_down(t, node, childnum, did_split, nodea, nodeb, splitk, debugp1(debug), logger, path_to_parent); + int r = push_some_brt_cmds_down(t, node, childnum, did_split, nodea, nodeb, splitk, debugp1(debug), logger); if (r!=0) return r; assert(*did_split==0 || *did_split==1); if (debug) printf("%s:%d %*sdid push_some_brt_cmds_down did_split=%d\n", __FILE__, __LINE__, debug, "", *did_split); @@ -1107,107 +1088,409 @@ static int brtnode_maybe_push_down(BRT t, BRTNODE node, int *did_split, BRTNODE return 0; } -int toku_brtleaf_compare_fun (u_int32_t alen __attribute__((__unused__)), void *aval, u_int32_t blen __attribute__((__unused__)), void *bval, void *extra) { - struct lc_pair *p = extra; - BRT t = p->t; - DBT k1,k2; - int cmp = t->compare_fun (t->db, - toku_fill_dbt(&k1, kv_pair_key(aval), kv_pair_keylen(aval)), - toku_fill_dbt(&k2, kv_pair_key(bval), kv_pair_keylen(bval))); - if (cmp == 0 && p->compare_both ) { +int leafval_bessel_le_committed (u_int32_t klen, void *kval, + u_int32_t dlen, void *dval, + struct cmd_leafval_bessel_extra *be) { + BRT t = be->t; + DBT dbt; + int cmp = t->compare_fun(t->db, + toku_fill_dbt(&dbt, kval, klen), + be->cmd->u.id.key); + if (cmp == 0 && be->compare_both_keys && be->cmd->u.id.val->data) { return t->dup_compare(t->db, - toku_fill_dbt(&k1, kv_pair_val(aval), kv_pair_vallen(aval)), - toku_fill_dbt(&k2, kv_pair_val(bval), kv_pair_vallen(bval))); + toku_fill_dbt(&dbt, dval, dlen), + be->cmd->u.id.val); } else { return cmp; } } +int leafval_bessel_le_both (TXNID xid __attribute__((__unused__)), + u_int32_t klen, void *kval, + u_int32_t clen, void *cval, + u_int32_t plen __attribute__((__unused__)), void *pval __attribute__((__unused__)), + struct cmd_leafval_bessel_extra *be) { + return leafval_bessel_le_committed(klen, kval, clen, cval, be); +} + +int leafval_bessel_le_provdel (TXNID xid __attribute__((__unused__)), + u_int32_t klen, void *kval, + u_int32_t clen, void *cval, + struct cmd_leafval_bessel_extra *be) { + return leafval_bessel_le_committed(klen, kval, clen, cval, be); +} + +int leafval_bessel_le_provpair (TXNID xid __attribute__((__unused__)), + u_int32_t klen, void *kval, + u_int32_t plen, void *pval, + struct cmd_leafval_bessel_extra *be) { + return leafval_bessel_le_committed(klen, kval, plen, pval, be); +} + +int toku_cmd_leafval_bessel (u_int32_t dlen __attribute__((__unused__)), void *dval, void *extra) { + struct cmd_leafval_bessel_extra *be = extra; + LEAFENTRY le = dval; + LESWITCHCALL(le, leafval_bessel, be); +} + +// Whenever anything provisional is happening, it's XID must match the cmd's. + +static int apply_cmd_to_le_committed (u_int32_t klen, void *kval, + u_int32_t dlen, void *dval, + BRT_CMD cmd, + u_int32_t *newlen, u_int32_t *disksize, LEAFENTRY *new_data) { + assert(cmd->u.id.key->size == klen); + assert(memcmp(cmd->u.id.key->data, kval, klen)==0); + switch (cmd->type) { + case BRT_INSERT: + return le_both(cmd->xid, + klen, kval, + dlen, dval, + cmd->u.id.val->size, cmd->u.id.val->data, + newlen, disksize, new_data); + case BRT_DELETE_ANY: + case BRT_DELETE_BOTH: + return le_provdel(cmd->xid, + klen, kval, + dlen, dval, + newlen, disksize, new_data); + case BRT_ABORT_BOTH: + case BRT_ABORT_ANY: + case BRT_COMMIT_BOTH: + case BRT_COMMIT_ANY: + // Just return the original committed record + return le_committed(klen, kval, dlen, dval, + newlen, disksize, new_data); + case BRT_NONE: break; + } + assert(0); + return 0; +} + +static int apply_cmd_to_le_both (TXNID xid, + u_int32_t klen, void *kval, + u_int32_t clen, void *cval, + u_int32_t plen, void *pval, + BRT_CMD cmd, + u_int32_t *newlen, u_int32_t *disksize, LEAFENTRY *new_data) { + // keep the committed value for rollback. + assert(cmd->xid == xid); // provisional things must match the cmd. (Others should already be committed or aborted) + assert(cmd->u.id.key->size == klen); + assert(memcmp(cmd->u.id.key->data, kval, klen)==0); + switch (cmd->type) { + case BRT_INSERT: + return le_both(cmd->xid, + klen, kval, + clen, cval, + cmd->u.id.val->size, cmd->u.id.val->data, + newlen, disksize, new_data); + case BRT_DELETE_ANY: + case BRT_DELETE_BOTH: + return le_provdel(cmd->xid, + klen, kval, + clen, cval, + newlen, disksize, new_data); + case BRT_ABORT_BOTH: + case BRT_ABORT_ANY: + return le_committed(klen, kval, + clen, cval, + newlen, disksize, new_data); + case BRT_COMMIT_BOTH: + case BRT_COMMIT_ANY: + return le_committed(klen, kval, + plen, pval, + newlen, disksize, new_data); + case BRT_NONE: break; + } + assert(0); + return 0; +} +static int apply_cmd_to_le_provdel (TXNID xid, + u_int32_t klen, void *kval, + u_int32_t clen, void *cval, + BRT_CMD cmd, + u_int32_t *newlen, u_int32_t *disksize, LEAFENTRY *new_data) { + // keep the committed value for rollback + assert(cmd->xid == xid); // provisional things must match the cmd. (Others should already be committed or aborted) + assert(cmd->u.id.key->size == klen); + assert(memcmp(cmd->u.id.key->data, kval, klen)==0); + switch (cmd->type) { + case BRT_INSERT: + return le_both(cmd->xid, + klen, kval, + clen, cval, + cmd->u.id.val->size, cmd->u.id.val->data, + newlen, disksize, new_data); + case BRT_DELETE_ANY: + case BRT_DELETE_BOTH: + // A delete of a delete could conceivably return the same item, but to simplify things we just reallocate it + // because othewise we have to notice not to free() the olditem. + return le_provdel(cmd->xid, + klen, kval, + clen, cval, + newlen, disksize, new_data); + case BRT_ABORT_BOTH: + case BRT_ABORT_ANY: + return le_committed(klen, kval, + clen, cval, + newlen, disksize, new_data); + case BRT_COMMIT_BOTH: + case BRT_COMMIT_ANY: + *new_data = 0; + return 0; + case BRT_NONE: break; + } + assert(0); + return 0; +} + +static int apply_cmd_to_le_provpair (TXNID xid, + u_int32_t klen, void *kval, + u_int32_t plen , void *pval, + BRT_CMD cmd, + u_int32_t *newlen, u_int32_t *disksize, LEAFENTRY *new_data) { + assert(cmd->xid == xid); // provisional things must match the cmd. (Others should already be committed or aborted) + + assert(cmd->u.id.key->size == klen); + assert(memcmp(cmd->u.id.key->data, kval, klen)==0); + switch (cmd->type) { + case BRT_INSERT: + // it's still a provpair (the old prov value is lost) + return le_provpair(cmd->xid, + klen, kval, + cmd->u.id.val->size, cmd->u.id.val->data, + newlen, disksize, new_data); + case BRT_DELETE_BOTH: + case BRT_DELETE_ANY: + case BRT_ABORT_BOTH: + case BRT_ABORT_ANY: + // A delete or abort of a provisional pair is nothing. + *new_data = 0; + return 0; + case BRT_COMMIT_ANY: + case BRT_COMMIT_BOTH: + return le_committed(klen, kval, + plen, pval, + newlen, disksize, new_data); + case BRT_NONE: break; + } + assert(0); + return 0; +} + +static int apply_cmd_to_leaf (BRT_CMD cmd, + u_int32_t oldlen, void *stored_data, // NULL if there was no stored data. + u_int32_t *newlen, u_int32_t *disksize, LEAFENTRY *new_data) { + if (stored_data==0) { + switch (cmd->type) { + case BRT_INSERT: + { + LEAFENTRY le; + int r = le_provpair(cmd->xid, + cmd->u.id.key->size, cmd->u.id.key->data, + cmd->u.id.val->size, cmd->u.id.val->data, + newlen, disksize, &le); + if (r==0) *new_data=le; + return r; + } + case BRT_DELETE_BOTH: + case BRT_DELETE_ANY: + case BRT_ABORT_BOTH: + case BRT_ABORT_ANY: + case BRT_COMMIT_BOTH: + case BRT_COMMIT_ANY: + *new_data = 0; + return 0; // Don't have to insert anything. + case BRT_NONE: + break; + } + assert(0); + return 0; + } else { + assert(oldlen==leafentry_memsize(stored_data)); + LESWITCHCALL(stored_data, apply_cmd_to, cmd, + newlen, disksize, new_data); + } + +} + +int should_compare_both_keys (BRTNODE node, BRT_CMD cmd) { + switch (cmd->type) { + case BRT_INSERT: + return node->flags & TOKU_DB_DUPSORT; + case BRT_DELETE_BOTH: + case BRT_ABORT_BOTH: + case BRT_COMMIT_BOTH: + return 1; + case BRT_DELETE_ANY: + case BRT_ABORT_ANY: + case BRT_COMMIT_ANY: + return 0; + case BRT_NONE: + break; + } + assert(0); + return 0; +} + +static int brt_leaf_apply_cmd_once (BRT t, BRTNODE node, BRT_CMD cmd, TOKULOGGER logger, + u_int32_t idx, u_int32_t storedlen, LEAFENTRY le) { + FILENUM filenum = toku_cachefile_filenum(t->cf); + u_int32_t newlen, newdisksize; + LEAFENTRY newdata; + int r = apply_cmd_to_leaf(cmd, storedlen, le, &newlen, &newdisksize, &newdata); + if (r!=0) return r; + if (newdata) assert(newdisksize == leafentry_disksize(newdata)); + + if (le) { + // It's there, note that it's gone and remove it from the mempool + node->u.l.n_bytes_in_buffer -= PMA_ITEM_OVERHEAD + leafentry_disksize(le); + node->local_fingerprint -= node->rand4fingerprint * toku_le_crc(le); + + r = toku_log_deleteleafentry(logger, &node->log_lsn, 0, filenum, node->thisnodename, idx, le); + if (r!=0) return r; + + BRT_CMD_S cmd2 = *cmd; + DBT val_from_lekey; + cmd2.u.id.val = toku_fill_dbt(&val_from_lekey, le_latest_val(le), le_latest_vallen(le)); + struct cmd_leafval_bessel_extra be = {t, &cmd2, 1}; // always compare both in this mode, if the value is there + struct move_struct ms = {.logger=logger, .filenum=filenum, .from=node, .to=node}; + toku_gpma_delete_bessel(node->u.l.buffer, + toku_cmd_leafval_bessel, &be, + 0, 0, + note_move_items_within, &ms); + toku_mempool_mfree(&node->u.l.buffer_mempool, 0, storedlen); // Must pass 0, since le may be no good any more. + } + if (newdata) { + struct move_struct ms = {.logger=logger, .filenum=filenum, .from=node, .to=node}; + struct cmd_leafval_bessel_extra be = {t, cmd, node->flags & TOKU_DB_DUPSORT}; + LEAFENTRY new_le = mempool_malloc_from_gpma(node->u.l.buffer, &node->u.l.buffer_mempool, newlen); + memcpy(new_le, newdata, newlen); + r = toku_gpma_insert_bessel(node->u.l.buffer, newlen, new_le, toku_cmd_leafval_bessel, &be, note_move_items_within, &ms, &idx); + if (r!=0) return r; + + r = toku_log_insertleafentry(logger, &node->log_lsn, 0, toku_cachefile_filenum(t->cf), node->thisnodename, idx, newdata); + if (r!=0) return r; + + assert(newdisksize == leafentry_disksize(newdata)); + node->u.l.n_bytes_in_buffer += PMA_ITEM_OVERHEAD + newdisksize; + node->local_fingerprint += node->rand4fingerprint*toku_le_crc(newdata); + toku_free(newdata); + } +// printf("%s:%d rand4=%08x local_fingerprint=%08x this=%08x\n", __FILE__, __LINE__, node->rand4fingerprint, node->local_fingerprint, toku_calccrc32_kvpair_struct(kv)); + return 0; +} + static int brt_leaf_put_cmd (BRT t, BRTNODE node, BRT_CMD cmd, int *did_split, BRTNODE *nodea, BRTNODE *nodeb, DBT *splitk, int debug, TOKULOGGER logger) { // toku_pma_verify_fingerprint(node->u.l.buffer, node->rand4fingerprint, node->subtree_fingerprint); + VERIFY_NODE(node); assert(node->height==0); FILENUM filenum = toku_cachefile_filenum(t->cf); - if (cmd->type == BRT_INSERT) { - DBT *k = cmd->u.id.key; - DBT *v = cmd->u.id.val; - struct kv_pair *kv = brtnode_malloc_kv_pair(node->u.l.buffer, &node->u.l.buffer_mempool, k->data, k->size, v->data, v->size); - assert(kv); - u_int32_t storedlen; - void *storeddata; - u_int32_t idx; - struct lc_pair lc = {t, node->flags & TOKU_DB_DUPSORT}; // for put operations we compare both keys if they are both there - int r = toku_gpma_lookup_item(node->u.l.buffer, kv_pair_size(kv), kv, toku_brtleaf_compare_fun, &lc, &storedlen, &storeddata, &idx); - - if (r==0) { - // It's already there. Note that it's gone and remove it from the mempool. - node->u.l.n_bytes_in_buffer -= PMA_ITEM_OVERHEAD + storedlen; - node->local_fingerprint -= node->rand4fingerprint*toku_calccrc32_kvpair_struct(storeddata); - BYTESTRING okbs = { kv_pair_keylen(storeddata), kv_pair_key(storeddata) }; - BYTESTRING odbs = { kv_pair_vallen(storeddata), kv_pair_val(storeddata) }; - r = toku_log_deleteinleaf(logger, &node->log_lsn, 0, cmd->xid, filenum, node->thisnodename, idx, okbs, odbs); - toku_mempool_mfree(&node->u.l.buffer_mempool, storeddata, storedlen); - // Now put the new kv in. - toku_gpma_set_at_index(node->u.l.buffer, idx, kv_pair_size(kv), kv); - } else { - // Insert it. - struct move_struct ms = {.logger=logger, .filenum=filenum, .from=node, .to=node}; - r = toku_gpma_insert(node->u.l.buffer, kv_pair_size(kv), kv, toku_brtleaf_compare_fun, &lc, note_move_items_within, &ms, &idx); - if (r!=0) return r; - } - { - BYTESTRING kbs = { kv_pair_keylen(kv), kv_pair_key(kv) }; - BYTESTRING dbs = { kv_pair_vallen(kv), kv_pair_val(kv) }; - r = toku_log_insertinleaf(logger, &node->log_lsn, 0, cmd->xid, filenum, node->thisnodename, idx, kbs, dbs); - if (r!=0) return r; - } - node->u.l.n_bytes_in_buffer += PMA_ITEM_OVERHEAD + kv_pair_size(kv); - node->local_fingerprint += node->rand4fingerprint*toku_calccrc32_kvpair_struct(kv); -// printf("%s:%d rand4=%08x local_fingerprint=%08x this=%08x\n", __FILE__, __LINE__, node->rand4fingerprint, node->local_fingerprint, toku_calccrc32_kvpair_struct(kv)); + u_int32_t storedlen; + void *storeddata; + u_int32_t idx; + int r; + int compare_both = should_compare_both_keys(node, cmd); + struct cmd_leafval_bessel_extra be = {t, cmd, compare_both}; - node->dirty = 1; + switch (cmd->type) { + case BRT_INSERT: + r = toku_gpma_lookup_bessel(node->u.l.buffer, toku_cmd_leafval_bessel, 0, &be, + &storedlen, &storeddata, &idx); + if (r==DB_NOTFOUND) { + storeddata = 0; + } else if (r!=0) + return r; + + r = brt_leaf_apply_cmd_once(t, node, cmd, logger, idx, storedlen, storeddata); + if (r!=0) return r; + break; + case BRT_DELETE_BOTH: + case BRT_ABORT_BOTH: + case BRT_COMMIT_BOTH: + + // Delete the one item + r = toku_gpma_lookup_bessel(node->u.l.buffer, toku_cmd_leafval_bessel, 0, &be, + &storedlen, &storeddata, &idx); + if (r == DB_NOTFOUND) break; + if (r != 0) return r; + + VERIFY_NODE(node); + + static int count=0; + count++; + r = brt_leaf_apply_cmd_once(t, node, cmd, logger, idx, storedlen, storeddata); + if (r!=0) return r; + + VERIFY_NODE(node); + break; + + case BRT_DELETE_ANY: + case BRT_ABORT_ANY: + case BRT_COMMIT_ANY: + // Delete all the matches + + r = toku_gpma_lookup_bessel(node->u.l.buffer, toku_cmd_leafval_bessel, 0, &be, + &storedlen, &storeddata, &idx); + if (r == DB_NOTFOUND) break; + if (r != 0) return r; + + while (1) { + int vallen = le_any_vallen(storeddata); + void *save_val = toku_memdup(le_any_val(storeddata), storedlen); + + r = brt_leaf_apply_cmd_once(t, node, cmd, logger, idx, storedlen, storeddata); + if (r!=0) return r; + + // Now we must find the next one. + DBT valdbt; + BRT_CMD_S ncmd = { cmd->type, cmd->xid, .u.id={cmd->u.id.key, toku_fill_dbt(&valdbt, save_val, vallen)}}; + struct cmd_leafval_bessel_extra nbe = {t, &ncmd, 1}; + r = toku_gpma_lookup_bessel(node->u.l.buffer, toku_cmd_leafval_bessel, +1, &nbe, + &storedlen, &storeddata, &idx); + + toku_free(save_val); + if (r!=0) break; + { // Continue only if the next record that we found has the same key. + DBT adbt; + if (t->compare_fun(t->db, + toku_fill_dbt(&adbt, le_any_key(storeddata), le_any_keylen(storeddata)), + cmd->u.id.key) != 0) + break; + } + } + + break; + + case BRT_NONE: return EINVAL; + } + /// All done doing the work + + node->dirty = 1; // toku_pma_verify_fingerprint(node->u.l.buffer, node->rand4fingerprint, node->subtree_fingerprint); - // If it doesn't fit, then split the leaf. - if (toku_serialize_brtnode_size(node) > node->nodesize) { - r = brtleaf_split (logger, filenum, t, node, nodea, nodeb, splitk); - if (r!=0) return r; - //printf("%s:%d splitkey=%s\n", __FILE__, __LINE__, (char*)*splitkey); - split_count++; - *did_split = 1; - if (debug) printf("%s:%d %*snodeb->thisnodename=%lld nodeb->size=%d\n", __FILE__, __LINE__, debug, "", (*nodeb)->thisnodename, (*nodeb)->nodesize); - assert(toku_serialize_brtnode_size(*nodea)<=(*nodea)->nodesize); - assert(toku_serialize_brtnode_size(*nodeb)<=(*nodeb)->nodesize); -// toku_pma_verify_fingerprint((*nodea)->u.l.buffer, (*nodea)->rand4fingerprint, (*nodea)->subtree_fingerprint); -// toku_pma_verify_fingerprint((*nodeb)->u.l.buffer, (*nodeb)->rand4fingerprint, (*nodeb)->subtree_fingerprint); - } else { - *did_split = 0; - } - return 0; - - } else if (cmd->type == BRT_DELETE || cmd->type == BRT_DELETE_BOTH) { - DBT *k = cmd->u.id.key; - DBT *v = cmd->u.id.val; - struct kv_pair *kv = kv_pair_malloc(k->data, k->size, v->data, v->size); - struct lc_pair lc = {t, (cmd->type == BRT_DELETE_BOTH) }; - struct move_struct ms = {.logger=logger, .filenum=filenum, .from=node, .to=node}; - struct delete_struct dp = {node}; - int r = toku_gpma_delete_item(node->u.l.buffer, - kv_pair_size(kv), kv, - toku_brtleaf_compare_fun, &lc, - brt_leaf_delete_callback, &dp, - note_move_items_within, &ms); - toku_free(kv); - *did_split = 0; - if (r==DB_NOTFOUND) return 0; - return r; + VERIFY_NODE(node); + // If it doesn't fit, then split the leaf. + if (toku_serialize_brtnode_size(node) > node->nodesize) { + r = brtleaf_split (logger, filenum, t, node, nodea, nodeb, splitk); + if (r!=0) return r; + //printf("%s:%d splitkey=%s\n", __FILE__, __LINE__, (char*)*splitkey); + split_count++; + *did_split = 1; + if (debug) printf("%s:%d %*snodeb->thisnodename=%lld nodeb->size=%d\n", __FILE__, __LINE__, debug, "", (*nodeb)->thisnodename, (*nodeb)->nodesize); + assert(toku_serialize_brtnode_size(*nodea)<=(*nodea)->nodesize); + assert(toku_serialize_brtnode_size(*nodeb)<=(*nodeb)->nodesize); + VERIFY_NODE(*nodea); + VERIFY_NODE(*nodeb); } else { - return EINVAL; + *did_split = 0; } + return 0; } /* find the leftmost child that may contain the key */ @@ -1226,8 +1509,7 @@ unsigned int toku_brtnode_which_child (BRTNODE node , DBT *k, DBT *d, BRT t) { /* put a cmd into a nodes child */ static int brt_nonleaf_put_cmd_child_node (BRT t, BRTNODE node, BRT_CMD cmd, int *did_split, BRTNODE *nodea, BRTNODE *nodeb, DBT *splitk, - int debug, TOKULOGGER logger, int childnum, int maybe, - DISKOFFARRAY path_to_parent) { + int debug, TOKULOGGER logger, int childnum, int maybe) { int r; void *child_v; BRTNODE child; @@ -1248,12 +1530,8 @@ static int brt_nonleaf_put_cmd_child_node (BRT t, BRTNODE node, BRT_CMD cmd, child = child_v; child_did_split = 0; - assert(path_to_parent.lenthisnodename; r = brtnode_put_cmd(t, child, cmd, - &child_did_split, &childa, &childb, &childsplitk, debug, logger, - path_to_parent); - path_to_parent.len--; + &child_did_split, &childa, &childb, &childsplitk, debug, logger); if (r != 0) { /* putting to the child failed for some reason, so unpin the child and return the error code */ int rr = toku_unpin_brtnode(t, child); @@ -1266,8 +1544,7 @@ static int brt_nonleaf_put_cmd_child_node (BRT t, BRTNODE node, BRT_CMD cmd, r = handle_split_of_child(t, node, childnum, childa, childb, &childsplitk, did_split, nodea, nodeb, splitk, - logger, - path_to_parent); + logger); assert(r == 0); } else { //verify_local_fingerprint_nonleaf(child); @@ -1283,13 +1560,12 @@ int toku_brt_do_push_cmd = 1; /* put a cmd into a node at childnum */ static int brt_nonleaf_put_cmd_child (BRT t, BRTNODE node, BRT_CMD cmd, int *did_split, BRTNODE *nodea, BRTNODE *nodeb, DBT *splitk, - int debug, TOKULOGGER logger, unsigned int childnum, int can_push, int *do_push_down, - DISKOFFARRAY path_to_parent) { + int debug, TOKULOGGER logger, unsigned int childnum, int can_push, int *do_push_down) { //verify_local_fingerprint_nonleaf(node); /* try to push the cmd to the subtree if the buffer is empty and pushes are enabled */ if (BNC_NBYTESINBUF(node, childnum) == 0 && can_push && toku_brt_do_push_cmd) { - int r = brt_nonleaf_put_cmd_child_node(t, node, cmd, did_split, nodea, nodeb, splitk, debug, logger, childnum, 1, path_to_parent); + int r = brt_nonleaf_put_cmd_child_node(t, node, cmd, did_split, nodea, nodeb, splitk, debug, logger, childnum, 1); if (r == 0) return r; } @@ -1301,7 +1577,7 @@ static int brt_nonleaf_put_cmd_child (BRT t, BRTNODE node, BRT_CMD cmd, DBT *k = cmd->u.id.key; DBT *v = cmd->u.id.val; - int r = log_and_save_brtenq(logger, t, node, childnum, cmd->xid, type, k->data, k->size, v->data, v->size, &node->local_fingerprint, path_to_parent); + int r = log_and_save_brtenq(logger, t, node, childnum, cmd->xid, type, k->data, k->size, v->data, v->size, &node->local_fingerprint); if (r!=0) return r; int diff = k->size + v->size + KEY_VALUE_OVERHEAD + BRT_CMD_OVERHEAD; r=toku_fifo_enq(BNC_BUFFER(node,childnum), k->data, k->size, v->data, v->size, type, cmd->xid); @@ -1314,9 +1590,9 @@ static int brt_nonleaf_put_cmd_child (BRT t, BRTNODE node, BRT_CMD cmd, return 0; } -static int brt_nonleaf_insert_cmd (BRT t, BRTNODE node, BRT_CMD cmd, - int *did_split, BRTNODE *nodea, BRTNODE *nodeb, DBT *splitk, - int debug, TOKULOGGER logger, DISKOFFARRAY path_to_parent) { +static int brt_nonleaf_cmd_once (BRT t, BRTNODE node, BRT_CMD cmd, + int *did_split, BRTNODE *nodea, BRTNODE *nodeb, DBT *splitk, + int debug, TOKULOGGER logger) { //verify_local_fingerprint_nonleaf(node); unsigned int childnum; int r; @@ -1326,14 +1602,14 @@ static int brt_nonleaf_insert_cmd (BRT t, BRTNODE node, BRT_CMD cmd, /* put the cmd in the subtree */ int do_push_down = 0; - r = brt_nonleaf_put_cmd_child(t, node, cmd, did_split, nodea, nodeb, splitk, debug, logger, childnum, 1, &do_push_down, path_to_parent); + r = brt_nonleaf_put_cmd_child(t, node, cmd, did_split, nodea, nodeb, splitk, debug, logger, childnum, 1, &do_push_down); if (r != 0) return r; /* maybe push down */ if (do_push_down) { if (debug) printf("%s:%d %*sDoing maybe_push_down\n", __FILE__, __LINE__, debug, ""); //verify_local_fingerprint_nonleaf(node); - r = brtnode_maybe_push_down(t, node, did_split, nodea, nodeb, splitk, debugp1(debug), logger, path_to_parent); + r = brtnode_maybe_push_down(t, node, did_split, nodea, nodeb, splitk, debugp1(debug), logger); if (r!=0) return r; if (debug) printf("%s:%d %*sDid maybe_push_down\n", __FILE__, __LINE__, debug, ""); if (*did_split) { @@ -1357,18 +1633,17 @@ static int brt_nonleaf_insert_cmd (BRT t, BRTNODE node, BRT_CMD cmd, } /* delete in all subtrees starting from the left most one which contains the key */ -static int brt_nonleaf_delete_cmd (BRT t, BRTNODE node, BRT_CMD cmd, - int *did_split, BRTNODE *nodea, BRTNODE *nodeb, DBT *splitk, - int debug, - TOKULOGGER logger, - DISKOFFARRAY path_to_parent) { +static int brt_nonleaf_cmd_many (BRT t, BRTNODE node, BRT_CMD cmd, + int *did_split, BRTNODE *nodea, BRTNODE *nodeb, DBT *splitk, + int debug, + TOKULOGGER logger) { int r; - /* find all children that need a delete cmd */ - int delchild[TREE_FANOUT], delidx = 0; - inline void delchild_append(int i) { - if (delidx == 0 || delchild[delidx-1] != i) - delchild[delidx++] = i; + /* find all children that need a copy of the command */ + int sendchild[TREE_FANOUT], delidx = 0; + inline void sendchild_append(int i) { + if (delidx == 0 || sendchild[delidx-1] != i) + sendchild[delidx++] = i; } int i; for (i = 0; i < node->u.n.n_children-1; i++) { @@ -1376,24 +1651,24 @@ static int brt_nonleaf_delete_cmd (BRT t, BRTNODE node, BRT_CMD cmd, if (cmp > 0) { continue; } else if (cmp < 0) { - delchild_append(i); + sendchild_append(i); break; } else if (t->flags & TOKU_DB_DUPSORT) { - delchild_append(i); - delchild_append(i+1); + sendchild_append(i); + sendchild_append(i+1); } else { - delchild_append(i); + sendchild_append(i); break; } } if (delidx == 0) - delchild_append(node->u.n.n_children-1); + sendchild_append(node->u.n.n_children-1); - /* issue the delete cmd to all of the children found previously */ + /* issue the to all of the children found previously */ int do_push_down = 0; for (i=0; itype == BRT_INSERT || cmd->type == BRT_DELETE_BOTH) { - return brt_nonleaf_insert_cmd(t, node, cmd, did_split, nodea, nodeb, splitk, debug, logger, path_to_parent); - } else if (cmd->type == BRT_DELETE) { - return brt_nonleaf_delete_cmd(t, node, cmd, did_split, nodea, nodeb, splitk, debug, logger, path_to_parent); - } else - return EINVAL; + TOKULOGGER logger) { + switch (cmd->type) { + case BRT_INSERT: + case BRT_DELETE_BOTH: + case BRT_ABORT_BOTH: + case BRT_COMMIT_BOTH: + do_once: + return brt_nonleaf_cmd_once(t, node, cmd, did_split, nodea, nodeb, splitk, debug, logger); + case BRT_DELETE_ANY: + case BRT_ABORT_ANY: + case BRT_COMMIT_ANY: + if (0 == (node->flags & TOKU_DB_DUPSORT)) goto do_once; // nondupsort delete_any is just do once. + return brt_nonleaf_cmd_many(t, node, cmd, did_split, nodea, nodeb, splitk, debug, logger); + case BRT_NONE: + break; + } + return EINVAL; } @@ -1453,8 +1737,7 @@ static void verify_local_fingerprint_nonleaf (BRTNODE node) { static int brtnode_put_cmd (BRT t, BRTNODE node, BRT_CMD cmd, int *did_split, BRTNODE *nodea, BRTNODE *nodeb, DBT *splitk, int debug, - TOKULOGGER logger, - DISKOFFARRAY path_to_parent) { + TOKULOGGER logger) { //static int counter=0; // FOO //static int oldcounter=0; //int tmpcounter; @@ -1469,7 +1752,7 @@ static int brtnode_put_cmd (BRT t, BRTNODE node, BRT_CMD cmd, } else { r = brt_nonleaf_put_cmd(t, node, cmd, did_split, nodea, nodeb, splitk, - debug, logger, path_to_parent); + debug, logger); } //oldcounter=tmpcounter; // Watch out. If did_split then the original node is no longer allocated. @@ -1693,6 +1976,7 @@ int toku_brt_open(BRT t, const char *fname, const char *fname_in_env, const char if ((r=toku_log_fheader(toku_txn_logger(txn), (LSN*)0, 0, toku_txn_get_txnid(txn), toku_cachefile_filenum(t->cf), lh))) { goto died6; } } if ((r=setup_initial_brt_root_node(t, t->nodesize, toku_txn_logger(txn)))!=0) { died6: if (dbname) goto died5; else goto died2; } + //printf("%s:%d putting %p (%d)\n", __FILE__, __LINE__, t->h, 0); if ((r=toku_cachetable_put(t->cf, 0, t->h, 0, toku_brtheader_flush_callback, toku_brtheader_fetch_callback, 0))) { goto died6; } } else if (r!=0) { @@ -1914,14 +2198,14 @@ static int brt_init_new_root(BRT brt, BRTNODE nodea, BRTNODE nodeb, DBT splitk, if (r!=0) return r; r = toku_unpin_brtnode(brt, nodeb); if (r!=0) return r; - //printf("%s:%d put %lld\n", __FILE__, __LINE__, brt->root); + //printf("%s:%d put %lld\n", __FILE__, __LINE__, newroot_diskoff); toku_cachetable_put(brt->cf, newroot_diskoff, newroot, brtnode_size(newroot), toku_brtnode_flush_callback, toku_brtnode_fetch_callback, brt); *newrootp = newroot; return 0; } -static int brt_root_put_cmd(BRT brt, BRT_CMD cmd, TOKULOGGER logger, DISKOFFARRAY path_to_parent) { +int toku_brt_root_put_cmd(BRT brt, BRT_CMD cmd, TOKULOGGER logger) { void *node_v; BRTNODE node; CACHEKEY *rootp; @@ -1948,8 +2232,7 @@ static int brt_root_put_cmd(BRT brt, BRT_CMD cmd, TOKULOGGER logger, DISKOFFARRA result = brtnode_put_cmd(brt, node, cmd, &did_split, &nodea, &nodeb, &splitk, debug, - logger, - path_to_parent); + logger); if (debug) printf("%s:%d did_insert\n", __FILE__, __LINE__); if (did_split) { // node is unpinned, so now we have to proceed to update the root with a new node. @@ -1974,10 +2257,15 @@ static int brt_root_put_cmd(BRT brt, BRT_CMD cmd, TOKULOGGER logger, DISKOFFARRA int toku_brt_insert (BRT brt, DBT *key, DBT *val, TOKUTXN txn) { int r; + if (txn) { + BYTESTRING keybs = {key->size, toku_memdup(key->data, key->size)}; + BYTESTRING databs = {val->size, toku_memdup(val->data, val->size)}; + r = toku_logger_save_rollback_cmdinsert(txn, toku_txn_get_txnid(txn), toku_cachefile_filenum(brt->cf), keybs, databs); + if (r!=0) return r; + } BRT_CMD_S brtcmd = { BRT_INSERT, toku_txn_get_txnid(txn), .u.id={key,val}}; - DISKOFF path[MAX_PATHLEN_TO_ROOT]; - DISKOFFARRAY path_to_parent = {0, path}; - r = brt_root_put_cmd(brt, &brtcmd, toku_txn_logger(txn), path_to_parent); + r = toku_brt_root_put_cmd(brt, &brtcmd, toku_txn_logger(txn)); + if (r!=0) return r; return r; } @@ -1998,20 +2286,27 @@ int toku_brt_lookup (BRT brt, DBT *k, DBT *v) { int toku_brt_delete(BRT brt, DBT *key, TOKUTXN txn) { int r; + if (txn) { + BYTESTRING keybs = {key->size, toku_memdup(key->data, key->size)}; + r = toku_logger_save_rollback_cmddelete(txn, toku_txn_get_txnid(txn), toku_cachefile_filenum(brt->cf), keybs); + if (r!=0) return r; + } DBT val; - BRT_CMD_S brtcmd = { BRT_DELETE, toku_txn_get_txnid(txn), .u.id={key, toku_init_dbt(&val)}}; - DISKOFF path[MAX_PATHLEN_TO_ROOT]; - DISKOFFARRAY path_to_parent = {0, path}; - r = brt_root_put_cmd(brt, &brtcmd, toku_txn_logger(txn), path_to_parent); + BRT_CMD_S brtcmd = { BRT_DELETE_ANY, toku_txn_get_txnid(txn), .u.id={key, toku_init_dbt(&val)}}; + r = toku_brt_root_put_cmd(brt, &brtcmd, toku_txn_logger(txn)); return r; } int toku_brt_delete_both(BRT brt, DBT *key, DBT *val, TOKUTXN txn) { int r; + if (txn) { + BYTESTRING keybs = {key->size, toku_memdup(key->data, key->size)}; + BYTESTRING databs = {val->size, toku_memdup(val->data, val->size)}; + r = toku_logger_save_rollback_cmddeleteboth(txn, toku_txn_get_txnid(txn), toku_cachefile_filenum(brt->cf), keybs, databs); + if (r!=0) return r; + } BRT_CMD_S brtcmd = { BRT_DELETE_BOTH, toku_txn_get_txnid(txn), .u.id={key,val}}; - DISKOFF path[MAX_PATHLEN_TO_ROOT]; - DISKOFFARRAY path_to_parent = {0, path}; - r = brt_root_put_cmd(brt, &brtcmd, toku_txn_logger(txn), path_to_parent); + r = toku_brt_root_put_cmd(brt, &brtcmd, toku_txn_logger(txn)); return r; } @@ -2149,15 +2444,15 @@ static inline void brt_split_init(BRT_SPLIT *split) { toku_init_dbt(&split->splitk); } -static int brt_search_node(BRT brt, BRTNODE node, brt_search_t *search, DBT *newkey, DBT *newval, BRT_SPLIT *split, TOKULOGGER logger, DISKOFFARRAY path_to_parent); +static int brt_search_node(BRT brt, BRTNODE node, brt_search_t *search, DBT *newkey, DBT *newval, BRT_SPLIT *split, TOKULOGGER logger); /* search in a node's child */ -static int brt_search_child(BRT brt, BRTNODE node, int childnum, brt_search_t *search, DBT *newkey, DBT *newval, BRT_SPLIT *split, TOKULOGGER logger, DISKOFFARRAY path_to_parent) { +static int brt_search_child(BRT brt, BRTNODE node, int childnum, brt_search_t *search, DBT *newkey, DBT *newval, BRT_SPLIT *split, TOKULOGGER logger) { int r, rr; /* if the child's buffer is not empty then try to empty it */ if (BNC_NBYTESINBUF(node, childnum) > 0) { - rr = push_some_brt_cmds_down(brt, node, childnum, &split->did_split, &split->nodea, &split->nodeb, &split->splitk, 0, logger, path_to_parent); + rr = push_some_brt_cmds_down(brt, node, childnum, &split->did_split, &split->nodea, &split->nodeb, &split->splitk, 0, logger); assert(rr == 0); /* push down may cause a child split, so childnum may not be appropriate, and the node itself may split, so retry */ return EAGAIN; @@ -2167,16 +2462,13 @@ static int brt_search_child(BRT brt, BRTNODE node, int childnum, brt_search_t *s rr = toku_cachetable_get_and_pin(brt->cf, BNC_DISKOFF(node,childnum), &node_v, NULL, toku_brtnode_flush_callback, toku_brtnode_fetch_callback, brt); assert(rr == 0); - assert(path_to_parent.lenthisnodename; - for (;;) { BRTNODE childnode = node_v; BRT_SPLIT childsplit; brt_split_init(&childsplit); - r = brt_search_node(brt, childnode, search, newkey, newval, &childsplit, logger, path_to_parent); + r = brt_search_node(brt, childnode, search, newkey, newval, &childsplit, logger); if (childsplit.did_split) { rr = handle_split_of_child(brt, node, childnum, childsplit.nodea, childsplit.nodeb, &childsplit.splitk, - &split->did_split, &split->nodea, &split->nodeb, &split->splitk, logger, path_to_parent); + &split->did_split, &split->nodea, &split->nodeb, &split->splitk, logger); assert(rr == 0); break; } else { @@ -2191,7 +2483,7 @@ static int brt_search_child(BRT brt, BRTNODE node, int childnum, brt_search_t *s return r; } -static int brt_search_nonleaf_node(BRT brt, BRTNODE node, brt_search_t *search, DBT *newkey, DBT *newval, BRT_SPLIT *split, TOKULOGGER logger, DISKOFFARRAY path_to_parent) { +static int brt_search_nonleaf_node(BRT brt, BRTNODE node, brt_search_t *search, DBT *newkey, DBT *newval, BRT_SPLIT *split, TOKULOGGER logger) { int r = DB_NOTFOUND; int c; @@ -2209,7 +2501,7 @@ static int brt_search_nonleaf_node(BRT brt, BRTNODE node, brt_search_t *search, if (search->compare(search, toku_fill_dbt(&pivotkey, kv_pair_key(pivot), kv_pair_keylen(pivot)), brt->flags & TOKU_DB_DUPSORT ? toku_fill_dbt(&pivotval, kv_pair_val(pivot), kv_pair_vallen(pivot)): 0)) { - r = brt_search_child(brt, node, child[c], search, newkey, newval, split, logger, path_to_parent); + r = brt_search_child(brt, node, child[c], search, newkey, newval, split, logger); if (r == 0 || r == EAGAIN) break; } @@ -2217,27 +2509,18 @@ static int brt_search_nonleaf_node(BRT brt, BRTNODE node, brt_search_t *search, /* check the first (left) or last (right) node if nothing has been found */ if (r == DB_NOTFOUND && c == node->u.n.n_children-1) - r = brt_search_child(brt, node, child[c], search, newkey, newval, split, logger, path_to_parent); + r = brt_search_child(brt, node, child[c], search, newkey, newval, split, logger); return r; } -struct bessel_from_search_struct { - brt_search_t *search; -}; - -static int bessel_from_search_t (u_int32_t len __attribute__((__unused__)), void *data, void *extra) { - struct bessel_from_search_struct *bs = extra; - brt_search_t *search=bs->search; +int pair_leafval_bessel_le_committed (u_int32_t klen, void *kval, + u_int32_t dlen, void *dval, + brt_search_t *search) { DBT x,y; - struct kv_pair *kv = data; int cmp = search->compare(search, - search->k ? toku_fill_dbt(&x, kv_pair_key(kv), kv_pair_keylen(kv)) : 0, - search->v ? toku_fill_dbt(&y, kv_pair_val(kv), kv_pair_vallen(kv)) : 0); - // For a left-to-right search, the search compare function returns 0 for all pairs < kv. We want the first value that is 1. - // To convert it to a bessel, we have to convert the 0 to a -1. - // For a right-to-left search, the search compare function returns 0 for all pairs > kv, and 1 for lesser values. We want the last value that is 1. - // To convert it to a bessel, we have to convert 0 to +1, and 1 to -1. + search->k ? toku_fill_dbt(&x, kval, klen) : 0, + search->v ? toku_fill_dbt(&y, dval, dlen) : 0); switch (search->direction) { case BRT_SEARCH_LEFT: return cmp==0 ? -1 : +1; case BRT_SEARCH_RIGHT: return cmp==0 ? +1 : -1; // Because the comparison runs backwards for right searches. @@ -2246,9 +2529,37 @@ static int bessel_from_search_t (u_int32_t len __attribute__((__unused__)), void return 0; } + +int pair_leafval_bessel_le_both (TXNID xid __attribute__((__unused__)), + u_int32_t klen, void *kval, + u_int32_t clen, void *cval, + u_int32_t plen __attribute__((__unused__)), void *pval __attribute__((__unused__)), + brt_search_t *search) { + return pair_leafval_bessel_le_committed(klen, kval, clen, cval, search); +} + +int pair_leafval_bessel_le_provdel (TXNID xid __attribute__((__unused__)), + u_int32_t klen, void *kval, + u_int32_t clen, void *cval, + brt_search_t *be) { + return pair_leafval_bessel_le_committed(klen, kval, clen, cval, be); +} + +int pair_leafval_bessel_le_provpair (TXNID xid __attribute__((__unused__)), + u_int32_t klen, void *kval, + u_int32_t plen, void *pval, + brt_search_t *be) { + return pair_leafval_bessel_le_committed(klen, kval, plen, pval, be); +} + + +static int bessel_from_search_t (u_int32_t dlen __attribute__((__unused__)), void *leafval, void *extra) { + brt_search_t *search = extra; + LESWITCHCALL(leafval, pair_leafval_bessel, search); +} + static int brt_search_leaf_node(BRT brt, BRTNODE node, brt_search_t *search, DBT *newkey, DBT *newval) { // Now we have to convert from brt_search_t to the bessel function with a direction. What a pain... - struct bessel_from_search_struct bs = {search}; int direction; switch (search->direction) { case BRT_SEARCH_LEFT: direction = +1; goto ok; @@ -2262,25 +2573,29 @@ static int brt_search_leaf_node(BRT brt, BRTNODE node, brt_search_t *search, DBT int r = toku_gpma_lookup_bessel(node->u.l.buffer, bessel_from_search_t, direction, - &bs, + search, &len, &data, &idx); if (r!=0) return r; - struct kv_pair *kv = data; + LEAFENTRY le = data; + if (le_is_provdel(le)) { + // Provisionally deleted stuff is gone. + return DB_NOTFOUND; + } if (newkey) { - r = toku_dbt_set_value(newkey, kv_pair_key(kv), kv_pair_keylen(kv), &brt->skey); + r = toku_dbt_set_value(newkey, le_latest_key(le), le_latest_keylen(le), &brt->skey); if (r!=0) return r; } if (newval) { - r = toku_dbt_set_value(newval, kv_pair_val(kv), kv_pair_vallen(kv), &brt->sval); + r = toku_dbt_set_value(newval, le_latest_val(le), le_latest_vallen(le), &brt->sval); if (r!=0) return r; } return 0; } -static int brt_search_node(BRT brt, BRTNODE node, brt_search_t *search, DBT *newkey, DBT *newval, BRT_SPLIT *split, TOKULOGGER logger, DISKOFFARRAY path_to_parent) { +static int brt_search_node(BRT brt, BRTNODE node, brt_search_t *search, DBT *newkey, DBT *newval, BRT_SPLIT *split, TOKULOGGER logger) { if (node->height > 0) - return brt_search_nonleaf_node(brt, node, search, newkey, newval, split, logger, path_to_parent); + return brt_search_nonleaf_node(brt, node, search, newkey, newval, split, logger); else return brt_search_leaf_node(brt, node, search, newkey, newval); } @@ -2302,9 +2617,7 @@ int toku_brt_search(BRT brt, brt_search_t *search, DBT *newkey, DBT *newval, TOK for (;;) { BRT_SPLIT split; brt_split_init(&split); - DISKOFF path[MAX_PATHLEN_TO_ROOT]; - DISKOFFARRAY path_to_parent = {0, path}; - r = brt_search_node(brt, node, search, newkey, newval, &split, logger, path_to_parent); + r = brt_search_node(brt, node, search, newkey, newval, &split, logger); if (split.did_split) { rr = brt_init_new_root(brt, split.nodea, split.nodeb, split.splitk, rootp, 0, &node); @@ -2729,3 +3042,39 @@ int toku_brt_nonleaf_expunge_xaction(BRT brt, DISKOFF diskoff, TXNID xid) { int r2 = toku_cachetable_unpin(brt->cf, diskoff, 1, toku_serialize_brtnode_size(node)); return r ? r : r2; } + +int toku_gpma_compress_kvspace (GPMA pma, struct mempool *memp); +void *mempool_malloc_from_gpma(GPMA pma, struct mempool *mp, size_t size); + +int toku_gpma_compress_kvspace (GPMA pma, struct mempool *memp) { + if (toku_mempool_get_frag_size(memp) == 0) + return -1; + void *newmem = toku_malloc(memp->size); + if (newmem == 0) + return -2; + struct mempool new_kvspace; + toku_mempool_init(&new_kvspace, newmem, memp->size); + GPMA_ITERATE(pma, idx, len, data, + ({ + void *newdata = toku_mempool_malloc(&new_kvspace, (size_t)len, 4); + assert(newdata); + memcpy(newdata, data, (size_t)len); + toku_gpma_set_at_index(pma, idx, len, newdata); + // toku_verify_gpma(pma); + })); + toku_free(memp->base); + *memp = new_kvspace; + // toku_verify_gpma(pma); + return 0; +} + +void *mempool_malloc_from_gpma(GPMA pma, struct mempool *mp, size_t size) { + void *v = toku_mempool_malloc(mp, size, 4); + if (v==0) { + if (0 == toku_gpma_compress_kvspace(pma, mp)) { + v = toku_mempool_malloc(mp, size, 4); + assert(v); + } + } + return v; +} diff --git a/newbrt/brtdump.c b/newbrt/brtdump.c index 68c53c11a33..7198b43af8e 100644 --- a/newbrt/brtdump.c +++ b/newbrt/brtdump.c @@ -86,8 +86,12 @@ void dump_node (int f, DISKOFF off, struct brt_header *h) { switch ((enum brt_cmd_type)typ) { case BRT_NONE: printf("NONE"); goto ok; case BRT_INSERT: printf("INSERT"); goto ok; - case BRT_DELETE: printf("DELETE"); goto ok; + case BRT_DELETE_ANY: printf("DELETE_ANY"); goto ok; case BRT_DELETE_BOTH: printf("DELETE_BOTH"); goto ok; + case BRT_ABORT_ANY: printf("ABORT_ANY"); goto ok; + case BRT_ABORT_BOTH: printf("ABORT_BOTH"); goto ok; + case BRT_COMMIT_ANY: printf("COMMIT_ANY"); goto ok; + case BRT_COMMIT_BOTH: printf("COMMIT_BOTH"); goto ok; } printf("HUH?"); ok: @@ -105,13 +109,10 @@ void dump_node (int f, DISKOFF off, struct brt_header *h) { printf(" n_bytes_in_buffer=%d\n", n->u.l.n_bytes_in_buffer); printf(" items_in_buffer =%d\n", toku_gpma_n_entries(n->u.l.buffer)); GPMA_ITERATE(n->u.l.buffer, idx, len, data, - ({ - printf("%d: ", idx); - print_item(kv_pair_key(data), kv_pair_keylen(data)); - printf(" "); - print_item(kv_pair_val(data), kv_pair_vallen(data)); - printf("\n"); - })); + ({ + print_leafentry(stdout, data); + printf("\n"); + })); } } diff --git a/newbrt/brttypes.h b/newbrt/brttypes.h index 70c51c4c229..a6e5f21a51c 100644 --- a/newbrt/brttypes.h +++ b/newbrt/brttypes.h @@ -27,11 +27,6 @@ typedef struct { char *data; } BYTESTRING; -typedef struct { - int len; - DISKOFF *array; -} DISKOFFARRAY; - /* Make the LSN be a struct instead of an integer so that we get better type checking. */ typedef struct __toku_lsn { u_int64_t lsn; } LSN; #define ZERO_LSN ((LSN){0}) @@ -79,8 +74,12 @@ typedef struct cachefile *CACHEFILE; enum brt_cmd_type { BRT_NONE = 0, BRT_INSERT = 1, - BRT_DELETE = 2, + BRT_DELETE_ANY = 2, // Delete any matching key. This used to be called BRT_DELETE. BRT_DELETE_BOTH = 3, + BRT_ABORT_ANY = 4, // Abort any commands on any matching key. + BRT_ABORT_BOTH = 5, // Abort commands that match both the key and the value + BRT_COMMIT_ANY = 6, + BRT_COMMIT_BOTH = 7 }; /* tree commands */ diff --git a/newbrt/cachetable.c b/newbrt/cachetable.c index 978d283b85c..50dd0740ac1 100644 --- a/newbrt/cachetable.c +++ b/newbrt/cachetable.c @@ -62,7 +62,12 @@ struct fileid { struct cachefile { CACHEFILE next; - int refcount; /* CACHEFILEs are shared. Use a refcount to decide when to really close it. */ + u_int64_t refcount; /* CACHEFILEs are shared. Use a refcount to decide when to really close it. + * The reference count is one for every open DB. + * Plus one for every commit/rollback record. (It would be harder to keep a count for every open transaction, + * because then we'd have to figure out if the transaction was already counted. If we simply use a count for + * every record in the transaction, we'll be ok. Hence we use a 64-bit counter to make sure we don't run out. + */ int fd; /* Bug: If a file is opened read-only, then it is stuck in read-only. If it is opened read-write, then subsequent writers can write to it too. */ CACHETABLE cachetable; struct fileid fileid; diff --git a/newbrt/fingerprint.c b/newbrt/fingerprint.c index 28a36d667da..eac13af25d3 100644 --- a/newbrt/fingerprint.c +++ b/newbrt/fingerprint.c @@ -17,7 +17,8 @@ static inline u_int32_t toku_calc_more_crc32_kvpair (u_int32_t crc, const void * return crc; } -u_int32_t toku_calccrc32_kvpair (const void *key, int keylen, const void *val, int vallen) { +#if 0 + u_int32_t toku_calccrc32_kvpair (const void *key, int keylen, const void *val, int vallen) { return toku_calc_more_crc32_kvpair(toku_null_crc, key, keylen, val, vallen); } @@ -25,6 +26,7 @@ u_int32_t toku_calccrc32_kvpair_struct (const struct kv_pair *kvp) { return toku_calccrc32_kvpair(kv_pair_key_const(kvp), kv_pair_keylen(kvp), kv_pair_val_const(kvp), kv_pair_vallen(kvp)); } +#endif u_int32_t toku_calccrc32_cmd (u_int32_t type, TXNID xid, const void *key, u_int32_t keylen, const void *val, u_int32_t vallen) { unsigned char type_c = type; @@ -38,10 +40,18 @@ u_int32_t toku_calccrc32_cmd (u_int32_t type, TXNID xid, const void *key, u_int3 } u_int32_t toku_calccrc32_cmdstruct (BRT_CMD cmd) { - if (cmd->type <= BRT_DELETE_BOTH) + switch (cmd->type) { + case BRT_INSERT: + case BRT_DELETE_ANY: + case BRT_DELETE_BOTH: + case BRT_COMMIT_ANY: + case BRT_COMMIT_BOTH: + case BRT_ABORT_ANY: + case BRT_ABORT_BOTH: return toku_calccrc32_cmd (cmd->type, cmd->xid, cmd->u.id.key->data, cmd->u.id.key->size, cmd->u.id.val->data, cmd->u.id.val->size); - else { - assert(0); /* Should not have come here. */ + case BRT_NONE: return 0; } + assert(0); /* Should not have come here. */ + return 0; } diff --git a/newbrt/gpma.c b/newbrt/gpma.c index a1fcd9551e6..75b938a2740 100644 --- a/newbrt/gpma.c +++ b/newbrt/gpma.c @@ -86,11 +86,14 @@ u_int32_t toku_gpma_index_limit(GPMA pma) { } // If direction==0 then find any match for which the bessel gives 0. *found is set to 1 iff something with 0. The return value is the place where the zero is (if found), or the place where it would go (if there's a value there, then that value goes after the zero.) +// If more than one value returns 0, return the left most such value. // If direction>0 then find the first match for which bessel gives >0. *found is set to 1 iff something with >0. The return value is the index of the leftmost such value (if found). In the not-found case, all items are <=0 and the return value is pma->N. // If direction<0 then find the last match for which bessel gives <0. *found is set to 1 iff something with <0. The return value is the index of the rightmost such value (if found). In the not-found case, all items are >=0 and the return value is 0. u_int32_t toku_gpma_find_index_bes (GPMA pma, gpma_besselfun_t besf, int direction, void *extra, int *found) { if (direction==0) { int lo=0, hi=pma->N; + int foundone = 0; + u_int32_t foundidx = 0; while (loitems[look].len, pma->items[look].data, extra); if (cmp==0) { /* We found a match. */ - *found=1; - return look; + foundone = 1; + foundidx=look; + /* But keep looking to the left. */ + hi=mi; } else if (cmp>0) { hi=mi; } else { @@ -111,8 +116,9 @@ u_int32_t toku_gpma_find_index_bes (GPMA pma, gpma_besselfun_t besf, int directi } } } - *found = 0; - return lo; + *found = foundone; + if (foundone) return foundidx; + else return lo; } else if (direction<0) { // Find the rightmost negative value. @@ -371,15 +377,12 @@ int toku_make_space_at (GPMA pma, u_int32_t idx, u_int32_t *newidx, gpma_renumbe return toku_gpma_smooth_region (pma, lo, hi, count, idx, newidx, rcall, extra, pma->N); } -int toku_gpma_insert(GPMA pma, - u_int32_t len, void*data, - gpma_compare_fun_t compare, void *extra_for_compare, - gpma_renumber_callback_t rcall, void*extra_for_rcall, // if anything gets renumbered, let the caller know - u_int32_t *idxp - ) { - int found; - u_int32_t idx = toku_gpma_find_index(pma, len, data, compare, extra_for_compare, &found); - if (found) return DB_KEYEXIST; +static int finish_insert (GPMA pma, + u_int32_t len, void*data, + gpma_renumber_callback_t rcall, void*extra_for_rcall, // if anything gets renumbered, let the caller know + u_int32_t idx, + u_int32_t *idxp // store idx into *idxp (but only do it when we succeed.) + ) { assert(idx<=toku_gpma_index_limit(pma)); if (idx==toku_gpma_index_limit(pma) || pma->items[idx].data) { u_int32_t newidx; @@ -395,6 +398,32 @@ int toku_gpma_insert(GPMA pma, return 0; } + +int toku_gpma_insert(GPMA pma, + u_int32_t len, void*data, + gpma_compare_fun_t compare, void *extra_for_compare, + gpma_renumber_callback_t rcall, void*extra_for_rcall, // if anything gets renumbered, let the caller know + u_int32_t *idxp + ) { + int found; + u_int32_t idx = toku_gpma_find_index(pma, len, data, compare, extra_for_compare, &found); + if (found) return DB_KEYEXIST; + return finish_insert(pma, len, data, rcall, extra_for_rcall, idx, idxp); +} + +int toku_gpma_insert_bessel (GPMA pma, + u_int32_t len, void *data, + gpma_besselfun_t besf, void *extra_for_besself, + gpma_renumber_callback_t renumberf, void*extra_for_renumberf, // if anything gets renumbered, let the caller know + u_int32_t *indexp // Where did the item get stored? + ) { + int found; + u_int32_t idx = toku_gpma_find_index_bes(pma, besf, 0, extra_for_besself, &found); + if (found) return DB_KEYEXIST; + return finish_insert(pma, len, data, renumberf, extra_for_renumberf, idx, indexp); +} + + inline int toku_max_int (int a, int b) { return aitems[i].data) { - r = deletef(i, pma->items[i].len, pma->items[i].data, extra_for_deletef); - pma->items[i].data = 0; - if (r!=0) return r; + if (deletef) { + r = deletef(i, pma->items[i].len, pma->items[i].data, extra_for_deletef); + pma->items[i].data = 0; + if (r!=0) return r; + } else { + pma->items[i].data = 0; + } } } // Now we must find a region that is sufficiently densely packed and spread things out. @@ -566,10 +599,10 @@ int toku_gpma_lookup_item (GPMA pma, int toku_gpma_lookup_bessel(GPMA pma, gpma_besselfun_t besf, int direction, void*extra, u_int32_t *resultlen, void **resultdata, u_int32_t *idxp) { int found; u_int32_t idx = toku_gpma_find_index_bes(pma, besf, direction, extra, &found); + if (idxp) *idxp=idx; if (found) { *resultlen =pma->items[idx].len; *resultdata=pma->items[idx].data; - if (idxp) *idxp=idx; return 0; } else { return DB_NOTFOUND; @@ -699,7 +732,7 @@ void toku_gpma_set_at_index (GPMA pma, u_int32_t idx, u_int32_t len, void *data) void toku_gpma_clear_at_index (GPMA pma, u_int32_t idx) { assert(idxN); - if (pma->items[idx].data==0) { + if (pma->items[idx].data) { pma->n_items_present--; } pma->items[idx].data = 0; diff --git a/newbrt/gpma.h b/newbrt/gpma.h index 483822b4fd3..94b2ab0124d 100644 --- a/newbrt/gpma.h +++ b/newbrt/gpma.h @@ -42,6 +42,15 @@ int toku_gpma_insert (GPMA, gpma_renumber_callback_t renumberf, void*extra_for_renumberf, // if anything gets renumbered, let the caller know u_int32_t *indexp // Where did the item get stored? ); +// Use a bessel function to determine where to insert the data. +// Puts the new value between the rightmost -1 and the leftmost +1. +// Requires: Nothing in the pma returns 0. +int toku_gpma_insert_bessel (GPMA pma, + u_int32_t len, void *data, + gpma_besselfun_t, void *extra_for_besself, + gpma_renumber_callback_t renumberf, void*extra_for_renumberf, // if anything gets renumbered, let the caller know + u_int32_t *indexp // Where did the item get stored? + ); // Delete anything for which the besselfun is zero. The besselfun must be monotonically increasing compared to the comparison function. // That is, if two othings compare to be < then their besselfun's must yield <=, and if the compare to be = their besselfuns must be =, and if they are > then their besselfuns must be >= @@ -69,7 +78,8 @@ int toku_gpma_delete_item (GPMA, int toku_gpma_lookup_item (GPMA, u_int32_t len, void *data, gpma_compare_fun_t compf, void*extra, u_int32_t *resultlen, void **resultdata, u_int32_t *idx); // Lookup something according to the besselfun. -// If direction==0 then return something for which the besselfun is zero (or return DB_NOTFOUND). +// If direction==0 then return something for which the besselfun is zero (or return DB_NOTFOUND and set the idx to point at the spot where the item would go. That spot may already have an element in it, or it may be off the end.) +// If more than one value is zero, return the leftmost such value. // If direction>0 then return the first thing for which the besselfun is positive (or return DB_NOTFOUND). // If direction<0 then return the last thing for which the besselfun is negative (or return DB_NOTFOUND). int toku_gpma_lookup_bessel (GPMA, gpma_besselfun_t, int direction, void*extra, u_int32_t *len, void **data, u_int32_t *idx); diff --git a/newbrt/leafentry.c b/newbrt/leafentry.c index 01853b88e87..2eeedebf2ca 100644 --- a/newbrt/leafentry.c +++ b/newbrt/leafentry.c @@ -1,19 +1,20 @@ +#ident "Copyright (c) 2007, 2008 Tokutek Inc. All rights reserved." + #include "brttypes.h" #include "crc.h" #include "leafentry.h" #include "memory.h" #include "toku_assert.h" +#include "log.h" +#include "wbuf.h" #include +#include #include #include -enum le_state { LE_COMMITTED=1, // A committed pair. - LE_BOTH, // A committed pair and a provisional pair. - LE_PROVDEL, // A committed pair that has been provisionally deleted - LE_PROVPAIR }; // No committed value, but a provisional pair. - struct leafentry { + enum typ_tag tag; // Delete this later char state; char contents[0]; } __attribute__((packed)); @@ -21,7 +22,7 @@ struct leafentry { struct contents_committed { u_int32_t keylen; u_int32_t vallen; - char *data[0]; + char data[0]; } __attribute__((packed)); struct contents_both { @@ -29,112 +30,92 @@ struct contents_both { u_int32_t keylen; u_int32_t committed_vallen; u_int32_t prov_vallen; - char *data[0]; + char data[0]; } __attribute__((packed)); -struct contents_provdelorpair { // Te PROVDEL or PROVPAIR cases +struct contents_provdelorpair { // The PROVDEL or PROVPAIR cases TXNID xid; u_int32_t keylen; u_int32_t vallen; - char *data[0]; + char data[0]; } __attribute__((packed)); -static u_int32_t committed_keylen (void*cev) { +enum le_state get_le_state(LEAFENTRY le) { + return le->state; +} +void* get_le_contents(LEAFENTRY le) { + return &le->contents[0]; +} +enum typ_tag get_le_tag(LEAFENTRY le) { + return le->tag; +} + +u_int32_t committed_keylen (void*cev) { struct contents_committed *ce=cev; return ce->keylen; } -static void* committed_key (void*cev) { +void* committed_key (void*cev) { struct contents_committed *ce=cev; return &ce->data[0]; } -static u_int32_t committed_vallen (struct contents_committed *ce) { +u_int32_t committed_vallen (struct contents_committed *ce) { return ce->vallen; } -static void* committed_val (struct contents_committed *ce) { +void* committed_val (struct contents_committed *ce) { return &ce->data[ce->keylen]; } -static TXNID both_xid (struct contents_both *ce) { +TXNID both_xid (struct contents_both *ce) { return ce->xid; } -static u_int32_t both_keylen (struct contents_both *ce) { +u_int32_t both_keylen (struct contents_both *ce) { return ce->keylen; } -static u_int32_t both_committed_vallen (struct contents_both *ce) { +u_int32_t both_committed_vallen (struct contents_both *ce) { return ce->committed_vallen; } -static u_int32_t both_prov_vallen (struct contents_both *ce) { +u_int32_t both_prov_vallen (struct contents_both *ce) { return ce->prov_vallen; } -static void* both_key (struct contents_both *ce) { +void* both_key (struct contents_both *ce) { return &ce->data[0]; } -static void* both_committed_val (struct contents_both *ce) { +void* both_committed_val (struct contents_both *ce) { return &ce->data[ce->keylen]; } -static void* both_prov_val (struct contents_both*ce) { +void* both_prov_val (struct contents_both*ce) { return &ce->data[ce->keylen+ce->committed_vallen]; } -static TXNID provdelorpair_xid (struct contents_provdelorpair *ce) { +TXNID provdelorpair_xid (struct contents_provdelorpair *ce) { return ce->xid; } -static u_int32_t provdelorpair_keylen (struct contents_provdelorpair *ce) { +u_int32_t provdelorpair_keylen (struct contents_provdelorpair *ce) { return ce->keylen; } -static u_int32_t provdelorpair_vallen (struct contents_provdelorpair *ce) { +u_int32_t provdelorpair_vallen (struct contents_provdelorpair *ce) { return ce->vallen; } -static void* provdelorpair_key (struct contents_provdelorpair *ce) { +void* provdelorpair_key (struct contents_provdelorpair *ce) { return &ce->data[0]; } -static void* provdelorpair_val (struct contents_provdelorpair *ce) { +void* provdelorpair_val (struct contents_provdelorpair *ce) { return &ce->data[ce->keylen]; } - -#define LESWITCHCALL(le,funname, ...) ({ \ - switch((enum le_state)((le)->state)) { \ - case LE_COMMITTED: return funname ## _le_committed( committed_keylen((struct contents_committed*)&(le)->contents), \ - committed_key((struct contents_committed*)&(le)->contents), \ - committed_vallen((struct contents_committed*)&(le)->contents), \ - committed_val((struct contents_committed*)&(le)->contents), \ - ## __VA_ARGS__); \ - case LE_BOTH: return funname ## _le_both( both_xid((struct contents_both*)&(le)->contents), \ - both_keylen((struct contents_both*)&(le)->contents), \ - both_key((struct contents_both*)&(le)->contents), \ - both_committed_vallen((struct contents_both*)&(le)->contents), \ - both_committed_val((struct contents_both*)&(le)->contents), \ - both_prov_vallen((struct contents_both*)&(le)->contents), \ - both_prov_val((struct contents_both*)&(le)->contents), \ - ## __VA_ARGS__); \ - case LE_PROVDEL: return funname ## _le_provdel ( provdelorpair_xid((struct contents_provdelorpair*)&(le)->contents), \ - provdelorpair_keylen((struct contents_provdelorpair*)&(le)->contents), \ - provdelorpair_key((struct contents_provdelorpair*)&(le)->contents), \ - provdelorpair_vallen((struct contents_provdelorpair*)&(le)->contents), \ - provdelorpair_val((struct contents_provdelorpair*)&(le)->contents), \ - ## __VA_ARGS__); \ - case LE_PROVPAIR: return funname ## _le_provpair(provdelorpair_xid((struct contents_provdelorpair*)&(le)->contents), \ - provdelorpair_keylen((struct contents_provdelorpair*)&(le)->contents), \ - provdelorpair_key((struct contents_provdelorpair*)&(le)->contents), \ - provdelorpair_vallen((struct contents_provdelorpair*)&(le)->contents), \ - provdelorpair_val((struct contents_provdelorpair*)&(le)->contents), \ - ## __VA_ARGS__); \ - } abort(); }) - static u_int32_t crc_uint32_t (u_int32_t crc, u_int32_t v) { u_int32_t i = htonl(v); return toku_crc32(crc, &i, 4); @@ -183,53 +164,463 @@ u_int32_t toku_le_crc(LEAFENTRY v) { LESWITCHCALL(v, crc, crc); } -int toku_gpma_compress_kvspace (GPMA pma, struct mempool *memp) { - if (toku_mempool_get_frag_size(memp) == 0) - return -1; - void *newmem = toku_malloc(memp->size); - if (newmem == 0) - return -2; - struct mempool new_kvspace; - toku_mempool_init(&new_kvspace, newmem, memp->size); - GPMA_ITERATE(pma, idx, len, data, - ({ - void *newdata = toku_mempool_malloc(&new_kvspace, (size_t)len, 4); - assert(newdata); - memcpy(newdata, data, (size_t)len); - toku_gpma_set_at_index(pma, idx, len, newdata); - // toku_verify_gpma(pma); - })); - toku_free(memp->base); - *memp = new_kvspace; - // toku_verify_gpma(pma); - return 0; -} - - -void *mempool_malloc_from_gpma(GPMA pma, struct mempool *mp, size_t size) { - void *v = toku_mempool_malloc(mp, size, 4); - if (v==0) { - if (0 == toku_gpma_compress_kvspace(pma, mp)) { - v = toku_mempool_malloc(mp, size, 4); - assert(v); - } - } - return v; -} - -int le_committed (ITEMLEN klen, bytevec kval, ITEMLEN dlen, bytevec dval, GPMA pma, struct mempool *mp, LEAFENTRY *result) { +int le_committed (u_int32_t klen, void* kval, u_int32_t dlen, void* dval, u_int32_t *resultsize, u_int32_t *disksize, LEAFENTRY *result) { struct contents_committed *ce; - LEAFENTRY le=mempool_malloc_from_gpma(pma, mp, sizeof(*le)+sizeof(*ce)+klen+dlen); - le->state=LE_COMMITTED; + LEAFENTRY le; + size_t size = sizeof(*le)+sizeof(*ce)+klen+dlen; + le=toku_malloc(size); + le->tag = TYP_LEAFENTRY; + le->state= LE_COMMITTED; ce=(struct contents_committed*)&le->contents[0]; ce->keylen = klen; ce->vallen = dlen; memcpy(&ce->data[0], kval, (size_t)klen); memcpy(&ce->data[klen], dval, (size_t)dlen); + *resultsize=size; + *disksize = 1 + 4 + 4 + klen + dlen; *result=le; return 0; } -int le_both (ITEMLEN cklen, bytevec ckval, ITEMLEN cdlen, bytevec cdval, ITEMLEN pdlen, bytevec pdval, - struct mempool *mp, LEAFENTRY *result); -int le_provdel (ITEMLEN klen, bytevec kval, ITEMLEN dlen, bytevec dval, struct mempool *mp, LEAFENTRY *result); -int le_provpair (ITEMLEN klen, bytevec kval, ITEMLEN dlen, bytevec dval, struct mempool *mp, LEAFENTRY *result); +int le_both (TXNID xid, u_int32_t klen, void* kval, u_int32_t clen, void* cval, u_int32_t plen, void* pval, + u_int32_t *resultsize, u_int32_t *disksize, LEAFENTRY *result) { + struct contents_both *ce; + LEAFENTRY le; + size_t size = sizeof(*le)+sizeof(*ce)+klen+plen+clen; + le=toku_malloc(size); + le->tag = TYP_LEAFENTRY; + le->state= LE_BOTH; + ce=(struct contents_both*)&le->contents[0]; + ce->xid = xid; + ce->keylen = klen; + ce->committed_vallen = clen; + ce->prov_vallen = plen; + memcpy(&ce->data[0], kval, (size_t)klen); + memcpy(&ce->data[klen], cval, (size_t)clen); + memcpy(&ce->data[klen+clen], pval, (size_t)plen); + *resultsize=size; + *disksize = 1 + 8 + 4*3 + klen + clen + plen; + *result=le; + return 0; + +} +int le_provdel (TXNID xid, u_int32_t klen, void* kval, u_int32_t dlen, void* dval, + u_int32_t *memsize, u_int32_t *disksize, LEAFENTRY *result) { + struct contents_provdelorpair *ce; + LEAFENTRY le; + size_t size = sizeof(*le)+sizeof(*ce)+klen+dlen; + le=toku_malloc(size); + le->tag = TYP_LEAFENTRY; + le->state= LE_PROVDEL; + ce=(struct contents_provdelorpair*)&le->contents[0]; + ce->xid = xid; + ce->keylen = klen; + ce->vallen = dlen; + memcpy(&ce->data[0], kval, (size_t)klen); + memcpy(&ce->data[klen], dval, (size_t)dlen); + *memsize=size; + *disksize = 1 + 4 + 4 + 8 + klen + dlen; + *result=le; + return 0; +} +int le_provpair (TXNID xid, u_int32_t klen, void* kval, u_int32_t dlen, void* dval, u_int32_t *resultsize, u_int32_t *disksize, LEAFENTRY *result) { + struct contents_provdelorpair *ce; + LEAFENTRY le; + size_t size = sizeof(*le)+sizeof(*ce)+klen+dlen; + le=toku_malloc(size); + le->tag = TYP_LEAFENTRY; + le->state= LE_PROVPAIR; + ce=(struct contents_provdelorpair*)&le->contents[0]; + ce->xid = xid; + ce->keylen = klen; + ce->vallen = dlen; + memcpy(&ce->data[0], kval, (size_t)klen); + memcpy(&ce->data[klen], dval, (size_t)dlen); + *resultsize=size; + *disksize = 1 + 4 + 4 + 8 + klen + dlen; + *result=le; + return 0; +} + +static u_int32_t memsize_le_committed (u_int32_t keylen, void *key __attribute__((__unused__)), + u_int32_t vallen, void *val __attribute__((__unused__))) { + return sizeof(struct leafentry) + sizeof(struct contents_committed) + keylen + vallen; +} + +static u_int32_t memsize_le_both (TXNID txnid __attribute__((__unused__)), + u_int32_t klen, void *kval __attribute__((__unused__)), + u_int32_t clen, void *cval __attribute__((__unused__)), + u_int32_t plen, void *pval __attribute__((__unused__))) { + return sizeof(struct leafentry) + sizeof(struct contents_both) + klen + clen + plen; +} + +static u_int32_t memsize_le_provdel (TXNID txnid __attribute__((__unused__)), + u_int32_t klen, void *kval __attribute__((__unused__)), + u_int32_t clen, void *cval __attribute__((__unused__))) { + return sizeof(struct leafentry) + sizeof(struct contents_provdelorpair) + klen + clen; +} + +static u_int32_t memsize_le_provpair (TXNID txnid __attribute__((__unused__)), + u_int32_t klen, void *kval __attribute__((__unused__)), + u_int32_t plen, void *pval __attribute__((__unused__))) { + return sizeof(struct leafentry) + sizeof(struct contents_provdelorpair) + klen + plen; +} + +u_int32_t leafentry_memsize (LEAFENTRY le) { + LESWITCHCALL(le, memsize); +} + +static u_int32_t disksize_le_committed (u_int32_t keylen, void *key __attribute__((__unused__)), + u_int32_t vallen, void *val __attribute__((__unused__))) { + return 1 + 4 + 4 + keylen + vallen; +} + +static u_int32_t disksize_le_both (TXNID txnid __attribute__((__unused__)), + u_int32_t klen, void *kval __attribute__((__unused__)), + u_int32_t clen, void *cval __attribute__((__unused__)), + u_int32_t plen, void *pval __attribute__((__unused__))) { + return 1 + 8 + 4*3 + klen + clen + plen; +} + +static u_int32_t disksize_le_provdel (TXNID txnid __attribute__((__unused__)), + u_int32_t klen, void *kval __attribute__((__unused__)), + u_int32_t clen, void *cval __attribute__((__unused__))) { + return 1 + 8 + 4 + 4 + klen + clen; +} + +static u_int32_t disksize_le_provpair (TXNID txnid __attribute__((__unused__)), + u_int32_t klen, void *kval __attribute__((__unused__)), + u_int32_t plen, void *pval __attribute__((__unused__))) { + return 1 + 8 + 4 + 4 + klen + plen; +} + +u_int32_t leafentry_disksize (LEAFENTRY le) { + LESWITCHCALL(le, disksize); +} + +u_int32_t toku_logsizeof_LEAFENTRY (LEAFENTRY le) { + return leafentry_disksize(le); +} + +int toku_fread_LEAFENTRY(FILE *f, LEAFENTRY *le, u_int32_t *crc, u_int32_t *len) { + u_int8_t state; + int r = toku_fread_u_int8_t (f, &state, crc, len); if (r!=0) return r; + TXNID xid; + BYTESTRING a,b,c; + u_int32_t memsize, disksize; + switch ((enum le_state)state) { + case LE_COMMITTED: + r = toku_fread_BYTESTRING(f, &a, crc, len); if (r!=0) return r; + r = toku_fread_BYTESTRING(f, &b, crc, len); if (r!=0) return r; + r = le_committed(a.len, a.data, b.len, b.data, + &memsize, &disksize, le); + toku_free_BYTESTRING(a); + toku_free_BYTESTRING(b); + return r; + case LE_BOTH: + r = toku_fread_TXNID(f, &xid, crc, len); + r = toku_fread_BYTESTRING(f, &a, crc, len); if (r!=0) return r; + r = toku_fread_BYTESTRING(f, &b, crc, len); if (r!=0) return r; + r = toku_fread_BYTESTRING(f, &c, crc, len); if (r!=0) return r; + r = le_both(xid, a.len, a.data, b.len, b.data, c.len, c.data, + &memsize, &disksize, le); + toku_free_BYTESTRING(a); + toku_free_BYTESTRING(b); + toku_free_BYTESTRING(c); + return r; + case LE_PROVDEL: + r = toku_fread_TXNID(f, &xid, crc, len); + r = toku_fread_BYTESTRING(f, &a, crc, len); if (r!=0) return r; + r = toku_fread_BYTESTRING(f, &b, crc, len); if (r!=0) return r; + r = le_provdel(xid, a.len, a.data, b.len, b.data, + &memsize, &disksize, le); + toku_free_BYTESTRING(a); + toku_free_BYTESTRING(b); + return r; + case LE_PROVPAIR: + r = toku_fread_TXNID(f, &xid, crc, len); + r = toku_fread_BYTESTRING(f, &a, crc, len); if (r!=0) return r; + r = toku_fread_BYTESTRING(f, &b, crc, len); if (r!=0) return r; + r = le_provpair(xid, a.len, a.data, b.len, b.data, + &memsize, &disksize, le); + toku_free_BYTESTRING(a); + toku_free_BYTESTRING(b); + return r; + } + return DB_BADFORMAT; +} + +static int print_le_committed (u_int32_t keylen, void *key, u_int32_t vallen, void *val, FILE *outf) { + fprintf(outf, "{C: "); + toku_print_BYTESTRING(outf, keylen, key); + toku_print_BYTESTRING(outf, vallen, val); + fprintf(outf, "}"); + return 0; +} + +static int print_le_both (TXNID xid, u_int32_t klen, void *kval, u_int32_t clen, void *cval, u_int32_t plen, void *pval, FILE *outf) { + fprintf(outf, "{B: "); + fprintf(outf, " xid=%" PRId64, xid); + fprintf(outf, " key="); + toku_print_BYTESTRING(outf, klen, kval); + toku_print_BYTESTRING(outf, clen, cval); + fprintf(outf, " provisional="); + toku_print_BYTESTRING(outf, plen, pval); + fprintf(outf, "}"); + return 0; +} + +static int print_le_provdel (TXNID xid, u_int32_t klen, void *kval, u_int32_t clen, void *cval, FILE *outf) { + fprintf(outf, "{D: "); + fprintf(outf, " xid=%" PRId64, xid); + fprintf(outf, " key="); + toku_print_BYTESTRING(outf, klen, kval); + fprintf(outf, " committed="); + toku_print_BYTESTRING(outf, clen, cval); + fprintf(outf, "}"); + return 0; +} + +static int print_le_provpair (TXNID xid, u_int32_t klen, void *kval, u_int32_t plen, void *pval, FILE *outf) { + fprintf(outf, "{P: "); + fprintf(outf, " xid=%" PRId64, xid); + fprintf(outf, " key="); + toku_print_BYTESTRING(outf, klen, kval); + fprintf(outf, " provisional="); + toku_print_BYTESTRING(outf, plen, pval); + fprintf(outf, "}"); + return 0; +} + +int print_leafentry (FILE *outf, LEAFENTRY v) { + if (!v) return 0; + LESWITCHCALL(v, print, outf); +} + +int toku_logprint_LEAFENTRY (FILE *outf, FILE *inf, const char *fieldname, u_int32_t *crc, u_int32_t *len, const char *format __attribute__((__unused__))) { + LEAFENTRY v; + int r = toku_fread_LEAFENTRY(inf, &v, crc, len); + if (r!=0) return r; + fprintf(outf, " %s=", fieldname); + print_leafentry(outf, v); + toku_free(v); + return 0; +} + +static int wbuf_le_committed (u_int32_t keylen, void *key, u_int32_t vallen, void *val, struct wbuf *w) { + wbuf_bytes(w, key, keylen); + wbuf_bytes(w, val, vallen); + return 0; +} + +static int wbuf_le_both (TXNID xid, u_int32_t klen, void *kval, u_int32_t clen, void *cval, u_int32_t plen, void *pval, struct wbuf *w) { + wbuf_TXNID(w, xid); + wbuf_bytes(w, kval, klen); + wbuf_bytes(w, cval, clen); + wbuf_bytes(w, pval, plen); + return 0; +} + +static int wbuf_le_provdel (TXNID xid, u_int32_t klen, void *kval, u_int32_t clen, void *cval, struct wbuf *w) { + wbuf_TXNID(w, xid); + wbuf_bytes(w, kval, klen); + wbuf_bytes(w, cval, clen); + return 0; +} +static int wbuf_le_provpair (TXNID xid, u_int32_t klen, void *kval, u_int32_t plen, void *pval, struct wbuf *w) { + wbuf_TXNID(w, xid); + wbuf_bytes(w, kval, klen); + wbuf_bytes(w, pval, plen); + return 0; +} + +static int do_wbuf_le (struct wbuf *w, LEAFENTRY le) { + LESWITCHCALL(le, wbuf, w); +} +void wbuf_LEAFENTRY(struct wbuf *w, LEAFENTRY le) { + wbuf_char(w, (unsigned int)le->state); + do_wbuf_le(w,le); +} + +void rbuf_LEAFENTRY(struct rbuf *r, u_int32_t *resultsize, u_int32_t *disksize, LEAFENTRY *le) { + enum le_state state = rbuf_char(r); + switch (state) { + case LE_COMMITTED: { + //printf("%s:%d reading committed\n", __FILE__, __LINE__); + bytevec key, val; + u_int32_t keylen, vallen; + rbuf_bytes(r, &key, &keylen); + rbuf_bytes(r, &val, &vallen); + le_committed(keylen, (void*)key, vallen, (void*)val, resultsize, disksize, le); + return; + } + case LE_BOTH: { + //printf("%s:%d reading both\n", __FILE__, __LINE__); + bytevec kval, cval, pval; + u_int32_t klen, clen, plen; + TXNID xid = rbuf_ulonglong(r); + rbuf_bytes(r, &kval, &klen); + rbuf_bytes(r, &cval, &clen); + rbuf_bytes(r, &pval, &plen); + le_both(xid, klen, (void*)kval, clen, (void*)cval, plen, (void*)pval, resultsize, disksize, le); + return; + } + case LE_PROVDEL: { + //printf("%s:%d reading provdel\n", __FILE__, __LINE__); + bytevec kval, cval; + u_int32_t klen, clen; + TXNID xid = rbuf_ulonglong(r); + rbuf_bytes(r, &kval, &klen); + rbuf_bytes(r, &cval, &clen); + le_provdel(xid, klen, (void*)kval, clen, (void*)cval, resultsize, disksize, le); + return; + } + case LE_PROVPAIR: { + //printf("%s:%d reading both\n", __FILE__, __LINE__); + bytevec kval, pval; + u_int32_t klen, plen; + TXNID xid = rbuf_ulonglong(r); + rbuf_bytes(r, &kval, &klen); + rbuf_bytes(r, &pval, &plen); + le_provpair(xid, klen, (void*)kval, plen, (void*)pval, resultsize, disksize, le); + return; + } + } + assert(0); +} + +// Use toku_free() +void toku_free_LEAFENTRY(LEAFENTRY le) { + toku_free(le); +} + +int le_is_provdel(LEAFENTRY le) { + return le->state==LE_PROVDEL; +} + +void* latest_key_le_committed (u_int32_t UU(keylen), void *key, u_int32_t UU(vallen), void *UU(val)) { + return key; +} +void* latest_key_le_both (TXNID UU(xid), u_int32_t UU(klen), void *kval, u_int32_t UU(clen), void *UU(cval), u_int32_t UU(plen), void *UU(pval)) { + return kval; +} +void* latest_key_le_provdel (TXNID UU(xid), u_int32_t UU(klen), void *UU(kval), u_int32_t UU(clen), void *UU(cval)) { + return 0; // for provisional delete, there is no *latest* key, so return NULL +} +void* latest_key_le_provpair (TXNID UU(xid), u_int32_t UU(klen), void *kval, u_int32_t UU(plen), void *UU(pval)) { + return kval; +} +void* le_latest_key (LEAFENTRY le) { + LESWITCHCALL(le, latest_key); +} + +u_int32_t latest_keylen_le_committed (u_int32_t keylen, void *UU(key), u_int32_t UU(vallen), void *UU(val)) { + return keylen; +} +u_int32_t latest_keylen_le_both (TXNID UU(xid), u_int32_t klen, void *UU(kval), u_int32_t UU(clen), void *UU(cval), u_int32_t UU(plen), void *UU(pval)) { + return klen; +} +u_int32_t latest_keylen_le_provdel (TXNID UU(xid), u_int32_t UU(klen), void *UU(kval), u_int32_t UU(clen), void *UU(cval)) { + return 0; // for provisional delete, there is no *latest* key, so return 0. What else can we do? +} +u_int32_t latest_keylen_le_provpair (TXNID UU(xid), u_int32_t klen, void *UU(kval), u_int32_t UU(plen), void *UU(pval)) { + return klen; +} +u_int32_t le_latest_keylen (LEAFENTRY le) { + LESWITCHCALL(le, latest_keylen); +} + +void* latest_val_le_committed (u_int32_t UU(keylen), void *UU(key), u_int32_t UU(vallen), void *UU(val)) { + return val; +} +void* latest_val_le_both (TXNID UU(xid), u_int32_t UU(klen), void *UU(kval), u_int32_t UU(clen), void *cval, u_int32_t UU(plen), void *UU(pval)) { + return cval; +} +void* latest_val_le_provdel (TXNID UU(xid), u_int32_t UU(klen), void *UU(kval), u_int32_t UU(clen), void *UU(cval)) { + return 0; // for provisional delete, there is no *latest* key, so return NULL +} +void* latest_val_le_provpair (TXNID UU(xid), u_int32_t UU(klen), void *UU(kval), u_int32_t UU(plen), void *pval) { + return pval; +} +void* le_latest_val (LEAFENTRY le) { + LESWITCHCALL(le, latest_val); +} + +u_int32_t latest_vallen_le_committed (u_int32_t UU(keylen), void *UU(key), u_int32_t vallen, void *UU(val)) { + return vallen; +} +u_int32_t latest_vallen_le_both (TXNID UU(xid), u_int32_t UU(klen), void *UU(kval), u_int32_t UU(clen), void *UU(cval), u_int32_t plen, void *UU(pval)) { + return plen; +} +u_int32_t latest_vallen_le_provdel (TXNID UU(xid), u_int32_t UU(klen), void *UU(kval), u_int32_t UU(clen), void *UU(cval)) { + return 0; // for provisional delete, there is no *latest* key, so return 0. What else can we do? +} +u_int32_t latest_vallen_le_provpair (TXNID UU(xid), u_int32_t UU(klen), void *UU(kval), u_int32_t plen, void *UU(pval)) { + return plen; +} +u_int32_t le_latest_vallen (LEAFENTRY le) { + LESWITCHCALL(le, latest_vallen); +} + +void* any_key_le_committed (u_int32_t UU(keylen), void *key, u_int32_t UU(vallen), void *UU(val)) { + return key; +} +void* any_key_le_both (TXNID UU(xid), u_int32_t UU(klen), void *kval, u_int32_t UU(clen), void *UU(cval), u_int32_t UU(plen), void *UU(pval)) { + return kval; +} +void* any_key_le_provdel (TXNID UU(xid), u_int32_t UU(klen), void *kval, u_int32_t UU(clen), void *UU(cval)) { + return kval; +} +void* any_key_le_provpair (TXNID UU(xid), u_int32_t UU(klen), void *kval, u_int32_t UU(plen), void *UU(pval)) { + return kval; +} +void* le_any_key (LEAFENTRY le) { + LESWITCHCALL(le, any_key); +} + +u_int32_t any_keylen_le_committed (u_int32_t keylen, void *UU(key), u_int32_t UU(vallen), void *UU(val)) { + return keylen; +} +u_int32_t any_keylen_le_both (TXNID UU(xid), u_int32_t klen, void *UU(kval), u_int32_t UU(clen), void *UU(cval), u_int32_t UU(plen), void *UU(pval)) { + return klen; +} +u_int32_t any_keylen_le_provdel (TXNID UU(xid), u_int32_t klen, void *UU(kval), u_int32_t UU(clen), void *UU(cval)) { + return klen; +} +u_int32_t any_keylen_le_provpair (TXNID UU(xid), u_int32_t klen, void *UU(kval), u_int32_t UU(plen), void *UU(pval)) { + return klen; +} +u_int32_t le_any_keylen (LEAFENTRY le) { + LESWITCHCALL(le, any_keylen); +} + +void* any_val_le_committed (u_int32_t UU(keylen), void *UU(key), u_int32_t UU(vallen), void *UU(val)) { + return val; +} +void* any_val_le_both (TXNID UU(xid), u_int32_t UU(klen), void *UU(kval), u_int32_t UU(clen), void *cval, u_int32_t UU(plen), void *UU(pval)) { + return cval; +} +void* any_val_le_provdel (TXNID UU(xid), u_int32_t UU(klen), void *UU(kval), u_int32_t UU(clen), void *cval) { + return cval; +} +void* any_val_le_provpair (TXNID UU(xid), u_int32_t UU(klen), void *UU(kval), u_int32_t UU(plen), void *pval) { + return pval; +} +void* le_any_val (LEAFENTRY le) { + LESWITCHCALL(le, any_val); +} + +u_int32_t any_vallen_le_committed (u_int32_t UU(keylen), void *UU(key), u_int32_t vallen, void *UU(val)) { + return vallen; +} +u_int32_t any_vallen_le_both (TXNID UU(xid), u_int32_t UU(klen), void *UU(kval), u_int32_t UU(clen), void *UU(cval), u_int32_t plen, void *UU(pval)) { + return plen; +} +u_int32_t any_vallen_le_provdel (TXNID UU(xid), u_int32_t UU(klen), void *UU(kval), u_int32_t clen, void *UU(cval)) { + return clen; // for provisional delete, there is no *any* key, so return 0. What else can we do? +} +u_int32_t any_vallen_le_provpair (TXNID UU(xid), u_int32_t UU(klen), void *UU(kval), u_int32_t plen, void *UU(pval)) { + return plen; +} +u_int32_t le_any_vallen (LEAFENTRY le) { + LESWITCHCALL(le, any_vallen); +} diff --git a/newbrt/leafentry.h b/newbrt/leafentry.h index 05c0bfdadf3..4dca4c383e8 100644 --- a/newbrt/leafentry.h +++ b/newbrt/leafentry.h @@ -30,19 +30,104 @@ #include "mempool.h" #include "brttypes.h" #include "gpma.h" +#include "rbuf.h" typedef struct leafentry *LEAFENTRY; -u_int32_t le_crc(LEAFENTRY v); +u_int32_t toku_le_crc(LEAFENTRY v); -int le_committed (ITEMLEN klen, bytevec kval, ITEMLEN dlen, bytevec dval, GPMA pma, struct mempool *mp, LEAFENTRY *result); -int le_both (ITEMLEN cklen, bytevec ckval, ITEMLEN cdlen, bytevec cdval, ITEMLEN pdlen, bytevec pdval, - struct mempool *mp, LEAFENTRY *result); -int le_provdel (ITEMLEN klen, bytevec kval, ITEMLEN dlen, bytevec dval, struct mempool *mp, LEAFENTRY *result); -int le_provpair (ITEMLEN klen, bytevec kval, ITEMLEN dlen, bytevec dval, struct mempool *mp, LEAFENTRY *result); +int le_committed (u_int32_t klen, void* kval, u_int32_t dlen, void* dval, u_int32_t *resultsize, u_int32_t *disksize, LEAFENTRY *result); +int le_both (TXNID xid, u_int32_t cklen, void* ckval, u_int32_t cdlen, void* cdval, u_int32_t pdlen, void* pdval, + u_int32_t *memsize, u_int32_t *disksize, LEAFENTRY *result); +int le_provdel (TXNID xid, u_int32_t klen, void* kval, u_int32_t dlen, void* dval, + u_int32_t *resultsize, u_int32_t *memsize, LEAFENTRY *result); +int le_provpair (TXNID xid, u_int32_t klen, void* kval, u_int32_t dlen, void* dval, + u_int32_t *memsize, u_int32_t *disksize, LEAFENTRY *result); -int toku_gpma_compress_kvspace (GPMA pma, struct mempool *memp); -void *mempool_malloc_from_gpma(GPMA pma, struct mempool *mp, size_t size); +enum le_state { LE_COMMITTED=1, // A committed pair. + LE_BOTH, // A committed pair and a provisional pair. + LE_PROVDEL, // A committed pair that has been provisionally deleted + LE_PROVPAIR }; // No committed value, but a provisional pair. + +struct contents_committed; +struct contents_both; +struct contents_provdelorpair; + +u_int32_t leafentry_memsize (LEAFENTRY); + +enum le_state get_le_state(LEAFENTRY); +void *get_le_contents(LEAFENTRY); +enum typ_tag get_le_tag(LEAFENTRY); + +u_int32_t committed_keylen (void*cev); +void* committed_key (void*cev); +u_int32_t committed_vallen (struct contents_committed *ce); +void* committed_val (struct contents_committed *ce); +TXNID both_xid (struct contents_both *ce); +u_int32_t both_keylen (struct contents_both *ce); +u_int32_t both_committed_vallen (struct contents_both *ce); +u_int32_t both_prov_vallen (struct contents_both *ce); +void* both_key (struct contents_both *ce); +void* both_committed_val (struct contents_both *ce); +void* both_prov_val (struct contents_both*ce); +TXNID provdelorpair_xid (struct contents_provdelorpair *ce); +u_int32_t provdelorpair_keylen (struct contents_provdelorpair *ce); +u_int32_t provdelorpair_vallen (struct contents_provdelorpair *ce); +void* provdelorpair_key (struct contents_provdelorpair *ce); +void* provdelorpair_val (struct contents_provdelorpair *ce); + +#define LESWITCHCALL(le,funname, ...) ({ \ + assert(get_le_tag(le)==TYP_LEAFENTRY); \ + switch(get_le_state(le)) { \ + case LE_COMMITTED: return funname ## _le_committed( committed_keylen((struct contents_committed*)(get_le_contents(le))), \ + committed_key((struct contents_committed*)(get_le_contents(le))), \ + committed_vallen((struct contents_committed*)(get_le_contents(le))), \ + committed_val((struct contents_committed*)(get_le_contents(le))), \ + ## __VA_ARGS__); \ + case LE_BOTH: return funname ## _le_both( both_xid((struct contents_both*)(get_le_contents(le))), \ + both_keylen((struct contents_both*)(get_le_contents(le))), \ + both_key((struct contents_both*)(get_le_contents(le))), \ + both_committed_vallen((struct contents_both*)(get_le_contents(le))), \ + both_committed_val((struct contents_both*)(get_le_contents(le))), \ + both_prov_vallen((struct contents_both*)(get_le_contents(le))), \ + both_prov_val((struct contents_both*)(get_le_contents(le))), \ + ## __VA_ARGS__); \ + case LE_PROVDEL: return funname ## _le_provdel ( provdelorpair_xid((struct contents_provdelorpair*)(get_le_contents(le))), \ + provdelorpair_keylen((struct contents_provdelorpair*)(get_le_contents(le))), \ + provdelorpair_key((struct contents_provdelorpair*)(get_le_contents(le))), \ + provdelorpair_vallen((struct contents_provdelorpair*)(get_le_contents(le))), \ + provdelorpair_val((struct contents_provdelorpair*)(get_le_contents(le))), \ + ## __VA_ARGS__); \ + case LE_PROVPAIR: return funname ## _le_provpair(provdelorpair_xid((struct contents_provdelorpair*)(get_le_contents(le))), \ + provdelorpair_keylen((struct contents_provdelorpair*)(get_le_contents(le))), \ + provdelorpair_key((struct contents_provdelorpair*)(get_le_contents(le))), \ + provdelorpair_vallen((struct contents_provdelorpair*)(get_le_contents(le))), \ + provdelorpair_val((struct contents_provdelorpair*)(get_le_contents(le))), \ + ## __VA_ARGS__); \ + } abort(); }) + + +u_int32_t leafentry_memsize (LEAFENTRY le); // the size of a leafentry in memory. +u_int32_t leafentry_disksize (LEAFENTRY le); // this is the same as logsizeof_LEAFENTRY. The size of a leafentry on disk. +u_int32_t toku_logsizeof_LEAFENTRY(LEAFENTRY le); +void wbuf_LEAFENTRY(struct wbuf *w, LEAFENTRY le); +void rbuf_LEAFENTRY(struct rbuf *r, u_int32_t *resultsize, u_int32_t *disksize, LEAFENTRY *le); +int toku_fread_LEAFENTRY(FILE *f, LEAFENTRY *le, u_int32_t *crc, u_int32_t *len); // read a leafentry from a log +int toku_logprint_LEAFENTRY(FILE *outf, FILE *inf, const char *fieldname, u_int32_t *crc, u_int32_t *len, const char *format); // read a leafentry from a log and then print it in human-readable form. +void toku_free_LEAFENTRY(LEAFENTRY le); +int print_leafentry (FILE *outf, LEAFENTRY v); // Print a leafentry out in human-readable form. + +int le_is_provdel(LEAFENTRY le); // Return true if it is a provisional delete. +void* le_latest_key (LEAFENTRY le); // Return the latest key (return NULL for provisional deletes) +u_int32_t le_latest_keylen (LEAFENTRY le); // Return the latest keylen. +void* le_latest_val (LEAFENTRY le); // Return the latest val (return NULL for provisional deletes) +u_int32_t le_latest_vallen (LEAFENTRY le); // Return the latest vallen. Returns 0 for provisional deletes. + + // Return any key or value (even if it's only provisional) +void* le_any_key (LEAFENTRY le); +u_int32_t le_any_keylen (LEAFENTRY le); +void* le_any_val (LEAFENTRY le); +u_int32_t le_any_vallen (LEAFENTRY le); #endif diff --git a/newbrt/log.c b/newbrt/log.c index 63d74f19373..31a71c8187c 100644 --- a/newbrt/log.c +++ b/newbrt/log.c @@ -343,10 +343,19 @@ int toku_logger_finish (TOKULOGGER logger, struct logbytes *logbytes, struct wbu } int toku_logger_commit (TOKUTXN txn, int nosync) { + // printf("%s:%d committing\n", __FILE__, __LINE__); // panic handled in log_commit int r = toku_log_commit(txn->logger, (LSN*)0, (txn->parent==0) && !nosync, txn->txnid64); // exits holding neither of the tokulogger locks. - if (r!=0) goto free_and_return; - if (txn->parent!=0) { + if (r!=0) { + struct roll_entry *item; + broken: + while ((item=txn->newest_logentry)) { + txn->newest_logentry = item->prev; + rolltype_dispatch(item, toku_free_rolltype_); + toku_free(item); + } + r = 0; + } else if (txn->parent!=0) { // Append the list to the front. if (txn->oldest_logentry) { // There are some entries, so link them in. @@ -357,18 +366,23 @@ int toku_logger_commit (TOKUTXN txn, int nosync) { txn->parent->oldest_logentry = txn->oldest_logentry; } txn->newest_logentry = txn->oldest_logentry = 0; - } - free_and_return: - { + r = 0; + } else { + // do the commit calls and free everything + // we do the commit calls in reverse order too. struct roll_entry *item; + //printf("%s:%d abort\n", __FILE__, __LINE__); while ((item=txn->newest_logentry)) { txn->newest_logentry = item->prev; + rolltype_dispatch_assign(item, toku_commit_, r, txn); + if (r!=0) goto broken; rolltype_dispatch(item, toku_free_rolltype_); toku_free(item); } - list_remove(&txn->live_txns_link); - toku_free(txn); + r = 0; } + list_remove(&txn->live_txns_link); + toku_free(txn); return r; } @@ -402,7 +416,7 @@ int toku_logger_log_fcreate (TOKUTXN txn, const char *fname, int mode) { BYTESTRING bs = { .len=strlen(fname), .data = strdup(fname) }; int r = toku_log_fcreate (txn->logger, (LSN*)0, 0, toku_txn_get_txnid(txn), bs, mode); if (r!=0) return r; - r = toku_logger_save_rollback_fcreate(txn, bs); + r = toku_logger_save_rollback_fcreate(txn, toku_txn_get_txnid(txn), bs); return r; } @@ -569,23 +583,30 @@ int toku_logprint_u_int32_t (FILE *outf, FILE *inf, const char *fieldname, u_int return 0; } -int toku_logprint_BYTESTRING (FILE *outf, FILE *inf, const char *fieldname, u_int32_t *crc, u_int32_t *len, const char *format __attribute__((__unused__))) { - BYTESTRING bs; - int r = toku_fread_BYTESTRING(inf, &bs, crc, len); - if (r!=0) return r; - fprintf(outf, " %s={len=%d data=\"", fieldname, bs.len); + +void toku_print_BYTESTRING (FILE *outf, u_int32_t len, char *data) { + fprintf(outf, "{len=%d data=\"", len); u_int32_t i; - for (i=0; icheckpoint_lsns[0].lsn, (long long)logger->checkpoint_lsns[1].lsn); if ((earliest_lsn_seen.lsn <= logger->checkpoint_lsns[0].lsn)&& - (earliest_lsn_seen.lsn <= logger->checkpoint_lsns[1].lsn)) { + (earliest_lsn_seen.lsn <= logger->checkpoint_lsns[1].lsn)&& + (earliest_lsn_seen.lsn <= oldest_live_txn_lsn.lsn)) { break; } } diff --git a/newbrt/log.h b/newbrt/log.h index ccac09c47db..2b3e3414aa5 100644 --- a/newbrt/log.h +++ b/newbrt/log.h @@ -70,6 +70,9 @@ int toku_logprint_u_int32_t (FILE *outf, FILE *inf, const char *fieldname, int toku_logprint_LOGGEDBRTHEADER (FILE *outf, FILE *inf, const char *fieldname, u_int32_t *crc, u_int32_t *len, const char *); int toku_logprint_INTPAIRARRAY (FILE *outf, FILE *inf, const char *fieldname, u_int32_t *crc, u_int32_t *len, const char *); +// Useful thing for printing a bytestring. +void toku_print_BYTESTRING (FILE *outf, u_int32_t len, char *data); + int toku_read_and_print_logmagic (FILE *f, u_int32_t *version); TXNID toku_txn_get_txnid (TOKUTXN); @@ -110,9 +113,6 @@ static inline int toku_copy_BYTESTRING(BYTESTRING *target, BYTESTRING val) { static inline void toku_free_BYTESTRING(BYTESTRING val) { toku_free(val.data); } -static inline void toku_free_DISKOFFARRAY(DISKOFFARRAY val) { - toku_free(val.array); -} static inline int toku_copy_LOGGEDBRTHEADER(LOGGEDBRTHEADER *target, LOGGEDBRTHEADER val) { *target = val; diff --git a/newbrt/logformat.c b/newbrt/logformat.c index b166df63567..d0fafe03cfd 100644 --- a/newbrt/logformat.c +++ b/newbrt/logformat.c @@ -39,23 +39,38 @@ struct logtype { int logformat_version_number = 0; const struct logtype rollbacks[] = { - {"fcreate", 'F', FA{{"BYTESTRING", "fname", 0}, + {"fcreate", 'F', FA{{"TXNID", "xid", 0}, + {"BYTESTRING", "fname", 0}, NULLFIELD}}, + {"cmdinsert", 'i', FA{{"TXNID", "xid", 0}, + {"FILENUM", "filenum", 0}, + {"BYTESTRING", "key", 0}, + {"BYTESTRING", "data", 0}, + NULLFIELD}}, + {"cmddeleteboth", 'D', FA{{"TXNID", "xid", 0}, + {"FILENUM", "filenum", 0}, + {"BYTESTRING", "key", 0}, + {"BYTESTRING", "data", 0}, + NULLFIELD}}, + {"cmddelete", 'd', FA{{"TXNID", "xid", 0}, + {"FILENUM", "filenum", 0}, + {"BYTESTRING", "key", 0}, + NULLFIELD}}, // {"fclose", 'c', FA{{"FILENUM", "filenum", 0}, // {"BYTESTRING", "fname", 0}, // NULLFIELD}}, - {"deleteatleaf", 'd', FA{{"FILENUM", "filenum", 0}, // Note a delete for rollback. The delete takes place in a leaf. - {"BYTESTRING", "key", 0}, - {"BYTESTRING", "data", 0}, - NULLFIELD}}, - {"insertatleaf", 'i', FA{{"FILENUM", "filenum", 0}, // Note an insert for rollback. The insert takes place in a leaf. - {"BYTESTRING", "key", 0}, - {"BYTESTRING", "data", 0}, - NULLFIELD}}, - {"xactiontouchednonleaf", 'n', FA{{"FILENUM", "filenum", 0}, - {"DISKOFFARRAY", "parents", 0}, - {"DISKOFF", "diskoff", 0}, - NULLFIELD}}, +// {"deleteatleaf", 'd', FA{{"FILENUM", "filenum", 0}, // Note a delete for rollback. The delete takes place in a leaf. +// {"BYTESTRING", "key", 0}, +// {"BYTESTRING", "data", 0}, +// NULLFIELD}}, +// {"insertatleaf", 'i', FA{{"FILENUM", "filenum", 0}, // Note an insert for rollback. The insert takes place in a leaf. +// {"BYTESTRING", "key", 0}, +// {"BYTESTRING", "data", 0}, +// NULLFIELD}}, +// {"xactiontouchednonleaf", 'n', FA{{"FILENUM", "filenum", 0}, +// {"DISKOFFARRAY", "parents", 0}, +// {"DISKOFF", "diskoff", 0}, +// NULLFIELD}}, {0,0,FA{NULLFIELD}} }; @@ -152,13 +167,29 @@ const struct logtype logtypes[] = { {"u_int32_t", "oldfingerprint", "%08x"}, {"u_int32_t", "newfingerprint", "%08x"}, NULLFIELD}}, - {"insertinleaf", 'I', FA{{"TXNID", "txnid", 0}, - {"FILENUM", "filenum", 0}, - {"DISKOFF", "diskoff", 0}, - {"u_int32_t", "pmaidx", 0}, - {"BYTESTRING", "key", 0}, - {"BYTESTRING", "data", 0}, - NULLFIELD}}, +// {"insertinleaf", 'I', FA{{"TXNID", "txnid", 0}, +// {"FILENUM", "filenum", 0}, +// {"DISKOFF", "diskoff", 0}, +// {"u_int32_t", "pmaidx", 0}, +// {"BYTESTRING", "key", 0}, +// {"BYTESTRING", "data", 0}, +// NULLFIELD}}, +// {"replaceleafentry", 'L', FA{{"FILENUM", "filenum", 0}, +// {"DISKOFF", "diskoff", 0}, +// {"u_int32_t", "pmaidx", 0}, +// {"LEAFENTRY", "oldleafentry", 0}, +// {"LEAFENTRY", "newleafentry", 0}, +// NULLFIELD}}, + {"insertleafentry", 'I', FA{{"FILENUM", "filenum", 0}, + {"DISKOFF", "diskoff", 0}, + {"u_int32_t", "pmaidx", 0}, + {"LEAFENTRY", "newleafentry", 0}, + NULLFIELD}}, + {"deleteleafentry", 'D', FA{{"FILENUM", "filenum", 0}, + {"DISKOFF", "diskoff", 0}, + {"u_int32_t", "pmaidx", 0}, + {"LEAFENTRY", "oldleafentry", 0}, + NULLFIELD}}, {"deleteinleaf", 'd', FA{{"TXNID", "txnid", 0}, {"FILENUM", "filenum", 0}, {"DISKOFF", "diskoff", 0}, @@ -259,6 +290,9 @@ void generate_log_struct (void) { fprintf(hf, "int toku_rollback_%s (", lt->name); DO_FIELDS(ft, lt, fprintf(hf, "%s %s,", ft->type, ft->name)); fprintf(hf, "TOKUTXN txn);\n"); + fprintf(hf, "int toku_commit_%s (", lt->name); + DO_FIELDS(ft, lt, fprintf(hf, "%s %s,", ft->type, ft->name)); + fprintf(hf, "TOKUTXN txn);\n"); })); fprintf(hf, "struct log_entry {\n"); fprintf(hf, " enum lt_cmd cmd;\n"); diff --git a/newbrt/memory.h b/newbrt/memory.h index 37e5f435f34..61966982fa0 100644 --- a/newbrt/memory.h +++ b/newbrt/memory.h @@ -10,12 +10,13 @@ /* Generally: errno is set to 0 or a value to indicate problems. */ -enum typ_tag { TYP_BRTNODE = 0xdead0001, +enum typ_tag { TYP_BRTNODE = 3735879681, //0xdead0001, TYP_CACHETABLE, TYP_PAIR, /* for cachetables */ TYP_PMA, TYP_GPMA, TYP_TOKULOGGER, - TYP_TOKUTXN + TYP_TOKUTXN, + TYP_LEAFENTRY }; /* Everything should call toku_malloc() instead of malloc(), and toku_calloc() instead of calloc() */ diff --git a/newbrt/mempool.c b/newbrt/mempool.c index 2069f18b358..8c543ed6c00 100644 --- a/newbrt/mempool.c +++ b/newbrt/mempool.c @@ -45,6 +45,7 @@ void *toku_mempool_malloc(struct mempool *mp, size_t size, int alignment) { assert(mp->free_offset <= mp->size); void *vp; size_t offset = (mp->free_offset + (alignment-1)) & ~(alignment-1); + //printf("mempool_malloc size=%ld base=%p free_offset=%ld mp->size=%ld offset=%ld\n", size, mp->base, mp->free_offset, mp->size, offset); if (offset + size > mp->size) { vp = 0; } else { @@ -54,11 +55,14 @@ void *toku_mempool_malloc(struct mempool *mp, size_t size, int alignment) { assert(mp->free_offset <= mp->size); assert(((long)vp & (alignment-1)) == 0); assert(vp == 0 || (mp->base <= vp && vp + size <= mp->base + mp->size)); + //printf("mempool returning %p\n", vp); return vp; } +// if vp is null then we are freeing something, but not specifying what. The data won't be freed until compression is done. void toku_mempool_mfree(struct mempool *mp, void *vp, int size) { - assert(size >= 0 && mp->base <= vp && vp + size <= mp->base + mp->size); + assert(size >= 0); + if (vp) assert(toku_mempool_inrange(mp, vp, size)); mp->frag_size += size; assert(mp->frag_size <= mp->size); } diff --git a/newbrt/rbuf.h b/newbrt/rbuf.h index 8f20508c7f2..b01feef419e 100644 --- a/newbrt/rbuf.h +++ b/newbrt/rbuf.h @@ -3,7 +3,7 @@ #ident "Copyright (c) 2007 Tokutek Inc. All rights reserved." -#include +#include "toku_assert.h" struct rbuf { unsigned char *buf; @@ -34,19 +34,19 @@ static inline void rbuf_literal_bytes (struct rbuf *r, bytevec *bytes, unsigned } /* Return a pointer into the middle of the buffer. */ -static void rbuf_bytes (struct rbuf *r, bytevec *bytes, unsigned int *n_bytes) +static inline void rbuf_bytes (struct rbuf *r, bytevec *bytes, unsigned int *n_bytes) { *n_bytes = rbuf_int(r); rbuf_literal_bytes(r, bytes, *n_bytes); } -static unsigned long long rbuf_ulonglong (struct rbuf *r) { +static inline unsigned long long rbuf_ulonglong (struct rbuf *r) { unsigned i0 = rbuf_int(r); unsigned i1 = rbuf_int(r); return ((unsigned long long)(i0)<<32) | ((unsigned long long)(i1)); } -static DISKOFF rbuf_diskoff (struct rbuf *r) { +static inline DISKOFF rbuf_diskoff (struct rbuf *r) { unsigned i0 = rbuf_int(r); unsigned i1 = rbuf_int(r); return ((unsigned long long)(i0)<<32) | ((unsigned long long)(i1)); diff --git a/newbrt/recover.c b/newbrt/recover.c index cc9e2aa7d6e..27118c0c9df 100644 --- a/newbrt/recover.c +++ b/newbrt/recover.c @@ -21,7 +21,7 @@ #include #include -#define DO_VERIFY_COUNTS +//#define DO_VERIFY_COUNTS #ifdef DO_VERIFY_COUNTS #define VERIFY_COUNTS(n) toku_verify_counts(n) #else @@ -390,7 +390,7 @@ void toku_recover_fopen (LSN UU(lsn), TXNID UU(txnid), BYTESTRING fname, FILENUM toku_free_BYTESTRING(fname); } -void toku_recover_insertinleaf (LSN lsn, TXNID UU(txnid), FILENUM filenum, DISKOFF diskoff, u_int32_t pmaidx, BYTESTRING keybs, BYTESTRING databs) { +void toku_recover_insertleafentry (LSN lsn, FILENUM filenum, DISKOFF diskoff, u_int32_t pmaidx, LEAFENTRY newleafentry) { struct cf_pair *pair = NULL; int r = find_cachefile(filenum, &pair); assert(r==0); @@ -401,25 +401,84 @@ void toku_recover_insertinleaf (LSN lsn, TXNID UU(txnid), FILENUM filenum, DISKO BRTNODE node = node_v; assert(node->height==0); VERIFY_COUNTS(node); - struct kv_pair *kvp = brtnode_malloc_kv_pair(node->u.l.buffer, &node->u.l.buffer_mempool, keybs.data, keybs.len, databs.data, databs.len); - assert(pair); - toku_gpma_set_at_index(node->u.l.buffer, pmaidx, kv_pair_size(kvp), kvp); - node->local_fingerprint += node->rand4fingerprint*toku_calccrc32_kvpair(keybs.data, keybs.len, databs.data, databs.len); -// printf("%s:%d local_fingerprint=%08x (this=%08x)\n", __FILE__, __LINE__, node->local_fingerprint, toku_calccrc32_kvpair(keybs.data, keybs.len, databs.data, databs.len)); - node->u.l.n_bytes_in_buffer += PMA_ITEM_OVERHEAD + KEY_VALUE_OVERHEAD + keybs.len + databs.len; - -// PMA_ITERATE_IDX(node->u.l.buffer, idx, skey, keylen __attribute__((__unused__)), sdata, datalen __attribute__((__unused__)), -// printf("%d: %s %s\n", idx, (char*)skey, (char*)sdata)); - - VERIFY_COUNTS(node); - node->log_lsn = lsn; + { + int memsize = leafentry_memsize(newleafentry); + void *mem = mempool_malloc_from_gpma(node->u.l.buffer, &node->u.l.buffer_mempool, memsize); + memcpy(mem, newleafentry, memsize); + toku_gpma_set_at_index(node->u.l.buffer, pmaidx, memsize, mem); + node->u.l.n_bytes_in_buffer += PMA_ITEM_OVERHEAD + leafentry_disksize(newleafentry); + node->local_fingerprint += node->rand4fingerprint * toku_le_crc(newleafentry); + } r = toku_cachetable_unpin(pair->cf, diskoff, 1, toku_serialize_brtnode_size(node)); assert(r==0); - toku_free_BYTESTRING(keybs); - toku_free_BYTESTRING(databs); + toku_free_LEAFENTRY(newleafentry); } +void toku_recover_deleteleafentry (LSN lsn, FILENUM filenum, DISKOFF diskoff, u_int32_t pmaidx, LEAFENTRY oldleafentry) { + struct cf_pair *pair = NULL; + int r = find_cachefile(filenum, &pair); + assert(r==0); + void *node_v; + assert(pair->brt); + r = toku_cachetable_get_and_pin(pair->cf, diskoff, &node_v, NULL, toku_brtnode_flush_callback, toku_brtnode_fetch_callback, pair->brt); + assert(r==0); + BRTNODE node = node_v; + assert(node->height==0); + VERIFY_COUNTS(node); + node->log_lsn = lsn; + { + u_int32_t len; void *data; + r=toku_gpma_get_from_index(node->u.l.buffer, pmaidx, &len, &data); + assert(r==0); + assert(len==leafentry_memsize(oldleafentry)); + assert(memcmp(oldleafentry, data, len)==0); + node->u.l.n_bytes_in_buffer -= PMA_ITEM_OVERHEAD + leafentry_disksize(data); + node->local_fingerprint -= node->rand4fingerprint * toku_le_crc(data); + toku_mempool_mfree(&node->u.l.buffer_mempool, data, len); + toku_gpma_clear_at_index(node->u.l.buffer, pmaidx); + } + r = toku_cachetable_unpin(pair->cf, diskoff, 1, toku_serialize_brtnode_size(node)); + assert(r==0); + toku_free_LEAFENTRY(oldleafentry); +} + +//void toku_recover_replaceleafentry (LSN lsn, FILENUM filenum, DISKOFF diskoff, u_int32_t pmaidx, LEAFENTRY oldleafentry, LEAFENTRY newleafentry) { +// struct cf_pair *pair = NULL; +// int r = find_cachefile(filenum, &pair); +// assert(r==0); +// void *node_v; +// assert(pair->brt); +// r = toku_cachetable_get_and_pin(pair->cf, diskoff, &node_v, NULL, toku_brtnode_flush_callback, toku_brtnode_fetch_callback, pair->brt); +// assert(r==0); +// BRTNODE node = node_v; +// assert(node->height==0); +// VERIFY_COUNTS(node); +// node->log_lsn = lsn; +// { +// u_int32_t len; void *data; +// r=toku_gpma_get_from_index(node->u.l.buffer, pmaidx, &len, &data); +// assert(r==0); +// assert(len==leafentry_memsize(oldleafentry)); +// assert(memcmp(oldleafentry, data, len)==0); +// node->u.l.n_bytes_in_buffer -= PMA_ITEM_OVERHEAD + leafentry_disksize(data); +// node->local_fingerprint -= node->rand4fingerprint * toku_le_crc(data); +// toku_mempool_mfree(&node->u.l.buffer_mempool, data, len); +// } +// { +// int memsize = leafentry_memsize(newleafentry); +// void *mem = mempool_malloc_from_gpma(node->u.l.buffer, &node->u.l.buffer_mempool, memsize); +// memcpy(mem, newleafentry, memsize); +// toku_gpma_set_at_index(node->u.l.buffer, pmaidx, memsize, mem); +// node->u.l.n_bytes_in_buffer += PMA_ITEM_OVERHEAD + leafentry_disksize(newleafentry); +// node->local_fingerprint += node->rand4fingerprint * toku_le_crc(newleafentry); +// } +// r = toku_cachetable_unpin(pair->cf, diskoff, 1, toku_serialize_brtnode_size(node)); +// assert(r==0); +// toku_free_LEAFENTRY(oldleafentry); +// toku_free_LEAFENTRY(newleafentry); +//} + void toku_recover_deleteinleaf (LSN lsn, TXNID UU(txnid), FILENUM filenum, DISKOFF diskoff, u_int32_t pmaidx, BYTESTRING keybs, BYTESTRING databs) { struct cf_pair *pair = NULL; int r = find_cachefile(filenum, &pair); @@ -440,7 +499,8 @@ void toku_recover_deleteinleaf (LSN lsn, TXNID UU(txnid), FILENUM filenum, DISKO } } toku_gpma_clear_at_index(node->u.l.buffer, pmaidx); - node->local_fingerprint -= node->rand4fingerprint*toku_calccrc32_kvpair(keybs.data, keybs.len, databs.data, databs.len); + assert(!"kvpair"); + //node->local_fingerprint -= node->rand4fingerprint*toku_calccrc32_kvpair(keybs.data, keybs.len, databs.data, databs.len); node->u.l.n_bytes_in_buffer -= PMA_ITEM_OVERHEAD + KEY_VALUE_OVERHEAD + keybs.len + databs.len; VERIFY_COUNTS(node); node->log_lsn = lsn; @@ -449,7 +509,6 @@ void toku_recover_deleteinleaf (LSN lsn, TXNID UU(txnid), FILENUM filenum, DISKO toku_free_BYTESTRING(keybs); toku_free_BYTESTRING(databs); } - // a newbrtnode should have been done before this void toku_recover_resizepma (LSN lsn, FILENUM filenum, DISKOFF diskoff, u_int32_t oldsize __attribute__((__unused__)), u_int32_t newsize) { struct cf_pair *pair = NULL; @@ -490,9 +549,9 @@ int move_indices (GPMA from, struct mempool *from_mempool, struct gitem item = from->items[idx]; items[i]=item; from->items[idx].data = 0; - fp += toku_calccrc32_kvpair_struct(item.data); - sizediff += PMA_ITEM_OVERHEAD + item.len; - assert(kv_pair_size(item.data)==item.len); + fp += toku_le_crc(item.data); + sizediff += PMA_ITEM_OVERHEAD + leafentry_disksize(item.data); + assert(leafentry_memsize(item.data)==item.len); } from->n_items_present -= fromto.size; @@ -512,7 +571,7 @@ int move_indices (GPMA from, struct mempool *from_mempool, to->items[to_idx] = (struct gitem){items[i].len, new_data}; toku_mempool_mfree(from_mempool, items[i].data, items[i].len); } - assert(kv_pair_size(to->items[to_idx].data)==to->items[to_idx].len); + assert(leafentry_memsize(to->items[to_idx].data)==to->items[to_idx].len); } to->n_items_present += fromto.size; *a_fp -= a_rand * fp; @@ -520,8 +579,8 @@ int move_indices (GPMA from, struct mempool *from_mempool, *a_nbytes -= sizediff; *b_nbytes += sizediff; toku_free(items); - toku_verify_gpma(from); - toku_verify_gpma(to); + //toku_verify_gpma(from); + //toku_verify_gpma(to); return 0; } diff --git a/newbrt/roll.c b/newbrt/roll.c index 282cea12333..73f36f6f459 100644 --- a/newbrt/roll.c +++ b/newbrt/roll.c @@ -12,7 +12,14 @@ #include "cachetable.h" #include "key.h" -int toku_rollback_fcreate (BYTESTRING bs_fname, +int toku_commit_fcreate (TXNID xid __attribute__((__unused__)), + BYTESTRING bs_fname __attribute__((__unused__)), + TOKUTXN txn __attribute__((__unused__))) { + return 0; +} + +int toku_rollback_fcreate (TXNID xid __attribute__((__unused__)), + BYTESTRING bs_fname, TOKUTXN txn __attribute__((__unused__))) { char *fname = fixup_fname(&bs_fname); char *directory = txn->logger->directory; @@ -26,84 +33,62 @@ int toku_rollback_fcreate (BYTESTRING bs_fname, return 0; } -#if 0 -int toku_rollback_fclose (FILENUM filenum, BYTESTRING bs_fname, TOKUTXN txn) { - abort(); - filenum=filenum; - bs_fname=bs_fname; - txn=txn; -#if 0 - char *fixedfname = fixup_fname(&bs_fname); - int fd = open(fixedfname, O_RDWR, 0); - assert(fd>=0); - BRT MALLOC(brt); - assert(errno==0 && brt!=0); - brt->database_name = fixedfname; - brt->h=0; - list_init(&brt->cursors); - brt->compare_fun = 0; - brt->dup_compare = 0; - brt->db = 0; - CACHETABLE cf; - int r = toku_cachetable_openfd(&cf, /*ct*/0, fd, brt); - assert(r==0); - brt->skey = brt->sval = 0; - brt->cf = cf; - toku_recover_note_cachefile(filenum, cf, brt); - - printf("%s:%d Must remember to close the file again after txn %p finishes aborting\n", __FILE__, __LINE__, txn); - - return 0; -#endif -} -#endif - -//int toku_rollback_newbrtnode (struct logtype_newbrtnode *le, TOKUTXN txn) { -// // All that must be done is to put the node on the freelist. -// // Since we don't have a freelist right now, we don't have anything to do. -// // We'll fix this later (See #264) -// le=le; -// txn=txn; -// return 0; -//} - -int toku_rollback_insertatleaf (FILENUM filenum, BYTESTRING key,BYTESTRING data, TOKUTXN txn) { +int toku_commit_cmdinsert (TXNID xid, FILENUM filenum, BYTESTRING key,BYTESTRING data,TOKUTXN txn) { CACHEFILE cf; BRT brt; + //printf("%s:%d committing insert %s %s\n", __FILE__, __LINE__, key.data, data.data); int r = toku_cachefile_of_filenum(txn->logger->ct, filenum, &cf, &brt); assert(r==0); DBT key_dbt,data_dbt; - r = toku_brt_delete_both(brt, - toku_fill_dbt(&key_dbt, key.data, key.len), - toku_fill_dbt(&data_dbt, data.data, data.len), - 0); - return r; + BRT_CMD_S brtcmd = { BRT_COMMIT_BOTH, xid, + .u.id={toku_fill_dbt(&key_dbt, key.data, key.len), + toku_fill_dbt(&data_dbt, data.data, data.len)}}; + return toku_brt_root_put_cmd(brt, &brtcmd, toku_txn_logger(txn)); } -int toku_rollback_deleteatleaf (FILENUM filenum, BYTESTRING key, BYTESTRING data,TOKUTXN txn) { +int toku_rollback_cmdinsert (TXNID xid, FILENUM filenum, BYTESTRING key,BYTESTRING data,TOKUTXN txn) { CACHEFILE cf; BRT brt; int r = toku_cachefile_of_filenum(txn->logger->ct, filenum, &cf, &brt); assert(r==0); + //printf("%s:%d aborting insert %s %s\n", __FILE__, __LINE__, key.data, data.data); DBT key_dbt,data_dbt; - r = toku_brt_insert(brt, - toku_fill_dbt(&key_dbt, key.data, key.len), - toku_fill_dbt(&data_dbt, data.data, data.len), - 0); // Do the insertion unconditionally - return r; + BRT_CMD_S brtcmd = { BRT_ABORT_BOTH, xid, + .u.id={toku_fill_dbt(&key_dbt, key.data, key.len), + toku_fill_dbt(&data_dbt, data.data, data.len)}}; + return toku_brt_root_put_cmd(brt, &brtcmd, toku_txn_logger(txn)); } -int toku_rollback_xactiontouchednonleaf(FILENUM filenum, DISKOFFARRAY array __attribute__((__unused__)), DISKOFF diskoff, TOKUTXN txn) { +int toku_commit_cmddeleteboth (TXNID xid, FILENUM filenum, BYTESTRING key,BYTESTRING data,TOKUTXN txn) { + return toku_commit_cmdinsert(xid, filenum, key, data, txn); +} + +int toku_rollback_cmddeleteboth (TXNID xid, FILENUM filenum, BYTESTRING key,BYTESTRING data,TOKUTXN txn) { + return toku_rollback_cmdinsert(xid, filenum, key, data, txn); +} + +int toku_commit_cmddelete (TXNID xid, FILENUM filenum, BYTESTRING key,TOKUTXN txn) { CACHEFILE cf; BRT brt; int r = toku_cachefile_of_filenum(txn->logger->ct, filenum, &cf, &brt); assert(r==0); - r = toku_brt_nonleaf_expunge_xaction(brt, diskoff, txn->txnid64); - assert(r==0); - //printf("%s:%d node=%lld has Rollback parents = {", __FILE__, __LINE__, (long long)diskoff); - //int i; for (i=0; ilogger->ct, filenum, &cf, &brt); + assert(r==0); + //printf("%s:%d aborting delete %s %s\n", __FILE__, __LINE__, key.data, data.data); + DBT key_dbt,data_dbt; + BRT_CMD_S brtcmd = { BRT_ABORT_ANY, xid, + .u.id={toku_fill_dbt(&key_dbt, key.data, key.len), + toku_init_dbt(&data_dbt)}}; + return toku_brt_root_put_cmd(brt, &brtcmd, toku_txn_logger(txn)); +} diff --git a/newbrt/test-del-inorder.c b/newbrt/test-del-inorder.c index 5fcaa5d4d6a..093d60ffd7d 100644 --- a/newbrt/test-del-inorder.c +++ b/newbrt/test-del-inorder.c @@ -44,7 +44,7 @@ void doit (void) { assert(r==0); u_int32_t fingerprint=0; - r = toku_testsetup_insert_to_nonleaf(t, nodeb, BRT_DELETE, "hello", 6, 0, 0, &fingerprint); + r = toku_testsetup_insert_to_nonleaf(t, nodeb, BRT_DELETE_ANY, "hello", 6, 0, 0, &fingerprint); assert(r==0); r = toku_testsetup_root(t, nodeb); diff --git a/src/tests/Makefile b/src/tests/Makefile index 9022ff47041..243b3728eac 100644 --- a/src/tests/Makefile +++ b/src/tests/Makefile @@ -51,10 +51,8 @@ TDB_CPPFLAGS = -I../../include SRCS = $(sort $(wildcard *.c)) TDB_TESTS = $(patsubst %.c,%.tdb,$(SRCS)) -BDB_DONTRUN = bug627 +BDB_DONTRUN = bug627 test_abort1 BDB_TESTS = $(patsubst %.c,%.bdb,$(filter-out $(patsubst %,%.c,$(BDB_DONTRUN)),$(SRCS))) -foobdb: - echo $(BDB_TESTS) ALL_TESTS = $(TDB_TESTS) $(BDB_TESTS) diff --git a/src/tests/test_abort1.c b/src/tests/test_abort1.c index c3a33654e8a..4ff3983665c 100644 --- a/src/tests/test_abort1.c +++ b/src/tests/test_abort1.c @@ -96,10 +96,12 @@ void test_db_put_aborts (void) { key.size=4; data.data="now"; data.size=4; - r=db->put(db, tid, &key, &data, 0); + r=db->put(db, tid2, &key, &data, 0); CKERR(r); } + //printf("%s:%d aborting\n", __FILE__, __LINE__); r=tid->abort(tid); assert(r==0); + //printf("%s:%d committing\n", __FILE__, __LINE__); r=tid2->commit(tid2,0); assert(r==0); } // The database should exist @@ -109,6 +111,7 @@ void test_db_put_aborts (void) { assert(r==0); } // But the item should not be in it. + if (1) { DB_TXN *tid; r=env->txn_begin(env, 0, &tid, 0); assert(r==0); @@ -122,6 +125,15 @@ void test_db_put_aborts (void) { assert(r!=0); assert(r==DB_NOTFOUND); } + { + DBT key,data; + memset(&key, 0, sizeof(key)); + memset(&data, 0, sizeof(data)); + key.data="bye"; + key.size=4; + r=db->get(db, tid, &key, &data, 0); + CKERR(r); + } r=tid->commit(tid,0); assert(r==0); } diff --git a/src/tests/test_abort2.c b/src/tests/test_abort2.c index 719b1232cd4..bdd3db345de 100644 --- a/src/tests/test_abort2.c +++ b/src/tests/test_abort2.c @@ -49,7 +49,7 @@ void do_test_abort2 (void) { r=db->close(db, 0); CKERR(r); r=env->close(env, 0); CKERR(r); - printf("%s:%d\n", __FILE__, __LINE__); + //printf("%s:%d\n", __FILE__, __LINE__); // Now do a few inserts that abort. r=db_env_create(&env, 0); assert(r==0); @@ -78,17 +78,17 @@ void do_test_abort2 (void) { r=txn->abort(txn); CKERR(r); - printf("%s:%d\n", __FILE__, __LINE__); + //printf("%s:%d\n", __FILE__, __LINE__); //r=db->close(db,0); CKERR(r); r=env->close(env, 0); CKERR(r); return; // Don't do a lookup on "hello7", because that will force things out of the buffer. r=db->close(db, 0); CKERR(r); - printf("%s:%d\n", __FILE__, __LINE__); + //printf("%s:%d\n", __FILE__, __LINE__); r=db_create(&db, env, 0); CKERR(r); r=env->txn_begin(env, 0, &txn, 0); assert(r==0); r=db->open(db, txn, "foo.db", 0, DB_BTREE, 0, 0777); CKERR(r); - r=txn->abort(txn); CKERR(r); - printf("%s:%d\n", __FILE__, __LINE__); + r=txn->commit(txn, 0); CKERR(r); + //printf("%s:%d\n", __FILE__, __LINE__); r=env->txn_begin(env, 0, &txn, 0); assert(r==0); { @@ -96,7 +96,7 @@ void do_test_abort2 (void) { memset(&data, 0, sizeof(data)); r = db->get(db, txn, dbt_init(&key, "hello7", strlen("hello7")+1), &data, 0); CKERR(r); - printf("data is %s\n", (char*)data.data); + //printf("data is %s\n", (char*)data.data); assert(((char*)data.data)[0]=='0'); } r=txn->abort(txn); CKERR(r); diff --git a/src/tests/test_dup_delete.c b/src/tests/test_dup_delete.c index 1feaef1fa21..0fc524462f5 100644 --- a/src/tests/test_dup_delete.c +++ b/src/tests/test_dup_delete.c @@ -106,7 +106,7 @@ void test_dup_delete(int n, int dup_mode) { int k = htonl(n/2); DBT key, val; r = db->get(db, null_txn, dbt_init(&key, &k, sizeof k), dbt_init_malloc(&val), 0); - assert(r != 0); + assert(r == DB_NOTFOUND); } /* verify all dups are removed using a cursor */ diff --git a/src/tests/test_log6a_abort.c b/src/tests/test_log6a_abort.c index 71017e32934..2f2ca8a3fc1 100644 --- a/src/tests/test_log6a_abort.c +++ b/src/tests/test_log6a_abort.c @@ -178,6 +178,7 @@ static void verify_items (DB_ENV *env, DB *db) { snprintf(hello, sizeof(hello), "hello%d.%d", kv, dv); snprintf(there, sizeof(hello), "there%d", dv); k2.data = hello; k2.size=strlen(hello)+1; + printf("kv=%d dv=%d\n", kv, dv); r=db->get(db, txn, &k2, &v2, 0); assert(r==0); assert(strcmp(v2.data, there)==0); diff --git a/src/ydb.c b/src/ydb.c index f139bfbbc97..284b9183bf1 100644 --- a/src/ydb.c +++ b/src/ydb.c @@ -2663,6 +2663,9 @@ char *db_strerror(int error) { if (error==DB_BADFORMAT) { return "Database Bad Format (probably a corrupted database)"; } + if (error==DB_NOTFOUND) { + return "Not found"; + } static char unknown_result[100]; // Race condition if two threads call this at the same time. However even in a bad case, it should be some sort of null-terminated string. errorstr = unknown_result;