From 538a507a3057651fdf426e7f5dab3a0e74a46172 Mon Sep 17 00:00:00 2001 From: "Bradley C. Kuszmaul" Date: Wed, 19 Mar 2008 19:23:45 +0000 Subject: [PATCH] Make rollback do the right thing in some cases fo internal nodes. Addresses #556. git-svn-id: file:///svn/tokudb@2955 c7de825b-a66e-492c-adef-691d508d4ae1 --- newbrt/Makefile | 3 +- newbrt/brt-internal.h | 2 + newbrt/brt.c | 162 ++++++++++++++++++++++++++++------------ newbrt/brt.h | 3 + newbrt/brttypes.h | 5 ++ newbrt/fifo-test-exp.c | 71 ++++++++++++++++++ newbrt/fifo.c | 21 +++++- newbrt/fifo.h | 2 + newbrt/log.h | 3 + newbrt/logformat.c | 4 + newbrt/roll.c | 10 +++ src/tests/test_abort2.c | 2 +- 12 files changed, 234 insertions(+), 54 deletions(-) create mode 100644 newbrt/fifo-test-exp.c diff --git a/newbrt/Makefile b/newbrt/Makefile index fc18cd589c3..76050af173d 100644 --- a/newbrt/Makefile +++ b/newbrt/Makefile @@ -52,6 +52,7 @@ REGRESSION_TESTS = \ cachetable-test \ cachetable-test2 \ fifo-test \ + fifo-test-exp \ test-brt-delete-both \ brt-test \ brt-test3 \ @@ -168,7 +169,7 @@ brt.o: $(BRT_INTERNAL_H_INCLUDES) key.h log_header.h fifo.o: fifo.h brttypes.h memory.o: memory.h primes.o: primes.h toku_assert.h -fifo-test: fifo.o memory.o toku_assert.o ybt.o +fifo-test-exp fifo-test: fifo.o memory.o toku_assert.o ybt.o brt-serialize.o: $(BRT_INTERNAL_H_INCLUDES) key.h wbuf.h rbuf.h brt-bigtest: memory.o ybt.o brt.o pma.o cachetable.o key.o fifo.o brt-serialize.o brt-bigtest.o: brt.h ../include/db.h diff --git a/newbrt/brt-internal.h b/newbrt/brt-internal.h index f0aa927eecc..1cb028ee36f 100644 --- a/newbrt/brt-internal.h +++ b/newbrt/brt-internal.h @@ -191,4 +191,6 @@ int toku_testsetup_insert_to_nonleaf (BRT brt, DISKOFF diskoff, enum brt_cmd_typ int toku_set_func_fsync (int (*fsync_function)(int)); + + #endif diff --git a/newbrt/brt.c b/newbrt/brt.c index 7539cb87ceb..f3ce73323d2 100644 --- a/newbrt/brt.c +++ b/newbrt/brt.c @@ -492,13 +492,14 @@ static int brtnode_put_cmd (BRT t, BRTNODE node, BRT_CMD cmd, int *did_split, BRTNODE *nodea, BRTNODE *nodeb, DBT *split, int debug, - TOKULOGGER); + TOKULOGGER, DISKOFFARRAY path_to_parent); /* key is not in the buffer. Either put the key-value pair in the child, or put it in the node. */ static int push_brt_cmd_down_only_if_it_wont_push_more_else_put_here (BRT t, BRTNODE node, BRTNODE child, BRT_CMD cmd, int childnum_of_node, - TOKULOGGER logger) { + TOKULOGGER logger, + DISKOFFARRAY path_to_parent) { assert(node->height>0); /* Not a leaf. */ DBT *k = cmd->u.id.key; DBT *v = cmd->u.id.val; @@ -533,7 +534,8 @@ static int push_brt_cmd_down_only_if_it_wont_push_more_else_put_here (BRT t, BRT r = brtnode_put_cmd(t, child, cmd, &again_split, &againa, &againb, &againk, 0, - logger); + logger, + path_to_parent); if (r!=0) return r; assert(again_split==0); /* I only did the insert if I knew it wouldn't push down, and hence wouldn't split. */ } else { @@ -547,7 +549,8 @@ static int push_a_brt_cmd_down (BRT t, BRTNODE node, BRTNODE child, int childnum BRT_CMD cmd, int *child_did_split, BRTNODE *childa, BRTNODE *childb, DBT *childsplitk, - TOKULOGGER logger) { + TOKULOGGER logger, + DISKOFFARRAY path_to_parent) { //if (debug) printf("%s:%d %*sinserting down\n", __FILE__, __LINE__, debug, ""); //printf("%s:%d hello!\n", __FILE__, __LINE__); assert(node->height>0); @@ -555,7 +558,8 @@ static int push_a_brt_cmd_down (BRT t, BRTNODE node, BRTNODE child, int childnum int r = brtnode_put_cmd(t, child, cmd, child_did_split, childa, childb, childsplitk, 0, - logger); + logger, + path_to_parent); if (r!=0) return r; } @@ -593,7 +597,7 @@ static int push_a_brt_cmd_down (BRT t, BRTNODE node, BRTNODE child, int childnum return 0; } -static int brtnode_maybe_push_down(BRT t, BRTNODE node, int *did_split, BRTNODE *nodea, BRTNODE *nodeb, DBT *splitk, int debug, TOKULOGGER logger); +static int brtnode_maybe_push_down(BRT t, BRTNODE node, int *did_split, BRTNODE *nodea, BRTNODE *nodeb, DBT *splitk, int debug, TOKULOGGER logger, DISKOFFARRAY path_to_parent); static int split_count=0; @@ -609,7 +613,8 @@ static int handle_split_of_child (BRT t, BRTNODE node, int childnum, DBT *childsplitk, /* the data in the childsplitk is alloc'd and is consumed by this call. */ int *did_split, BRTNODE *nodea, BRTNODE *nodeb, DBT *splitk, - TOKULOGGER logger) { + TOKULOGGER logger, + DISKOFFARRAY path_to_parent) { assert(node->height>0); assert(0 <= childnum && childnum < node->u.n.n_children); FIFO old_h = BNC_BUFFER(node,childnum); @@ -718,7 +723,7 @@ static int handle_split_of_child (BRT t, BRTNODE node, int childnum, if (pusha) { // If we already have something in the buffer, we must add the new command to the buffer so that commands don't get out of order. if (toku_fifo_n_entries(BNC_BUFFER(node,childnum))==0) { - r=push_brt_cmd_down_only_if_it_wont_push_more_else_put_here(t, node, childa, &brtcmd, childnum, logger); + r=push_brt_cmd_down_only_if_it_wont_push_more_else_put_here(t, node, childa, &brtcmd, childnum, logger, path_to_parent); } else { r=insert_to_buffer_in_nonleaf(node, childnum, &skd, &svd, type, xid); } @@ -726,7 +731,7 @@ static int handle_split_of_child (BRT t, BRTNODE node, int childnum, if (pushb) { // If we already have something in the buffer, we must add the new command to the buffer so that commands don't get out of order. if (toku_fifo_n_entries(BNC_BUFFER(node,childnum+1))==0) { - r=push_brt_cmd_down_only_if_it_wont_push_more_else_put_here(t, node, childb, &brtcmd, childnum+1, logger); + r=push_brt_cmd_down_only_if_it_wont_push_more_else_put_here(t, node, childb, &brtcmd, childnum+1, logger, path_to_parent); } else { r=insert_to_buffer_in_nonleaf(node, childnum+1, &skd, &svd, type, xid); } @@ -784,7 +789,7 @@ static int handle_split_of_child (BRT t, BRTNODE node, int childnum, if (toku_serialize_brtnode_size(node) > node->nodesize) { /* lighten the node by pushing down its buffers. this may cause the current node to split and go away */ - r = brtnode_maybe_push_down(t, node, did_split, nodea, nodeb, splitk, 0, logger); + r = brtnode_maybe_push_down(t, node, did_split, nodea, nodeb, splitk, 0, logger, path_to_parent); assert(r == 0); } if (*did_split == 0) assert(toku_serialize_brtnode_size(node)<=node->nodesize); @@ -796,7 +801,8 @@ static int push_some_brt_cmds_down (BRT t, BRTNODE node, int childnum, int *did_split, BRTNODE *nodea, BRTNODE *nodeb, DBT *splitk, int debug, - TOKULOGGER logger) { + TOKULOGGER logger, + DISKOFFARRAY path_to_parent) { void *childnode_v; BRTNODE child; int r; @@ -844,7 +850,8 @@ static int push_some_brt_cmds_down (BRT t, BRTNODE node, int childnum, &brtcmd, &child_did_split, &childa, &childb, &childsplitk, - logger); + logger, + path_to_parent); if (0){ unsigned int sum=0; @@ -862,7 +869,8 @@ static int push_some_brt_cmds_down (BRT t, BRTNODE node, int childnum, r=handle_split_of_child (t, node, childnum, childa, childb, &childsplitk, did_split, nodea, nodeb, splitk, - logger); + logger, + path_to_parent); //if (*did_split) { // verify_local_fingerprint_nonleaf(*nodea); // verify_local_fingerprint_nonleaf(*nodeb); @@ -885,7 +893,7 @@ static int debugp1 (int debug) { return debug ? debug+1 : 0; } -static int brtnode_maybe_push_down(BRT t, BRTNODE node, int *did_split, BRTNODE *nodea, BRTNODE *nodeb, DBT *splitk, int debug, TOKULOGGER logger) +static int brtnode_maybe_push_down(BRT t, BRTNODE node, int *did_split, BRTNODE *nodea, BRTNODE *nodeb, DBT *splitk, int debug, TOKULOGGER logger, DISKOFFARRAY path_to_parent) /* If the buffer is too full, then push down. Possibly the child will split. That may make us split. */ { assert(node->height>0); @@ -901,7 +909,7 @@ static int brtnode_maybe_push_down(BRT t, BRTNODE node, int *did_split, BRTNODE find_heaviest_child(node, &childnum); if (0) printf("%s:%d %*spush some down from %lld into %lld (child %d)\n", __FILE__, __LINE__, debug, "", node->thisnodename, BNC_DISKOFF(node, childnum), childnum); assert(BNC_DISKOFF(node, childnum)!=0); - int r = push_some_brt_cmds_down(t, node, childnum, did_split, nodea, nodeb, splitk, debugp1(debug), logger); + int r = push_some_brt_cmds_down(t, node, childnum, did_split, nodea, nodeb, splitk, debugp1(debug), logger, path_to_parent); if (r!=0) return r; assert(*did_split==0 || *did_split==1); if (debug) printf("%s:%d %*sdid push_some_brt_cmds_down did_split=%d\n", __FILE__, __LINE__, debug, "", *did_split); @@ -1020,7 +1028,8 @@ unsigned int toku_brtnode_which_child (BRTNODE node , DBT *k, DBT *d, BRT t) { /* put a cmd into a nodes child */ static int brt_nonleaf_put_cmd_child_node (BRT t, BRTNODE node, BRT_CMD cmd, int *did_split, BRTNODE *nodea, BRTNODE *nodeb, DBT *splitk, - int debug, TOKULOGGER logger, int childnum, int maybe) { + int debug, TOKULOGGER logger, int childnum, int maybe, + DISKOFFARRAY path_to_parent) { int r; void *child_v; BRTNODE child; @@ -1042,7 +1051,8 @@ static int brt_nonleaf_put_cmd_child_node (BRT t, BRTNODE node, BRT_CMD cmd, child_did_split = 0; r = brtnode_put_cmd(t, child, cmd, - &child_did_split, &childa, &childb, &childsplitk, debug, logger); + &child_did_split, &childa, &childb, &childsplitk, debug, logger, + path_to_parent); if (r != 0) { /* putting to the child failed for some reason, so unpin the child and return the error code */ int rr = toku_unpin_brtnode(t, child); @@ -1055,7 +1065,8 @@ static int brt_nonleaf_put_cmd_child_node (BRT t, BRTNODE node, BRT_CMD cmd, r = handle_split_of_child(t, node, childnum, childa, childb, &childsplitk, did_split, nodea, nodeb, splitk, - logger); + logger, + path_to_parent); assert(r == 0); } else { //verify_local_fingerprint_nonleaf(child); @@ -1071,12 +1082,13 @@ int toku_brt_do_push_cmd = 1; /* put a cmd into a node at childnum */ static int brt_nonleaf_put_cmd_child (BRT t, BRTNODE node, BRT_CMD cmd, int *did_split, BRTNODE *nodea, BRTNODE *nodeb, DBT *splitk, - int debug, TOKULOGGER logger, unsigned int childnum, int can_push, int *do_push_down) { + int debug, TOKULOGGER logger, unsigned int childnum, int can_push, int *do_push_down, + DISKOFFARRAY path_to_parent) { //verify_local_fingerprint_nonleaf(node); /* try to push the cmd to the subtree if the buffer is empty and pushes are enabled */ if (BNC_NBYTESINBUF(node, childnum) == 0 && can_push && toku_brt_do_push_cmd) { - int r = brt_nonleaf_put_cmd_child_node(t, node, cmd, did_split, nodea, nodeb, splitk, debug, logger, childnum, 1); + int r = brt_nonleaf_put_cmd_child_node(t, node, cmd, did_split, nodea, nodeb, splitk, debug, logger, childnum, 1, path_to_parent); if (r == 0) return r; } @@ -1094,6 +1106,15 @@ static int brt_nonleaf_put_cmd_child (BRT t, BRTNODE node, BRT_CMD cmd, u_int32_t newfingerprint = node->local_fingerprint + node->rand4fingerprint * toku_calccrc32_cmd(type, cmd->xid, k->data, k->size, v->data, v->size); int r=toku_log_brtenq(logger, 0, toku_cachefile_filenum(t->cf), node->thisnodename, childnum, cmd->xid, type, keybs, databs, node->local_fingerprint, newfingerprint); assert(r==0); + { + TOKUTXN txn; + if (0==toku_txnid2txn(logger,cmd->xid,&txn) && txn) { + DISKOFFARRAY path = path_to_parent; + path.array = toku_memdup(path.array, sizeof(path.array[0])*(1+path.len)); + r=toku_logger_save_rollback_xactiontouchednonleaf(txn, toku_cachefile_filenum(t->cf), path, node->thisnodename); + if (r!=0) return r; + } + } r=toku_fifo_enq(BNC_BUFFER(node,childnum), k->data, k->size, v->data, v->size, type, cmd->xid); assert(r==0); node->local_fingerprint = newfingerprint; @@ -1107,7 +1128,7 @@ static int brt_nonleaf_put_cmd_child (BRT t, BRTNODE node, BRT_CMD cmd, static int brt_nonleaf_insert_cmd (BRT t, BRTNODE node, BRT_CMD cmd, int *did_split, BRTNODE *nodea, BRTNODE *nodeb, DBT *splitk, - int debug, TOKULOGGER logger) { + int debug, TOKULOGGER logger, DISKOFFARRAY path_to_parent) { //verify_local_fingerprint_nonleaf(node); unsigned int childnum; int r; @@ -1117,14 +1138,14 @@ static int brt_nonleaf_insert_cmd (BRT t, BRTNODE node, BRT_CMD cmd, /* put the cmd in the subtree */ int do_push_down = 0; - r = brt_nonleaf_put_cmd_child(t, node, cmd, did_split, nodea, nodeb, splitk, debug, logger, childnum, 1, &do_push_down); + r = brt_nonleaf_put_cmd_child(t, node, cmd, did_split, nodea, nodeb, splitk, debug, logger, childnum, 1, &do_push_down, path_to_parent); if (r != 0) return r; /* maybe push down */ if (do_push_down) { if (debug) printf("%s:%d %*sDoing maybe_push_down\n", __FILE__, __LINE__, debug, ""); //verify_local_fingerprint_nonleaf(node); - r = brtnode_maybe_push_down(t, node, did_split, nodea, nodeb, splitk, debugp1(debug), logger); + r = brtnode_maybe_push_down(t, node, did_split, nodea, nodeb, splitk, debugp1(debug), logger, path_to_parent); if (r!=0) return r; if (debug) printf("%s:%d %*sDid maybe_push_down\n", __FILE__, __LINE__, debug, ""); if (*did_split) { @@ -1154,7 +1175,8 @@ static int brt_nonleaf_insert_cmd (BRT t, BRTNODE node, BRT_CMD cmd, static int brt_nonleaf_delete_cmd (BRT t, BRTNODE node, BRT_CMD cmd, int *did_split, BRTNODE *nodea, BRTNODE *nodeb, DBT *splitk, int debug, - TOKULOGGER logger) { + TOKULOGGER logger, + DISKOFFARRAY path_to_parent) { int r; /* find all children that need a delete cmd */ @@ -1186,7 +1208,7 @@ static int brt_nonleaf_delete_cmd (BRT t, BRTNODE node, BRT_CMD cmd, /* issue the delete cmd to all of the children found previously */ int do_push_down = 0; for (i=0; itype == BRT_INSERT || cmd->type == BRT_DELETE_BOTH) { - return brt_nonleaf_insert_cmd(t, node, cmd, did_split, nodea, nodeb, splitk, debug, logger); + return brt_nonleaf_insert_cmd(t, node, cmd, did_split, nodea, nodeb, splitk, debug, logger, path_to_parent); } else if (cmd->type == BRT_DELETE) { - return brt_nonleaf_delete_cmd(t, node, cmd, did_split, nodea, nodeb, splitk, debug, logger); + return brt_nonleaf_delete_cmd(t, node, cmd, did_split, nodea, nodeb, splitk, debug, logger, path_to_parent); } else return EINVAL; } @@ -1248,7 +1271,8 @@ static void verify_local_fingerprint_nonleaf (BRTNODE node) { static int brtnode_put_cmd (BRT t, BRTNODE node, BRT_CMD cmd, int *did_split, BRTNODE *nodea, BRTNODE *nodeb, DBT *splitk, int debug, - TOKULOGGER logger) { + TOKULOGGER logger, + DISKOFFARRAY path_to_parent) { //static int counter=0; // FOO //static int oldcounter=0; //int tmpcounter; @@ -1263,7 +1287,7 @@ static int brtnode_put_cmd (BRT t, BRTNODE node, BRT_CMD cmd, } else { r = brt_nonleaf_put_cmd(t, node, cmd, did_split, nodea, nodeb, splitk, - debug, logger); + debug, logger, path_to_parent); } //oldcounter=tmpcounter; // Watch out. If did_split then the original node is no longer allocated. @@ -1702,7 +1726,7 @@ static int brt_init_new_root(BRT brt, BRTNODE nodea, BRTNODE nodeb, DBT splitk, return 0; } -static int brt_root_put_cmd(BRT brt, BRT_CMD cmd, TOKULOGGER logger) { +static int brt_root_put_cmd(BRT brt, BRT_CMD cmd, TOKULOGGER logger, DISKOFFARRAY path_to_parent) { void *node_v; BRTNODE node; CACHEKEY *rootp; @@ -1729,7 +1753,8 @@ static int brt_root_put_cmd(BRT brt, BRT_CMD cmd, TOKULOGGER logger) { result = brtnode_put_cmd(brt, node, cmd, &did_split, &nodea, &nodeb, &splitk, debug, - logger); + logger, + path_to_parent); if (debug) printf("%s:%d did_insert\n", __FILE__, __LINE__); if (did_split) { // node is unpinned, so now we have to proceed to update the root with a new node. @@ -1752,11 +1777,14 @@ static int brt_root_put_cmd(BRT brt, BRT_CMD cmd, TOKULOGGER logger) { return result; } +#define MAX_PATHLEN_TO_ROOT 40 + int toku_brt_insert (BRT brt, DBT *key, DBT *val, TOKUTXN txn) { int r; BRT_CMD_S brtcmd = { BRT_INSERT, toku_txn_get_txnid(txn), .u.id={key,val}}; - - r = brt_root_put_cmd(brt, &brtcmd, toku_txn_logger(txn)); + DISKOFF path[MAX_PATHLEN_TO_ROOT]; + DISKOFFARRAY path_to_parent = {0, path}; + r = brt_root_put_cmd(brt, &brtcmd, toku_txn_logger(txn), path_to_parent); return r; } @@ -1779,14 +1807,18 @@ int toku_brt_delete(BRT brt, DBT *key, TOKUTXN txn) { int r; DBT val; BRT_CMD_S brtcmd = { BRT_DELETE, toku_txn_get_txnid(txn), .u.id={key, toku_init_dbt(&val)}}; - r = brt_root_put_cmd(brt, &brtcmd, toku_txn_logger(txn)); + DISKOFF path[MAX_PATHLEN_TO_ROOT]; + DISKOFFARRAY path_to_parent = {0, path}; + r = brt_root_put_cmd(brt, &brtcmd, toku_txn_logger(txn), path_to_parent); return r; } int toku_brt_delete_both(BRT brt, DBT *key, DBT *val, TOKUTXN txn) { int r; BRT_CMD_S brtcmd = { BRT_DELETE_BOTH, toku_txn_get_txnid(txn), .u.id={key,val}}; - r = brt_root_put_cmd(brt, &brtcmd, toku_txn_logger(txn)); + DISKOFF path[MAX_PATHLEN_TO_ROOT]; + DISKOFFARRAY path_to_parent = {0, path}; + r = brt_root_put_cmd(brt, &brtcmd, toku_txn_logger(txn), path_to_parent); return r; } @@ -1924,15 +1956,15 @@ static inline void brt_split_init(BRT_SPLIT *split) { toku_init_dbt(&split->splitk); } -static int brt_search_node(BRT brt, BRTNODE node, brt_search_t *search, DBT *newkey, DBT *newval, BRT_SPLIT *split, TOKULOGGER logger); +static int brt_search_node(BRT brt, BRTNODE node, brt_search_t *search, DBT *newkey, DBT *newval, BRT_SPLIT *split, TOKULOGGER logger, DISKOFFARRAY path_to_parent); /* search in a node's child */ -static int brt_search_child(BRT brt, BRTNODE node, int childnum, brt_search_t *search, DBT *newkey, DBT *newval, BRT_SPLIT *split, TOKULOGGER logger) { +static int brt_search_child(BRT brt, BRTNODE node, int childnum, brt_search_t *search, DBT *newkey, DBT *newval, BRT_SPLIT *split, TOKULOGGER logger, DISKOFFARRAY path_to_parent) { int r, rr; /* if the child's buffer is not empty then try to empty it */ if (BNC_NBYTESINBUF(node, childnum) > 0) { - rr = push_some_brt_cmds_down(brt, node, childnum, &split->did_split, &split->nodea, &split->nodeb, &split->splitk, 0, logger); + rr = push_some_brt_cmds_down(brt, node, childnum, &split->did_split, &split->nodea, &split->nodeb, &split->splitk, 0, logger, path_to_parent); assert(rr == 0); /* push down may cause a child split, so childnum may not be appropriate, and the node itself may split, so retry */ return EAGAIN; @@ -1945,11 +1977,11 @@ static int brt_search_child(BRT brt, BRTNODE node, int childnum, brt_search_t *s for (;;) { BRTNODE childnode = node_v; BRT_SPLIT childsplit; brt_split_init(&childsplit); - r = brt_search_node(brt, childnode, search, newkey, newval, &childsplit, logger); + r = brt_search_node(brt, childnode, search, newkey, newval, &childsplit, logger, path_to_parent); if (childsplit.did_split) { rr = handle_split_of_child(brt, node, childnum, childsplit.nodea, childsplit.nodeb, &childsplit.splitk, - &split->did_split, &split->nodea, &split->nodeb, &split->splitk, logger); + &split->did_split, &split->nodea, &split->nodeb, &split->splitk, logger, path_to_parent); assert(rr == 0); break; } else { @@ -1964,7 +1996,7 @@ static int brt_search_child(BRT brt, BRTNODE node, int childnum, brt_search_t *s return r; } -static int brt_search_nonleaf_node(BRT brt, BRTNODE node, brt_search_t *search, DBT *newkey, DBT *newval, BRT_SPLIT *split, TOKULOGGER logger) { +static int brt_search_nonleaf_node(BRT brt, BRTNODE node, brt_search_t *search, DBT *newkey, DBT *newval, BRT_SPLIT *split, TOKULOGGER logger, DISKOFFARRAY path_to_parent) { int r = DB_NOTFOUND; int c; @@ -1982,7 +2014,7 @@ static int brt_search_nonleaf_node(BRT brt, BRTNODE node, brt_search_t *search, if (search->compare(search, toku_fill_dbt(&pivotkey, kv_pair_key(pivot), kv_pair_keylen(pivot)), brt->flags & TOKU_DB_DUPSORT ? toku_fill_dbt(&pivotval, kv_pair_val(pivot), kv_pair_vallen(pivot)): 0)) { - r = brt_search_child(brt, node, child[c], search, newkey, newval, split, logger); + r = brt_search_child(brt, node, child[c], search, newkey, newval, split, logger, path_to_parent); if (r == 0 || r == EAGAIN) break; } @@ -1990,7 +2022,7 @@ static int brt_search_nonleaf_node(BRT brt, BRTNODE node, brt_search_t *search, /* check the first (left) or last (right) node if nothing has been found */ if (r == DB_NOTFOUND && c == node->u.n.n_children-1) - r = brt_search_child(brt, node, child[c], search, newkey, newval, split, logger); + r = brt_search_child(brt, node, child[c], search, newkey, newval, split, logger, path_to_parent); return r; } @@ -2001,9 +2033,9 @@ static int brt_search_leaf_node(BRTNODE node, brt_search_t *search, DBT *newkey, return r; } -static int brt_search_node(BRT brt, BRTNODE node, brt_search_t *search, DBT *newkey, DBT *newval, BRT_SPLIT *split, TOKULOGGER logger) { +static int brt_search_node(BRT brt, BRTNODE node, brt_search_t *search, DBT *newkey, DBT *newval, BRT_SPLIT *split, TOKULOGGER logger, DISKOFFARRAY path_to_parent) { if (node->height > 0) - return brt_search_nonleaf_node(brt, node, search, newkey, newval, split, logger); + return brt_search_nonleaf_node(brt, node, search, newkey, newval, split, logger, path_to_parent); else return brt_search_leaf_node(node, search, newkey, newval); } @@ -2025,7 +2057,9 @@ int toku_brt_search(BRT brt, brt_search_t *search, DBT *newkey, DBT *newval, TOK for (;;) { BRT_SPLIT split; brt_split_init(&split); - r = brt_search_node(brt, node, search, newkey, newval, &split, logger); + DISKOFF path[MAX_PATHLEN_TO_ROOT]; + DISKOFFARRAY path_to_parent = {0, path}; + r = brt_search_node(brt, node, search, newkey, newval, &split, logger, path_to_parent); if (split.did_split) { rr = brt_init_new_root(brt, split.nodea, split.nodeb, split.splitk, rootp, 0, &node); @@ -2412,3 +2446,35 @@ int toku_brt_height_of_root(BRT brt, int *height) { r = toku_unpin_brt_header(brt); assert(r==0); return 0; } + +struct callpair { + BRTNODE node; + int childnum; +}; +static int note_removal (bytevec key, ITEMLEN keylen, bytevec data, ITEMLEN datalen, int type, TXNID xid, void*cpairv) { + struct callpair *cpair = cpairv; + BRTNODE node = cpair->node; + int childnum = cpair->childnum; + u_int32_t old_fingerprint = node->local_fingerprint; + node->local_fingerprint = old_fingerprint = node->rand4fingerprint*toku_calccrc32_cmd(type, xid, key, keylen, data, datalen); + u_int32_t countdiff = keylen+datalen+KEY_VALUE_OVERHEAD+BRT_CMD_OVERHEAD; + BNC_NBYTESINBUF(node,childnum) -= countdiff; + node->u.n.n_bytes_in_buffers -= countdiff; + return 0; +} + +int toku_brt_nonleaf_expunge_xaction(BRT brt, DISKOFF diskoff, TXNID xid) { + void *node_v; + int r = toku_cachetable_get_and_pin(brt->cf, diskoff, &node_v, NULL, toku_brtnode_flush_callback, toku_brtnode_fetch_callback, brt); + if (r!=0) return r; + BRTNODE node = node_v; + int i; + r=0; + for (i=0; iu.n.n_children; i++) { + struct callpair pair = { node, i }; + int r3 = toku_fifo_expunge_xaction(BNC_BUFFER(node, i), xid, note_removal, &pair); + if (r==0) r=r3; + } + int r2 = toku_cachetable_unpin(brt->cf, diskoff, 1, toku_serialize_brtnode_size(node)); + return r ? r : r2; +} diff --git a/newbrt/brt.h b/newbrt/brt.h index b6c8009899b..94997e51087 100644 --- a/newbrt/brt.h +++ b/newbrt/brt.h @@ -66,4 +66,7 @@ int toku_brt_get_fd(BRT, int *); int toku_brt_height_of_root(BRT, int *height); // for an open brt, return the current height. +// Special hack for recovery +int toku_brt_nonleaf_expunge_xaction(BRT brt, DISKOFF diskoff, TXNID xid); + #endif diff --git a/newbrt/brttypes.h b/newbrt/brttypes.h index c112775cf4e..13694803802 100644 --- a/newbrt/brttypes.h +++ b/newbrt/brttypes.h @@ -27,6 +27,11 @@ typedef struct { char *data; } BYTESTRING; +typedef struct { + int len; + DISKOFF *array; +} DISKOFFARRAY; + /* Make the LSN be a struct instead of an integer so that we get better type checking. */ typedef struct __toku_lsn { u_int64_t lsn; } LSN; #define ZERO_LSN ((LSN){0}) diff --git a/newbrt/fifo-test-exp.c b/newbrt/fifo-test-exp.c new file mode 100644 index 00000000000..6eb591765bc --- /dev/null +++ b/newbrt/fifo-test-exp.c @@ -0,0 +1,71 @@ +/* Test the expunge method. */ + +#include +#include +#include +#include "fifo.h" +#include "memory.h" + +int count; +int callback (bytevec key, ITEMLEN keylen, bytevec data, ITEMLEN datalen, int type, TXNID xid, void *v) { + TXNID which=(long)v; + assert(xid==which); + int actual_row = count; + assert(strlen(key)+1==keylen); + assert(strlen(data)+1==datalen); + //printf("count=%d which=%ld deleting %s %s\n", count, (long)which, (char*)key, (char*)data); + switch (which) { + case 23: break; + case 24: actual_row++; break; + case 26: actual_row+=3; + } + switch (actual_row) { + case 0: assert(strcmp(key, "hello")==0); assert(strcmp(data, "thera")==0); assert(xid==23); assert(type==0); break; + case 1: assert(strcmp(key, "hello")==0); assert(strcmp(data, "therb")==0); assert(xid==24); assert(type==0); break; + case 2: assert(strcmp(key, "hell1")==0); assert(strcmp(data, "therc")==0); assert(xid==24); assert(type==1); break; + case 3: assert(strcmp(key, "hell1")==0); assert(strcmp(data, "therd")==0); assert(xid==26); assert(type==1); break; + default: assert(0); + } + count++; + return 0; +} + +void doit (int which) { + int r; + FIFO f; + r = toku_fifo_create(&f); assert(r==0); + r = toku_fifo_enq(f, "hello", 6, "thera", 6, 0, 23); assert(r==0); + r = toku_fifo_enq(f, "hello", 6, "therb", 6, 0, 24); assert(r==0); + r = toku_fifo_enq(f, "hell1", 6, "therc", 6, 1, 24); assert(r==0); + r = toku_fifo_enq(f, "hell1", 6, "therd", 6, 1, 26); assert(r==0); + int i=0; + FIFO_ITERATE(f, k, kl, d, dl, t, x, + ({ + assert(strlen(k)+1==kl); + assert(strlen(d)+1==dl); + switch(i) { + case 0: assert(strcmp(k, "hello")==0); assert(strcmp(d, "thera")==0); assert(x==23); assert(t==0); i++; break; + case 1: assert(strcmp(k, "hello")==0); assert(strcmp(d, "therb")==0); assert(x==24); assert(t==0); i++; break; + case 2: assert(strcmp(k, "hell1")==0); assert(strcmp(d, "therc")==0); assert(x==24); assert(t==1); i++; break; + case 3: assert(strcmp(k, "hell1")==0); assert(strcmp(d, "therd")==0); assert(x==26); assert(t==1); i++; break; + default: assert(0); + } + })); + count=0; + r = toku_fifo_expunge_xaction(f, which, callback, (void*)(long)which); + switch (which) { + case 23: assert(count==1); break; + case 24: assert(count==2); break; + case 26: assert(count==1); break; + } + toku_fifo_free(&f); + toku_malloc_cleanup(); +} + +int main (int argc __attribute__((__unused__)), char *argv[] __attribute__((__unused__))) { + doit(23); + doit(24); + doit(26); + doit(27); + return 0; +} diff --git a/newbrt/fifo.c b/newbrt/fifo.c index 14f1ee82ab4..0919631da7f 100644 --- a/newbrt/fifo.c +++ b/newbrt/fifo.c @@ -140,7 +140,20 @@ void toku_fifo_iterate (FIFO fifo, void(*f)(bytevec key,ITEMLEN keylen,bytevec d f(entry->key, entry->keylen, entry->key + entry->keylen, entry->vallen, entry->type, entry->xid, arg); } - - - - +int toku_fifo_expunge_xaction(FIFO fifo, TXNID xid, int (*callback_on_delete)(bytevec key, ITEMLEN keylen, bytevec data, ITEMLEN datalen, int type, TXNID xid, void*), void*arg) { + struct fifo_entry **prev=&fifo->head; + struct fifo_entry *entry; + while ((entry=*prev)) { + if (entry->xid==xid) { + // Must remove it. + int r = callback_on_delete(entry->key, entry->keylen, entry->key+entry->keylen, entry->vallen, entry->type, entry->xid, arg); + fifo->n--; + *prev=entry->next; + toku_free_n(entry, fifo_entry_size(entry)); + if (r!=0) return r; + } else { + prev = &entry->next; + } + } + return 0; +} diff --git a/newbrt/fifo.h b/newbrt/fifo.h index 7c4057a5670..2f3fb13eb65 100644 --- a/newbrt/fifo.h +++ b/newbrt/fifo.h @@ -44,4 +44,6 @@ void toku_fifo_iterate (FIFO, void(*f)(bytevec key,ITEMLEN keylen,bytevec data,I } \ }) +int toku_fifo_expunge_xaction(FIFO fifo, TXNID xid, int (*callback_on_delete)(bytevec key, ITEMLEN keylen, bytevec data, ITEMLEN datalen, int type, TXNID xid, void*), void*arg); + #endif diff --git a/newbrt/log.h b/newbrt/log.h index 71b712b637b..07f2621a75e 100644 --- a/newbrt/log.h +++ b/newbrt/log.h @@ -112,6 +112,9 @@ static inline int toku_copy_BYTESTRING(BYTESTRING *target, BYTESTRING val) { static inline void toku_free_BYTESTRING(BYTESTRING val) { toku_free(val.data); } +static inline void toku_free_DISKOFFARRAY(DISKOFFARRAY val) { + toku_free(val.array); +} static inline int toku_copy_LOGGEDBRTHEADER(LOGGEDBRTHEADER *target, LOGGEDBRTHEADER val) { *target = val; diff --git a/newbrt/logformat.c b/newbrt/logformat.c index 01d5fa111d7..93425b63894 100644 --- a/newbrt/logformat.c +++ b/newbrt/logformat.c @@ -49,6 +49,10 @@ const struct logtype rollbacks[] = { {"BYTESTRING", "key", 0}, {"BYTESTRING", "data", 0}, NULLFIELD}}, + {"xactiontouchednonleaf", 'n', FA{{"FILENUM", "filenum", 0}, + {"DISKOFFARRAY", "parents", 0}, + {"DISKOFF", "diskoff", 0}, + NULLFIELD}}, {0,0,FA{NULLFIELD}} }; diff --git a/newbrt/roll.c b/newbrt/roll.c index 234c3a38772..edc3dcaa1a3 100644 --- a/newbrt/roll.c +++ b/newbrt/roll.c @@ -62,3 +62,13 @@ int toku_rollback_deleteatleaf (FILENUM filenum, BYTESTRING key, BYTESTRING data 0); // Do the insertion unconditionally return r; } + +int toku_rollback_xactiontouchednonleaf(FILENUM filenum, DISKOFFARRAY array __attribute__((__unused__)), DISKOFF diskoff, TOKUTXN txn) { + CACHEFILE cf; + BRT brt; + int r = toku_cachefile_of_filenum(txn->logger->ct, filenum, &cf, &brt); + assert(r==0); + r = toku_brt_nonleaf_expunge_xaction(brt, diskoff, txn->txnid64); + assert(r==0); + return 0; +} diff --git a/src/tests/test_abort2.c b/src/tests/test_abort2.c index 8f156c2225b..cfe501c628a 100644 --- a/src/tests/test_abort2.c +++ b/src/tests/test_abort2.c @@ -77,7 +77,7 @@ void do_test_abort2 (void) { insert(7, 1); r=txn->abort(txn); CKERR(r); - // Don't do a query on "hello7", because that will force things out of the buffer. + // Don't do a lookup on "hello7", because that will force things out of the buffer. r=env->txn_begin(env, 0, &txn, 0); assert(r==0); {