Make rollback do the right thing in some cases fo internal nodes. Addresses #556.

git-svn-id: file:///svn/tokudb@2955 c7de825b-a66e-492c-adef-691d508d4ae1
This commit is contained in:
Bradley C. Kuszmaul 2008-03-19 19:23:45 +00:00
parent 6153661e2c
commit 538a507a30
12 changed files with 234 additions and 54 deletions

View file

@ -52,6 +52,7 @@ REGRESSION_TESTS = \
cachetable-test \
cachetable-test2 \
fifo-test \
fifo-test-exp \
test-brt-delete-both \
brt-test \
brt-test3 \
@ -168,7 +169,7 @@ brt.o: $(BRT_INTERNAL_H_INCLUDES) key.h log_header.h
fifo.o: fifo.h brttypes.h
memory.o: memory.h
primes.o: primes.h toku_assert.h
fifo-test: fifo.o memory.o toku_assert.o ybt.o
fifo-test-exp fifo-test: fifo.o memory.o toku_assert.o ybt.o
brt-serialize.o: $(BRT_INTERNAL_H_INCLUDES) key.h wbuf.h rbuf.h
brt-bigtest: memory.o ybt.o brt.o pma.o cachetable.o key.o fifo.o brt-serialize.o
brt-bigtest.o: brt.h ../include/db.h

View file

@ -191,4 +191,6 @@ int toku_testsetup_insert_to_nonleaf (BRT brt, DISKOFF diskoff, enum brt_cmd_typ
int toku_set_func_fsync (int (*fsync_function)(int));
#endif

View file

@ -492,13 +492,14 @@ static int brtnode_put_cmd (BRT t, BRTNODE node, BRT_CMD cmd,
int *did_split, BRTNODE *nodea, BRTNODE *nodeb,
DBT *split,
int debug,
TOKULOGGER);
TOKULOGGER, DISKOFFARRAY path_to_parent);
/* key is not in the buffer. Either put the key-value pair in the child, or put it in the node. */
static int push_brt_cmd_down_only_if_it_wont_push_more_else_put_here (BRT t, BRTNODE node, BRTNODE child,
BRT_CMD cmd,
int childnum_of_node,
TOKULOGGER logger) {
TOKULOGGER logger,
DISKOFFARRAY path_to_parent) {
assert(node->height>0); /* Not a leaf. */
DBT *k = cmd->u.id.key;
DBT *v = cmd->u.id.val;
@ -533,7 +534,8 @@ static int push_brt_cmd_down_only_if_it_wont_push_more_else_put_here (BRT t, BRT
r = brtnode_put_cmd(t, child, cmd,
&again_split, &againa, &againb, &againk,
0,
logger);
logger,
path_to_parent);
if (r!=0) return r;
assert(again_split==0); /* I only did the insert if I knew it wouldn't push down, and hence wouldn't split. */
} else {
@ -547,7 +549,8 @@ static int push_a_brt_cmd_down (BRT t, BRTNODE node, BRTNODE child, int childnum
BRT_CMD cmd,
int *child_did_split, BRTNODE *childa, BRTNODE *childb,
DBT *childsplitk,
TOKULOGGER logger) {
TOKULOGGER logger,
DISKOFFARRAY path_to_parent) {
//if (debug) printf("%s:%d %*sinserting down\n", __FILE__, __LINE__, debug, "");
//printf("%s:%d hello!\n", __FILE__, __LINE__);
assert(node->height>0);
@ -555,7 +558,8 @@ static int push_a_brt_cmd_down (BRT t, BRTNODE node, BRTNODE child, int childnum
int r = brtnode_put_cmd(t, child, cmd,
child_did_split, childa, childb, childsplitk,
0,
logger);
logger,
path_to_parent);
if (r!=0) return r;
}
@ -593,7 +597,7 @@ static int push_a_brt_cmd_down (BRT t, BRTNODE node, BRTNODE child, int childnum
return 0;
}
static int brtnode_maybe_push_down(BRT t, BRTNODE node, int *did_split, BRTNODE *nodea, BRTNODE *nodeb, DBT *splitk, int debug, TOKULOGGER logger);
static int brtnode_maybe_push_down(BRT t, BRTNODE node, int *did_split, BRTNODE *nodea, BRTNODE *nodeb, DBT *splitk, int debug, TOKULOGGER logger, DISKOFFARRAY path_to_parent);
static int split_count=0;
@ -609,7 +613,8 @@ static int handle_split_of_child (BRT t, BRTNODE node, int childnum,
DBT *childsplitk, /* the data in the childsplitk is alloc'd and is consumed by this call. */
int *did_split, BRTNODE *nodea, BRTNODE *nodeb,
DBT *splitk,
TOKULOGGER logger) {
TOKULOGGER logger,
DISKOFFARRAY path_to_parent) {
assert(node->height>0);
assert(0 <= childnum && childnum < node->u.n.n_children);
FIFO old_h = BNC_BUFFER(node,childnum);
@ -718,7 +723,7 @@ static int handle_split_of_child (BRT t, BRTNODE node, int childnum,
if (pusha) {
// If we already have something in the buffer, we must add the new command to the buffer so that commands don't get out of order.
if (toku_fifo_n_entries(BNC_BUFFER(node,childnum))==0) {
r=push_brt_cmd_down_only_if_it_wont_push_more_else_put_here(t, node, childa, &brtcmd, childnum, logger);
r=push_brt_cmd_down_only_if_it_wont_push_more_else_put_here(t, node, childa, &brtcmd, childnum, logger, path_to_parent);
} else {
r=insert_to_buffer_in_nonleaf(node, childnum, &skd, &svd, type, xid);
}
@ -726,7 +731,7 @@ static int handle_split_of_child (BRT t, BRTNODE node, int childnum,
if (pushb) {
// If we already have something in the buffer, we must add the new command to the buffer so that commands don't get out of order.
if (toku_fifo_n_entries(BNC_BUFFER(node,childnum+1))==0) {
r=push_brt_cmd_down_only_if_it_wont_push_more_else_put_here(t, node, childb, &brtcmd, childnum+1, logger);
r=push_brt_cmd_down_only_if_it_wont_push_more_else_put_here(t, node, childb, &brtcmd, childnum+1, logger, path_to_parent);
} else {
r=insert_to_buffer_in_nonleaf(node, childnum+1, &skd, &svd, type, xid);
}
@ -784,7 +789,7 @@ static int handle_split_of_child (BRT t, BRTNODE node, int childnum,
if (toku_serialize_brtnode_size(node) > node->nodesize) {
/* lighten the node by pushing down its buffers. this may cause
the current node to split and go away */
r = brtnode_maybe_push_down(t, node, did_split, nodea, nodeb, splitk, 0, logger);
r = brtnode_maybe_push_down(t, node, did_split, nodea, nodeb, splitk, 0, logger, path_to_parent);
assert(r == 0);
}
if (*did_split == 0) assert(toku_serialize_brtnode_size(node)<=node->nodesize);
@ -796,7 +801,8 @@ static int push_some_brt_cmds_down (BRT t, BRTNODE node, int childnum,
int *did_split, BRTNODE *nodea, BRTNODE *nodeb,
DBT *splitk,
int debug,
TOKULOGGER logger) {
TOKULOGGER logger,
DISKOFFARRAY path_to_parent) {
void *childnode_v;
BRTNODE child;
int r;
@ -844,7 +850,8 @@ static int push_some_brt_cmds_down (BRT t, BRTNODE node, int childnum,
&brtcmd,
&child_did_split, &childa, &childb,
&childsplitk,
logger);
logger,
path_to_parent);
if (0){
unsigned int sum=0;
@ -862,7 +869,8 @@ static int push_some_brt_cmds_down (BRT t, BRTNODE node, int childnum,
r=handle_split_of_child (t, node, childnum,
childa, childb, &childsplitk,
did_split, nodea, nodeb, splitk,
logger);
logger,
path_to_parent);
//if (*did_split) {
// verify_local_fingerprint_nonleaf(*nodea);
// verify_local_fingerprint_nonleaf(*nodeb);
@ -885,7 +893,7 @@ static int debugp1 (int debug) {
return debug ? debug+1 : 0;
}
static int brtnode_maybe_push_down(BRT t, BRTNODE node, int *did_split, BRTNODE *nodea, BRTNODE *nodeb, DBT *splitk, int debug, TOKULOGGER logger)
static int brtnode_maybe_push_down(BRT t, BRTNODE node, int *did_split, BRTNODE *nodea, BRTNODE *nodeb, DBT *splitk, int debug, TOKULOGGER logger, DISKOFFARRAY path_to_parent)
/* If the buffer is too full, then push down. Possibly the child will split. That may make us split. */
{
assert(node->height>0);
@ -901,7 +909,7 @@ static int brtnode_maybe_push_down(BRT t, BRTNODE node, int *did_split, BRTNODE
find_heaviest_child(node, &childnum);
if (0) printf("%s:%d %*spush some down from %lld into %lld (child %d)\n", __FILE__, __LINE__, debug, "", node->thisnodename, BNC_DISKOFF(node, childnum), childnum);
assert(BNC_DISKOFF(node, childnum)!=0);
int r = push_some_brt_cmds_down(t, node, childnum, did_split, nodea, nodeb, splitk, debugp1(debug), logger);
int r = push_some_brt_cmds_down(t, node, childnum, did_split, nodea, nodeb, splitk, debugp1(debug), logger, path_to_parent);
if (r!=0) return r;
assert(*did_split==0 || *did_split==1);
if (debug) printf("%s:%d %*sdid push_some_brt_cmds_down did_split=%d\n", __FILE__, __LINE__, debug, "", *did_split);
@ -1020,7 +1028,8 @@ unsigned int toku_brtnode_which_child (BRTNODE node , DBT *k, DBT *d, BRT t) {
/* put a cmd into a nodes child */
static int brt_nonleaf_put_cmd_child_node (BRT t, BRTNODE node, BRT_CMD cmd,
int *did_split, BRTNODE *nodea, BRTNODE *nodeb, DBT *splitk,
int debug, TOKULOGGER logger, int childnum, int maybe) {
int debug, TOKULOGGER logger, int childnum, int maybe,
DISKOFFARRAY path_to_parent) {
int r;
void *child_v;
BRTNODE child;
@ -1042,7 +1051,8 @@ static int brt_nonleaf_put_cmd_child_node (BRT t, BRTNODE node, BRT_CMD cmd,
child_did_split = 0;
r = brtnode_put_cmd(t, child, cmd,
&child_did_split, &childa, &childb, &childsplitk, debug, logger);
&child_did_split, &childa, &childb, &childsplitk, debug, logger,
path_to_parent);
if (r != 0) {
/* putting to the child failed for some reason, so unpin the child and return the error code */
int rr = toku_unpin_brtnode(t, child);
@ -1055,7 +1065,8 @@ static int brt_nonleaf_put_cmd_child_node (BRT t, BRTNODE node, BRT_CMD cmd,
r = handle_split_of_child(t, node, childnum,
childa, childb, &childsplitk,
did_split, nodea, nodeb, splitk,
logger);
logger,
path_to_parent);
assert(r == 0);
} else {
//verify_local_fingerprint_nonleaf(child);
@ -1071,12 +1082,13 @@ int toku_brt_do_push_cmd = 1;
/* put a cmd into a node at childnum */
static int brt_nonleaf_put_cmd_child (BRT t, BRTNODE node, BRT_CMD cmd,
int *did_split, BRTNODE *nodea, BRTNODE *nodeb, DBT *splitk,
int debug, TOKULOGGER logger, unsigned int childnum, int can_push, int *do_push_down) {
int debug, TOKULOGGER logger, unsigned int childnum, int can_push, int *do_push_down,
DISKOFFARRAY path_to_parent) {
//verify_local_fingerprint_nonleaf(node);
/* try to push the cmd to the subtree if the buffer is empty and pushes are enabled */
if (BNC_NBYTESINBUF(node, childnum) == 0 && can_push && toku_brt_do_push_cmd) {
int r = brt_nonleaf_put_cmd_child_node(t, node, cmd, did_split, nodea, nodeb, splitk, debug, logger, childnum, 1);
int r = brt_nonleaf_put_cmd_child_node(t, node, cmd, did_split, nodea, nodeb, splitk, debug, logger, childnum, 1, path_to_parent);
if (r == 0)
return r;
}
@ -1094,6 +1106,15 @@ static int brt_nonleaf_put_cmd_child (BRT t, BRTNODE node, BRT_CMD cmd,
u_int32_t newfingerprint = node->local_fingerprint + node->rand4fingerprint * toku_calccrc32_cmd(type, cmd->xid, k->data, k->size, v->data, v->size);
int r=toku_log_brtenq(logger, 0, toku_cachefile_filenum(t->cf), node->thisnodename, childnum, cmd->xid, type, keybs, databs, node->local_fingerprint, newfingerprint);
assert(r==0);
{
TOKUTXN txn;
if (0==toku_txnid2txn(logger,cmd->xid,&txn) && txn) {
DISKOFFARRAY path = path_to_parent;
path.array = toku_memdup(path.array, sizeof(path.array[0])*(1+path.len));
r=toku_logger_save_rollback_xactiontouchednonleaf(txn, toku_cachefile_filenum(t->cf), path, node->thisnodename);
if (r!=0) return r;
}
}
r=toku_fifo_enq(BNC_BUFFER(node,childnum), k->data, k->size, v->data, v->size, type, cmd->xid);
assert(r==0);
node->local_fingerprint = newfingerprint;
@ -1107,7 +1128,7 @@ static int brt_nonleaf_put_cmd_child (BRT t, BRTNODE node, BRT_CMD cmd,
static int brt_nonleaf_insert_cmd (BRT t, BRTNODE node, BRT_CMD cmd,
int *did_split, BRTNODE *nodea, BRTNODE *nodeb, DBT *splitk,
int debug, TOKULOGGER logger) {
int debug, TOKULOGGER logger, DISKOFFARRAY path_to_parent) {
//verify_local_fingerprint_nonleaf(node);
unsigned int childnum;
int r;
@ -1117,14 +1138,14 @@ static int brt_nonleaf_insert_cmd (BRT t, BRTNODE node, BRT_CMD cmd,
/* put the cmd in the subtree */
int do_push_down = 0;
r = brt_nonleaf_put_cmd_child(t, node, cmd, did_split, nodea, nodeb, splitk, debug, logger, childnum, 1, &do_push_down);
r = brt_nonleaf_put_cmd_child(t, node, cmd, did_split, nodea, nodeb, splitk, debug, logger, childnum, 1, &do_push_down, path_to_parent);
if (r != 0) return r;
/* maybe push down */
if (do_push_down) {
if (debug) printf("%s:%d %*sDoing maybe_push_down\n", __FILE__, __LINE__, debug, "");
//verify_local_fingerprint_nonleaf(node);
r = brtnode_maybe_push_down(t, node, did_split, nodea, nodeb, splitk, debugp1(debug), logger);
r = brtnode_maybe_push_down(t, node, did_split, nodea, nodeb, splitk, debugp1(debug), logger, path_to_parent);
if (r!=0) return r;
if (debug) printf("%s:%d %*sDid maybe_push_down\n", __FILE__, __LINE__, debug, "");
if (*did_split) {
@ -1154,7 +1175,8 @@ static int brt_nonleaf_insert_cmd (BRT t, BRTNODE node, BRT_CMD cmd,
static int brt_nonleaf_delete_cmd (BRT t, BRTNODE node, BRT_CMD cmd,
int *did_split, BRTNODE *nodea, BRTNODE *nodeb, DBT *splitk,
int debug,
TOKULOGGER logger) {
TOKULOGGER logger,
DISKOFFARRAY path_to_parent) {
int r;
/* find all children that need a delete cmd */
@ -1186,7 +1208,7 @@ static int brt_nonleaf_delete_cmd (BRT t, BRTNODE node, BRT_CMD cmd,
/* issue the delete cmd to all of the children found previously */
int do_push_down = 0;
for (i=0; i<delidx; i++) {
r = brt_nonleaf_put_cmd_child(t, node, cmd, did_split, nodea, nodeb, splitk, debug, logger, delchild[i], delidx == 1, &do_push_down);
r = brt_nonleaf_put_cmd_child(t, node, cmd, did_split, nodea, nodeb, splitk, debug, logger, delchild[i], delidx == 1, &do_push_down, path_to_parent);
assert(r == 0);
}
@ -1194,7 +1216,7 @@ static int brt_nonleaf_delete_cmd (BRT t, BRTNODE node, BRT_CMD cmd,
/* maybe push down */
if (debug) printf("%s:%d %*sDoing maybe_push_down\n", __FILE__, __LINE__, debug, "");
//verify_local_fingerprint_nonleaf(node);
r = brtnode_maybe_push_down(t, node, did_split, nodea, nodeb, splitk, debugp1(debug), logger);
r = brtnode_maybe_push_down(t, node, did_split, nodea, nodeb, splitk, debugp1(debug), logger, path_to_parent);
if (r!=0) return r;
if (debug) printf("%s:%d %*sDid maybe_push_down\n", __FILE__, __LINE__, debug, "");
if (*did_split) {
@ -1223,11 +1245,12 @@ static int brt_nonleaf_put_cmd (BRT t, BRTNODE node, BRT_CMD cmd,
int *did_split, BRTNODE *nodea, BRTNODE *nodeb,
DBT *splitk,
int debug,
TOKULOGGER logger) {
TOKULOGGER logger,
DISKOFFARRAY path_to_parent) {
if (cmd->type == BRT_INSERT || cmd->type == BRT_DELETE_BOTH) {
return brt_nonleaf_insert_cmd(t, node, cmd, did_split, nodea, nodeb, splitk, debug, logger);
return brt_nonleaf_insert_cmd(t, node, cmd, did_split, nodea, nodeb, splitk, debug, logger, path_to_parent);
} else if (cmd->type == BRT_DELETE) {
return brt_nonleaf_delete_cmd(t, node, cmd, did_split, nodea, nodeb, splitk, debug, logger);
return brt_nonleaf_delete_cmd(t, node, cmd, did_split, nodea, nodeb, splitk, debug, logger, path_to_parent);
} else
return EINVAL;
}
@ -1248,7 +1271,8 @@ static void verify_local_fingerprint_nonleaf (BRTNODE node) {
static int brtnode_put_cmd (BRT t, BRTNODE node, BRT_CMD cmd,
int *did_split, BRTNODE *nodea, BRTNODE *nodeb, DBT *splitk,
int debug,
TOKULOGGER logger) {
TOKULOGGER logger,
DISKOFFARRAY path_to_parent) {
//static int counter=0; // FOO
//static int oldcounter=0;
//int tmpcounter;
@ -1263,7 +1287,7 @@ static int brtnode_put_cmd (BRT t, BRTNODE node, BRT_CMD cmd,
} else {
r = brt_nonleaf_put_cmd(t, node, cmd,
did_split, nodea, nodeb, splitk,
debug, logger);
debug, logger, path_to_parent);
}
//oldcounter=tmpcounter;
// Watch out. If did_split then the original node is no longer allocated.
@ -1702,7 +1726,7 @@ static int brt_init_new_root(BRT brt, BRTNODE nodea, BRTNODE nodeb, DBT splitk,
return 0;
}
static int brt_root_put_cmd(BRT brt, BRT_CMD cmd, TOKULOGGER logger) {
static int brt_root_put_cmd(BRT brt, BRT_CMD cmd, TOKULOGGER logger, DISKOFFARRAY path_to_parent) {
void *node_v;
BRTNODE node;
CACHEKEY *rootp;
@ -1729,7 +1753,8 @@ static int brt_root_put_cmd(BRT brt, BRT_CMD cmd, TOKULOGGER logger) {
result = brtnode_put_cmd(brt, node, cmd,
&did_split, &nodea, &nodeb, &splitk,
debug,
logger);
logger,
path_to_parent);
if (debug) printf("%s:%d did_insert\n", __FILE__, __LINE__);
if (did_split) {
// node is unpinned, so now we have to proceed to update the root with a new node.
@ -1752,11 +1777,14 @@ static int brt_root_put_cmd(BRT brt, BRT_CMD cmd, TOKULOGGER logger) {
return result;
}
#define MAX_PATHLEN_TO_ROOT 40
int toku_brt_insert (BRT brt, DBT *key, DBT *val, TOKUTXN txn) {
int r;
BRT_CMD_S brtcmd = { BRT_INSERT, toku_txn_get_txnid(txn), .u.id={key,val}};
r = brt_root_put_cmd(brt, &brtcmd, toku_txn_logger(txn));
DISKOFF path[MAX_PATHLEN_TO_ROOT];
DISKOFFARRAY path_to_parent = {0, path};
r = brt_root_put_cmd(brt, &brtcmd, toku_txn_logger(txn), path_to_parent);
return r;
}
@ -1779,14 +1807,18 @@ int toku_brt_delete(BRT brt, DBT *key, TOKUTXN txn) {
int r;
DBT val;
BRT_CMD_S brtcmd = { BRT_DELETE, toku_txn_get_txnid(txn), .u.id={key, toku_init_dbt(&val)}};
r = brt_root_put_cmd(brt, &brtcmd, toku_txn_logger(txn));
DISKOFF path[MAX_PATHLEN_TO_ROOT];
DISKOFFARRAY path_to_parent = {0, path};
r = brt_root_put_cmd(brt, &brtcmd, toku_txn_logger(txn), path_to_parent);
return r;
}
int toku_brt_delete_both(BRT brt, DBT *key, DBT *val, TOKUTXN txn) {
int r;
BRT_CMD_S brtcmd = { BRT_DELETE_BOTH, toku_txn_get_txnid(txn), .u.id={key,val}};
r = brt_root_put_cmd(brt, &brtcmd, toku_txn_logger(txn));
DISKOFF path[MAX_PATHLEN_TO_ROOT];
DISKOFFARRAY path_to_parent = {0, path};
r = brt_root_put_cmd(brt, &brtcmd, toku_txn_logger(txn), path_to_parent);
return r;
}
@ -1924,15 +1956,15 @@ static inline void brt_split_init(BRT_SPLIT *split) {
toku_init_dbt(&split->splitk);
}
static int brt_search_node(BRT brt, BRTNODE node, brt_search_t *search, DBT *newkey, DBT *newval, BRT_SPLIT *split, TOKULOGGER logger);
static int brt_search_node(BRT brt, BRTNODE node, brt_search_t *search, DBT *newkey, DBT *newval, BRT_SPLIT *split, TOKULOGGER logger, DISKOFFARRAY path_to_parent);
/* search in a node's child */
static int brt_search_child(BRT brt, BRTNODE node, int childnum, brt_search_t *search, DBT *newkey, DBT *newval, BRT_SPLIT *split, TOKULOGGER logger) {
static int brt_search_child(BRT brt, BRTNODE node, int childnum, brt_search_t *search, DBT *newkey, DBT *newval, BRT_SPLIT *split, TOKULOGGER logger, DISKOFFARRAY path_to_parent) {
int r, rr;
/* if the child's buffer is not empty then try to empty it */
if (BNC_NBYTESINBUF(node, childnum) > 0) {
rr = push_some_brt_cmds_down(brt, node, childnum, &split->did_split, &split->nodea, &split->nodeb, &split->splitk, 0, logger);
rr = push_some_brt_cmds_down(brt, node, childnum, &split->did_split, &split->nodea, &split->nodeb, &split->splitk, 0, logger, path_to_parent);
assert(rr == 0);
/* push down may cause a child split, so childnum may not be appropriate, and the node itself may split, so retry */
return EAGAIN;
@ -1945,11 +1977,11 @@ static int brt_search_child(BRT brt, BRTNODE node, int childnum, brt_search_t *s
for (;;) {
BRTNODE childnode = node_v;
BRT_SPLIT childsplit; brt_split_init(&childsplit);
r = brt_search_node(brt, childnode, search, newkey, newval, &childsplit, logger);
r = brt_search_node(brt, childnode, search, newkey, newval, &childsplit, logger, path_to_parent);
if (childsplit.did_split) {
rr = handle_split_of_child(brt, node, childnum, childsplit.nodea, childsplit.nodeb, &childsplit.splitk,
&split->did_split, &split->nodea, &split->nodeb, &split->splitk, logger);
&split->did_split, &split->nodea, &split->nodeb, &split->splitk, logger, path_to_parent);
assert(rr == 0);
break;
} else {
@ -1964,7 +1996,7 @@ static int brt_search_child(BRT brt, BRTNODE node, int childnum, brt_search_t *s
return r;
}
static int brt_search_nonleaf_node(BRT brt, BRTNODE node, brt_search_t *search, DBT *newkey, DBT *newval, BRT_SPLIT *split, TOKULOGGER logger) {
static int brt_search_nonleaf_node(BRT brt, BRTNODE node, brt_search_t *search, DBT *newkey, DBT *newval, BRT_SPLIT *split, TOKULOGGER logger, DISKOFFARRAY path_to_parent) {
int r = DB_NOTFOUND;
int c;
@ -1982,7 +2014,7 @@ static int brt_search_nonleaf_node(BRT brt, BRTNODE node, brt_search_t *search,
if (search->compare(search,
toku_fill_dbt(&pivotkey, kv_pair_key(pivot), kv_pair_keylen(pivot)),
brt->flags & TOKU_DB_DUPSORT ? toku_fill_dbt(&pivotval, kv_pair_val(pivot), kv_pair_vallen(pivot)): 0)) {
r = brt_search_child(brt, node, child[c], search, newkey, newval, split, logger);
r = brt_search_child(brt, node, child[c], search, newkey, newval, split, logger, path_to_parent);
if (r == 0 || r == EAGAIN)
break;
}
@ -1990,7 +2022,7 @@ static int brt_search_nonleaf_node(BRT brt, BRTNODE node, brt_search_t *search,
/* check the first (left) or last (right) node if nothing has been found */
if (r == DB_NOTFOUND && c == node->u.n.n_children-1)
r = brt_search_child(brt, node, child[c], search, newkey, newval, split, logger);
r = brt_search_child(brt, node, child[c], search, newkey, newval, split, logger, path_to_parent);
return r;
}
@ -2001,9 +2033,9 @@ static int brt_search_leaf_node(BRTNODE node, brt_search_t *search, DBT *newkey,
return r;
}
static int brt_search_node(BRT brt, BRTNODE node, brt_search_t *search, DBT *newkey, DBT *newval, BRT_SPLIT *split, TOKULOGGER logger) {
static int brt_search_node(BRT brt, BRTNODE node, brt_search_t *search, DBT *newkey, DBT *newval, BRT_SPLIT *split, TOKULOGGER logger, DISKOFFARRAY path_to_parent) {
if (node->height > 0)
return brt_search_nonleaf_node(brt, node, search, newkey, newval, split, logger);
return brt_search_nonleaf_node(brt, node, search, newkey, newval, split, logger, path_to_parent);
else
return brt_search_leaf_node(node, search, newkey, newval);
}
@ -2025,7 +2057,9 @@ int toku_brt_search(BRT brt, brt_search_t *search, DBT *newkey, DBT *newval, TOK
for (;;) {
BRT_SPLIT split; brt_split_init(&split);
r = brt_search_node(brt, node, search, newkey, newval, &split, logger);
DISKOFF path[MAX_PATHLEN_TO_ROOT];
DISKOFFARRAY path_to_parent = {0, path};
r = brt_search_node(brt, node, search, newkey, newval, &split, logger, path_to_parent);
if (split.did_split) {
rr = brt_init_new_root(brt, split.nodea, split.nodeb, split.splitk, rootp, 0, &node);
@ -2412,3 +2446,35 @@ int toku_brt_height_of_root(BRT brt, int *height) {
r = toku_unpin_brt_header(brt); assert(r==0);
return 0;
}
struct callpair {
BRTNODE node;
int childnum;
};
static int note_removal (bytevec key, ITEMLEN keylen, bytevec data, ITEMLEN datalen, int type, TXNID xid, void*cpairv) {
struct callpair *cpair = cpairv;
BRTNODE node = cpair->node;
int childnum = cpair->childnum;
u_int32_t old_fingerprint = node->local_fingerprint;
node->local_fingerprint = old_fingerprint = node->rand4fingerprint*toku_calccrc32_cmd(type, xid, key, keylen, data, datalen);
u_int32_t countdiff = keylen+datalen+KEY_VALUE_OVERHEAD+BRT_CMD_OVERHEAD;
BNC_NBYTESINBUF(node,childnum) -= countdiff;
node->u.n.n_bytes_in_buffers -= countdiff;
return 0;
}
int toku_brt_nonleaf_expunge_xaction(BRT brt, DISKOFF diskoff, TXNID xid) {
void *node_v;
int r = toku_cachetable_get_and_pin(brt->cf, diskoff, &node_v, NULL, toku_brtnode_flush_callback, toku_brtnode_fetch_callback, brt);
if (r!=0) return r;
BRTNODE node = node_v;
int i;
r=0;
for (i=0; i<node->u.n.n_children; i++) {
struct callpair pair = { node, i };
int r3 = toku_fifo_expunge_xaction(BNC_BUFFER(node, i), xid, note_removal, &pair);
if (r==0) r=r3;
}
int r2 = toku_cachetable_unpin(brt->cf, diskoff, 1, toku_serialize_brtnode_size(node));
return r ? r : r2;
}

View file

@ -66,4 +66,7 @@ int toku_brt_get_fd(BRT, int *);
int toku_brt_height_of_root(BRT, int *height); // for an open brt, return the current height.
// Special hack for recovery
int toku_brt_nonleaf_expunge_xaction(BRT brt, DISKOFF diskoff, TXNID xid);
#endif

View file

@ -27,6 +27,11 @@ typedef struct {
char *data;
} BYTESTRING;
typedef struct {
int len;
DISKOFF *array;
} DISKOFFARRAY;
/* Make the LSN be a struct instead of an integer so that we get better type checking. */
typedef struct __toku_lsn { u_int64_t lsn; } LSN;
#define ZERO_LSN ((LSN){0})

71
newbrt/fifo-test-exp.c Normal file
View file

@ -0,0 +1,71 @@
/* Test the expunge method. */
#include <assert.h>
#include <stdio.h>
#include <string.h>
#include "fifo.h"
#include "memory.h"
int count;
int callback (bytevec key, ITEMLEN keylen, bytevec data, ITEMLEN datalen, int type, TXNID xid, void *v) {
TXNID which=(long)v;
assert(xid==which);
int actual_row = count;
assert(strlen(key)+1==keylen);
assert(strlen(data)+1==datalen);
//printf("count=%d which=%ld deleting %s %s\n", count, (long)which, (char*)key, (char*)data);
switch (which) {
case 23: break;
case 24: actual_row++; break;
case 26: actual_row+=3;
}
switch (actual_row) {
case 0: assert(strcmp(key, "hello")==0); assert(strcmp(data, "thera")==0); assert(xid==23); assert(type==0); break;
case 1: assert(strcmp(key, "hello")==0); assert(strcmp(data, "therb")==0); assert(xid==24); assert(type==0); break;
case 2: assert(strcmp(key, "hell1")==0); assert(strcmp(data, "therc")==0); assert(xid==24); assert(type==1); break;
case 3: assert(strcmp(key, "hell1")==0); assert(strcmp(data, "therd")==0); assert(xid==26); assert(type==1); break;
default: assert(0);
}
count++;
return 0;
}
void doit (int which) {
int r;
FIFO f;
r = toku_fifo_create(&f); assert(r==0);
r = toku_fifo_enq(f, "hello", 6, "thera", 6, 0, 23); assert(r==0);
r = toku_fifo_enq(f, "hello", 6, "therb", 6, 0, 24); assert(r==0);
r = toku_fifo_enq(f, "hell1", 6, "therc", 6, 1, 24); assert(r==0);
r = toku_fifo_enq(f, "hell1", 6, "therd", 6, 1, 26); assert(r==0);
int i=0;
FIFO_ITERATE(f, k, kl, d, dl, t, x,
({
assert(strlen(k)+1==kl);
assert(strlen(d)+1==dl);
switch(i) {
case 0: assert(strcmp(k, "hello")==0); assert(strcmp(d, "thera")==0); assert(x==23); assert(t==0); i++; break;
case 1: assert(strcmp(k, "hello")==0); assert(strcmp(d, "therb")==0); assert(x==24); assert(t==0); i++; break;
case 2: assert(strcmp(k, "hell1")==0); assert(strcmp(d, "therc")==0); assert(x==24); assert(t==1); i++; break;
case 3: assert(strcmp(k, "hell1")==0); assert(strcmp(d, "therd")==0); assert(x==26); assert(t==1); i++; break;
default: assert(0);
}
}));
count=0;
r = toku_fifo_expunge_xaction(f, which, callback, (void*)(long)which);
switch (which) {
case 23: assert(count==1); break;
case 24: assert(count==2); break;
case 26: assert(count==1); break;
}
toku_fifo_free(&f);
toku_malloc_cleanup();
}
int main (int argc __attribute__((__unused__)), char *argv[] __attribute__((__unused__))) {
doit(23);
doit(24);
doit(26);
doit(27);
return 0;
}

View file

@ -140,7 +140,20 @@ void toku_fifo_iterate (FIFO fifo, void(*f)(bytevec key,ITEMLEN keylen,bytevec d
f(entry->key, entry->keylen, entry->key + entry->keylen, entry->vallen, entry->type, entry->xid, arg);
}
int toku_fifo_expunge_xaction(FIFO fifo, TXNID xid, int (*callback_on_delete)(bytevec key, ITEMLEN keylen, bytevec data, ITEMLEN datalen, int type, TXNID xid, void*), void*arg) {
struct fifo_entry **prev=&fifo->head;
struct fifo_entry *entry;
while ((entry=*prev)) {
if (entry->xid==xid) {
// Must remove it.
int r = callback_on_delete(entry->key, entry->keylen, entry->key+entry->keylen, entry->vallen, entry->type, entry->xid, arg);
fifo->n--;
*prev=entry->next;
toku_free_n(entry, fifo_entry_size(entry));
if (r!=0) return r;
} else {
prev = &entry->next;
}
}
return 0;
}

View file

@ -44,4 +44,6 @@ void toku_fifo_iterate (FIFO, void(*f)(bytevec key,ITEMLEN keylen,bytevec data,I
} \
})
int toku_fifo_expunge_xaction(FIFO fifo, TXNID xid, int (*callback_on_delete)(bytevec key, ITEMLEN keylen, bytevec data, ITEMLEN datalen, int type, TXNID xid, void*), void*arg);
#endif

View file

@ -112,6 +112,9 @@ static inline int toku_copy_BYTESTRING(BYTESTRING *target, BYTESTRING val) {
static inline void toku_free_BYTESTRING(BYTESTRING val) {
toku_free(val.data);
}
static inline void toku_free_DISKOFFARRAY(DISKOFFARRAY val) {
toku_free(val.array);
}
static inline int toku_copy_LOGGEDBRTHEADER(LOGGEDBRTHEADER *target, LOGGEDBRTHEADER val) {
*target = val;

View file

@ -49,6 +49,10 @@ const struct logtype rollbacks[] = {
{"BYTESTRING", "key", 0},
{"BYTESTRING", "data", 0},
NULLFIELD}},
{"xactiontouchednonleaf", 'n', FA{{"FILENUM", "filenum", 0},
{"DISKOFFARRAY", "parents", 0},
{"DISKOFF", "diskoff", 0},
NULLFIELD}},
{0,0,FA{NULLFIELD}}
};

View file

@ -62,3 +62,13 @@ int toku_rollback_deleteatleaf (FILENUM filenum, BYTESTRING key, BYTESTRING data
0); // Do the insertion unconditionally
return r;
}
int toku_rollback_xactiontouchednonleaf(FILENUM filenum, DISKOFFARRAY array __attribute__((__unused__)), DISKOFF diskoff, TOKUTXN txn) {
CACHEFILE cf;
BRT brt;
int r = toku_cachefile_of_filenum(txn->logger->ct, filenum, &cf, &brt);
assert(r==0);
r = toku_brt_nonleaf_expunge_xaction(brt, diskoff, txn->txnid64);
assert(r==0);
return 0;
}

View file

@ -77,7 +77,7 @@ void do_test_abort2 (void) {
insert(7, 1);
r=txn->abort(txn); CKERR(r);
// Don't do a query on "hello7", because that will force things out of the buffer.
// Don't do a lookup on "hello7", because that will force things out of the buffer.
r=env->txn_begin(env, 0, &txn, 0); assert(r==0);
{