Break up brt-test some more. Addresses #475. Also make the fanout flexible. Fixes #126.

git-svn-id: file:///svn/tokudb@2593 c7de825b-a66e-492c-adef-691d508d4ae1
2025-01-22 14:54:20 +01:00 · 2008-03-06 21:46:57 +00:00 · 2008-03-06 21:46:57 +00:00 · 241c5392df
commit 241c5392df
parent 902b8bb9b8
9 changed files with 294 additions and 179 deletions
--- a/newbrt/Makefile
+++ b/newbrt/Makefile
@ -46,14 +46,15 @@ default: bins libs recover tdb_logprint
 # Put these one-per-line so that if we insert a new one the svn diff can understand it better.
 # Also keep them sorted.
 REGRESSION_TESTS = \
-        ybt-test \
-        pma-test \
-        brt-serialize-test \
-        cachetable-test \
-        cachetable-test2 \
-        fifo-test \
+	ybt-test \
+	pma-test \
+	brt-serialize-test \
+	cachetable-test \
+	cachetable-test2 \
+	fifo-test \
 	test-brt-delete-both \
-        brt-test \
+	brt-test \
+	brt-test3 \
 	brt-test4 \
 	brt-test-cursor \
 	test_oexcl \
@ -99,6 +100,7 @@ CHECKS = \
 test-brt-delete-both \
 brt-test \
 brt-test-cursor \
+ brt-test3 \
 brt-test4 \
 fifo-test \
 test_toku_malloc_plain_free \
@ -106,7 +108,8 @@ CHECKS = \
 list-test \
 # This line intentially kept commented so I can have a \ on the previous line

-check: bins $(patsubst %,check_%,$(CHECKS)) check_benchmarktest_256
+# Put check_benchmarktest_256 first because it is long-running (and therefore on the critical path, so get it started)
+check: bins check_benchmarktest_256 $(patsubst %,check_%,$(CHECKS))
 check_benchmarktest_256: benchmark-test
 	$(DTOOL) ./benchmark-test $(VERBVERBOSE) --valsize 256 --verify 1

@ -121,7 +124,7 @@ check_test-assert: test-assert
 	@# one argument, "ok" should not error
 	$(DTOOL) ./test-assert ok
 check_%: %
-	$(DTOOL)  ./$< $(VERBVERBOSE) 
+	time $(DTOOL)  ./$< $(VERBVERBOSE) 

 check-fanout:
 	let BRT_FANOUT=4; \
@ -130,7 +133,7 @@ check-fanout:
 		let BRT_FANOUT=BRT_FANOUT+1; \
 	done

-pma-test benchmark-test brt-test brt-test4 brt-test-cursor test-brt-delete-both brt-serialize-test brtdump test-inc-split test-del-inorder: LDFLAGS+=-lz
+pma-test benchmark-test brt-test brt-test3 brt-test4 brt-test-cursor test-brt-delete-both brt-serialize-test brtdump test-inc-split test-del-inorder: LDFLAGS+=-lz
 # pma: PROF_FLAGS=-fprofile-arcs -ftest-coverage

 BRT_INTERNAL_H_INCLUDES = brt-internal.h cachetable.h fifo.h pma.h brt.h brttypes.h yerror.h ybt.h log.h ../include/db.h kv-pair.h memory.h crc.h
@ -146,10 +149,10 @@ ybt.o: ybt.h brttypes.h ../include/db.h
 ybt-test: ybt-test.o ybt.o memory.o toku_assert.o
 ybt-test.o: ybt.h ../include/db.h
 cachetable.o: cachetable.h hashfun.h memory.h
-brt-test4 brt-test-cursor brt-test: ybt.o brt.o fifo.o pma.o memory.o brt-serialize.o cachetable.o ybt.o key.o primes.o toku_assert.o log.o mempool.o brt-verify.o fingerprint.o log_code.o roll.o
+brt-test3 brt-test4 brt-test-cursor brt-test: ybt.o brt.o fifo.o pma.o memory.o brt-serialize.o cachetable.o ybt.o key.o primes.o toku_assert.o log.o mempool.o brt-verify.o fingerprint.o log_code.o roll.o
 log.o: log_header.h log-internal.h log.h wbuf.h crc.h brttypes.h $(BRT_INTERNAL_H_INCLUDES)
 logformat: logformat.o toku_assert.o
-brt-test4.o brt-test-cursor.o brt-test.o brt.o: brt.h ../include/db.h fifo.h pma.h brttypes.h cachetable.h memory.h
+brt-test3.o brt-test4.o brt-test-cursor.o brt-test.o brt.o: brt.h ../include/db.h fifo.h pma.h brttypes.h cachetable.h memory.h
 brt-serialize-test.o: $(BRT_INTERNAL_H_INCLUDES)
 brt.o: $(BRT_INTERNAL_H_INCLUDES) key.h log_header.h
 fifo.o: fifo.h brttypes.h 
--- a/newbrt/brt-internal.h
+++ b/newbrt/brt-internal.h
@ -64,14 +64,14 @@ struct brtnode {
 	    unsigned int    totalchildkeylens;
 	    unsigned int    n_bytes_in_buffers;

-	    struct brtnode_nonleaf_childinfo childinfos[TREE_FANOUT+1]; /* One extra so we can grow */
+	    struct brtnode_nonleaf_childinfo *childinfos; /* One extra so we can grow */

 #define BNC_SUBTREE_FINGERPRINT(node,i) ((node)->u.n.childinfos[i].subtree_fingerprint)
 #define BNC_DISKOFF(node,i) ((node)->u.n.childinfos[i].diskoff)
 #define BNC_BUFFER(node,i) ((node)->u.n.childinfos[i].buffer)
 #define BNC_NBYTESINBUF(node,i) ((node)->u.n.childinfos[i].n_bytes_in_buffer)

-	    struct kv_pair *childkeys[TREE_FANOUT];   /* Pivot keys.  Child 0's keys are <= childkeys[0].  Child 1's keys are <= childkeys[1].
+	    struct kv_pair **childkeys;   /* Pivot keys.  Child 0's keys are <= childkeys[0].  Child 1's keys are <= childkeys[1].
 							 Note: It is possible that Child 1's keys are == to child 0's key's, so it is
 							 not necessarily true that child 1's keys are > childkeys[0].
 						         However, in the absense of duplicate keys, child 1's keys *are* > childkeys[0]. */
@ -175,7 +175,7 @@ struct brt_cursor {
    DBT val;
 };

-void toku_create_new_brtnode (BRT t, BRTNODE *result, int height, TOKULOGGER logger);
+int toku_create_new_brtnode (BRT t, BRTNODE *result, int height, TOKULOGGER logger);
 int toku_unpin_brtnode (BRT brt, BRTNODE node) ;
 unsigned int toku_brtnode_which_child (BRTNODE node , DBT *k, DBT *d, BRT t);

--- a/newbrt/brt-serialize-test.c
+++ b/newbrt/brt-serialize-test.c
@ -32,6 +32,8 @@ static void test_serialize(void) {
    sn.local_fingerprint = 0;
    sn.u.n.n_children = 2;
    hello_string = toku_strdup("hello");
+    MALLOC_N(2, sn.u.n.childinfos);
+    MALLOC_N(1, sn.u.n.childkeys);
    sn.u.n.childkeys[0] = kv_pair_malloc(hello_string, 6, 0, 0); 
    sn.u.n.totalchildkeylens = 6;
    BNC_DISKOFF(&sn, 0) = sn.nodesize*30;
@ -45,11 +47,6 @@ static void test_serialize(void) {
    r = toku_fifo_enq(BNC_BUFFER(&sn,1), "x", 2, "xval", 5, BRT_NONE, (TXNID)234); assert(r==0);    sn.local_fingerprint += randval*toku_calccrc32_cmd(BRT_NONE, (TXNID)234, "x", 2, "xval", 5);
    BNC_NBYTESINBUF(&sn, 0) = 2*(BRT_CMD_OVERHEAD+KEY_VALUE_OVERHEAD+2+5);
    BNC_NBYTESINBUF(&sn, 1) = 1*(BRT_CMD_OVERHEAD+KEY_VALUE_OVERHEAD+2+5);
-    {
-	int i;
-	for (i=2; i<TREE_FANOUT+1; i++)
-	    BNC_NBYTESINBUF(&sn, i)=0;
-    }
    sn.u.n.n_bytes_in_buffers = 3*(BRT_CMD_OVERHEAD+KEY_VALUE_OVERHEAD+2+5);

    toku_serialize_brtnode_to(fd, sn.nodesize*20, sn.nodesize, &sn);  assert(r==0);
--- a/newbrt/brt-serialize.c
+++ b/newbrt/brt-serialize.c
@ -278,18 +278,11 @@ int toku_deserialize_brtnode_from (int fd, DISKOFF off, BRTNODE *brtnode, int fl
    //printf("height==%d\n", result->height);
    if (result->height>0) {
 	result->u.n.totalchildkeylens=0;
-	for (i=0; i<TREE_FANOUT; i++) { 
-            result->u.n.childkeys[i]=0; 
-        }
-	for (i=0; i<TREE_FANOUT+1; i++) { 
-	    BNC_SUBTREE_FINGERPRINT(result, i)=0;
-            BNC_DISKOFF(result,i)=0; 
-            BNC_BUFFER(result,i)=0; 
-            BNC_NBYTESINBUF(result,i)=0;
-        }
 	u_int32_t subtree_fingerprint = rbuf_int(&rc);
 	u_int32_t check_subtree_fingerprint = 0;
 	result->u.n.n_children = rbuf_int(&rc);
+	MALLOC_N(result->u.n.n_children,   result->u.n.childinfos);
+	MALLOC_N(result->u.n.n_children-1, result->u.n.childkeys);
 	//printf("n_children=%d\n", result->n_children);
 	assert(result->u.n.n_children>=0 && result->u.n.n_children<=TREE_FANOUT);
 	for (i=0; i<result->u.n.n_children; i++) {
@ -315,11 +308,9 @@ int toku_deserialize_brtnode_from (int fd, DISKOFF off, BRTNODE *brtnode, int fl
 	}
 	for (i=0; i<result->u.n.n_children; i++) {
 	    BNC_DISKOFF(result,i) = rbuf_diskoff(&rc);
+	    BNC_NBYTESINBUF(result,i) = 0;
 	    //printf("Child %d at %lld\n", i, result->children[i]);
 	}
-	for (i=0; i<TREE_FANOUT+1; i++) {
-	    BNC_NBYTESINBUF(result,i)=0;
-	}
 	result->u.n.n_bytes_in_buffers = 0; 
 	for (i=0; i<result->u.n.n_children; i++) {
 	    r=toku_fifo_create(&BNC_BUFFER(result,i));
@ -472,9 +463,6 @@ void toku_verify_counts (BRTNODE node) {
 	    sum += BNC_NBYTESINBUF(node,i);
 	// We don't rally care of the later buffers have garbage in them.  Valgrind would do a better job noticing if we leave it uninitialized.
 	// But for now the code always initializes the later tables so they are 0.
-	for (; i<TREE_FANOUT+1; i++) {
-	    assert(BNC_NBYTESINBUF(node,i)==0);
-        }
 	assert(sum==node->u.n.n_bytes_in_buffers);
    }
 }
--- a/newbrt/brt-test.c
+++ b/newbrt/brt-test.c
@ -108,37 +108,6 @@ static void test2 (int memcheck) {
    if (verbose) printf("test2 ok\n");
 }

-static void test3 (int nodesize, int count, int memcheck) {
-    BRT t;
-    int r;
-    struct timeval t0,t1;
-    int i;
-    CACHETABLE ct;
-    char fname[]="testbrt.brt";
-    toku_memory_check=memcheck;
-    toku_memory_check_all_free();
-    r = toku_brt_create_cachetable(&ct, 0, ZERO_LSN, NULL_LOGGER); assert(r==0);
-    gettimeofday(&t0, 0);
-    unlink(fname);
-    r = toku_open_brt(fname, 0, 1, &t, nodesize, ct, null_txn, toku_default_compare_fun, null_db);
-    assert(r==0);
-    for (i=0; i<count; i++) {
-	char key[100],val[100];
-	DBT k,v;
-	snprintf(key,100,"hello%d",i);
-	snprintf(val,100,"there%d",i);
-	toku_brt_insert(t, toku_fill_dbt(&k, key, 1+strlen(key)), toku_fill_dbt(&v, val, 1+strlen(val)), null_txn);
-    }
-    r = toku_close_brt(t);              assert(r==0);
-    r = toku_cachetable_close(&ct);     assert(r==0);
-    toku_memory_check_all_free();
-    gettimeofday(&t1, 0);
-    {
-	double tdiff = (t1.tv_sec-t0.tv_sec)+1e-6*(t1.tv_usec-t0.tv_usec);
-	if (verbose) printf("serial insertions: blocksize=%d %d insertions in %.3f seconds, %.2f insertions/second\n", nodesize, count, tdiff, count/tdiff);
-    }
-}
-
 static void test5 (void) {
    int r;
    BRT t;
@ -1689,17 +1658,6 @@ static void brt_blackbox_test (void) {
    test5();
    if (verbose) printf("test_multiple_files\n");
    test_multiple_files();
-    if (verbose) printf("test3 slow\n");
-    toku_memory_check=0;
-    test3(2048, 1<<15, 1);
-    if (verbose) printf("test3 fast\n");
-
-    if (verbose) toku_pma_show_stats();
-
-    test3(1<<15, 1024, 1);
-    if (verbose) printf("test3 fast\n");
-
-    test3(1<<18, 1<<20, 0);

    toku_memory_check = 1;

--- a/newbrt/brt-test3.c
+++ b/newbrt/brt-test3.c
@ -0,0 +1,88 @@
+/* -*- mode: C; c-basic-offset: 4 -*- */
+#ident "Copyright (c) 2007, 2008 Tokutek Inc.  All rights reserved."
+
+#include "brt.h"
+#include "key.h"
+#include "pma.h"
+#include "brt-internal.h"
+#include "memory.h"
+#include "toku_assert.h"
+
+#include <fcntl.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/time.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <arpa/inet.h>
+
+#include "test.h"
+
+static const char fname[]= __FILE__ ".brt";
+
+static TOKUTXN const null_txn = 0;
+static DB * const null_db = 0;
+
+static void test3 (int nodesize, int count, int memcheck) {
+    BRT t;
+    int r;
+    struct timeval t0,t1;
+    int i;
+    CACHETABLE ct;
+    toku_memory_check=memcheck;
+    toku_memory_check_all_free();
+    r = toku_brt_create_cachetable(&ct, 0, ZERO_LSN, NULL_LOGGER); assert(r==0);
+    gettimeofday(&t0, 0);
+    unlink(fname);
+    r = toku_open_brt(fname, 0, 1, &t, nodesize, ct, null_txn, toku_default_compare_fun, null_db);
+    assert(r==0);
+    for (i=0; i<count; i++) {
+	char key[100],val[100];
+	DBT k,v;
+	snprintf(key,100,"hello%d",i);
+	snprintf(val,100,"there%d",i);
+	toku_brt_insert(t, toku_fill_dbt(&k, key, 1+strlen(key)), toku_fill_dbt(&v, val, 1+strlen(val)), null_txn);
+    }
+    r = toku_close_brt(t);              assert(r==0);
+    r = toku_cachetable_close(&ct);     assert(r==0);
+    toku_memory_check_all_free();
+    gettimeofday(&t1, 0);
+    {
+	double tdiff = (t1.tv_sec-t0.tv_sec)+1e-6*(t1.tv_usec-t0.tv_usec);
+	if (verbose) printf("serial insertions: blocksize=%d %d insertions in %.3f seconds, %.2f insertions/second\n", nodesize, count, tdiff, count/tdiff);
+    }
+}
+
+static void brt_blackbox_test (void) {
+    if (verbose) printf("test3 slow\n");
+    toku_memory_check=0;
+    test3(2048, 1<<15, 1);
+    if (verbose) printf("test3 fast\n");
+
+    if (verbose) toku_pma_show_stats();
+
+    test3(1<<15, 1024, 1);
+    if (verbose) printf("test3 fast\n");
+
+    test3(1<<18, 1<<20, 0);
+
+    toku_memory_check = 1;
+
+//    test3(1<<19, 1<<20, 0);
+
+//    test3(1<<20, 1<<20, 0);
+
+//    test3(1<<20, 1<<21, 0);
+
+//    test3(1<<20, 1<<22, 0);
+
+}
+
+int main (int argc , const char *argv[]) {
+    default_parse_args(argc, argv);
+
+    brt_blackbox_test();
+    toku_malloc_cleanup();
+    if (verbose) printf("test ok\n");
+    return 0;
+}
--- a/newbrt/brt.c
+++ b/newbrt/brt.c
@ -49,13 +49,15 @@ void toku_brtnode_free (BRTNODE *nodep) {
    //printf("%s:%d %p->mdict[0]=%p\n", __FILE__, __LINE__, node, node->mdicts[0]);
    if (node->height>0) {
 	for (i=0; i<node->u.n.n_children-1; i++) {
-	    toku_free((void*)node->u.n.childkeys[i]);
+	    toku_free(node->u.n.childkeys[i]);
 	}
 	for (i=0; i<node->u.n.n_children; i++) {
 	    if (BNC_BUFFER(node,i)) {
 		toku_fifo_free(&BNC_BUFFER(node,i));
 	    }
 	}
+	toku_free(node->u.n.childkeys);
+	toku_free(node->u.n.childinfos);
    } else {
 	if (node->u.l.buffer) // The buffer may have been freed already, in some cases.
 	    toku_pma_free(&node->u.l.buffer);
@ -246,7 +248,6 @@ int malloc_diskblock (DISKOFF *res, BRT brt, int size, TOKULOGGER logger) {
 }

 static void initialize_brtnode (BRT t, BRTNODE n, DISKOFF nodename, int height) {
-    int i;
    n->tag = TYP_BRTNODE;
    n->nodesize = t->h->nodesize;
    n->flags = t->h->flags;
@ -261,18 +262,10 @@ static void initialize_brtnode (BRT t, BRTNODE n, DISKOFF nodename, int height)
    assert(height>=0);
    if (height>0) {
 	n->u.n.n_children   = 0;
-	for (i=0; i<TREE_FANOUT; i++) {
-//	    n->u.n.childkeys[i] = 0;
-//	    n->u.n.childkeylens[i] = 0;
-	}
 	n->u.n.totalchildkeylens = 0;
-	for (i=0; i<TREE_FANOUT+1; i++) {
-	    BNC_SUBTREE_FINGERPRINT(n, i) = 0;
-//	    n->u.n.children[i] = 0;
-//	    n->u.n.buffers[i] = 0;
-	    BNC_NBYTESINBUF(n,i) = 0;
-	}
 	n->u.n.n_bytes_in_buffers = 0;
+	n->u.n.childinfos=0;
+	n->u.n.childkeys=0;
    } else {
 	int r = toku_pma_create(&n->u.l.buffer, t->compare_fun, t->db, toku_cachefile_filenum(t->cf), n->nodesize);
        assert(r==0);
@ -285,7 +278,7 @@ static void initialize_brtnode (BRT t, BRTNODE n, DISKOFF nodename, int height)
    }
 }

-void toku_create_new_brtnode (BRT t, BRTNODE *result, int height, TOKULOGGER logger) {
+int toku_create_new_brtnode (BRT t, BRTNODE *result, int height, TOKULOGGER logger) {
    TAGMALLOC(BRTNODE, n);
    int r;
    DISKOFF name;
@ -305,6 +298,7 @@ void toku_create_new_brtnode (BRT t, BRTNODE *result, int height, TOKULOGGER log
    r=toku_log_newbrtnode(logger, toku_cachefile_filenum(t->cf), n->thisnodename, height, n->nodesize, (t->flags&TOKU_DB_DUPSORT)!=0, n->rand4fingerprint);
    assert(r==0);
    toku_update_brtnode_loggerlsn(n, logger);
+    return 0;
 }

 static int insert_to_buffer_in_nonleaf (BRTNODE node, int childnum, DBT *k, DBT *v, int type, TXNID xid) {
@ -360,6 +354,8 @@ static int brt_nonleaf_split (BRT t, BRTNODE node, BRTNODE *nodea, BRTNODE *node
    assert(node->u.n.n_children>=2); // Otherwise, how do we split?  We need at least two children to split. */
    assert(t->h->nodesize>=node->nodesize); /* otherwise we might be in trouble because the nodesize shrank. */
    toku_create_new_brtnode(t, &B, node->height, logger);
+    MALLOC_N(n_children_in_b+1, B->u.n.childinfos);
+    MALLOC_N(n_children_in_b, B->u.n.childkeys);
    B->u.n.n_children   =n_children_in_b;
    //printf("%s:%d %p (%lld) becomes %p and %p\n", __FILE__, __LINE__, node, node->thisnodename, A, B);
    //printf("%s:%d A is at %lld\n", __FILE__, __LINE__, A->thisnodename);
@ -372,6 +368,8 @@ static int brt_nonleaf_split (BRT t, BRTNODE node, BRTNODE *nodea, BRTNODE *node
 	for (i=0; i<n_children_in_b; i++) {
 	    int r = toku_fifo_create(&BNC_BUFFER(B,i));
 	    if (r!=0) return r;
+	    BNC_NBYTESINBUF(B,i)=0;
+	    BNC_SUBTREE_FINGERPRINT(B,i)=0;
 	}

 	for (i=n_children_in_a; i<old_n_children; i++) {
@ -453,7 +451,9 @@ static int brt_nonleaf_split (BRT t, BRTNODE node, BRTNODE *nodea, BRTNODE *node
 	splitk->data = (void*)(node->u.n.childkeys[n_children_in_a-1]);
 	splitk->size = toku_brt_pivot_key_len(t, node->u.n.childkeys[n_children_in_a-1]);
 	node->u.n.totalchildkeylens -= toku_brt_pivot_key_len(t, node->u.n.childkeys[n_children_in_a-1]);
-	node->u.n.childkeys[n_children_in_a-1]=0;
+
+	REALLOC_N(n_children_in_a+1,   node->u.n.childinfos);
+	REALLOC_N(n_children_in_a, node->u.n.childkeys);

 	verify_local_fingerprint_nonleaf(node);
 	verify_local_fingerprint_nonleaf(B);
@ -618,6 +618,8 @@ static int handle_split_of_child (BRT t, BRTNODE node, int childnum,

    //verify_local_fingerprint_nonleaf(node);

+    REALLOC_N(node->u.n.n_children+2, node->u.n.childinfos);
+    REALLOC_N(node->u.n.n_children+1, node->u.n.childkeys);
    // Slide the children over.
    for (cnum=node->u.n.n_children; cnum>childnum+1; cnum--) {
 	node->u.n.childinfos[cnum] = node->u.n.childinfos[cnum-1];
@ -625,6 +627,8 @@ static int handle_split_of_child (BRT t, BRTNODE node, int childnum,
    r = toku_log_addchild(logger, toku_cachefile_filenum(t->cf), node->thisnodename, childnum+1, childb->thisnodename, 0);
    assert(BNC_DISKOFF(node, childnum)==childa->thisnodename);
    BNC_DISKOFF(node, childnum+1) = childb->thisnodename;
+    BNC_SUBTREE_FINGERPRINT(node, childnum)=0;
+    BNC_SUBTREE_FINGERPRINT(node, childnum+1)=0;
    fixup_child_fingerprint(node, childnum,   childa, t, logger);
    fixup_child_fingerprint(node, childnum+1, childb, t, logger);
    r=toku_fifo_create(&BNC_BUFFER(node,childnum));   assert(r==0); // ??? SHould handle this error case
@ -1625,6 +1629,8 @@ static int brt_init_new_root(BRT brt, BRTNODE nodea, BRTNODE nodeb, DBT splitk,
    initialize_brtnode (brt, newroot, newroot_diskoff, new_height);
    //printf("new_root %lld %d %lld %lld\n", newroot_diskoff, newroot->height, nodea->thisnodename, nodeb->thisnodename);
    newroot->u.n.n_children=2;
+    MALLOC_N(3, newroot->u.n.childinfos);
+    MALLOC_N(2, newroot->u.n.childkeys);
    //printf("%s:%d Splitkey=%p %s\n", __FILE__, __LINE__, splitkey, splitkey);
    newroot->u.n.childkeys[0] = splitk.data;
    newroot->u.n.totalchildkeylens=splitk.size;
@ -1632,6 +1638,10 @@ static int brt_init_new_root(BRT brt, BRTNODE nodea, BRTNODE nodeb, DBT splitk,
    BNC_DISKOFF(newroot,1)=nodeb->thisnodename;
    r=toku_fifo_create(&BNC_BUFFER(newroot,0)); if (r!=0) return r;
    r=toku_fifo_create(&BNC_BUFFER(newroot,1)); if (r!=0) return r;
+    BNC_NBYTESINBUF(newroot, 0)=0;
+    BNC_NBYTESINBUF(newroot, 1)=0;
+    BNC_SUBTREE_FINGERPRINT(newroot, 0)=0; 
+    BNC_SUBTREE_FINGERPRINT(newroot, 1)=0; 
    toku_verify_counts(newroot);
    //verify_local_fingerprint_nonleaf(nodea);
    //verify_local_fingerprint_nonleaf(nodeb);
--- a/newbrt/brt2.c
+++ b/newbrt/brt2.c
@ -33,6 +33,8 @@ extern long long n_items_malloced;
 static int malloc_diskblock (DISKOFF *res, BRT brt, int size, TOKULOGGER);
 static void verify_local_fingerprint_nonleaf (BRTNODE node);

+#ifdef FOO
+
 /* Frees a node, including all the stuff in the hash table. */
 void toku_brtnode_free (BRTNODE *nodep) {
    BRTNODE node=*nodep;
@ -55,9 +57,11 @@ void toku_brtnode_free (BRTNODE *nodep) {
    *nodep=0;
 }

+#endif
 static long brtnode_size(BRTNODE node) {
    return toku_serialize_brtnode_size(node);
 }
+#ifdef FOO

 static void toku_update_brtnode_loggerlsn(BRTNODE node, TOKULOGGER logger) {
    if (logger) {
@ -82,6 +86,8 @@ static void fixup_child_fingerprint(BRTNODE node, int childnum_of_node, BRTNODE
    toku_update_brtnode_loggerlsn(node, logger);
 }

+#endif
+
 // If you pass in data==0 then it only compares the key, not the data (even if is a DUPSORT database)
 static int brt_compare_pivot(BRT brt, DBT *key, DBT *data, bytevec ck) {
    int cmp;
@ -97,6 +103,7 @@ static int brt_compare_pivot(BRT brt, DBT *key, DBT *data, bytevec ck) {
    return cmp;
 }

+#ifdef FOO

 void toku_brtnode_flush_callback (CACHEFILE cachefile, DISKOFF nodename, void *brtnode_v, long size __attribute((unused)), BOOL write_me, BOOL keep_me, LSN modified_lsn __attribute__((__unused__)) , BOOL rename_p __attribute__((__unused__))) {
    BRTNODE brtnode = brtnode_v;
@ -170,9 +177,11 @@ int toku_unpin_brt_header (BRT brt) {
    brt->h=0;
    return r;
 }
+#endif
 static int unpin_brtnode (BRT brt, BRTNODE node) {
    return toku_cachetable_unpin(brt->cf, node->thisnodename, node->dirty, brtnode_size(node));
 }
+#ifdef FOO

 typedef struct kvpair {
    bytevec key;
@ -293,7 +302,7 @@ static int split_leaf_node (BRT t, TOKULOGGER logger, BRTNODE node, int *n_new_n
    while (toku_serialize_brtnode_size(node)>node->nodesize) {
 	BRTNODE B;
 	DBT splitk;
-	if ((r = create_new_brtnode(t, &B, 0, logger))) return r;
+	if ((r = toku_create_new_brtnode(t, &B, 0, logger))) return r;
 	// Split so that B is at least 1/2 full
 	// The stuff in B goes *before* node
 	if ((r = toku_pma_split(logger, toku_cachefile_filenum(t->cf),
@ -324,7 +333,7 @@ static int brt_nonleaf_split (BRT t, BRTNODE node, BRTNODE *nodea, BRTNODE *node
    assert(node->height>0);
    assert(node->u.n.n_children>=2); // Otherwise, how do we split?  We need at least two children to split. */
    assert(t->h->nodesize>=node->nodesize); /* otherwise we might be in trouble because the nodesize shrank. */
-    create_new_brtnode(t, &B, node->height, logger);
+    toku_create_new_brtnode(t, &B, node->height, logger);
    B->u.n.n_children   =n_children_in_b;
    //printf("%s:%d %p (%lld) becomes %p and %p\n", __FILE__, __LINE__, node, node->thisnodename, A, B);
    //printf("%s:%d A is at %lld\n", __FILE__, __LINE__, A->thisnodename);
@ -432,6 +441,8 @@ static int brt_nonleaf_split (BRT t, BRTNODE node, BRTNODE *nodea, BRTNODE *node
    return 0;
 }

+#endif
+
 static void find_heaviest_child (BRTNODE node, int *childnum) {
    int max_child = 0;
    int max_weight = BNC_NBYTESINBUF(node, 0);
@ -465,8 +476,140 @@ static unsigned int brtnode_which_child (BRTNODE node , DBT *k, DBT *d, BRT t) {
 }

 static int brtnode_put (BRT t, BRTNODE node, BRT_CMD cmd, TOKULOGGER logger, WS weak_p);
-static int maybe_fixup_fat_child(BRT t, BRTNODE node, int childnum, BRTNODE child, TOKULOGGER logger); // If the node is too big then deal with it.  Unpin the child (or children if it splits)  NODE may be too big at the end

+// If CHILD is too wide, split it, and create a new node with the new children.  Unpin CHILD or the new children (even if something goes wrong).
+// If it does split, unpin the new root node also.
+static int maybe_split_root(BRT brt, BRTNODE child, CACHEKEY *rootp, TOKULOGGER logger);
+// if CHILD is too wide, split it, and fix up NODE.  Either way, unpin the child or resulting children (even if it fails do the unpin)
+static int maybe_split_nonroot (BRT brt, BRTNODE node, int childnum, BRTNODE child, int *n_children_replacing_child, TOKULOGGER logger);
+
+// Push stuff into a child weakly.  (That is don't cause any I/O or cause the child to get too big.)
+static int weak_push_to_child (BRT brt, BRTNODE node, int childnum, TOKULOGGER logger) {
+    void *child_v;
+    int r = toku_cachetable_maybe_get_and_pin(brt->cf, BNC_DISKOFF(node, childnum), &child_v);
+    if (r!=0) return 0;
+    BRTNODE child = child_v;
+    DBT key,val;
+    BRT_CMD_S cmd;
+    while (0 == toku_fifo_peek_cmdstruct(BNC_BUFFER(node, childnum), &cmd, &key, &val)) {
+	r = brtnode_put(brt, child, &cmd, logger, WEAK);
+	if (r==EAGAIN) break;
+	if (r!=0) goto died;
+	r=toku_fifo_deq(BNC_BUFFER(node, childnum));
+	if (r!=0) goto died;
+    }
+    return unpin_brtnode(brt, child);
+ died:
+    unpin_brtnode(brt, child);
+    return r;
+		  
+}
+
+// If the buffers are too big, push stuff down.  The subchild may need to be split, in which case our fanout may get too large.
+// When are done, this node is has little enough stuff in its buffers (but the fanout may be too large), and all the descendant
+// nodes are properly sized (the buffer sizes and fanouts are all small enough).
+static int push_down_if_buffers_too_full(BRT brt, BRTNODE node, TOKULOGGER logger) {
+    if (node->height==0) return 0; // can't push down for leaf nodes
+
+    while (node->u.n.n_bytes_in_buffers > 0 && toku_serialize_brtnode_size(node)>node->nodesize) {
+	int childnum;
+	find_heaviest_child(node, &childnum);
+	void *child_v;
+	int r = toku_cachetable_get_and_pin(brt->cf, BNC_DISKOFF(node, childnum), &child_v, NULL,
+					    toku_brtnode_flush_callback, toku_brtnode_fetch_callback, brt);
+	if (r!=0) return r;
+	BRTNODE child=child_v;
+	if (0) { died: unpin_brtnode(brt, child); return r; }
+	BRT_CMD_S cmd;
+	DBT key,val;
+	while (0==toku_fifo_peek_cmdstruct(BNC_BUFFER(node, childnum), &cmd, &key, &val)) {
+	    r=toku_fifo_deq(BNC_BUFFER(node, childnum));
+	    assert(r==0); // we just did a peek, so the buffer must be nonempty
+	    r=brtnode_put(brt, child, &cmd, logger, WEAK);
+	    if (r!=EAGAIN && r!=0) goto died;
+	    if (r==EAGAIN) {
+		// Weak pushes ran out of steam.  Now do a strong push if there is still something in the buffer.
+		if (0==toku_fifo_peek_cmdstruct(BNC_BUFFER(node, childnum), &cmd, &key, &val)) {
+		    r=brtnode_put(brt, child, &cmd, logger, STRONG);
+		    if (r!=0) goto died;
+		    r=toku_fifo_deq(BNC_BUFFER(node, childnum));
+		    if (r!=0) goto died;
+		    // Now it's possible that the child must be split.  (Or maybe the child managed to flush stuff to our grandchildren)
+		    int n_children_replacing_child;
+		    r=maybe_split_nonroot(brt, node, childnum, child, &n_children_replacing_child, logger);
+		    if (r!=0) return r; // don't go to died since that unpins
+		    int i;
+		    for (i=0; i<n_children_replacing_child; i++) {
+			r=weak_push_to_child(brt, node, childnum+i, logger);
+			if (r!=0) return r;
+		    }
+		    // we basically pushed as much as we could to that child
+		}
+	    } 
+	}
+    }
+    return 0;
+}
+
+static int nonleaf_node_is_too_wide (BRT, BRTNODE);
+
+static int maybe_fixup_fat_child(BRT brt, BRTNODE node, int childnum, BRTNODE child, TOKULOGGER logger) // If the node is too big then deal with it.  Unpin the child (or children if it splits)  NODE may be too big at the end
+{
+    int r = push_down_if_buffers_too_full(brt, child, logger);
+    if (r!=0) return r;
+    // now the child may have too much fanout.
+    if (child->height>0) {
+	if (nonleaf_node_is_too_wide(brt, child)) {
+	    int n_new_nodes; BRTNODE *new_nodes; DBT *splitks;
+	    if ((r=split_nonleaf_node(brt, child,  &n_new_nodes, &new_nodes, &splitks))) return r;
+	    int i;
+	    int old_n_children = node->u.n.n_children;
+	    FIFO old_fifo = BNC_BUFFER(node, childnum);
+	    node->u.n.childinfos = toku_realloc(node->u.n.childinfos, (old_n_children+n_new_nodes-1) * sizeof(struct brt_nonleaf_childinfo));
+	    // slide the children over
+	    for (i=old_n_children-1; i>childnum; i--)
+		node->u.n.childinfos[i+n_new_nodes-1] = node->u.n.childinfos[i];
+	    // fill in the new children
+	    for (; i<childnum+n_new_nodes-1; i++) {
+		node->u.n.childinfos[i] = (struct brtnode_nonleaf_childinfo) { .subtree_fingerprint = 0,
+									       .diskoff = new_nodes[i-childnum]->thisnodename,
+									       .n_bytes_in_buffer = 0 };
+		r=toku_fifo_create(&BNC_BUFFER(node, i));
+	    }
+	    // slide the keys over
+	    node->u.n.childkeys = toku_realloc(node->u.n.childkeys, (old_n_children+n_new_nodes-2 ) * sizeof(node->u.n.childkeys[0]));
+	    for (i=node->u.n.n_children; cnum>=childnum; cnum--) {
+		node->u.n.childkeys[cnum+n_new_nodes-1] = node->u.n.childkeys[cnum];
+	    }
+	    // fix up fingerprints
+	    for (i=0; i<n_new_nodes; i++) {
+		fixup_child_fingerprint(node, childnum+i, new_nodes[i], brt, logger);
+	    }
+	    toku_free(new_nodes);
+	    // now everything in the fifos must be put again
+	    BRT_CMD_S cmd;
+	    DBT key,val;
+	    while (0=toku_fifo_peek_deq_cmdstruct(old_fifo, &cmd, &key, &val)) {
+		for (i=childnum; i<childnum+n_new_nodes-1; i++) {
+		    int cmp = brt_compare_pivot(t, cmd->u.id.key, 0, node->u.n.childkeys[i]);
+		    if (cmp<=0) {
+			r=toku_fifo_enq_cmdstruct(BNC_BUFFER(node, i), cmd);
+			if (r!=0) return r;
+			if (cmd->type!=DELETE || 0==(t->flags&TOKU_DB_DUPSORT)) goto filled; // we only need to put one in
+		    }
+		}
+		r=toku_fifo_enq_cmdstruct(BNC_BUFFER(node, i), cmd);
+		if (r!=0) return r;
+	    filled: /*nothing*/;
+	    }
+	    r=toku_fifo_free(&old_fifo);
+	    if (r!=0) return r;
+	}
+    } else {
+	abort(); // if a leaf is too fat need to split it.
+    }
+    return 0;
+}

 // There are two kinds of puts:  
 //  A "weak" put that is guaranteed to trigger no I/O, and will not leaf the node overfull.
@ -507,40 +650,6 @@ static int brt_leaf_put (BRT t, BRTNODE node, BRT_CMD cmd, TOKULOGGER logger, WS
    return EINVAL; //  if none of the cases match, then the command is messed up.
 }

-static int brt_leaf_strong_put (BRT t, BRTNODE node, BRT_CMD cmd, TOKULOGGER logger) {
-    FILENUM filenum = toku_cachefile_filenum(t->cf);
-    switch (cmd->type) {
-    case BRT_INSERT: {
-        int r = toku_pma_strong_insert_or_replace(node->u.l.buffer,
-						  cmd->u.id.key, cmd->u.id.val,
-						  logger, cmd->xid,
-						  filenum, node->thisnodename, node->rand4fingerprint, &node->local_fingerprint,
-						  &node->log_lsn, &node->u.l.n_bytes_in_buffer);
-	assert(r==0);
-	node->dirty=1;
-	return 0;
-    }
-    case BRT_DELETE: {
-        int r = toku_pma_delete_fixupsize(node->u.l.buffer, cmd->u.id.key, (DBT*)0,
-					  logger, cmd->xid, node->thisnodename,
-					  node->rand4fingerprint, &node->local_fingerprint, &node->log_lsn, &node->u.l.n_bytes_in_buffer);
-	if (r==0) node->dirty=1;
-	if (r==DB_NOTFOUND) r=0;
-	return r;
-    }
-    case BRT_DELETE_BOTH: {
-        int r = toku_pma_delete_fixupsize(node->u.l.buffer, cmd->u.id.key, cmd->u.id.val,
-					  logger, cmd->xid, node->thisnodename,
-					  node->rand4fingerprint, &node->local_fingerprint, &node->log_lsn,&node->u.l.n_bytes_in_buffer);
-        if (r == 0) node->dirty = 1;
-	if (r == DB_NOTFOUND) r=0;
-        return r;
-    }
-    case BRT_NONE: return 0;
-    }
-    return EINVAL; //  if none of the cases match, then the command is messed up.
-}
-
 // Put an command in a particular child's fifo.
 // If weak_p then do it without doing I/O or overfilling the child.
 //   If the child is in main memory and we can do a weak put on the child, then push into the child.
@ -561,7 +670,7 @@ static int brt_nonleaf_put_cmd_to_child (BRT t, BRTNODE node, int childnum, BRT_
 		r = unpin_brtnode(t, child);
 		if (r!=0) return r; // node is still OK
 	    } else if (r==0) {
-		return maybe_fixup_fat_child(t, node, childnum, child, logger); // If the node is too big then deal with it.  Unpin the child.  NODE may be too big
+		return maybe_fixup_fat_child(t, node, childnum, child, logger); // If the node is too big then deal with it.  Unpin the child.  NODE may be too big.  I think the only way a node can get fat is if weak_p==STRONG.
 	    } else {
 		unpin_brtnode(t, child);
 		return r; // node is still OK
@ -650,11 +759,9 @@ static int brt_nonleaf_put (BRT t, BRTNODE node, BRT_CMD cmd, TOKULOGGER logger,
        return EINVAL;
 }

-// Put the command into the node.   For leaf nodes, that means execute the command.
-// For internal nodes, just put it into the fifo, unless the appropriate child is in main memory and has a place to put the command without getting too big.
-// The node could end up overfull (but the children cannot get too big)
-// However, if you precalculate that the node is big enough, then the node will not get too big.
-//  (This implies that none of the children will overflow since we precalculate before calling this function on a child.)
+// Put the command into the node.
+// If weak_p is set then neither the node nor any descendants will get too big, and no I/O will occur.
+// if !weak_p then I/O could occur and the node could end up with too much fanout.  (But the children will all be properly sized)
 static int brtnode_put (BRT t, BRTNODE node, BRT_CMD cmd, TOKULOGGER logger, WS weak_p) {
    if (node->height==0) {
 	return brt_leaf_put(t, node, cmd, logger, weak_p);
@ -662,6 +769,7 @@ static int brtnode_put (BRT t, BRTNODE node, BRT_CMD cmd, TOKULOGGER logger, WS
 	return brt_nonleaf_put(t, node, cmd, logger, weak_p);
    }
 }
+#ifdef FOO

 static void verify_local_fingerprint_nonleaf (BRTNODE node) {
    u_int32_t fp=0;
@ -1083,55 +1191,12 @@ static int brt_init_new_root(BRT brt, int n_new_nodes, BRTNODE *new_nodes, DBT *
    return 0;
 }

-static int nonleaf_node_is_too_wide (BRT, BRTNODE);
 static int split_nonleaf_node(BRT, int *n_new_nodes, BRTNODE **new_nodes, DBT **splitks);
 static int leaf_node_is_too_full (BRT, BRTNODE);
-// If CHILD is too wide, split it, and create a new node with the new children.  Unpin CHILD or the new children (even if something goes wrong).
-// If it does split, unpin the new root node also.
-static int maybe_split_root(BRT brt, BRTNODE child, CACHEKEY *rootp, TOKULOGGER logger);
-// if CHILD is too wide, split it, and fix up NODE.  Either way, unpin the child or resulting children (even if it fails do the unpin)
-static int maybe_split_nonroot (BRT brt, BRTNODE node, int childnum, BRTNODE child, TOKULOGGER logger);
+
 // push things down into node's children (and into their children and so forth) but don't make any descendant too big.
 static int push_down_without_overfilling (BRT brt, BRTNODE node, TOKULOGGER logger);

-// If the buffers are too big, push stuff down.  The subchild may need to be split, in which case our fanout may get too large.
-// When are done, this node is has little enough stuff in its buffers (but the fanout may be too large), and all the descendant
-// nodes are properly sized (the buffer sizes and fanouts are all small enough).
-static int push_down_if_buffers_too_full(BRT brt, BRTNODE node, TOKULOGGER logger) {
-    if (node->height==0) return 0; // can't push down for leaf nodes
-
-    while (node->u.n.n_bytes_in_buffers > 0 && toku_serialize_brtnode_size(node)>node->nodesize) {
-	int childnum;
-	find_heaviest_child(node, &childnum);
-	void *child_v;
-	int r = toku_cachetable_get_and_pin(brt->cf, BNC_DISKOFF(node, childnum), &child_v, NULL,
-					    toku_brtnode_flush_callback, toku_brtnode_fetch_callback, brt);
-	if (r!=0) return r;
-	BRTNODE child=child_v;
-	if (0) { died: unpin_brtnode(brt, child); return r; }
-	BRT_CMD_S cmd;
-	DBT key,val;
-	while (0==toku_fifo_peek_cmdstruct(BNC_BUFFER(node, childnum), &cmd, &key, &val)) {
-	    r=toku_fifo_deq(BNC_BUFFER(node, childnum));
-	    assert(r==0); // we just did a peek, so the buffer must be nonempty
-	    r=brtnode_put_cmd_no_io(brt, child, &cmd, logger); if (r!=0) goto died;
-	    if (toku_serialize_brtnode_size(child)>child->nodesize) {
-		// The child got too big, so do the fixup on the child
-		r = push_down_if_buffers_too_full(brt, child, logger); if (r!=0) goto died;
-		// After the split_nonroot call, the children are all unpinned... 
-		r = maybe_split_nonroot(brt, node, childnum, child, logger);
-		if (r!=0) return r; // so on error just return r instead of going to died.
-		r =push_down_without_overfilling(brt, node, logger);
-		if (r!=0) return r;
-		// We hope that NODE is now not too full.  One can imagine cases where it is too full, however, so we 
-		// stop popping from this fifo, and go around the outer while loop to look at the node to see if it is too big again.
-		break;
-	    }
-	}
-    }
-    return 0;
-}
-
 // Push data toward a child.  If the child gets too big then the child will push down or split.
 // If a split happens, then return immediately so that we can check to see if NODE needs to be split
 static int flush_toward_child (BRT brt, BRTNODE node, int childnum, TOKULOGGER logger);
@ -1143,7 +1208,7 @@ static int maybe_fixup_root (BRT brt, BRTNODE node, CACHEKEY *rootp, TOKULOGGER
    maybe_reshape_internal_node:
 	while (nonleaf_node_is_too_wide(brt, node)) {
 	    int n_new_nodes; BRTNODE *new_nodes; DBT *splitks;
-	    if ((r=split_nonleaf_node(brt, &n_new_nodes, &new_nodes, &splitks))) return r;
+	    if ((r=split_nonleaf_node(brt, node, &n_new_nodes, &new_nodes, &splitks))) return r;
 	    if ((r=brt_init_new_root(brt, n_new_nodes, new_nodes, splitks, rootp, logger, &node))) return r; // unpins all the new nodes, which are all small enough
 	    // now node is still possibly too wide, hence the loop
 	}
@ -1160,6 +1225,8 @@ static int maybe_fixup_root (BRT brt, BRTNODE node, CACHEKEY *rootp, TOKULOGGER
    return 0;
 }

+#endif
+
 static int brt_root_put_cmd(BRT brt, BRT_CMD cmd, TOKULOGGER logger) {
    void *node_v;
    BRTNODE node;
@ -1344,6 +1411,7 @@ int toku_brt_dbt_set_value(BRT brt, DBT *ybt, bytevec val, ITEMLEN vallen) {
    return r;
 }

+#ifdef FOO
 /* search in a node's child */
 static int brt_search_child(BRT brt, BRTNODE node, int childnum, brt_search_t *search, DBT *newkey, DBT *newval, TOKULOGGER logger) {
    int r, rr;
@ -1834,3 +1902,4 @@ int toku_brt_height_of_root(BRT brt, int *height) {
    r = toku_unpin_brt_header(brt); assert(r==0);
    return 0;
 }
+#endif
--- a/newbrt/memory.h
+++ b/newbrt/memory.h
@ -42,6 +42,8 @@ void *toku_realloc(void *, size_t size);
 */
 #define MALLOC_N(n,v) v = toku_malloc((n)*sizeof(*v))

+#define REALLOC_N(n,v) v = toku_realloc(v, (n)*sizeof(*v))
+
 /* If you have a type such as 
 *    struct pma *PMA;
 * and you define a corresponding int constant, such as