diff --git a/buildheader/db.h_4_1 b/buildheader/db.h_4_1 index 5a118a5de19..082c0c7aa65 100644 --- a/buildheader/db.h_4_1 +++ b/buildheader/db.h_4_1 @@ -421,12 +421,14 @@ struct __toku_db { int (*flatten)(DB*, DB_TXN*) /* Flatten a dictionary, similar to (but faster than) a table scan */; int (*optimize)(DB*) /* Run garbage collecion and promote all transactions older than oldest. Amortized (happens during flattening) */; int (*get_fragmentation)(DB*,TOKU_DB_FRAGMENTATION); + int (*get_readpagesize)(DB*,u_int32_t*); + int (*set_readpagesize)(DB*,u_int32_t); int (*set_indexer)(DB*, DB_INDEXER*); void (*get_indexer)(DB*, DB_INDEXER**); int (*verify_with_progress)(DB *, int (*progress_callback)(void *progress_extra, float progress), void *progress_extra, int verbose, int keep_going); int (*update)(DB *, DB_TXN*, const DBT *key, const DBT *extra, u_int32_t flags); int (*update_broadcast)(DB *, DB_TXN*, const DBT *extra, u_int32_t flags); - void* __toku_dummy0[13]; + void* __toku_dummy0[11]; char __toku_dummy1[96]; void *api_internal; /* 32-bit offset=236 size=4, 64=bit offset=376 size=8 */ void* __toku_dummy2[5]; diff --git a/buildheader/db.h_4_3 b/buildheader/db.h_4_3 index bc38627a4ee..c297040e065 100644 --- a/buildheader/db.h_4_3 +++ b/buildheader/db.h_4_3 @@ -431,12 +431,14 @@ struct __toku_db { int (*flatten)(DB*, DB_TXN*) /* Flatten a dictionary, similar to (but faster than) a table scan */; int (*optimize)(DB*) /* Run garbage collecion and promote all transactions older than oldest. Amortized (happens during flattening) */; int (*get_fragmentation)(DB*,TOKU_DB_FRAGMENTATION); + int (*get_readpagesize)(DB*,u_int32_t*); + int (*set_readpagesize)(DB*,u_int32_t); int (*set_indexer)(DB*, DB_INDEXER*); void (*get_indexer)(DB*, DB_INDEXER**); int (*verify_with_progress)(DB *, int (*progress_callback)(void *progress_extra, float progress), void *progress_extra, int verbose, int keep_going); int (*update)(DB *, DB_TXN*, const DBT *key, const DBT *extra, u_int32_t flags); int (*update_broadcast)(DB *, DB_TXN*, const DBT *extra, u_int32_t flags); - void* __toku_dummy0[16]; + void* __toku_dummy0[14]; char __toku_dummy1[96]; void *api_internal; /* 32-bit offset=248 size=4, 64=bit offset=400 size=8 */ void* __toku_dummy2[5]; diff --git a/buildheader/db.h_4_4 b/buildheader/db.h_4_4 index 0c24f47aca4..9ce7f568e43 100644 --- a/buildheader/db.h_4_4 +++ b/buildheader/db.h_4_4 @@ -433,12 +433,14 @@ struct __toku_db { int (*flatten)(DB*, DB_TXN*) /* Flatten a dictionary, similar to (but faster than) a table scan */; int (*optimize)(DB*) /* Run garbage collecion and promote all transactions older than oldest. Amortized (happens during flattening) */; int (*get_fragmentation)(DB*,TOKU_DB_FRAGMENTATION); + int (*get_readpagesize)(DB*,u_int32_t*); + int (*set_readpagesize)(DB*,u_int32_t); int (*set_indexer)(DB*, DB_INDEXER*); void (*get_indexer)(DB*, DB_INDEXER**); int (*verify_with_progress)(DB *, int (*progress_callback)(void *progress_extra, float progress), void *progress_extra, int verbose, int keep_going); int (*update)(DB *, DB_TXN*, const DBT *key, const DBT *extra, u_int32_t flags); int (*update_broadcast)(DB *, DB_TXN*, const DBT *extra, u_int32_t flags); - void* __toku_dummy0[18]; + void* __toku_dummy0[16]; char __toku_dummy1[96]; void *api_internal; /* 32-bit offset=256 size=4, 64=bit offset=416 size=8 */ void* __toku_dummy2[5]; diff --git a/buildheader/db.h_4_5 b/buildheader/db.h_4_5 index a1f6f89ea5f..2fbb02960eb 100644 --- a/buildheader/db.h_4_5 +++ b/buildheader/db.h_4_5 @@ -433,12 +433,14 @@ struct __toku_db { int (*flatten)(DB*, DB_TXN*) /* Flatten a dictionary, similar to (but faster than) a table scan */; int (*optimize)(DB*) /* Run garbage collecion and promote all transactions older than oldest. Amortized (happens during flattening) */; int (*get_fragmentation)(DB*,TOKU_DB_FRAGMENTATION); + int (*get_readpagesize)(DB*,u_int32_t*); + int (*set_readpagesize)(DB*,u_int32_t); int (*set_indexer)(DB*, DB_INDEXER*); void (*get_indexer)(DB*, DB_INDEXER**); int (*verify_with_progress)(DB *, int (*progress_callback)(void *progress_extra, float progress), void *progress_extra, int verbose, int keep_going); int (*update)(DB *, DB_TXN*, const DBT *key, const DBT *extra, u_int32_t flags); int (*update_broadcast)(DB *, DB_TXN*, const DBT *extra, u_int32_t flags); - void* __toku_dummy0[21]; + void* __toku_dummy0[19]; char __toku_dummy1[96]; void *api_internal; /* 32-bit offset=268 size=4, 64=bit offset=440 size=8 */ void* __toku_dummy2[5]; diff --git a/buildheader/db.h_4_6 b/buildheader/db.h_4_6 index 44b560e2fec..32836e23654 100644 --- a/buildheader/db.h_4_6 +++ b/buildheader/db.h_4_6 @@ -436,12 +436,14 @@ struct __toku_db { int (*flatten)(DB*, DB_TXN*) /* Flatten a dictionary, similar to (but faster than) a table scan */; int (*optimize)(DB*) /* Run garbage collecion and promote all transactions older than oldest. Amortized (happens during flattening) */; int (*get_fragmentation)(DB*,TOKU_DB_FRAGMENTATION); + int (*get_readpagesize)(DB*,u_int32_t*); + int (*set_readpagesize)(DB*,u_int32_t); int (*set_indexer)(DB*, DB_INDEXER*); void (*get_indexer)(DB*, DB_INDEXER**); int (*verify_with_progress)(DB *, int (*progress_callback)(void *progress_extra, float progress), void *progress_extra, int verbose, int keep_going); int (*update)(DB *, DB_TXN*, const DBT *key, const DBT *extra, u_int32_t flags); int (*update_broadcast)(DB *, DB_TXN*, const DBT *extra, u_int32_t flags); - void* __toku_dummy1[25]; + void* __toku_dummy1[23]; char __toku_dummy2[80]; void *api_internal; /* 32-bit offset=276 size=4, 64=bit offset=464 size=8 */ void* __toku_dummy3[5]; diff --git a/buildheader/make_db_h.c b/buildheader/make_db_h.c index 621b7c65f69..4f19ef7ad5c 100644 --- a/buildheader/make_db_h.c +++ b/buildheader/make_db_h.c @@ -684,6 +684,8 @@ int main (int argc __attribute__((__unused__)), char *const argv[] __attribute__ "int (*flatten)(DB*, DB_TXN*) /* Flatten a dictionary, similar to (but faster than) a table scan */", "int (*optimize)(DB*) /* Run garbage collecion and promote all transactions older than oldest. Amortized (happens during flattening) */", "int (*get_fragmentation)(DB*,TOKU_DB_FRAGMENTATION)", + "int (*get_readpagesize)(DB*,u_int32_t*)", + "int (*set_readpagesize)(DB*,u_int32_t)", "int (*set_indexer)(DB*, DB_INDEXER*)", "void (*get_indexer)(DB*, DB_INDEXER**)", "int (*verify_with_progress)(DB *, int (*progress_callback)(void *progress_extra, float progress), void *progress_extra, int verbose, int keep_going)", diff --git a/buildheader/tdb.h b/buildheader/tdb.h index abba5c52e35..3439c7793f5 100644 --- a/buildheader/tdb.h +++ b/buildheader/tdb.h @@ -405,6 +405,8 @@ struct __toku_db { int (*flatten)(DB*, DB_TXN*) /* Flatten a dictionary, similar to (but faster than) a table scan */; int (*optimize)(DB*) /* Run garbage collecion and promote all transactions older than oldest. Amortized (happens during flattening) */; int (*get_fragmentation)(DB*,TOKU_DB_FRAGMENTATION); + int (*get_readpagesize)(DB*,u_int32_t*); + int (*set_readpagesize)(DB*,u_int32_t); int (*set_indexer)(DB*, DB_INDEXER*); void (*get_indexer)(DB*, DB_INDEXER**); int (*verify_with_progress)(DB *, int (*progress_callback)(void *progress_extra, float progress), void *progress_extra, int verbose, int keep_going); diff --git a/include/db.h b/include/db.h index abba5c52e35..3439c7793f5 100644 --- a/include/db.h +++ b/include/db.h @@ -405,6 +405,8 @@ struct __toku_db { int (*flatten)(DB*, DB_TXN*) /* Flatten a dictionary, similar to (but faster than) a table scan */; int (*optimize)(DB*) /* Run garbage collecion and promote all transactions older than oldest. Amortized (happens during flattening) */; int (*get_fragmentation)(DB*,TOKU_DB_FRAGMENTATION); + int (*get_readpagesize)(DB*,u_int32_t*); + int (*set_readpagesize)(DB*,u_int32_t); int (*set_indexer)(DB*, DB_INDEXER*); void (*get_indexer)(DB*, DB_INDEXER**); int (*verify_with_progress)(DB *, int (*progress_callback)(void *progress_extra, float progress), void *progress_extra, int verbose, int keep_going); diff --git a/newbrt/brt-internal.h b/newbrt/brt-internal.h index d7e64643aa0..c7b4e14e589 100644 --- a/newbrt/brt-internal.h +++ b/newbrt/brt-internal.h @@ -41,6 +41,7 @@ enum { BRT_CMD_OVERHEAD = (1 + sizeof(MSN)) // the type plus MSN }; enum { BRT_DEFAULT_NODE_SIZE = 1 << 22 }; +enum { BRT_DEFAULT_BASEMENT_NODE_SIZE = 128 * 1024 }; struct nodeheader_in_file { int n_in_buffer; @@ -389,6 +390,7 @@ struct brt_header { BOOL upgrade_brt_performed; // initially FALSE, set TRUE when brt has been fully updated (even though nodes may not have been) int64_t num_blocks_to_upgrade; // Number of v13 blocks still not newest version. When we release layout 15 we may need to turn this to an array or add more variables. unsigned int nodesize; + unsigned int basementnodesize; BLOCKNUM root; // roots of the dictionary struct remembered_hash root_hash; // hash of the root offset. unsigned int flags; @@ -419,6 +421,7 @@ struct brt { struct toku_list cursors; unsigned int nodesize; + unsigned int basementnodesize; unsigned int flags; BOOL did_set_flags; int (*compare_fun)(DB*,const DBT*,const DBT*); @@ -442,6 +445,7 @@ struct brt { /* serialization code */ int toku_serialize_brtnode_to_memory (BRTNODE node, + unsigned int basementnodesize, /*out*/ size_t *n_bytes_to_write, /*out*/ char **bytes_to_write); int toku_serialize_brtnode_to(int fd, BLOCKNUM, BRTNODE node, struct brt_header *h, int n_workitems, int n_threads, BOOL for_checkpoint); diff --git a/newbrt/brt-serialize.c b/newbrt/brt-serialize.c index 854ac6d481a..e47684928e0 100644 --- a/newbrt/brt-serialize.c +++ b/newbrt/brt-serialize.c @@ -455,8 +455,6 @@ array_item (OMTVALUE lev, u_int32_t idx, void *vsi) { return 0; } -#define BN_MAX_SIZE 128*1024 - struct sum_info { unsigned int dsum; unsigned int msum; @@ -475,7 +473,7 @@ sum_item (OMTVALUE lev, u_int32_t UU(idx), void *vsi) { // There must still be at least one child static void -rebalance_brtnode_leaf(BRTNODE node) +rebalance_brtnode_leaf(BRTNODE node, unsigned int basementnodesize) { assert(node->height == 0); assert(node->dirty); @@ -499,14 +497,14 @@ rebalance_brtnode_leaf(BRTNODE node) toku_omt_iterate(curr_omt, array_item, &ai); curr_le += toku_omt_size(curr_omt); } - + // figure out the new pivots u_int32_t curr_pivot = 0; u_int32_t num_le_in_curr_bn = 0; u_int32_t bn_size_so_far = 0; for (u_int32_t i = 0; i < num_le; i++) { u_int32_t curr_size = leafentry_disksize(array[i]); - if ((bn_size_so_far + curr_size > BN_MAX_SIZE) && (num_le_in_curr_bn != 0)) { + if ((bn_size_so_far + curr_size > basementnodesize) && (num_le_in_curr_bn != 0)) { // cap off the current basement node to end with the element before i new_pivots[curr_pivot] = i-1; curr_pivot++; @@ -514,7 +512,7 @@ rebalance_brtnode_leaf(BRTNODE node) bn_size_so_far = 0; } num_le_in_curr_bn++; - bn_size_so_far += curr_size; + bn_size_so_far += curr_size; } // now we need to fill in the new basement nodes and pivots @@ -534,7 +532,6 @@ rebalance_brtnode_leaf(BRTNODE node) min_dsn = (curr_dsn.dsn < min_dsn.dsn) ? curr_dsn : min_dsn; max_msn = (curr_msn.msn > max_msn.msn) ? curr_msn : max_msn; } - // Now destroy the old stuff; toku_destroy_brtnode_internals(node); @@ -546,7 +543,7 @@ rebalance_brtnode_leaf(BRTNODE node) XMALLOC_N(num_children-1, node->childkeys); node->n_children = num_children; - XMALLOC_N(num_children, node->bp); + XMALLOC_N(num_children, node->bp); for (int i = 0; i < num_children; i++) { set_BLB(node, i, toku_create_empty_bn()); } @@ -581,9 +578,9 @@ rebalance_brtnode_leaf(BRTNODE node) memcpy(bn_array, &array[curr_start], num_in_bn*(sizeof(array[0]))); toku_omt_destroy(&BLB_BUFFER(node, i)); int r = toku_omt_create_steal_sorted_array( - &BLB_BUFFER(node, i), - &bn_array, - num_in_bn, + &BLB_BUFFER(node, i), + &bn_array, + num_in_bn, num_in_bn ); lazy_assert_zero(r); @@ -597,7 +594,7 @@ rebalance_brtnode_leaf(BRTNODE node) BLB_MAX_MSN_APPLIED(node,i) = max_msn; } node->max_msn_applied_to_node_on_disk = max_msn; - + // now the subtree estimates toku_brt_leaf_reset_calc_leaf_stats(node); @@ -611,16 +608,17 @@ rebalance_brtnode_leaf(BRTNODE node) // int toku_serialize_brtnode_to_memory (BRTNODE node, + unsigned int basementnodesize, /*out*/ size_t *n_bytes_to_write, /*out*/ char **bytes_to_write) { toku_assert_entire_node_in_memory(node); if (node->height == 0) { - rebalance_brtnode_leaf(node); + rebalance_brtnode_leaf(node, basementnodesize); } const int npartitions = node->n_children; - + // Each partition represents a compressed sub block // For internal nodes, a sub block is a message buffer // For leaf nodes, a sub block is a basement node @@ -704,7 +702,7 @@ toku_serialize_brtnode_to_memory (BRTNODE node, toku_free(sb[i].compressed_ptr); toku_free(sb[i].uncompressed_ptr); } - + return 0; } @@ -714,7 +712,8 @@ toku_serialize_brtnode_to (int fd, BLOCKNUM blocknum, BRTNODE node, struct brt_h size_t n_to_write; char *compressed_buf = NULL; { - int r = toku_serialize_brtnode_to_memory (node, &n_to_write, &compressed_buf); + int r = toku_serialize_brtnode_to_memory(node, h->basementnodesize, + &n_to_write, &compressed_buf); if (r!=0) return r; } @@ -1413,6 +1412,7 @@ serialize_brt_header_min_size (u_int32_t version) { switch(version) { case BRT_LAYOUT_VERSION_15: + size += 4; // basement node size case BRT_LAYOUT_VERSION_14: size += 8; //TXNID that created case BRT_LAYOUT_VERSION_13: @@ -1420,7 +1420,7 @@ serialize_brt_header_min_size (u_int32_t version) { +4 // build_id_original +8 // time_of_creation +8 // time_of_last_modification - ); + ); // fall through case BRT_LAYOUT_VERSION_12: size += (+8 // "tokudata" @@ -1479,6 +1479,7 @@ int toku_serialize_brt_header_to_wbuf (struct wbuf *wbuf, struct brt_header *h, } wbuf_ulonglong(wbuf, h->num_blocks_to_upgrade); wbuf_TXNID(wbuf, h->root_xid_that_created); + wbuf_int(wbuf, h->basementnodesize); u_int32_t checksum = x1764_finish(&wbuf->checksum); wbuf_int(wbuf, checksum); lazy_assert(wbuf->ndone == wbuf->size); @@ -1745,6 +1746,9 @@ deserialize_brtheader (int fd, struct rbuf *rb, struct brt_header **brth) { // at this layer, this new field is the only difference between versions 13 and 14 rbuf_TXNID(&rc, &h->root_xid_that_created); } + if (h->layout_version >= BRT_LAYOUT_VERSION_15) { + h->basementnodesize = rbuf_int(&rc); + } (void)rbuf_int(&rc); //Read in checksum and ignore (already verified). if (rc.ndone!=rc.size) {ret = EINVAL; goto died1;} toku_free(rc.buf); @@ -1795,6 +1799,8 @@ deserialize_brtheader_versioned (int fd, struct rbuf *rb, struct brt_header **br upgrade++; //Fall through on purpose case BRT_LAYOUT_VERSION_14: + h->basementnodesize = 128*1024; // basement nodes added in v15 + //fall through on purpose case BRT_LAYOUT_VERSION_15: invariant(h->layout_version == BRT_LAYOUT_VERSION); h->upgrade_brt_performed = FALSE; diff --git a/newbrt/brt.c b/newbrt/brt.c index 2fc31eab739..5d8d2bca515 100644 --- a/newbrt/brt.c +++ b/newbrt/brt.c @@ -1024,6 +1024,7 @@ brt_init_new_root(BRT brt, BRTNODE nodea, BRTNODE nodeb, DBT splitk, CACHEKEY *r void toku_create_new_brtnode (BRT t, BRTNODE *result, int height, int n_children) { assert(t->h->nodesize > 0); + assert(t->h->basementnodesize > 0); if (height == 0) assert(n_children > 0); @@ -3493,7 +3494,7 @@ toku_brt_maybe_delete(BRT brt, DBT *key, TOKUTXN txn, BOOL oplsn_valid, LSN opls r = toku_log_enq_delete_any(logger, (LSN*)0, 0, toku_cachefile_filenum(brt->cf), xid, keybs); if (r!=0) return r; } - + LSN treelsn; if (oplsn_valid && oplsn.lsn <= (treelsn = toku_brt_checkpoint_lsn(brt)).lsn) { r = 0; @@ -3514,7 +3515,7 @@ toku_brt_send_delete(BRT brt, DBT *key, XIDS xids) { /* ******************** open,close and create ********************** */ // Test only function (not used in running system). This one has no env -int toku_open_brt (const char *fname, int is_create, BRT *newbrt, int nodesize, CACHETABLE cachetable, TOKUTXN txn, +int toku_open_brt (const char *fname, int is_create, BRT *newbrt, int nodesize, int basementnodesize, CACHETABLE cachetable, TOKUTXN txn, int (*compare_fun)(DB*,const DBT*,const DBT*), DB *db) { BRT brt; int r; @@ -3524,6 +3525,7 @@ int toku_open_brt (const char *fname, int is_create, BRT *newbrt, int nodesize, if (r != 0) return r; r = toku_brt_set_nodesize(brt, nodesize); assert_zero(r); + r = toku_brt_set_basementnodesize(brt, basementnodesize); assert_zero(r); r = toku_brt_set_bt_compare(brt, compare_fun); assert_zero(r); r = toku_brt_open(brt, fname, is_create, only_create, cachetable, txn, db); @@ -3624,20 +3626,21 @@ brtheader_log_suppress_rollback_during_checkpoint (CACHEFILE cf, void *header_v) static int brtheader_note_pin_by_checkpoint (CACHEFILE cachefile, void *header_v); static int brtheader_note_unpin_by_checkpoint (CACHEFILE cachefile, void *header_v); -static int +static int brt_init_header_partial (BRT t, TOKUTXN txn) { int r; t->h->flags = t->flags; if (t->h->cf!=NULL) assert(t->h->cf == t->cf); t->h->cf = t->cf; t->h->nodesize=t->nodesize; + t->h->basementnodesize=t->basementnodesize; t->h->num_blocks_to_upgrade = 0; t->h->root_xid_that_created = txn ? txn->ancestor_txnid64 : TXNID_NONE; compute_and_fill_remembered_hash(t); - t->h->root_put_counter = global_root_put_counter++; - + t->h->root_put_counter = global_root_put_counter++; + BLOCKNUM root = t->h->root; if ((r=setup_initial_brt_root_node(t, root))!=0) { return r; } //printf("%s:%d putting %p (%d)\n", __FILE__, __LINE__, t->h, 0); @@ -3890,7 +3893,7 @@ brt_open(BRT t, const char *fname_in_env, int is_create, int only_create, CACHET if (r==ENOENT && is_create) { toku_cachetable_reserve_filenum(cachetable, &reserved_filenum, use_reserved_filenum, reserved_filenum); if (0) { - died1: + died1: if (did_create) toku_cachetable_unreserve_filenum(cachetable, reserved_filenum); goto died0; @@ -3904,7 +3907,7 @@ brt_open(BRT t, const char *fname_in_env, int is_create, int only_create, CACHET if (r != 0) goto died1; } txn_created = (BOOL)(txn!=NULL); - r = toku_logger_log_fcreate(txn, fname_in_env, reserved_filenum, mode, t->flags, t->nodesize); + r = toku_logger_log_fcreate(txn, fname_in_env, reserved_filenum, mode, t->flags, t->nodesize, t->basementnodesize); if (r!=0) goto died1; r = brt_create_file(t, fname_in_cwd, &fd); } @@ -3912,13 +3915,13 @@ brt_open(BRT t, const char *fname_in_env, int is_create, int only_create, CACHET fname_in_cwd = NULL; if (r != 0) goto died1; // TODO: #2090 - r=toku_cachetable_openfd_with_filenum(&t->cf, cachetable, fd, + r=toku_cachetable_openfd_with_filenum(&t->cf, cachetable, fd, fname_in_env, use_reserved_filenum||did_create, reserved_filenum, did_create); if (r != 0) goto died1; } if (r!=0) { - died_after_open: + died_after_open: toku_cachefile_close(&t->cf, 0, FALSE, ZERO_LSN); goto died1; } @@ -3948,6 +3951,7 @@ brt_open(BRT t, const char *fname_in_env, int is_create, int only_create, CACHET if ((r = toku_read_brt_header_and_store_in_cachefile(t->cf, max_acceptable_lsn, &t->h, &was_already_open))!=0) goto died_after_open; found_it: t->nodesize = t->h->nodesize; /* inherit the pagesize from the file */ + t->basementnodesize = t->h->basementnodesize; if (!t->did_set_flags) { r = verify_builtin_comparisons_consistent(t, t->flags); if (r!=0) goto died_after_read_and_pin; @@ -4044,6 +4048,8 @@ brt_open_for_redirect(BRT *new_brtp, const char *fname_in_env, TOKUTXN txn, BRT assert_zero(r); r = toku_brt_set_nodesize(t, old_brt->nodesize); assert_zero(r); + r = toku_brt_set_basementnodesize(t, old_brt->basementnodesize); + assert_zero(r); CACHETABLE ct = toku_cachefile_get_cachetable(old_brt->cf); r = brt_open(t, fname_in_env, 0, 0, ct, txn, old_brt->db, FILENUM_NONE, old_h->dict_id, MAX_LSN); assert_zero(r); @@ -4339,6 +4345,16 @@ int toku_brt_get_nodesize(BRT brt, unsigned int *nodesize) { return 0; } +int toku_brt_set_basementnodesize(BRT brt, unsigned int basementnodesize) { + brt->basementnodesize = basementnodesize; + return 0; +} + +int toku_brt_get_basementnodesize(BRT brt, unsigned int *basementnodesize) { + *basementnodesize = brt->basementnodesize; + return 0; +} + int toku_brt_set_bt_compare(BRT brt, int (*bt_compare)(DB *, const DBT*, const DBT*)) { brt->compare_fun = bt_compare; return 0; @@ -4683,6 +4699,7 @@ int toku_brt_create(BRT *brt_ptr) { brt->flags = 0; brt->did_set_flags = FALSE; brt->nodesize = BRT_DEFAULT_NODE_SIZE; + brt->basementnodesize = BRT_DEFAULT_BASEMENT_NODE_SIZE; brt->compare_fun = toku_builtin_compare_fun; brt->update_fun = NULL; int r = toku_omt_create(&brt->txns); diff --git a/newbrt/brt.h b/newbrt/brt.h index 720306f0266..ba5fb80cedf 100644 --- a/newbrt/brt.h +++ b/newbrt/brt.h @@ -28,7 +28,7 @@ C_BEGIN //-infinity depending on direction) typedef int(*BRT_GET_CALLBACK_FUNCTION)(ITEMLEN, bytevec, ITEMLEN, bytevec, void*); -int toku_open_brt (const char *fname, int is_create, BRT *, int nodesize, CACHETABLE, TOKUTXN, int(*)(DB*,const DBT*,const DBT*), DB*) __attribute__ ((warn_unused_result)); +int toku_open_brt (const char *fname, int is_create, BRT *, int nodesize, int basementnodesize, CACHETABLE, TOKUTXN, int(*)(DB*,const DBT*,const DBT*), DB*) __attribute__ ((warn_unused_result)); int toku_brt_change_descriptor(BRT t, const DBT* old_descriptor, const DBT* new_descriptor, BOOL do_log, TOKUTXN txn); int toku_update_descriptor(struct brt_header * h, DESCRIPTOR d, int fd); @@ -44,6 +44,8 @@ int toku_brt_set_flags(BRT, unsigned int flags) __attribute__ ((warn_unused_res int toku_brt_get_flags(BRT, unsigned int *flags) __attribute__ ((warn_unused_result)); int toku_brt_set_nodesize(BRT, unsigned int nodesize) __attribute__ ((warn_unused_result)); int toku_brt_get_nodesize(BRT, unsigned int *nodesize) __attribute__ ((warn_unused_result)); +int toku_brt_set_basementnodesize(BRT, unsigned int basementnodesize) __attribute__ ((warn_unused_result)); +int toku_brt_get_basementnodesize(BRT, unsigned int *basementnodesize) __attribute__ ((warn_unused_result)); int toku_brt_set_bt_compare(BRT, brt_compare_func) __attribute__ ((warn_unused_result)); brt_compare_func toku_brt_get_bt_compare (BRT brt); diff --git a/newbrt/brt_layout_version.h b/newbrt/brt_layout_version.h index bebcc2e5aed..a4e9659329f 100644 --- a/newbrt/brt_layout_version.h +++ b/newbrt/brt_layout_version.h @@ -18,7 +18,7 @@ enum brt_layout_version_e { BRT_LAYOUT_VERSION_12 = 12, // Diff from 11 to 12: Added BRT_CMD 'BRT_INSERT_NO_OVERWRITE', compressed block format, num old blocks BRT_LAYOUT_VERSION_13 = 13, // Diff from 12 to 13: Fixed loader pivot bug, added build_id to every node, timestamps to brtheader BRT_LAYOUT_VERSION_14 = 14, // Diff from 13 to 14: Added MVCC; deprecated TOKU_DB_VALCMP_BUILTIN(_13); Remove fingerprints; Support QUICKLZ; add end-to-end checksum on uncompressed data. - BRT_LAYOUT_VERSION_15 = 15, // Diff from 14 to 15: TODO + BRT_LAYOUT_VERSION_15 = 15, // Diff from 14 to 15: basement nodes, TODO BRT_NEXT_VERSION, // the version after the current version BRT_LAYOUT_VERSION = BRT_NEXT_VERSION-1, // A hack so I don't have to change this line. BRT_LAYOUT_MIN_SUPPORTED_VERSION = BRT_LAYOUT_VERSION_13, // Minimum version supported diff --git a/newbrt/brtdump.c b/newbrt/brtdump.c index e72c0ec6f90..450a8590495 100644 --- a/newbrt/brtdump.c +++ b/newbrt/brtdump.c @@ -101,6 +101,7 @@ dump_header (int f, struct brt_header **header, CACHEFILE cf) { printf(" dirty=%d\n", h->dirty); printf(" nodesize=%u\n", h->nodesize); + printf(" basementnodesize=%u\n", h->basementnodesize); printf(" unnamed_root=%" PRId64 "\n", h->root.b); printf(" flags=%u\n", h->flags); dump_descriptor(&h->descriptor); diff --git a/newbrt/brtloader-internal.h b/newbrt/brtloader-internal.h index 430ecfb419b..325057cc368 100644 --- a/newbrt/brtloader-internal.h +++ b/newbrt/brtloader-internal.h @@ -188,6 +188,7 @@ struct fractal_thread_args { int errno_result; // the final result. int which_db; uint32_t target_nodesize; + uint32_t target_basementnodesize; }; void toku_brt_loader_set_n_rows(BRTLOADER bl, u_int64_t n_rows); @@ -220,7 +221,8 @@ int toku_loader_write_brt_from_q_in_C (BRTLOADER bl, QUEUE q, uint64_t total_disksize_estimate, int which_db, - uint32_t target_nodesize); + uint32_t target_nodesize, + uint32_t target_basementnodesize); int brt_loader_mergesort_row_array (struct row rows[/*n*/], int n, int which_db, DB *dest_db, brt_compare_func, BRTLOADER, struct rowset *); diff --git a/newbrt/brtloader.c b/newbrt/brtloader.c index 5707c9f02c0..cf34d1e3d3b 100644 --- a/newbrt/brtloader.c +++ b/newbrt/brtloader.c @@ -71,6 +71,7 @@ static size_t do_fwrite (const void *ptr, size_t size, size_t nmemb, FILE *strea // Different values for these sizes may be used for testing. static uint32_t size_factor = 1024; static uint32_t default_loader_nodesize = BRT_DEFAULT_NODE_SIZE; +static uint32_t default_loader_basementnodesize = BRT_DEFAULT_BASEMENT_NODE_SIZE; enum { EXTRACTOR_QUEUE_DEPTH = 2, FILE_BUFFER_SIZE = 1<<24, @@ -2203,12 +2204,12 @@ static struct leaf_buf *start_leaf (struct dbout *out, const DESCRIPTOR UU(desc) } CILK_BEGIN -static void finish_leafnode (struct dbout *out, struct leaf_buf *lbuf, int progress_allocation, BRTLOADER bl); -static int write_nonleaves (BRTLOADER bl, FIDX pivots_fidx, struct dbout *out, struct subtrees_info *sts, const DESCRIPTOR descriptor, uint32_t target_nodesize); +static void finish_leafnode (struct dbout *out, struct leaf_buf *lbuf, int progress_allocation, BRTLOADER bl, uint32_t target_basementnodesize); +static int write_nonleaves (BRTLOADER bl, FIDX pivots_fidx, struct dbout *out, struct subtrees_info *sts, const DESCRIPTOR descriptor, uint32_t target_nodesize, uint32_t target_basementnodesize); CILK_END static void add_pair_to_leafnode (struct leaf_buf *lbuf, unsigned char *key, int keylen, unsigned char *val, int vallen, int this_leafentry_size); static int write_translation_table (struct dbout *out, long long *off_of_translation_p); -static int write_header (struct dbout *out, long long translation_location_on_disk, long long translation_size_on_disk, BLOCKNUM root_blocknum_on_disk, LSN load_lsn, TXNID root_xid, uint32_t target_nodesize); +static int write_header (struct dbout *out, long long translation_location_on_disk, long long translation_size_on_disk, BLOCKNUM root_blocknum_on_disk, LSN load_lsn, TXNID root_xid, uint32_t target_nodesize, uint32_t target_basementnodesize); static void drain_writer_q(QUEUE q) { void *item; @@ -2253,7 +2254,8 @@ static int toku_loader_write_brt_from_q (BRTLOADER bl, QUEUE q, uint64_t total_disksize_estimate, int which_db, - uint32_t target_nodesize) + uint32_t target_nodesize, + uint32_t target_basementnodesize) // Effect: Consume a sequence of rowsets work from a queue, creating a fractal tree. Closes fd. { // set the number of fractal tree writer threads so that we can partition memory in the merger @@ -2354,7 +2356,7 @@ static int toku_loader_write_brt_from_q (BRTLOADER bl, uint64_t used_here = lbuf->off + 1000; // leave 1000 for various overheads. uint64_t target_size = (target_nodesize*7L)/8; // use only 7/8 of the node. uint64_t used_here_with_next_key = used_here + this_leafentry_size; - if (lbuf->nkeys > 0 && + if (lbuf->nkeys > 0 && ((used_here_with_next_key >= target_size) || (used_here + remaining_amount >= target_size && lbuf->off > remaining_amount))) { int progress_this_node = progress_allocation * (double)(old_n_rows_remaining - n_rows_remaining)/(double)old_n_rows_remaining; @@ -2372,8 +2374,8 @@ static int toku_loader_write_brt_from_q (BRTLOADER bl, if (result == 0) result = r; break; } - - cilk_spawn finish_leafnode(&out, lbuf, progress_this_node, bl); + + cilk_spawn finish_leafnode(&out, lbuf, progress_this_node, bl, target_basementnodesize); lbuf = NULL; r = allocate_block(&out, &lblock); @@ -2384,7 +2386,7 @@ static int toku_loader_write_brt_from_q (BRTLOADER bl, } lbuf = start_leaf(&out, descriptor, lblock, le_xid, target_nodesize); } - + add_pair_to_leafnode(lbuf, (unsigned char *) key.data, key.size, (unsigned char *) val.data, val.size, this_leafentry_size); n_rows_remaining--; @@ -2408,7 +2410,7 @@ static int toku_loader_write_brt_from_q (BRTLOADER bl, allocate_node(&sts, lblock, est); { int p = progress_allocation/2; - finish_leafnode(&out, lbuf, p, bl); + finish_leafnode(&out, lbuf, p, bl, target_basementnodesize); progress_allocation -= p; } } @@ -2434,7 +2436,7 @@ static int toku_loader_write_brt_from_q (BRTLOADER bl, } } - r = write_nonleaves(bl, pivots_file, &out, &sts, descriptor, target_nodesize); + r = write_nonleaves(bl, pivots_file, &out, &sts, descriptor, target_nodesize, target_basementnodesize); if (r) { result = r; goto error; } @@ -2466,10 +2468,10 @@ static int toku_loader_write_brt_from_q (BRTLOADER bl, if (r) { result = r; goto error; } - } - - long long off_of_translation; - r = write_translation_table(&out, &off_of_translation); + } + + long long off_of_translation; + r = write_translation_table(&out, &off_of_translation); if (r) { result = r; goto error; } @@ -2478,28 +2480,28 @@ static int toku_loader_write_brt_from_q (BRTLOADER bl, if (bl->root_xids_that_created) { root_xid_that_created = bl->root_xids_that_created[which_db]; } - r = write_header(&out, off_of_translation, (out.n_translations+1)*16+4, root_block, bl->load_lsn, root_xid_that_created, target_nodesize); + r = write_header(&out, off_of_translation, (out.n_translations+1)*16+4, root_block, bl->load_lsn, root_xid_that_created, target_nodesize, target_basementnodesize); if (r) { result = r; goto error; } - r = update_progress(progress_allocation, bl, "wrote tdb file"); + r = update_progress(progress_allocation, bl, "wrote tdb file"); if (r) { result = r; goto error; } } r = fsync(out.fd); - if (r) { - result = errno; goto error; + if (r) { + result = errno; goto error; } // Do we need to pay attention to user_said_stop? Or should the guy at the other end of the queue pay attention and send in an EOF. - error: + error: { int rr = toku_os_close(fd); - if (rr) + if (rr) result = errno; } out.fd = -1; @@ -2520,14 +2522,16 @@ int toku_loader_write_brt_from_q_in_C (BRTLOADER bl, QUEUE q, uint64_t total_disksize_estimate, int which_db, - uint32_t target_nodesize) + uint32_t target_nodesize, + uint32_t target_basementnodesize) // This is probably only for testing. { target_nodesize = target_nodesize == 0 ? default_loader_nodesize : target_nodesize; + target_basementnodesize = target_basementnodesize == 0 ? default_loader_basementnodesize : target_basementnodesize; #if defined(__cilkplusplus) - return cilk::run(toku_loader_write_brt_from_q, bl, descriptor, fd, progress_allocation, q, total_disksize_estimate, which_db, target_nodesize); + return cilk::run(toku_loader_write_brt_from_q, bl, descriptor, fd, progress_allocation, q, total_disksize_estimate, which_db, target_nodesize, target_basementnodesize); #else - return toku_loader_write_brt_from_q (bl, descriptor, fd, progress_allocation, q, total_disksize_estimate, which_db, target_nodesize); + return toku_loader_write_brt_from_q (bl, descriptor, fd, progress_allocation, q, total_disksize_estimate, which_db, target_nodesize, target_basementnodesize); #endif } @@ -2536,9 +2540,9 @@ static void* fractal_thread (void *ftav) { BL_TRACE(blt_start_fractal_thread); struct fractal_thread_args *fta = (struct fractal_thread_args *)ftav; #if defined(__cilkplusplus) - int r = cilk::run(toku_loader_write_brt_from_q, fta->bl, fta->descriptor, fta->fd, fta->progress_allocation, fta->q, fta->total_disksize_estimate, fta->which_db, fta->target_nodesize); + int r = cilk::run(toku_loader_write_brt_from_q, fta->bl, fta->descriptor, fta->fd, fta->progress_allocation, fta->q, fta->total_disksize_estimate, fta->which_db, fta->target_nodesize, fta->target_basementnodesize); #else - int r = toku_loader_write_brt_from_q (fta->bl, fta->descriptor, fta->fd, fta->progress_allocation, fta->q, fta->total_disksize_estimate, fta->which_db, fta->target_nodesize); + int r = toku_loader_write_brt_from_q (fta->bl, fta->descriptor, fta->fd, fta->progress_allocation, fta->q, fta->total_disksize_estimate, fta->which_db, fta->target_nodesize, fta->target_basementnodesize); #endif fta->errno_result = r; return NULL; @@ -2575,9 +2579,11 @@ static int loader_do_i (BRTLOADER bl, r = errno; goto error; } - uint32_t target_nodesize; + uint32_t target_nodesize, target_basementnodesize; r = dest_db->get_pagesize(dest_db, &target_nodesize); invariant_zero(r); + r = dest_db->get_readpagesize(dest_db, &target_basementnodesize); + invariant_zero(r); // This structure must stay live until the join below. struct fractal_thread_args fta = { bl, @@ -2589,6 +2595,7 @@ static int loader_do_i (BRTLOADER bl, 0, which_db, target_nodesize, + target_basementnodesize, }; r = toku_pthread_create(bl->fractal_threads+which_db, NULL, fractal_thread, (void*)&fta); @@ -2781,13 +2788,13 @@ static int write_literal(struct dbout *out, void*data, size_t len) { } CILK_BEGIN -static void finish_leafnode (struct dbout *out, struct leaf_buf *lbuf, int progress_allocation, BRTLOADER bl) { +static void finish_leafnode (struct dbout *out, struct leaf_buf *lbuf, int progress_allocation, BRTLOADER bl, uint32_t target_basementnodesize) { int result = 0; // serialize leaf to buffer size_t serialized_leaf_size = 0; char *serialized_leaf = NULL; - result = toku_serialize_brtnode_to_memory(lbuf->node, &serialized_leaf_size, &serialized_leaf); + result = toku_serialize_brtnode_to_memory(lbuf->node, target_basementnodesize, &serialized_leaf_size, &serialized_leaf); // write it out if (result == 0) { @@ -2846,9 +2853,9 @@ static int write_translation_table (struct dbout *out, long long *off_of_transla } -static int -write_header (struct dbout *out, long long translation_location_on_disk, long long translation_size_on_disk, BLOCKNUM root_blocknum_on_disk, - LSN load_lsn, TXNID root_xid_that_created, uint32_t target_nodesize) { +static int +write_header (struct dbout *out, long long translation_location_on_disk, long long translation_size_on_disk, BLOCKNUM root_blocknum_on_disk, + LSN load_lsn, TXNID root_xid_that_created, uint32_t target_nodesize, uint32_t target_basementnodesize) { int result = 0; struct brt_header h; memset(&h, 0, sizeof h); @@ -2860,6 +2867,7 @@ write_header (struct dbout *out, long long translation_location_on_disk, long lo h.checkpoint_count = 1; h.checkpoint_lsn = load_lsn; h.nodesize = target_nodesize; + h.basementnodesize = target_basementnodesize; h.root = root_blocknum_on_disk; h.flags = 0; h.root_xid_that_created = root_xid_that_created; @@ -2984,7 +2992,7 @@ CILK_BEGIN static void write_nonleaf_node (BRTLOADER bl, struct dbout *out, int64_t blocknum_of_new_node, int n_children, DBT *pivots, /* must free this array, as well as the things it points t */ - struct subtree_info *subtree_info, int height, const DESCRIPTOR UU(desc), uint32_t target_nodesize) + struct subtree_info *subtree_info, int height, const DESCRIPTOR UU(desc), uint32_t target_nodesize, uint32_t target_basementnodesize) { //Nodes do not currently touch descriptors invariant(height > 0); @@ -3016,7 +3024,7 @@ static void write_nonleaf_node (BRTLOADER bl, struct dbout *out, int64_t blocknu size_t n_bytes; char *bytes; int r; - r = toku_serialize_brtnode_to_memory(node, &n_bytes, &bytes); + r = toku_serialize_brtnode_to_memory(node, target_basementnodesize, &n_bytes, &bytes); if (r) { result = r; } else { @@ -3054,7 +3062,7 @@ static void write_nonleaf_node (BRTLOADER bl, struct dbout *out, int64_t blocknu brt_loader_set_panic(bl, result, TRUE); } -static int write_nonleaves (BRTLOADER bl, FIDX pivots_fidx, struct dbout *out, struct subtrees_info *sts, const DESCRIPTOR descriptor, uint32_t target_nodesize) { +static int write_nonleaves (BRTLOADER bl, FIDX pivots_fidx, struct dbout *out, struct subtrees_info *sts, const DESCRIPTOR descriptor, uint32_t target_nodesize, uint32_t target_basementnodesize) { int result = 0; int height = 1; @@ -3102,7 +3110,7 @@ static int write_nonleaves (BRTLOADER bl, FIDX pivots_fidx, struct dbout *out, s result = r; break; } else { - cilk_spawn write_nonleaf_node(bl, out, blocknum_of_new_node, n_per_block, pivots, subtree_info, height, descriptor, target_nodesize); // frees all the data structures that go into making the node. + cilk_spawn write_nonleaf_node(bl, out, blocknum_of_new_node, n_per_block, pivots, subtree_info, height, descriptor, target_nodesize, target_basementnodesize); // frees all the data structures that go into making the node. n_subtrees_used += n_per_block; } } @@ -3125,7 +3133,7 @@ static int write_nonleaves (BRTLOADER bl, FIDX pivots_fidx, struct dbout *out, s if (r) { result = r; } else { - cilk_spawn write_nonleaf_node(bl, out, blocknum_of_new_node, n_first, pivots, subtree_info, height, descriptor, target_nodesize); + cilk_spawn write_nonleaf_node(bl, out, blocknum_of_new_node, n_first, pivots, subtree_info, height, descriptor, target_nodesize, target_basementnodesize); n_blocks_left -= n_first; n_subtrees_used += n_first; } @@ -3144,7 +3152,7 @@ static int write_nonleaves (BRTLOADER bl, FIDX pivots_fidx, struct dbout *out, s if (r) { result = r; } else { - cilk_spawn write_nonleaf_node(bl, out, blocknum_of_new_node, n_blocks_left, pivots, subtree_info, height, descriptor, target_nodesize); + cilk_spawn write_nonleaf_node(bl, out, blocknum_of_new_node, n_blocks_left, pivots, subtree_info, height, descriptor, target_nodesize, target_basementnodesize); n_subtrees_used += n_blocks_left; } } diff --git a/newbrt/logformat.c b/newbrt/logformat.c index 33a8d2cc070..cc82f60494e 100644 --- a/newbrt/logformat.c +++ b/newbrt/logformat.c @@ -129,6 +129,7 @@ const struct logtype logtypes[] = { {"u_int32_t", "mode", "0%o"}, {"u_int32_t", "treeflags", 0}, {"u_int32_t", "nodesize", 0}, + {"u_int32_t", "basementnodesize", 0}, NULLFIELD}}, //TODO: #2037 Add dname {"fopen", 'O', FA{{"BYTESTRING", "iname", 0}, diff --git a/newbrt/logger.c b/newbrt/logger.c index 0f41a9587ca..6d953131d11 100644 --- a/newbrt/logger.c +++ b/newbrt/logger.c @@ -846,12 +846,12 @@ int toku_logger_restart(TOKULOGGER logger, LSN lastlsn) } // fname is the iname -int toku_logger_log_fcreate (TOKUTXN txn, const char *fname, FILENUM filenum, u_int32_t mode, u_int32_t treeflags, u_int32_t nodesize) { +int toku_logger_log_fcreate (TOKUTXN txn, const char *fname, FILENUM filenum, u_int32_t mode, u_int32_t treeflags, u_int32_t nodesize, u_int32_t basementnodesize) { if (txn==0) return 0; if (txn->logger->is_panicked) return EINVAL; BYTESTRING bs_fname = { .len=strlen(fname), .data = (char *) fname }; // fsync log on fcreate - int r = toku_log_fcreate (txn->logger, (LSN*)0, 1, toku_txn_get_txnid(txn), filenum, bs_fname, mode, treeflags, nodesize); + int r = toku_log_fcreate (txn->logger, (LSN*)0, 1, toku_txn_get_txnid(txn), filenum, bs_fname, mode, treeflags, nodesize, basementnodesize); return r; } diff --git a/newbrt/logger.h b/newbrt/logger.h index bd31ec4a0dc..f2143dd13ef 100644 --- a/newbrt/logger.h +++ b/newbrt/logger.h @@ -57,7 +57,7 @@ int toku_logger_restart(TOKULOGGER logger, LSN lastlsn); // Returns: 0 if success int toku_logger_maybe_trim_log(TOKULOGGER logger, LSN oldest_open_lsn); -int toku_logger_log_fcreate (TOKUTXN txn, const char *fname, FILENUM filenum, u_int32_t mode, u_int32_t flags, u_int32_t nodesize); +int toku_logger_log_fcreate (TOKUTXN txn, const char *fname, FILENUM filenum, u_int32_t mode, u_int32_t flags, u_int32_t nodesize, u_int32_t basementnodesize); int toku_logger_log_fdelete (TOKUTXN txn, const char *fname); int toku_logger_log_fopen (TOKUTXN txn, const char * fname, FILENUM filenum, uint32_t treeflags); diff --git a/newbrt/recover.c b/newbrt/recover.c index 237c27d872c..57e654d96ef 100644 --- a/newbrt/recover.c +++ b/newbrt/recover.c @@ -221,7 +221,7 @@ static void recover_env_cleanup (RECOVER_ENV renv, BOOL recovery_succeeded) { assert(r==0); r = toku_logger_close(&renv->logger); assert(r == 0); - + r = toku_cachetable_close(&renv->ct); assert(r == 0); @@ -240,8 +240,8 @@ static void recover_yield(voidfp f, void *fpthunk, void *UU(yieldthunk)) { } // Open the file if it is not already open. If it is already open, then do nothing. -static int internal_recover_fopen_or_fcreate (RECOVER_ENV renv, BOOL must_create, int mode, BYTESTRING *bs_iname, FILENUM filenum, u_int32_t treeflags, - TOKUTXN txn, uint32_t nodesize, LSN max_acceptable_lsn) { +static int internal_recover_fopen_or_fcreate (RECOVER_ENV renv, BOOL must_create, int mode, BYTESTRING *bs_iname, FILENUM filenum, u_int32_t treeflags, + TOKUTXN txn, uint32_t nodesize, uint32_t basementnodesize, LSN max_acceptable_lsn) { int r; char *iname = fixup_fname(bs_iname); @@ -257,6 +257,11 @@ static int internal_recover_fopen_or_fcreate (RECOVER_ENV renv, BOOL must_create assert(r == 0); } + if (basementnodesize != 0) { + r = toku_brt_set_basementnodesize(brt, basementnodesize); + assert(r == 0); + } + // set the key compare functions if (!(treeflags & TOKU_DB_KEYCMP_BUILTIN) && renv->bt_compare) { r = toku_brt_set_bt_compare(brt, renv->bt_compare); @@ -412,19 +417,19 @@ static int toku_recover_fassociate (struct logtype_fassociate *l, RECOVER_ENV re // Open it if it exists. // If rollback file, specify which checkpointed version of file we need (not just the latest) // because we cannot use a rollback log that is later than the last complete checkpoint. See #3113. - { - BOOL rollback_file = !strcmp(fname, ROLLBACK_CACHEFILE_NAME); - LSN max_acceptable_lsn = MAX_LSN; - if (rollback_file) - max_acceptable_lsn = renv->ss.checkpoint_begin_lsn; - r = internal_recover_fopen_or_fcreate(renv, FALSE, 0, &l->iname, l->filenum, l->treeflags, NULL, 0, max_acceptable_lsn); - if (r==0 && rollback_file) { - //Load rollback cachefile - r = file_map_find(&renv->fmap, l->filenum, &tuple); - assert(r==0); - renv->logger->rollback_cachefile = tuple->brt->cf; - } - } + { + BOOL rollback_file = !strcmp(fname, ROLLBACK_CACHEFILE_NAME); + LSN max_acceptable_lsn = MAX_LSN; + if (rollback_file) + max_acceptable_lsn = renv->ss.checkpoint_begin_lsn; + r = internal_recover_fopen_or_fcreate(renv, FALSE, 0, &l->iname, l->filenum, l->treeflags, NULL, 0, 0, max_acceptable_lsn); + if (r==0 && rollback_file) { + //Load rollback cachefile + r = file_map_find(&renv->fmap, l->filenum, &tuple); + assert(r==0); + renv->logger->rollback_cachefile = tuple->brt->cf; + } + } break; case FORWARD_NEWER_CHECKPOINT_END: if (r == 0) { //IF it is open @@ -646,7 +651,7 @@ static int toku_recover_fcreate (struct logtype_fcreate *l, RECOVER_ENV renv) { toku_free(iname); BOOL must_create = TRUE; - r = internal_recover_fopen_or_fcreate(renv, must_create, l->mode, &l->iname, l->filenum, l->treeflags, txn, l->nodesize, MAX_LSN); + r = internal_recover_fopen_or_fcreate(renv, must_create, l->mode, &l->iname, l->filenum, l->treeflags, txn, l->nodesize, l->basementnodesize, MAX_LSN); return r; } @@ -671,7 +676,7 @@ static int toku_recover_fopen (struct logtype_fopen *l, RECOVER_ENV renv) { if (strcmp(fname, ROLLBACK_CACHEFILE_NAME)) { //Rollback cachefile can only be opened via fassociate. - r = internal_recover_fopen_or_fcreate(renv, must_create, 0, &l->iname, l->filenum, l->treeflags, txn, 0, MAX_LSN); + r = internal_recover_fopen_or_fcreate(renv, must_create, 0, &l->iname, l->filenum, l->treeflags, txn, 0, 0, MAX_LSN); } toku_free(fname); return r; diff --git a/newbrt/tests/benchmark-test.c b/newbrt/tests/benchmark-test.c index d12db0f815b..6431cbc79c3 100644 --- a/newbrt/tests/benchmark-test.c +++ b/newbrt/tests/benchmark-test.c @@ -16,8 +16,10 @@ enum { ITEMS_TO_INSERT_PER_ITERATION = 1<<20 }; enum { BOUND_INCREASE_PER_ITERATION = SERIAL_SPACING*ITEMS_TO_INSERT_PER_ITERATION }; enum { NODE_SIZE = 1<<20 }; +enum { BASEMENT_NODE_SIZE = 128 * 1024 }; static int nodesize = NODE_SIZE; +static int basementnodesize = BASEMENT_NODE_SIZE; static int keysize = sizeof (long long); static int valsize = sizeof (long long); static int do_verify =0; /* Do a slow verify after every insert. */ @@ -32,7 +34,7 @@ static void setup (void) { int r; unlink(fname); r = toku_brt_create_cachetable(&ct, 0, ZERO_LSN, NULL_LOGGER); assert(r==0); - r = toku_open_brt(fname, 1, &t, nodesize, ct, NULL_TXN, toku_builtin_compare_fun, (DB*)0); assert(r==0); + r = toku_open_brt(fname, 1, &t, nodesize, basementnodesize, ct, NULL_TXN, toku_builtin_compare_fun, (DB*)0); assert(r==0); } static void toku_shutdown (void) { diff --git a/newbrt/tests/brt-serialize-sub-block-test.c b/newbrt/tests/brt-serialize-sub-block-test.c index 08b35b53715..596c1691210 100644 --- a/newbrt/tests/brt-serialize-sub-block-test.c +++ b/newbrt/tests/brt-serialize-sub-block-test.c @@ -14,6 +14,7 @@ static void test_sub_block(int n) { const char fname[]= __FILE__ ".brt"; const int nodesize = 4*1024*1024; + const int basementnodesize = 128*1024; TOKUTXN const null_txn = 0; DB * const null_db = 0; @@ -28,7 +29,7 @@ static void test_sub_block(int n) { error = toku_brt_create_cachetable(&ct, 0, ZERO_LSN, NULL_LOGGER); assert(error == 0); - error = toku_open_brt(fname, TRUE, &brt, nodesize, ct, null_txn, toku_builtin_compare_fun, null_db); + error = toku_open_brt(fname, TRUE, &brt, nodesize, basementnodesize, ct, null_txn, toku_builtin_compare_fun, null_db); assert(error == 0); // insert keys 0, 1, 2, .. (n-1) @@ -47,7 +48,7 @@ static void test_sub_block(int n) { assert(error == 0); // verify the brt by walking a cursor through the rows - error = toku_open_brt(fname, FALSE, &brt, nodesize, ct, null_txn, toku_builtin_compare_fun, null_db); + error = toku_open_brt(fname, FALSE, &brt, nodesize, basementnodesize, ct, null_txn, toku_builtin_compare_fun, null_db); assert(error == 0); BRT_CURSOR cursor; diff --git a/newbrt/tests/brt-serialize-test.c b/newbrt/tests/brt-serialize-test.c index 5f5607a9eba..6b2d5ddf064 100644 --- a/newbrt/tests/brt-serialize-test.c +++ b/newbrt/tests/brt-serialize-test.c @@ -193,6 +193,7 @@ test_serialize_leaf_with_large_pivots(enum brtnode_verify_type bft) { brt->h = brt_h; brt_h->type = BRTHEADER_CURRENT; brt_h->panic = 0; brt_h->panic_string = 0; + brt_h->basementnodesize = 128*1024; toku_blocktable_create_new(&brt_h->blocktable); //Want to use block #20 BLOCKNUM b = make_blocknum(0); @@ -307,6 +308,7 @@ test_serialize_leaf_with_many_rows(enum brtnode_verify_type bft) { brt->h = brt_h; brt_h->type = BRTHEADER_CURRENT; brt_h->panic = 0; brt_h->panic_string = 0; + brt_h->basementnodesize = 128*1024; toku_blocktable_create_new(&brt_h->blocktable); //Want to use block #20 BLOCKNUM b = make_blocknum(0); @@ -427,6 +429,7 @@ test_serialize_leaf_with_large_rows(enum brtnode_verify_type bft) { brt->h = brt_h; brt_h->type = BRTHEADER_CURRENT; brt_h->panic = 0; brt_h->panic_string = 0; + brt_h->basementnodesize = 128*1024; toku_blocktable_create_new(&brt_h->blocktable); //Want to use block #20 BLOCKNUM b = make_blocknum(0); @@ -551,6 +554,7 @@ test_serialize_leaf_with_empty_basement_nodes(enum brtnode_verify_type bft) { brt->h = brt_h; brt_h->type = BRTHEADER_CURRENT; brt_h->panic = 0; brt_h->panic_string = 0; + brt_h->basementnodesize = 128*1024; toku_blocktable_create_new(&brt_h->blocktable); //Want to use block #20 BLOCKNUM b = make_blocknum(0); @@ -662,6 +666,7 @@ test_serialize_leaf_with_multiple_empty_basement_nodes(enum brtnode_verify_type brt->h = brt_h; brt_h->type = BRTHEADER_CURRENT; brt_h->panic = 0; brt_h->panic_string = 0; + brt_h->basementnodesize = 128*1024; toku_blocktable_create_new(&brt_h->blocktable); //Want to use block #20 BLOCKNUM b = make_blocknum(0); @@ -780,6 +785,7 @@ test_serialize_leaf(enum brtnode_verify_type bft) { brt->h = brt_h; brt_h->type = BRTHEADER_CURRENT; brt_h->panic = 0; brt_h->panic_string = 0; + brt_h->basementnodesize = 128*1024; toku_blocktable_create_new(&brt_h->blocktable); //Want to use block #20 BLOCKNUM b = make_blocknum(0); @@ -917,6 +923,7 @@ test_serialize_nonleaf(enum brtnode_verify_type bft) { brt->h = brt_h; brt_h->type = BRTHEADER_CURRENT; brt_h->panic = 0; brt_h->panic_string = 0; + brt_h->basementnodesize = 128*1024; toku_blocktable_create_new(&brt_h->blocktable); //Want to use block #20 BLOCKNUM b = make_blocknum(0); diff --git a/newbrt/tests/brt-test-cursor-2.c b/newbrt/tests/brt-test-cursor-2.c index 5e3d8c546ea..d4af974a84b 100644 --- a/newbrt/tests/brt-test-cursor-2.c +++ b/newbrt/tests/brt-test-cursor-2.c @@ -35,7 +35,7 @@ static void test_multiple_brt_cursor_dbts(int n, DB *db) { r = toku_brt_create_cachetable(&ct, 0, ZERO_LSN, NULL_LOGGER); assert(r==0); - r = toku_open_brt(fname, 1, &brt, 1<<12, ct, null_txn, toku_builtin_compare_fun, db); + r = toku_open_brt(fname, 1, &brt, 1<<12, 1<<9, ct, null_txn, toku_builtin_compare_fun, db); assert(r==0); int i; diff --git a/newbrt/tests/brt-test-cursor.c b/newbrt/tests/brt-test-cursor.c index 3bf5a368529..3c3c805dff7 100644 --- a/newbrt/tests/brt-test-cursor.c +++ b/newbrt/tests/brt-test-cursor.c @@ -90,7 +90,7 @@ static void test_brt_cursor_first(int n, DB *db) { r = toku_brt_create_cachetable(&ct, 0, ZERO_LSN, NULL_LOGGER); assert(r==0); - r = toku_open_brt(fname, 1, &brt, 1<<12, ct, null_txn, test_brt_cursor_keycompare, db); + r = toku_open_brt(fname, 1, &brt, 1<<12, 1<<9, ct, null_txn, test_brt_cursor_keycompare, db); assert(r==0); /* insert a bunch of kv pairs */ @@ -131,7 +131,7 @@ static void test_brt_cursor_last(int n, DB *db) { r = toku_brt_create_cachetable(&ct, 0, ZERO_LSN, NULL_LOGGER); assert(r==0); - r = toku_open_brt(fname, 1, &brt, 1<<12, ct, null_txn, test_brt_cursor_keycompare, db); + r = toku_open_brt(fname, 1, &brt, 1<<12, 1<<9, ct, null_txn, test_brt_cursor_keycompare, db); assert(r==0); /* insert keys 0, 1, .. (n-1) */ @@ -172,7 +172,7 @@ static void test_brt_cursor_first_last(int n, DB *db) { r = toku_brt_create_cachetable(&ct, 0, ZERO_LSN, NULL_LOGGER); assert(r==0); - r = toku_open_brt(fname, 1, &brt, 1<<12, ct, null_txn, test_brt_cursor_keycompare, db); + r = toku_open_brt(fname, 1, &brt, 1<<12, 1<<9, ct, null_txn, test_brt_cursor_keycompare, db); assert(r==0); /* insert a bunch of kv pairs */ @@ -217,7 +217,7 @@ static void test_brt_cursor_rfirst(int n, DB *db) { r = toku_brt_create_cachetable(&ct, 0, ZERO_LSN, NULL_LOGGER); assert(r==0); - r = toku_open_brt(fname, 1, &brt, 1<<12, ct, null_txn, test_brt_cursor_keycompare, db); + r = toku_open_brt(fname, 1, &brt, 1<<12, 1<<9, ct, null_txn, test_brt_cursor_keycompare, db); assert(r==0); /* insert keys n-1, n-2, ... , 0 */ @@ -285,7 +285,7 @@ static void test_brt_cursor_walk(int n, DB *db) { r = toku_brt_create_cachetable(&ct, 0, ZERO_LSN, NULL_LOGGER); assert(r==0); - r = toku_open_brt(fname, 1, &brt, 1<<12, ct, null_txn, test_brt_cursor_keycompare, db); + r = toku_open_brt(fname, 1, &brt, 1<<12, 1<<9, ct, null_txn, test_brt_cursor_keycompare, db); assert(r==0); /* insert a bunch of kv pairs */ @@ -351,7 +351,7 @@ static void test_brt_cursor_rwalk(int n, DB *db) { r = toku_brt_create_cachetable(&ct, 0, ZERO_LSN, NULL_LOGGER); assert(r==0); - r = toku_open_brt(fname, 1, &brt, 1<<12, ct, null_txn, test_brt_cursor_keycompare, db); + r = toku_open_brt(fname, 1, &brt, 1<<12, 1<<9, ct, null_txn, test_brt_cursor_keycompare, db); assert(r==0); /* insert a bunch of kv pairs */ @@ -435,7 +435,7 @@ static void test_brt_cursor_rand(int n, DB *db) { r = toku_brt_create_cachetable(&ct, 0, ZERO_LSN, NULL_LOGGER); assert(r==0); - r = toku_open_brt(fname, 1, &brt, 1<<12, ct, null_txn, test_brt_cursor_keycompare, db); + r = toku_open_brt(fname, 1, &brt, 1<<12, 1<<9, ct, null_txn, test_brt_cursor_keycompare, db); assert(r==0); /* insert a bunch of kv pairs */ @@ -489,7 +489,7 @@ static void test_brt_cursor_split(int n, DB *db) { r = toku_brt_create_cachetable(&ct, 0, ZERO_LSN, NULL_LOGGER); assert(r==0); - r = toku_open_brt(fname, 1, &brt, 1<<12, ct, null_txn, test_brt_cursor_keycompare, db); + r = toku_open_brt(fname, 1, &brt, 1<<12, 1<<9, ct, null_txn, test_brt_cursor_keycompare, db); assert(r==0); /* insert a bunch of kv pairs */ @@ -565,7 +565,7 @@ static void test_multiple_brt_cursors(int n, DB *db) { r = toku_brt_create_cachetable(&ct, 0, ZERO_LSN, NULL_LOGGER); assert(r==0); - r = toku_open_brt(fname, 1, &brt, 1<<12, ct, null_txn, test_brt_cursor_keycompare, db); + r = toku_open_brt(fname, 1, &brt, 1<<12, 1<<9, ct, null_txn, test_brt_cursor_keycompare, db); assert(r==0); int i; @@ -614,7 +614,7 @@ static void test_multiple_brt_cursor_walk(int n, DB *db) { r = toku_brt_create_cachetable(&ct, cachesize, ZERO_LSN, NULL_LOGGER); assert(r==0); - r = toku_open_brt(fname, 1, &brt, 1<<12, ct, null_txn, test_brt_cursor_keycompare, db); + r = toku_open_brt(fname, 1, &brt, 1<<12, 1<<9, ct, null_txn, test_brt_cursor_keycompare, db); assert(r==0); int c; @@ -691,7 +691,7 @@ static void test_brt_cursor_set(int n, int cursor_op, DB *db) { r = toku_brt_create_cachetable(&ct, 0, ZERO_LSN, NULL_LOGGER); assert(r==0); - r = toku_open_brt(fname, 1, &brt, 1<<12, ct, null_txn, test_brt_cursor_keycompare, db); + r = toku_open_brt(fname, 1, &brt, 1<<12, 1<<9, ct, null_txn, test_brt_cursor_keycompare, db); assert(r==0); int i; @@ -763,7 +763,7 @@ static void test_brt_cursor_set_range(int n, DB *db) { r = toku_brt_create_cachetable(&ct, 0, ZERO_LSN, NULL_LOGGER); assert(r==0); - r = toku_open_brt(fname, 1, &brt, 1<<12, ct, null_txn, test_brt_cursor_keycompare, db); + r = toku_open_brt(fname, 1, &brt, 1<<12, 1<<9, ct, null_txn, test_brt_cursor_keycompare, db); assert(r==0); int i; @@ -827,7 +827,7 @@ static void test_brt_cursor_delete(int n, DB *db) { error = toku_brt_create_cachetable(&ct, 0, ZERO_LSN, NULL_LOGGER); assert(error == 0); - error = toku_open_brt(fname, 1, &brt, 1<<12, ct, null_txn, test_brt_cursor_keycompare, db); + error = toku_open_brt(fname, 1, &brt, 1<<12, 1<<9, ct, null_txn, test_brt_cursor_keycompare, db); assert(error == 0); error = toku_brt_cursor(brt, &cursor, NULL, FALSE); diff --git a/newbrt/tests/brt-test.c b/newbrt/tests/brt-test.c index 50bce781d15..1a2df6bae50 100644 --- a/newbrt/tests/brt-test.c +++ b/newbrt/tests/brt-test.c @@ -18,7 +18,7 @@ static void test_dump_empty_db (void) { r = toku_brt_create_cachetable(&ct, 0, ZERO_LSN, NULL_LOGGER); assert(r==0); unlink(fname); - r = toku_open_brt(fname, 1, &t, 1024, ct, null_txn, toku_builtin_compare_fun, null_db); + r = toku_open_brt(fname, 1, &t, 1024, 256, ct, null_txn, toku_builtin_compare_fun, null_db); assert(r==0); if (verbose) { r=toku_dump_brt(stdout, t); assert(r==0); } r = toku_close_brt(t, 0); assert(r==0); @@ -38,8 +38,8 @@ static void test_multiple_files_of_size (int size) { unlink(n1); r = toku_brt_create_cachetable(&ct, 0, ZERO_LSN, NULL_LOGGER); assert(r==0); - r = toku_open_brt(n0, 1, &t0, size, ct, null_txn, toku_builtin_compare_fun, null_db); assert(r==0); - r = toku_open_brt(n1, 1, &t1, size, ct, null_txn, toku_builtin_compare_fun, null_db); assert(r==0); + r = toku_open_brt(n0, 1, &t0, size, size / 4, ct, null_txn, toku_builtin_compare_fun, null_db); assert(r==0); + r = toku_open_brt(n1, 1, &t1, size, size / 4, ct, null_txn, toku_builtin_compare_fun, null_db); assert(r==0); for (i=0; i<10000; i++) { char key[100],val[100]; DBT k,v; @@ -62,10 +62,10 @@ static void test_multiple_files_of_size (int size) { /* Now see if the data is all there. */ r = toku_brt_create_cachetable(&ct, 0, ZERO_LSN, NULL_LOGGER); assert(r==0); - r = toku_open_brt(n0, 0, &t0, 1<<12, ct, null_txn, toku_builtin_compare_fun, null_db); + r = toku_open_brt(n0, 0, &t0, 1<<12, 1<<9, ct, null_txn, toku_builtin_compare_fun, null_db); if (verbose) printf("%s:%d r=%d\n", __FILE__, __LINE__,r); assert(r==0); - r = toku_open_brt(n1, 0, &t1, 1<<12, ct, null_txn, toku_builtin_compare_fun, null_db); assert(r==0); + r = toku_open_brt(n1, 0, &t1, 1<<12, 1<<9, ct, null_txn, toku_builtin_compare_fun, null_db); assert(r==0); for (i=0; i<10000; i++) { char key[100],val[100]; @@ -98,7 +98,7 @@ static void test_multiple_brts_one_db_one_file (void) { unlink(fname); r = toku_brt_create_cachetable(&ct, 32, ZERO_LSN, NULL_LOGGER); assert(r==0); for (i=0; i=0); if (verbose) traceit("write to file"); - r = toku_loader_write_brt_from_q_in_C(&bl, &desc, fd, 1000, q2, size_est, 0, 0); + r = toku_loader_write_brt_from_q_in_C(&bl, &desc, fd, 1000, q2, size_est, 0, 0, 0); assert(r==0); r = queue_destroy(q2); diff --git a/newbrt/tests/brtloader-test.c b/newbrt/tests/brtloader-test.c index ae3c8b41983..aa1b1c24e5f 100644 --- a/newbrt/tests/brtloader-test.c +++ b/newbrt/tests/brtloader-test.c @@ -323,7 +323,7 @@ static void verify_dbfile(int n, int sorted_keys[], const char *sorted_vals[], c int fd = open(output_name, O_RDWR | O_CREAT | O_BINARY, S_IRWXU|S_IRWXG|S_IRWXO); assert(fd>=0); - r = toku_loader_write_brt_from_q_in_C(&bl, &desc, fd, 1000, q, size_est, 0, 0); + r = toku_loader_write_brt_from_q_in_C(&bl, &desc, fd, 1000, q, size_est, 0, 0, 0); assert(r==0); destroy_merge_fileset(&fs); diff --git a/newbrt/tests/is_empty.c b/newbrt/tests/is_empty.c index ab5c8a3b695..1876c3554b7 100644 --- a/newbrt/tests/is_empty.c +++ b/newbrt/tests/is_empty.c @@ -39,7 +39,7 @@ static void test_it (int N) { TOKUTXN txn; r = toku_txn_begin_txn((DB_TXN*)NULL, (TOKUTXN)0, &txn, logger, TXN_SNAPSHOT_ROOT); CKERR(r); - r = toku_open_brt(FILENAME, 1, &brt, 1024, ct, txn, toku_builtin_compare_fun, NULL); CKERR(r); + r = toku_open_brt(FILENAME, 1, &brt, 1024, 256, ct, txn, toku_builtin_compare_fun, NULL); CKERR(r); r = toku_txn_commit_txn(txn, FALSE, do_yield, NULL, NULL, NULL); CKERR(r); toku_txn_close_txn(txn); @@ -50,7 +50,7 @@ static void test_it (int N) { unsigned int rands[N]; for (int i=0; i +#include "test.h" + + +static const int envflags = DB_INIT_MPOOL|DB_CREATE|DB_THREAD |DB_INIT_LOCK|DB_INIT_LOG|DB_INIT_TXN|DB_PRIVATE; + +static char *namea="a.db"; uint32_t nodesizea = 0; +static char *nameb="b.db"; uint32_t nodesizeb = 32*1024; + +static void do_remove(DB_ENV *env, const char *filename) { + int r; +#if TOKUDB + DBT dname; + DBT iname; + dbt_init(&dname, filename, strlen(filename)+1); + dbt_init(&iname, NULL, 0); + iname.flags |= DB_DBT_MALLOC; + r = env->get_iname(env, &dname, &iname); CKERR(r); + if (verbose) printf("%s -> %s\n", filename, (char *) iname.data); + char rmcmd[32 + strlen(ENVDIR) + strlen(iname.data)]; + sprintf(rmcmd, "rm %s/%s", ENVDIR, (char *) iname.data); + r = system(rmcmd); CKERR(r); + toku_free(iname.data); +#else + env = env; + char rmcmd[32 + strlen(ENVDIR) + strlen(filename)]; + sprintf(rmcmd, "rm %s/%s", ENVDIR, filename); + r = system(rmcmd); CKERR(r); +#endif +} + +static void run_test (void) { + int r; + + r = system("rm -rf " ENVDIR); + CKERR(r); + toku_os_mkdir(ENVDIR, S_IRWXU+S_IRWXG+S_IRWXO); + + DB_ENV *env; + r = db_env_create(&env, 0); CKERR(r); + r = env->open(env, ENVDIR, envflags, S_IRWXU+S_IRWXG+S_IRWXO); CKERR(r); + + r = env->txn_checkpoint(env, 0, 0, 0); CKERR(r); + + DB_TXN *txn; + r = env->txn_begin(env, NULL, &txn, 0); CKERR(r); + + // create a db with the default nodesize + DB *dba; + r = db_create(&dba, env, 0); CKERR(r); + r = dba->get_readpagesize(dba, &nodesizea); CKERR(r); + if (verbose) printf("nodesizea=%u", nodesizea); + r = dba->open(dba, NULL, namea, NULL, DB_BTREE, DB_AUTO_COMMIT|DB_CREATE, 0666); CKERR(r); + r = dba->close(dba, 0); CKERR(r); + + // create a db with a small nodesize + DB *dbb; + r = db_create(&dbb, env, 0); CKERR(r); + r = dbb->set_readpagesize(dbb, nodesizeb); CKERR(r); + r = dbb->open(dbb, NULL, nameb, NULL, DB_BTREE, DB_AUTO_COMMIT|DB_CREATE, 0666); CKERR(r); + r = dbb->close(dbb, 0); CKERR(r); + + r = txn->commit(txn, 0); CKERR(r); + + // remove the inames to force recovery to recreate them + do_remove(env, namea); + do_remove(env, nameb); + + toku_hard_crash_on_purpose(); +} + +static void run_recover (void) { + int r; + + // run recovery + DB_ENV *env; + r = db_env_create(&env, 0); CKERR(r); + r = env->open(env, ENVDIR, envflags + DB_RECOVER, S_IRWXU+S_IRWXG+S_IRWXO); CKERR(r); + + // verify that the trees have the correct nodesizes + uint32_t pagesize; + DB *dba; + r = db_create(&dba, env, 0); CKERR(r); + r = dba->open(dba, NULL, namea, NULL, DB_UNKNOWN, DB_AUTO_COMMIT, 0666); CKERR(r); + r = dba->get_readpagesize(dba, &pagesize); CKERR(r); + if (verbose) printf("%u\n", pagesize); + // assert(pagesize == nodesizea); + r = dba->close(dba, 0); CKERR(r); + + DB *dbb; + r = db_create(&dbb, env, 0); CKERR(r); + r = dbb->open(dbb, NULL, nameb, NULL, DB_UNKNOWN, DB_AUTO_COMMIT, 0666); CKERR(r); + r = dbb->get_readpagesize(dbb, &pagesize); CKERR(r); + if (verbose) printf("%u\n", pagesize); + assert(pagesize == nodesizeb); + r = dbb->close(dbb, 0); CKERR(r); + + r = env->close(env, 0); CKERR(r); + exit(0); +} + +static void run_no_recover (void) { + int r; + + DB_ENV *env; + r = db_env_create(&env, 0); CKERR(r); + r = env->open(env, ENVDIR, envflags & ~DB_RECOVER, S_IRWXU+S_IRWXG+S_IRWXO); CKERR(r); + r = env->close(env, 0); CKERR(r); + exit(0); +} + +static const char *cmd; + +static BOOL do_test=FALSE, do_recover=FALSE, do_recover_only=FALSE, do_no_recover = FALSE; + +static void test_parse_args (int argc, char * const argv[]) { + int resultcode; + cmd = argv[0]; + argc--; argv++; + while (argc>0) { + if (strcmp(argv[0], "-v") == 0) { + verbose++; + } else if (strcmp(argv[0],"-q")==0) { + verbose--; + if (verbose<0) verbose=0; + } else if (strcmp(argv[0], "--test")==0) { + do_test=TRUE; + } else if (strcmp(argv[0], "--recover") == 0) { + do_recover=TRUE; + } else if (strcmp(argv[0], "--recover-only") == 0) { + do_recover_only=TRUE; + } else if (strcmp(argv[0], "--no-recover") == 0) { + do_no_recover=TRUE; + } else if (strcmp(argv[0], "-h")==0) { + resultcode=0; + do_usage: + fprintf(stderr, "Usage:\n%s [-v|-q]* [-h] {--test | --recover } \n", cmd); + exit(resultcode); + } else { + fprintf(stderr, "Unknown arg: %s\n", argv[0]); + resultcode=1; + goto do_usage; + } + argc--; + argv++; + } +} + +int test_main (int argc, char * const argv[]) { + test_parse_args(argc, argv); + if (do_test) { + run_test(); + } else if (do_recover) { + run_recover(); + } else if (do_recover_only) { + run_recover(); + } else if (do_no_recover) { + run_no_recover(); + } + return 0; +} diff --git a/src/ydb.c b/src/ydb.c index d6cc5cef566..d98ff7d48bf 100644 --- a/src/ydb.c +++ b/src/ydb.c @@ -5429,6 +5429,20 @@ toku_db_get_pagesize(DB *db, u_int32_t *pagesize_ptr) { return r; } +static int +toku_db_set_readpagesize(DB *db, u_int32_t readpagesize) { + HANDLE_PANICKED_DB(db); + int r = toku_brt_set_basementnodesize(db->i->brt, readpagesize); + return r; +} + +static int +toku_db_get_readpagesize(DB *db, u_int32_t *readpagesize_ptr) { + HANDLE_PANICKED_DB(db); + int r = toku_brt_get_basementnodesize(db->i->brt, readpagesize_ptr); + return r; +} + static int toku_db_stat64(DB * db, DB_TXN *txn, DB_BTREE_STAT64 *s) { HANDLE_PANICKED_DB(db); @@ -5860,6 +5874,16 @@ locked_db_get_pagesize(DB *db, u_int32_t *pagesize_ptr) { toku_ydb_lock(); int r = toku_db_get_pagesize(db, pagesize_ptr); toku_ydb_unlock(); return r; } +static int +locked_db_set_readpagesize(DB *db, u_int32_t readpagesize) { + toku_ydb_lock(); int r = toku_db_set_readpagesize(db, readpagesize); toku_ydb_unlock(); return r; +} + +static int +locked_db_get_readpagesize(DB *db, u_int32_t *readpagesize_ptr) { + toku_ydb_lock(); int r = toku_db_get_readpagesize(db, readpagesize_ptr); toku_ydb_unlock(); return r; +} + // TODO 2216 delete this static int locked_db_fd(DB * UU(db), int * UU(fdp)) { @@ -6040,6 +6064,8 @@ toku_db_create(DB ** db, DB_ENV * env, u_int32_t flags) { SDB(set_errfile); SDB(set_pagesize); SDB(get_pagesize); + SDB(set_readpagesize); + SDB(get_readpagesize); SDB(set_flags); SDB(get_flags); SDB(stat64);