[t:4875], [t:4887], merge from tokudb.4875 to main

git-svn-id: file:///svn/toku/tokudb@43896 c7de825b-a66e-492c-adef-691d508d4ae1
This commit is contained in:
Zardosht Kasheff 2013-04-17 00:00:37 -04:00 committed by Yoni Fogel
parent 939721e749
commit f2c4fe13e8
26 changed files with 932 additions and 676 deletions

View file

@ -84,15 +84,15 @@ static inline void unlock_for_blocktable (BLOCK_TABLE bt);
static void static void
ft_set_dirty(FT h, BOOL for_checkpoint){ ft_set_dirty(FT ft, BOOL for_checkpoint){
assert(toku_mutex_is_locked(&h->blocktable->mutex)); assert(toku_mutex_is_locked(&ft->blocktable->mutex));
assert(h->type == FT_CURRENT); assert(ft->h->type == FT_CURRENT);
if (for_checkpoint) { if (for_checkpoint) {
assert(h->checkpoint_header->type == FT_CHECKPOINT_INPROGRESS); assert(ft->checkpoint_header->type == FT_CHECKPOINT_INPROGRESS);
h->checkpoint_header->dirty = 1; ft->checkpoint_header->dirty = 1;
} }
else { else {
h->dirty = 1; ft->h->dirty = 1;
} }
} }
@ -449,9 +449,9 @@ PRNTF("blokAllokator", 1L, size, offset, bt);
//Fills wbuf with bt //Fills wbuf with bt
//A clean shutdown runs checkpoint start so that current and inprogress are copies. //A clean shutdown runs checkpoint start so that current and inprogress are copies.
void void
toku_serialize_translation_to_wbuf_unlocked(BLOCK_TABLE bt, struct wbuf *w, toku_serialize_translation_to_wbuf(BLOCK_TABLE bt, struct wbuf *w,
int64_t *address, int64_t *size) { int64_t *address, int64_t *size) {
assert(toku_mutex_is_locked(&bt->mutex)); lock_for_blocktable(bt);
struct translation *t = &bt->inprogress; struct translation *t = &bt->inprogress;
BLOCKNUM b = make_blocknum(RESERVED_BLOCKNUM_TRANSLATION); BLOCKNUM b = make_blocknum(RESERVED_BLOCKNUM_TRANSLATION);
@ -478,6 +478,7 @@ toku_serialize_translation_to_wbuf_unlocked(BLOCK_TABLE bt, struct wbuf *w,
wbuf_int(w, checksum); wbuf_int(w, checksum);
*address = t->block_translation[b.b].u.diskoff; *address = t->block_translation[b.b].u.diskoff;
*size = t->block_translation[b.b].size; *size = t->block_translation[b.b].size;
unlock_for_blocktable(bt);
} }

View file

@ -52,7 +52,7 @@ void toku_blocknum_realloc_on_disk(BLOCK_TABLE bt, BLOCKNUM b, DISKOFF size, DIS
void toku_translate_blocknum_to_offset_size(BLOCK_TABLE bt, BLOCKNUM b, DISKOFF *offset, DISKOFF *size); void toku_translate_blocknum_to_offset_size(BLOCK_TABLE bt, BLOCKNUM b, DISKOFF *offset, DISKOFF *size);
//Serialization //Serialization
void toku_serialize_translation_to_wbuf_unlocked(BLOCK_TABLE bt, struct wbuf *w, int64_t *address, int64_t *size); void toku_serialize_translation_to_wbuf(BLOCK_TABLE bt, struct wbuf *w, int64_t *address, int64_t *size);
void toku_block_table_swap_for_redirect(BLOCK_TABLE old_bt, BLOCK_TABLE new_bt); void toku_block_table_swap_for_redirect(BLOCK_TABLE old_bt, BLOCK_TABLE new_bt);

View file

@ -67,9 +67,9 @@
static CHECKPOINT_STATUS_S cp_status; static CHECKPOINT_STATUS_S cp_status;
#define STATUS_INIT(k,t,l) { \ #define STATUS_INIT(k,t,l) { \
cp_status.status[k].keyname = #k; \ cp_status.status[k].keyname = #k; \
cp_status.status[k].type = t; \ cp_status.status[k].type = t; \
cp_status.status[k].legend = "checkpoint: " l; \ cp_status.status[k].legend = "checkpoint: " l; \
} }
static void static void
@ -106,7 +106,7 @@ status_init(void) {
void void
toku_checkpoint_get_status(CACHETABLE ct, CHECKPOINT_STATUS statp) { toku_checkpoint_get_status(CACHETABLE ct, CHECKPOINT_STATUS statp) {
if (!cp_status.initialized) if (!cp_status.initialized)
status_init(); status_init();
STATUS_VALUE(CP_PERIOD) = toku_get_checkpoint_period_unlocked(ct); STATUS_VALUE(CP_PERIOD) = toku_get_checkpoint_period_unlocked(ct);
*statp = cp_status; *statp = cp_status;
} }
@ -193,7 +193,7 @@ checkpoint_safe_checkpoint_unlock(void) {
void void
toku_multi_operation_client_lock(void) { toku_multi_operation_client_lock(void) {
if (locked_mo) if (locked_mo)
(void) __sync_fetch_and_add(&STATUS_VALUE(CP_CLIENT_WAIT_ON_MO), 1); (void) __sync_fetch_and_add(&STATUS_VALUE(CP_CLIENT_WAIT_ON_MO), 1);
toku_pthread_rwlock_rdlock(&multi_operation_lock); toku_pthread_rwlock_rdlock(&multi_operation_lock);
} }
@ -205,7 +205,7 @@ toku_multi_operation_client_unlock(void) {
void void
toku_checkpoint_safe_client_lock(void) { toku_checkpoint_safe_client_lock(void) {
if (locked_cs) if (locked_cs)
(void) __sync_fetch_and_add(&STATUS_VALUE(CP_CLIENT_WAIT_ON_CS), 1); (void) __sync_fetch_and_add(&STATUS_VALUE(CP_CLIENT_WAIT_ON_CS), 1);
toku_pthread_rwlock_rdlock(&checkpoint_safe_lock); toku_pthread_rwlock_rdlock(&checkpoint_safe_lock);
toku_multi_operation_client_lock(); toku_multi_operation_client_lock();
} }
@ -241,23 +241,23 @@ toku_checkpoint_destroy(void) {
// Take a checkpoint of all currently open dictionaries // Take a checkpoint of all currently open dictionaries
int int
toku_checkpoint(CACHETABLE ct, TOKULOGGER logger, toku_checkpoint(CACHETABLE ct, TOKULOGGER logger,
void (*callback_f)(void*), void * extra, void (*callback_f)(void*), void * extra,
void (*callback2_f)(void*), void * extra2, void (*callback2_f)(void*), void * extra2,
checkpoint_caller_t caller_id) { checkpoint_caller_t caller_id) {
int r; int r;
int footprint_offset = (int) caller_id * 1000; int footprint_offset = (int) caller_id * 1000;
assert(initialized); assert(initialized);
if (locked_cs) { if (locked_cs) {
if (caller_id == SCHEDULED_CHECKPOINT) if (caller_id == SCHEDULED_CHECKPOINT)
(void) __sync_fetch_and_add(&STATUS_VALUE(CP_WAIT_SCHED_CS), 1); (void) __sync_fetch_and_add(&STATUS_VALUE(CP_WAIT_SCHED_CS), 1);
else if (caller_id == CLIENT_CHECKPOINT) else if (caller_id == CLIENT_CHECKPOINT)
(void) __sync_fetch_and_add(&STATUS_VALUE(CP_WAIT_CLIENT_CS), 1); (void) __sync_fetch_and_add(&STATUS_VALUE(CP_WAIT_CLIENT_CS), 1);
else if (caller_id == TXN_COMMIT_CHECKPOINT) else if (caller_id == TXN_COMMIT_CHECKPOINT)
(void) __sync_fetch_and_add(&STATUS_VALUE(CP_WAIT_TXN_CS), 1); (void) __sync_fetch_and_add(&STATUS_VALUE(CP_WAIT_TXN_CS), 1);
else else
(void) __sync_fetch_and_add(&STATUS_VALUE(CP_WAIT_OTHER_CS), 1); (void) __sync_fetch_and_add(&STATUS_VALUE(CP_WAIT_OTHER_CS), 1);
} }
(void) __sync_fetch_and_add(&STATUS_VALUE(CP_WAITERS_NOW), 1); (void) __sync_fetch_and_add(&STATUS_VALUE(CP_WAITERS_NOW), 1);
@ -265,27 +265,29 @@ toku_checkpoint(CACHETABLE ct, TOKULOGGER logger,
(void) __sync_fetch_and_sub(&STATUS_VALUE(CP_WAITERS_NOW), 1); (void) __sync_fetch_and_sub(&STATUS_VALUE(CP_WAITERS_NOW), 1);
if (STATUS_VALUE(CP_WAITERS_NOW) > STATUS_VALUE(CP_WAITERS_MAX)) if (STATUS_VALUE(CP_WAITERS_NOW) > STATUS_VALUE(CP_WAITERS_MAX))
STATUS_VALUE(CP_WAITERS_MAX) = STATUS_VALUE(CP_WAITERS_NOW); // threadsafe, within checkpoint_safe lock STATUS_VALUE(CP_WAITERS_MAX) = STATUS_VALUE(CP_WAITERS_NOW); // threadsafe, within checkpoint_safe lock
SET_CHECKPOINT_FOOTPRINT(10); SET_CHECKPOINT_FOOTPRINT(10);
if (locked_mo) { if (locked_mo) {
if (caller_id == SCHEDULED_CHECKPOINT) if (caller_id == SCHEDULED_CHECKPOINT)
STATUS_VALUE(CP_WAIT_SCHED_MO)++; // threadsafe, within checkpoint_safe lock STATUS_VALUE(CP_WAIT_SCHED_MO)++; // threadsafe, within checkpoint_safe lock
else if (caller_id == CLIENT_CHECKPOINT) else if (caller_id == CLIENT_CHECKPOINT)
STATUS_VALUE(CP_WAIT_CLIENT_MO)++; STATUS_VALUE(CP_WAIT_CLIENT_MO)++;
else if (caller_id == TXN_COMMIT_CHECKPOINT) else if (caller_id == TXN_COMMIT_CHECKPOINT)
STATUS_VALUE(CP_WAIT_TXN_MO)++; STATUS_VALUE(CP_WAIT_TXN_MO)++;
else else
STATUS_VALUE(CP_WAIT_OTHER_MO)++; STATUS_VALUE(CP_WAIT_OTHER_MO)++;
} }
multi_operation_checkpoint_lock(); multi_operation_checkpoint_lock();
SET_CHECKPOINT_FOOTPRINT(20); SET_CHECKPOINT_FOOTPRINT(20);
ydb_lock(); ydb_lock();
toku_ft_open_close_lock();
SET_CHECKPOINT_FOOTPRINT(30); SET_CHECKPOINT_FOOTPRINT(30);
STATUS_VALUE(CP_TIME_LAST_CHECKPOINT_BEGIN) = time(NULL); STATUS_VALUE(CP_TIME_LAST_CHECKPOINT_BEGIN) = time(NULL);
r = toku_cachetable_begin_checkpoint(ct, logger); r = toku_cachetable_begin_checkpoint(ct, logger);
toku_ft_open_close_unlock();
multi_operation_checkpoint_unlock(); multi_operation_checkpoint_unlock();
ydb_unlock(); ydb_unlock();
@ -299,7 +301,7 @@ toku_checkpoint(CACHETABLE ct, TOKULOGGER logger,
if (r==0 && logger) { if (r==0 && logger) {
last_completed_checkpoint_lsn = logger->last_completed_checkpoint_lsn; last_completed_checkpoint_lsn = logger->last_completed_checkpoint_lsn;
r = toku_logger_maybe_trim_log(logger, last_completed_checkpoint_lsn); r = toku_logger_maybe_trim_log(logger, last_completed_checkpoint_lsn);
STATUS_VALUE(CP_LAST_LSN) = last_completed_checkpoint_lsn.lsn; STATUS_VALUE(CP_LAST_LSN) = last_completed_checkpoint_lsn.lsn;
} }
SET_CHECKPOINT_FOOTPRINT(60); SET_CHECKPOINT_FOOTPRINT(60);
@ -307,9 +309,9 @@ toku_checkpoint(CACHETABLE ct, TOKULOGGER logger,
STATUS_VALUE(CP_TIME_LAST_CHECKPOINT_BEGIN_COMPLETE) = STATUS_VALUE(CP_TIME_LAST_CHECKPOINT_BEGIN); STATUS_VALUE(CP_TIME_LAST_CHECKPOINT_BEGIN_COMPLETE) = STATUS_VALUE(CP_TIME_LAST_CHECKPOINT_BEGIN);
if (r == 0) if (r == 0)
STATUS_VALUE(CP_CHECKPOINT_COUNT)++; STATUS_VALUE(CP_CHECKPOINT_COUNT)++;
else else
STATUS_VALUE(CP_CHECKPOINT_COUNT_FAIL)++; STATUS_VALUE(CP_CHECKPOINT_COUNT_FAIL)++;
STATUS_VALUE(CP_FOOTPRINT) = 0; STATUS_VALUE(CP_FOOTPRINT) = 0;
checkpoint_safe_checkpoint_unlock(); checkpoint_safe_checkpoint_unlock();

View file

@ -65,7 +65,7 @@ cachetable_put_empty_node_with_dep_nodes(
void void
create_new_ftnode_with_dep_nodes( create_new_ftnode_with_dep_nodes(
FT h, FT ft,
FTNODE *result, FTNODE *result,
int height, int height,
int n_children, int n_children,
@ -76,15 +76,15 @@ create_new_ftnode_with_dep_nodes(
BLOCKNUM name; BLOCKNUM name;
cachetable_put_empty_node_with_dep_nodes( cachetable_put_empty_node_with_dep_nodes(
h, ft,
num_dependent_nodes, num_dependent_nodes,
dependent_nodes, dependent_nodes,
&name, &name,
&fullhash, &fullhash,
result); result);
assert(h->nodesize > 0); assert(ft->h->nodesize > 0);
assert(h->basementnodesize > 0); assert(ft->h->basementnodesize > 0);
if (height == 0) { if (height == 0) {
assert(n_children > 0); assert(n_children > 0);
} }
@ -94,9 +94,9 @@ create_new_ftnode_with_dep_nodes(
name, name,
height, height,
n_children, n_children,
h->layout_version, ft->h->layout_version,
h->nodesize, ft->h->nodesize,
h->flags); ft->h->flags);
assert((*result)->nodesize > 0); assert((*result)->nodesize > 0);
(*result)->fullhash = fullhash; (*result)->fullhash = fullhash;
@ -208,10 +208,10 @@ toku_pin_ftnode_off_client_thread(
} }
void void
toku_unpin_ftnode_off_client_thread(FT h, FTNODE node) toku_unpin_ftnode_off_client_thread(FT ft, FTNODE node)
{ {
int r = toku_cachetable_unpin( int r = toku_cachetable_unpin(
h->cf, ft->cf,
node->thisnodename, node->thisnodename,
node->fullhash, node->fullhash,
(enum cachetable_dirty) node->dirty, (enum cachetable_dirty) node->dirty,
@ -221,11 +221,11 @@ toku_unpin_ftnode_off_client_thread(FT h, FTNODE node)
} }
void void
toku_unpin_ftnode(FT h, FTNODE node) toku_unpin_ftnode(FT ft, FTNODE node)
{ {
// printf("%*sUnpin %ld\n", 8-node->height, "", node->thisnodename.b); // printf("%*sUnpin %ld\n", 8-node->height, "", node->thisnodename.b);
//VERIFY_NODE(brt,node); //VERIFY_NODE(brt,node);
toku_unpin_ftnode_off_client_thread(h, node); toku_unpin_ftnode_off_client_thread(ft, node);
} }
void void

View file

@ -718,15 +718,15 @@ ftleaf_split(
invariant(node->height == 0); invariant(node->height == 0);
STATUS_VALUE(FT_FLUSHER_SPLIT_LEAF)++; STATUS_VALUE(FT_FLUSHER_SPLIT_LEAF)++;
if (node->n_children) { if (node->n_children) {
// First move all the accumulated stat64info deltas into the first basement. // First move all the accumulated stat64info deltas into the first basement.
// After the split, either both nodes or neither node will be included in the next checkpoint. // After the split, either both nodes or neither node will be included in the next checkpoint.
// The accumulated stats in the dictionary will be correct in either case. // The accumulated stats in the dictionary will be correct in either case.
// By moving all the deltas into one (arbitrary) basement, we avoid the need to maintain // By moving all the deltas into one (arbitrary) basement, we avoid the need to maintain
// correct information for a basement that is divided between two leafnodes (i.e. when split is // correct information for a basement that is divided between two leafnodes (i.e. when split is
// not on a basement boundary). // not on a basement boundary).
STAT64INFO_S delta_for_leafnode = toku_get_and_clear_basement_stats(node); STAT64INFO_S delta_for_leafnode = toku_get_and_clear_basement_stats(node);
BASEMENTNODE bn = BLB(node,0); BASEMENTNODE bn = BLB(node,0);
bn->stat64_delta = delta_for_leafnode; bn->stat64_delta = delta_for_leafnode;
} }
@ -807,9 +807,9 @@ ftleaf_split(
name, name,
0, 0,
num_children_in_b, num_children_in_b,
h->layout_version, h->h->layout_version,
h->nodesize, h->h->nodesize,
h->flags); h->h->flags);
assert(B->nodesize > 0); assert(B->nodesize > 0);
B->fullhash = fullhash; B->fullhash = fullhash;
} }
@ -1002,7 +1002,7 @@ ft_split_child(
FTNODE nodea, nodeb; FTNODE nodea, nodeb;
DBT splitk; DBT splitk;
// printf("%s:%d node %" PRIu64 "->u.n.n_children=%d height=%d\n", __FILE__, __LINE__, node->thisnodename.b, node->u.n.n_children, node->height); // printf("%s:%d node %" PRIu64 "->u.n.n_children=%d height=%d\n", __FILE__, __LINE__, node->thisnodename.b, node->u.n.n_children, node->height);
assert(h->nodesize>=node->nodesize); /* otherwise we might be in trouble because the nodesize shrank. */ assert(h->h->nodesize>=node->nodesize); /* otherwise we might be in trouble because the nodesize shrank. */
// for test // for test
call_flusher_thread_callback(flt_flush_before_split); call_flusher_thread_callback(flt_flush_before_split);

View file

@ -335,67 +335,129 @@ u_int32_t compute_child_fullhash (CACHEFILE cf, FTNODE node, int childnum);
enum ft_type {FT_CURRENT=1, FT_CHECKPOINT_INPROGRESS}; enum ft_type {FT_CURRENT=1, FT_CHECKPOINT_INPROGRESS};
struct ft_header {
enum ft_type type;
int dirty;
// Free-running counter incremented once per checkpoint (toggling LSB).
// LSB indicates which header location is used on disk so this
// counter is effectively a boolean which alternates with each checkpoint.
uint64_t checkpoint_count;
// LSN of creation of "checkpoint-begin" record in log.
LSN checkpoint_lsn;
// see brt_layout_version.h. maybe don't need this if we assume
// it's always the current version after deserializing
const int layout_version;
// different (<) from layout_version if upgraded from a previous
// version (useful for debugging)
const int layout_version_original;
// build_id (svn rev number) of software that wrote this node to
// disk. (read from disk, overwritten when written to disk, I
// think).
const uint32_t build_id;
// build_id of software that created this tree
const uint32_t build_id_original;
// time this tree was created
const uint64_t time_of_creation;
// and the root transaction id that created it
TXNID root_xid_that_created;
// last time this header was serialized to disk (read from disk,
// overwritten when written to disk)
uint64_t time_of_last_modification;
// last time that this tree was verified
uint64_t time_of_last_verification;
// this field is protected by tree_lock, see comment for tree_lock
BLOCKNUM root_blocknum;
const unsigned int flags;
const unsigned int nodesize;
const unsigned int basementnodesize;
const enum toku_compression_method compression_method;
// Current Minimum MSN to be used when upgrading pre-MSN BRT's.
// This is decremented from our currnt MIN_MSN so as not to clash
// with any existing 'normal' MSN's.
MSN highest_unused_msn_for_upgrade;
// last time that a hot optimize operation was begun
uint64_t time_of_last_optimize_begin;
// last time that a hot optimize operation was successfully completed
uint64_t time_of_last_optimize_end;
// the number of hot optimize operations currently in progress on this tree
uint32_t count_of_optimize_in_progress;
// the number of hot optimize operations in progress on this tree at the time of the last crash (this field is in-memory only)
uint32_t count_of_optimize_in_progress_read_from_disk;
// all messages before this msn have been applied to leaf nodes
MSN msn_at_start_of_last_completed_optimize;
STAT64INFO_S on_disk_stats;
};
// brt_header is always the current version. // brt_header is always the current version.
struct ft { struct ft {
enum ft_type type; FT_HEADER h;
FT checkpoint_header; FT_HEADER checkpoint_header;
// These are (mostly) read-only.
CACHEFILE cf; CACHEFILE cf;
// unique id for dictionary
DICTIONARY_ID dict_id;
ft_compare_func compare_fun;
ft_update_func update_fun;
// protected by locktree
DESCRIPTOR_S descriptor;
// protected by locktree and user. User
// makes sure this is only changed
// when no activity on tree
DESCRIPTOR_S cmp_descriptor;
// These are not read-only:
// lock used by a thread to pin the root node to start a descent into // lock used by a thread to pin the root node to start a descent into
// the tree. This lock protects the blocknum of the root node (root_blocknum). Any // the tree. This lock protects the blocknum of the root node (root_blocknum). Any
// thread that wants to descend down the tree starting at the root // thread that wants to descend down the tree starting at the root
// must grab this lock before pinning the root. // must grab this lock before pinning the root.
toku_mutex_t tree_lock; toku_mutex_t tree_lock;
u_int64_t checkpoint_count; // Free-running counter incremented once per checkpoint (toggling LSB).
// LSB indicates which header location is used on disk so this
// counter is effectively a boolean which alternates with each checkpoint.
LSN checkpoint_lsn; // LSN of creation of "checkpoint-begin" record in log.
int dirty;
DICTIONARY_ID dict_id; // unique id for dictionary
int panic; // If nonzero there was a write error. Don't write any more, because it probably only gets worse. This is the error code.
char *panic_string; // A malloced string that can indicate what went wrong.
int layout_version;
int layout_version_original; // different (<) from layout_version if upgraded from a previous version (useful for debugging)
int layout_version_read_from_disk; // transient, not serialized to disk
uint32_t build_id; // build_id (svn rev number) of software that wrote this node to disk
uint32_t build_id_original; // build_id of software that created this tree (read from disk, overwritten when written to disk)
uint64_t time_of_creation; // time this tree was created
uint64_t time_of_last_modification; // last time this header was serialized to disk (read from disk, overwritten when written to disk)
uint64_t time_of_last_verification; // last time that this tree was verified
unsigned int nodesize;
unsigned int basementnodesize;
// this field is protected by tree_lock, see comment for tree_lock
BLOCKNUM root_blocknum; // roots of the dictionary
unsigned int flags;
DESCRIPTOR_S descriptor;
DESCRIPTOR_S cmp_descriptor;
// protected by blocktable lock
BLOCK_TABLE blocktable; BLOCK_TABLE blocktable;
// protected by atomic builtins
STAT64INFO_S in_memory_stats;
// transient, not serialized to disk. updated when we do write to
// disk. tells us whether we can do partial eviction (we can't if
// the on-disk layout version is from before basement nodes)
int layout_version_read_from_disk;
// If a transaction created this BRT, which one? // If a transaction created this BRT, which one?
// If a transaction locked the BRT when it was empty, which transaction? (Only the latest one matters) // If a transaction locked the BRT when it was empty, which transaction? (Only the latest one matters)
// 0 if no such transaction // 0 if no such transaction
// only one thread can write to these at once, this is enforced by
// the lock tree
TXNID txnid_that_created_or_locked_when_empty; TXNID txnid_that_created_or_locked_when_empty;
TXNID root_that_created_or_locked_when_empty;
TXNID txnid_that_suppressed_recovery_logs; TXNID txnid_that_suppressed_recovery_logs;
TXNID root_xid_that_created;
struct toku_list live_ft_handles;
OMT txns; // transactions that are using this header
bool pinned_by_checkpoint; //Keep this header around for checkpoint, like a transaction
ft_compare_func compare_fun; // protects modifying live_ft_handles, txns, and pinned_by_checkpoint
ft_update_func update_fun; toku_mutex_t ft_ref_lock;
STAT64INFO_S in_memory_stats; struct toku_list live_ft_handles;
STAT64INFO_S on_disk_stats; // transactions that are using this header. you should only be able
STAT64INFO_S checkpoint_staging_stats; // to modify this if you have a valid handle in the list of live brts
uint64_t time_of_last_optimize_begin; // last time that a hot optimize operation was begun OMT txns;
uint64_t time_of_last_optimize_end; // last time that a hot optimize operation was successfully completed // Keep this header around for checkpoint, like a transaction
uint32_t count_of_optimize_in_progress; // the number of hot optimize operations currently in progress on this tree bool pinned_by_checkpoint;
uint32_t count_of_optimize_in_progress_read_from_disk; // the number of hot optimize operations in progress on this tree at the time of the last crash (this field is in-memory only)
MSN msn_at_start_of_last_completed_optimize; // all messages before this msn have been applied to leaf nodes // If nonzero there was a write error. Don't write any more, because it probably only gets worse. This is the error code.
enum toku_compression_method compression_method; int panic;
// Current Minimum MSN to be used when upgrading pre-MSN BRT's. // A malloced string that can indicate what went wrong.
// This is decremented from our currnt MIN_MSN so as not to clash char *panic_string;
// with any existing 'normal' MSN's.
MSN highest_unused_msn_for_upgrade;
}; };
// Copy the descriptor into a temporary variable, and tell DRD that subsequent code happens after reading that pointer. // Copy the descriptor into a temporary variable, and tell DRD that subsequent code happens after reading that pointer.
@ -464,9 +526,14 @@ int toku_keycompare (bytevec key1, ITEMLEN key1len, bytevec key2, ITEMLEN key2le
void toku_verify_or_set_counts(FTNODE); void toku_verify_or_set_counts(FTNODE);
int toku_serialize_ft_size (FT h); int toku_serialize_ft_size (FT_HEADER h);
int toku_serialize_ft_to (int fd, FT h); int toku_serialize_ft_to (int fd, FT_HEADER h, BLOCK_TABLE blocktable, CACHEFILE cf);
int toku_serialize_ft_to_wbuf (struct wbuf *, FT h, int64_t address_translation, int64_t size_translation); int toku_serialize_ft_to_wbuf (
struct wbuf *wbuf,
FT_HEADER h,
DISKOFF translation_location_on_disk,
DISKOFF translation_size_on_disk
);
enum deserialize_error_code toku_deserialize_ft_from (int fd, LSN max_acceptable_lsn, FT *ft); enum deserialize_error_code toku_deserialize_ft_from (int fd, LSN max_acceptable_lsn, FT *ft);
int toku_serialize_descriptor_contents_to_fd(int fd, const DESCRIPTOR desc, DISKOFF offset); int toku_serialize_descriptor_contents_to_fd(int fd, const DESCRIPTOR desc, DISKOFF offset);
void toku_serialize_descriptor_contents_to_wbuf(struct wbuf *wb, const DESCRIPTOR desc); void toku_serialize_descriptor_contents_to_wbuf(struct wbuf *wb, const DESCRIPTOR desc);
@ -579,7 +646,6 @@ struct ft_cursor {
// is required, such as for flushes. // is required, such as for flushes.
// //
static inline void fill_bfe_for_full_read(struct ftnode_fetch_extra *bfe, FT h) { static inline void fill_bfe_for_full_read(struct ftnode_fetch_extra *bfe, FT h) {
invariant(h->type == FT_CURRENT);
bfe->type = ftnode_fetch_all; bfe->type = ftnode_fetch_all;
bfe->h = h; bfe->h = h;
bfe->search = NULL; bfe->search = NULL;
@ -608,7 +674,7 @@ static inline void fill_bfe_for_subset_read(
BOOL disable_prefetching BOOL disable_prefetching
) )
{ {
invariant(h->type == FT_CURRENT); invariant(h->h->type == FT_CURRENT);
bfe->type = ftnode_fetch_subset; bfe->type = ftnode_fetch_subset;
bfe->h = h; bfe->h = h;
bfe->search = search; bfe->search = search;
@ -627,7 +693,7 @@ static inline void fill_bfe_for_subset_read(
// Currently used for stat64. // Currently used for stat64.
// //
static inline void fill_bfe_for_min_read(struct ftnode_fetch_extra *bfe, FT h) { static inline void fill_bfe_for_min_read(struct ftnode_fetch_extra *bfe, FT h) {
invariant(h->type == FT_CURRENT); invariant(h->h->type == FT_CURRENT);
bfe->type = ftnode_fetch_none; bfe->type = ftnode_fetch_none;
bfe->h = h; bfe->h = h;
bfe->search = NULL; bfe->search = NULL;
@ -659,7 +725,7 @@ static inline void destroy_bfe_for_prefetch(struct ftnode_fetch_extra *bfe) {
static inline void fill_bfe_for_prefetch(struct ftnode_fetch_extra *bfe, static inline void fill_bfe_for_prefetch(struct ftnode_fetch_extra *bfe,
FT h, FT h,
FT_CURSOR c) { FT_CURSOR c) {
invariant(h->type == FT_CURRENT); invariant(h->h->type == FT_CURRENT);
bfe->type = ftnode_fetch_prefetch; bfe->type = ftnode_fetch_prefetch;
bfe->h = h; bfe->h = h;
bfe->search = NULL; bfe->search = NULL;

View file

@ -150,6 +150,8 @@ static volatile FT_STATUS_S ft_status;
ft_status.status[k].legend = "brt: " l; \ ft_status.status[k].legend = "brt: " l; \
} }
static toku_mutex_t ft_open_close_lock;
static void static void
status_init(void) status_init(void)
{ {
@ -307,8 +309,8 @@ toku_ft_nonleaf_is_gorged (FTNODE node) {
(!buffers_are_empty)); (!buffers_are_empty));
} }
static void ft_verify_flags(FT h, FTNODE node) { static void ft_verify_flags(FT ft, FTNODE node) {
assert(h->flags == node->flags); assert(ft->h->flags == node->flags);
} }
int toku_ft_debug_mode = 0; int toku_ft_debug_mode = 0;
@ -599,16 +601,25 @@ static void ft_status_update_flush_reason(FTNODE node, BOOL for_checkpoint) {
static void ftnode_update_disk_stats( static void ftnode_update_disk_stats(
FTNODE ftnode, FTNODE ftnode,
FT h, FT ft,
BOOL for_checkpoint BOOL for_checkpoint
) )
{ {
STAT64INFO_S deltas = ZEROSTATS; STAT64INFO_S deltas = ZEROSTATS;
// capture deltas before rebalancing basements for serialization // capture deltas before rebalancing basements for serialization
deltas = toku_get_and_clear_basement_stats(ftnode); deltas = toku_get_and_clear_basement_stats(ftnode);
toku_ft_update_stats(&h->on_disk_stats, deltas); // locking not necessary here with respect to checkpointing
// in Clayface (because of the pending lock and cachetable lock
// in toku_cachetable_begin_checkpoint)
// essentially, if we are dealing with a for_checkpoint
// parameter in a function that is called by the flush_callback,
// then the cachetable needs to ensure that this is called in a safe
// manner that does not interfere with the beginning
// of a checkpoint, which it does with the cachetable lock
// and pending lock
toku_ft_update_stats(&ft->h->on_disk_stats, deltas);
if (for_checkpoint) { if (for_checkpoint) {
toku_ft_update_stats(&h->checkpoint_staging_stats, deltas); toku_ft_update_stats(&ft->checkpoint_header->on_disk_stats, deltas);
} }
} }
@ -637,15 +648,15 @@ void toku_ftnode_clone_callback(
{ {
FTNODE node = value_data; FTNODE node = value_data;
toku_assert_entire_node_in_memory(node); toku_assert_entire_node_in_memory(node);
FT h = write_extraargs; FT ft = write_extraargs;
FTNODE XMALLOC(cloned_node); FTNODE XMALLOC(cloned_node);
//FTNODE cloned_node = (FTNODE)toku_xmalloc(sizeof(*FTNODE)); //FTNODE cloned_node = (FTNODE)toku_xmalloc(sizeof(*FTNODE));
memset(cloned_node, 0, sizeof(*cloned_node)); memset(cloned_node, 0, sizeof(*cloned_node));
if (node->height == 0) { if (node->height == 0) {
// set header stats, must be done before rebalancing // set header stats, must be done before rebalancing
ftnode_update_disk_stats(node, h, for_checkpoint); ftnode_update_disk_stats(node, ft, for_checkpoint);
// rebalance the leaf node // rebalance the leaf node
rebalance_ftnode_leaf(node, h->basementnodesize); rebalance_ftnode_leaf(node, ft->h->basementnodesize);
} }
cloned_node->max_msn_applied_to_node_on_disk = node->max_msn_applied_to_node_on_disk; cloned_node->max_msn_applied_to_node_on_disk = node->max_msn_applied_to_node_on_disk;
@ -870,7 +881,7 @@ void toku_evict_bn_from_memory(FTNODE node, int childnum, FT h) {
// callback for partially evicting a node // callback for partially evicting a node
int toku_ftnode_pe_callback (void *ftnode_pv, PAIR_ATTR UU(old_attr), PAIR_ATTR* new_attr, void* extraargs) { int toku_ftnode_pe_callback (void *ftnode_pv, PAIR_ATTR UU(old_attr), PAIR_ATTR* new_attr, void* extraargs) {
FTNODE node = (FTNODE)ftnode_pv; FTNODE node = (FTNODE)ftnode_pv;
FT h = extraargs; FT ft = extraargs;
// Don't partially evict dirty nodes // Don't partially evict dirty nodes
if (node->dirty) { if (node->dirty) {
goto exit; goto exit;
@ -888,7 +899,7 @@ int toku_ftnode_pe_callback (void *ftnode_pv, PAIR_ATTR UU(old_attr), PAIR_ATTR*
if (BP_STATE(node,i) == PT_AVAIL) { if (BP_STATE(node,i) == PT_AVAIL) {
if (BP_SHOULD_EVICT(node,i)) { if (BP_SHOULD_EVICT(node,i)) {
STATUS_VALUE(FT_PARTIAL_EVICTIONS_NONLEAF)++; STATUS_VALUE(FT_PARTIAL_EVICTIONS_NONLEAF)++;
cilk_spawn compress_internal_node_partition(node, i, h->compression_method); cilk_spawn compress_internal_node_partition(node, i, ft->h->compression_method);
} }
else { else {
BP_SWEEP_CLOCK(node,i); BP_SWEEP_CLOCK(node,i);
@ -919,7 +930,7 @@ int toku_ftnode_pe_callback (void *ftnode_pv, PAIR_ATTR UU(old_attr), PAIR_ATTR*
else if (BP_STATE(node,i) == PT_AVAIL) { else if (BP_STATE(node,i) == PT_AVAIL) {
if (BP_SHOULD_EVICT(node,i)) { if (BP_SHOULD_EVICT(node,i)) {
STATUS_VALUE(FT_PARTIAL_EVICTIONS_LEAF)++; STATUS_VALUE(FT_PARTIAL_EVICTIONS_LEAF)++;
toku_evict_bn_from_memory(node, i, h); toku_evict_bn_from_memory(node, i, ft);
} }
else { else {
BP_SWEEP_CLOCK(node,i); BP_SWEEP_CLOCK(node,i);
@ -1272,7 +1283,7 @@ toku_initialize_empty_ftnode (FTNODE n, BLOCKNUM nodename, int height, int num_c
} }
static void static void
ft_init_new_root(FT h, FTNODE nodea, FTNODE nodeb, DBT splitk, CACHEKEY *rootp, FTNODE *newrootp) ft_init_new_root(FT ft, FTNODE nodea, FTNODE nodeb, DBT splitk, CACHEKEY *rootp, FTNODE *newrootp)
// Effect: Create a new root node whose two children are NODEA and NODEB, and the pivotkey is SPLITK. // Effect: Create a new root node whose two children are NODEA and NODEB, and the pivotkey is SPLITK.
// Store the new root's identity in *ROOTP, and the node in *NEWROOTP. // Store the new root's identity in *ROOTP, and the node in *NEWROOTP.
// Unpin nodea and nodeb. // Unpin nodea and nodeb.
@ -1281,11 +1292,11 @@ ft_init_new_root(FT h, FTNODE nodea, FTNODE nodeb, DBT splitk, CACHEKEY *rootp,
FTNODE XMALLOC(newroot); FTNODE XMALLOC(newroot);
int new_height = nodea->height+1; int new_height = nodea->height+1;
BLOCKNUM newroot_diskoff; BLOCKNUM newroot_diskoff;
toku_allocate_blocknum(h->blocktable, &newroot_diskoff, h); toku_allocate_blocknum(ft->blocktable, &newroot_diskoff, ft);
assert(newroot); assert(newroot);
*rootp=newroot_diskoff; *rootp=newroot_diskoff;
assert(new_height > 0); assert(new_height > 0);
toku_initialize_empty_ftnode (newroot, newroot_diskoff, new_height, 2, h->layout_version, h->nodesize, h->flags); toku_initialize_empty_ftnode (newroot, newroot_diskoff, new_height, 2, ft->h->layout_version, ft->h->nodesize, ft->h->flags);
//printf("new_root %lld %d %lld %lld\n", newroot_diskoff, newroot->height, nodea->thisnodename, nodeb->thisnodename); //printf("new_root %lld %d %lld %lld\n", newroot_diskoff, newroot->height, nodea->thisnodename, nodeb->thisnodename);
//printf("%s:%d Splitkey=%p %s\n", __FILE__, __LINE__, splitkey, splitkey); //printf("%s:%d Splitkey=%p %s\n", __FILE__, __LINE__, splitkey, splitkey);
toku_copyref_dbt(&newroot->childkeys[0], splitk); toku_copyref_dbt(&newroot->childkeys[0], splitk);
@ -1301,12 +1312,19 @@ ft_init_new_root(FT h, FTNODE nodea, FTNODE nodeb, DBT splitk, CACHEKEY *rootp,
BP_STATE(newroot,0) = PT_AVAIL; BP_STATE(newroot,0) = PT_AVAIL;
BP_STATE(newroot,1) = PT_AVAIL; BP_STATE(newroot,1) = PT_AVAIL;
newroot->dirty = 1; newroot->dirty = 1;
toku_unpin_ftnode(h, nodea);
toku_unpin_ftnode(h, nodeb);
//printf("%s:%d put %lld\n", __FILE__, __LINE__, newroot_diskoff); //printf("%s:%d put %lld\n", __FILE__, __LINE__, newroot_diskoff);
u_int32_t fullhash = toku_cachetable_hash(h->cf, newroot_diskoff); u_int32_t fullhash = toku_cachetable_hash(ft->cf, newroot_diskoff);
newroot->fullhash = fullhash; newroot->fullhash = fullhash;
toku_cachetable_put(h->cf, newroot_diskoff, fullhash, newroot, make_ftnode_pair_attr(newroot), get_write_callbacks_for_node(h)); toku_cachetable_put(ft->cf, newroot_diskoff, fullhash, newroot, make_ftnode_pair_attr(newroot), get_write_callbacks_for_node(ft));
//at this point, newroot is associated with newroot_diskoff, nodea is associated with root_blocknum
// make newroot_diskoff point to nodea
// make root_blocknum point to newroot
// also modify the blocknum and fullhash of nodea and newroot
// before doing this, assert(nodea->blocknum == ft->root_blocknum)
toku_unpin_ftnode(ft, nodea);
toku_unpin_ftnode(ft, nodeb);
*newrootp = newroot; *newrootp = newroot;
} }
@ -2048,7 +2066,7 @@ ft_nonleaf_put_cmd (ft_compare_func compare_fun, DESCRIPTOR desc, FTNODE node, F
// return TRUE if root changed, FALSE otherwise // return TRUE if root changed, FALSE otherwise
static BOOL static BOOL
ft_process_maybe_reactive_root (FT h, CACHEKEY *rootp, FTNODE *nodep) { ft_process_maybe_reactive_root (FT ft, CACHEKEY *rootp, FTNODE *nodep) {
FTNODE node = *nodep; FTNODE node = *nodep;
toku_assert_entire_node_in_memory(node); toku_assert_entire_node_in_memory(node);
enum reactivity re = get_node_reactivity(node); enum reactivity re = get_node_reactivity(node);
@ -2060,18 +2078,18 @@ ft_process_maybe_reactive_root (FT h, CACHEKEY *rootp, FTNODE *nodep) {
// The root node should split, so make a new root. // The root node should split, so make a new root.
FTNODE nodea,nodeb; FTNODE nodea,nodeb;
DBT splitk; DBT splitk;
assert(h->nodesize>=node->nodesize); /* otherwise we might be in trouble because the nodesize shrank. */ assert(ft->h->nodesize>=node->nodesize); /* otherwise we might be in trouble because the nodesize shrank. */
// //
// This happens on the client thread with the ydb lock, so it is safe to // This happens on the client thread with the ydb lock, so it is safe to
// not pass in dependent nodes. Although if we wanted to, we could pass // not pass in dependent nodes. Although if we wanted to, we could pass
// in just node. That would be correct. // in just node. That would be correct.
// //
if (node->height==0) { if (node->height==0) {
ftleaf_split(h, node, &nodea, &nodeb, &splitk, TRUE, 0, NULL); ftleaf_split(ft, node, &nodea, &nodeb, &splitk, TRUE, 0, NULL);
} else { } else {
ft_nonleaf_split(h, node, &nodea, &nodeb, &splitk, 0, NULL); ft_nonleaf_split(ft, node, &nodea, &nodeb, &splitk, 0, NULL);
} }
ft_init_new_root(h, nodea, nodeb, splitk, rootp, nodep); ft_init_new_root(ft, nodea, nodeb, splitk, rootp, nodep);
return TRUE; return TRUE;
} }
case RE_FUSIBLE: case RE_FUSIBLE:
@ -2695,7 +2713,7 @@ toku_ft_maybe_insert (FT_HANDLE brt, DBT *key, DBT *val, TOKUTXN txn, BOOL oplsn
//We have transactions, and this is not 2440. We must send the full root-to-leaf-path //We have transactions, and this is not 2440. We must send the full root-to-leaf-path
message_xids = toku_txn_get_xids(txn); message_xids = toku_txn_get_xids(txn);
} }
else if (txn->ancestor_txnid64 != brt->ft->root_xid_that_created) { else if (txn->ancestor_txnid64 != brt->ft->h->root_xid_that_created) {
//We have transactions, and this is 2440, however the txn doing 2440 did not create the dictionary. We must send the full root-to-leaf-path //We have transactions, and this is 2440, however the txn doing 2440 did not create the dictionary. We must send the full root-to-leaf-path
message_xids = toku_txn_get_xids(txn); message_xids = toku_txn_get_xids(txn);
} }
@ -2800,6 +2818,7 @@ toku_ft_maybe_update_broadcast(FT_HANDLE brt, const DBT *update_function_extra,
if (r != 0) { goto cleanup; } if (r != 0) { goto cleanup; }
} }
//TODO(yoni): remove treelsn here and similar calls (no longer being used)
LSN treelsn; LSN treelsn;
if (oplsn_valid && if (oplsn_valid &&
oplsn.lsn <= (treelsn = toku_ft_checkpoint_lsn(brt->ft)).lsn) { oplsn.lsn <= (treelsn = toku_ft_checkpoint_lsn(brt->ft)).lsn) {
@ -2890,7 +2909,7 @@ toku_ft_maybe_delete(FT_HANDLE brt, DBT *key, TOKUTXN txn, BOOL oplsn_valid, LSN
//We have transactions, and this is not 2440. We must send the full root-to-leaf-path //We have transactions, and this is not 2440. We must send the full root-to-leaf-path
message_xids = toku_txn_get_xids(txn); message_xids = toku_txn_get_xids(txn);
} }
else if (txn->ancestor_txnid64 != brt->ft->root_xid_that_created) { else if (txn->ancestor_txnid64 != brt->ft->h->root_xid_that_created) {
//We have transactions, and this is 2440, however the txn doing 2440 did not create the dictionary. We must send the full root-to-leaf-path //We have transactions, and this is 2440, however the txn doing 2440 did not create the dictionary. We must send the full root-to-leaf-path
message_xids = toku_txn_get_xids(txn); message_xids = toku_txn_get_xids(txn);
} }
@ -3126,10 +3145,10 @@ cleanup:
static void static void
toku_ft_handle_inherit_options(FT_HANDLE t, FT ft) { toku_ft_handle_inherit_options(FT_HANDLE t, FT ft) {
struct ft_options options = { struct ft_options options = {
.nodesize = ft->nodesize, .nodesize = ft->h->nodesize,
.basementnodesize = ft->basementnodesize, .basementnodesize = ft->h->basementnodesize,
.compression_method = ft->compression_method, .compression_method = ft->h->compression_method,
.flags = ft->flags, .flags = ft->h->flags,
.compare_fun = ft->compare_fun, .compare_fun = ft->compare_fun,
.update_fun = ft->update_fun .update_fun = ft->update_fun
}; };
@ -3148,6 +3167,7 @@ ft_handle_open(FT_HANDLE t, const char *fname_in_env, int is_create, int only_cr
CACHEFILE cf = NULL; CACHEFILE cf = NULL;
FT ft = NULL; FT ft = NULL;
BOOL did_create = FALSE; BOOL did_create = FALSE;
toku_ft_open_close_lock();
if (t->did_set_flags) { if (t->did_set_flags) {
r = verify_builtin_comparisons_consistent(t, t->options.flags); r = verify_builtin_comparisons_consistent(t, t->options.flags);
@ -3211,7 +3231,7 @@ ft_handle_open(FT_HANDLE t, const char *fname_in_env, int is_create, int only_cr
if (!t->did_set_flags) { if (!t->did_set_flags) {
r = verify_builtin_comparisons_consistent(t, t->options.flags); r = verify_builtin_comparisons_consistent(t, t->options.flags);
if (r) { goto exit; } if (r) { goto exit; }
} else if (t->options.flags != ft->flags) { /* if flags have been set then flags must match */ } else if (t->options.flags != ft->h->flags) { /* if flags have been set then flags must match */
r = EINVAL; r = EINVAL;
goto exit; goto exit;
} }
@ -3277,7 +3297,10 @@ exit:
// but we have not linked it to this brt. So, // but we have not linked it to this brt. So,
// we can simply try to remove the header. // we can simply try to remove the header.
// We don't need to unlink this brt from the header // We don't need to unlink this brt from the header
if (!toku_ft_needed(ft)) { toku_ft_grab_reflock(ft);
BOOL needed = toku_ft_needed_unlocked(ft);
toku_ft_release_reflock(ft);
if (!needed) {
//Close immediately. //Close immediately.
char *error_string = NULL; char *error_string = NULL;
r = toku_remove_ft(ft, &error_string, false, ZERO_LSN); r = toku_remove_ft(ft, &error_string, false, ZERO_LSN);
@ -3288,6 +3311,7 @@ exit:
toku_cachefile_close(&cf, 0, FALSE, ZERO_LSN); toku_cachefile_close(&cf, 0, FALSE, ZERO_LSN);
} }
} }
toku_ft_open_close_unlock();
return r; return r;
} }
@ -3406,36 +3430,24 @@ ft_compare_func toku_ft_get_bt_compare (FT_HANDLE brt) {
return brt->options.compare_fun; return brt->options.compare_fun;
} }
static void
ft_remove_handle_ref_callback(FT UU(ft), void *extra) {
FT_HANDLE handle = extra;
toku_list_remove(&handle->live_ft_handle_link);
}
int int
toku_ft_handle_close (FT_HANDLE brt, bool oplsn_valid, LSN oplsn) toku_ft_handle_close (FT_HANDLE brt, bool oplsn_valid, LSN oplsn)
// Effect: See ft-ops.h for the specification of this function. // Effect: See ft-ops.h for the specification of this function.
{ {
int r = 0; FT ft = brt->ft;
FT h = brt->ft; if (ft) {
toku_ft_remove_reference(brt->ft, oplsn_valid, oplsn, ft_remove_handle_ref_callback, brt);
// it is possible that a header was never opened
// for the brt
if (brt->ft) {
// TODO: figure out the proper locking here
// what really protects live_ft_handle_link?
toku_ft_lock(h);
toku_list_remove(&brt->live_ft_handle_link);
toku_ft_unlock(h);
if (!toku_ft_needed(brt->ft)) {
// close header
char *error_string = NULL;
r = toku_remove_ft(h, &error_string, oplsn_valid, oplsn);
assert_zero(r);
assert(error_string == NULL);
}
} }
toku_free(brt); toku_free(brt);
return 0;
return r;
} }
// test function
int toku_close_ft_handle_nolsn (FT_HANDLE brt, char** UU(error_string)) { int toku_close_ft_handle_nolsn (FT_HANDLE brt, char** UU(error_string)) {
return toku_ft_handle_close(brt, FALSE, ZERO_LSN); return toku_ft_handle_close(brt, FALSE, ZERO_LSN);
} }
@ -3530,7 +3542,7 @@ int toku_ft_cursor (
{ {
if (is_snapshot_read) { if (is_snapshot_read) {
invariant(ttxn != NULL); invariant(ttxn != NULL);
int accepted = does_txn_read_entry(brt->ft->root_xid_that_created, ttxn); int accepted = does_txn_read_entry(brt->ft->h->root_xid_that_created, ttxn);
if (accepted!=TOKUDB_ACCEPT) { if (accepted!=TOKUDB_ACCEPT) {
invariant(accepted==0); invariant(accepted==0);
return TOKUDB_MVCC_DICTIONARY_TOO_NEW; return TOKUDB_MVCC_DICTIONARY_TOO_NEW;
@ -5434,17 +5446,23 @@ int toku_ft_layer_init(void (*ydb_lock_callback)(void),
void (*ydb_unlock_callback)(void)) { void (*ydb_unlock_callback)(void)) {
int r = 0; int r = 0;
//Portability must be initialized first //Portability must be initialized first
if (r==0) r = toku_portability_init();
r = toku_portability_init(); if (r) { goto exit; }
if (r==0)
toku_checkpoint_init(ydb_lock_callback, ydb_unlock_callback); toku_checkpoint_init(ydb_lock_callback, ydb_unlock_callback);
if (r == 0)
r = toku_ft_serialize_layer_init(); r = toku_ft_serialize_layer_init();
if (r) { goto exit; }
toku_mutex_init(&ft_open_close_lock, NULL);
exit:
return r; return r;
} }
int toku_ft_layer_destroy(void) { int toku_ft_layer_destroy(void) {
int r = 0; int r = 0;
toku_mutex_destroy(&ft_open_close_lock);
if (r == 0) if (r == 0)
r = toku_ft_serialize_layer_destroy(); r = toku_ft_serialize_layer_destroy();
if (r==0) if (r==0)
@ -5455,6 +5473,14 @@ int toku_ft_layer_destroy(void) {
return r; return r;
} }
void toku_ft_open_close_lock(void) {
toku_mutex_lock(&ft_open_close_lock);
}
void toku_ft_open_close_unlock(void) {
toku_mutex_unlock(&ft_open_close_lock);
}
//Suppress both rollback and recovery logs. //Suppress both rollback and recovery logs.
void void
toku_ft_suppress_recovery_logs (FT_HANDLE brt, TOKUTXN txn) { toku_ft_suppress_recovery_logs (FT_HANDLE brt, TOKUTXN txn) {

View file

@ -241,6 +241,8 @@ toku_ft_handle_stat64 (FT_HANDLE, TOKUTXN, struct ftstat64_s *stat) __attribute_
int toku_ft_layer_init(void (*ydb_lock_callback)(void), int toku_ft_layer_init(void (*ydb_lock_callback)(void),
void (*ydb_unlock_callback)(void)) void (*ydb_unlock_callback)(void))
__attribute__ ((warn_unused_result)); __attribute__ ((warn_unused_result));
void toku_ft_open_close_lock(void);
void toku_ft_open_close_unlock(void);
int toku_ft_layer_destroy(void) __attribute__ ((warn_unused_result)); int toku_ft_layer_destroy(void) __attribute__ ((warn_unused_result));
int toku_ft_serialize_layer_init(void) __attribute__ ((warn_unused_result)); int toku_ft_serialize_layer_init(void) __attribute__ ((warn_unused_result));
int toku_ft_serialize_layer_destroy(void) __attribute__ ((warn_unused_result)); int toku_ft_serialize_layer_destroy(void) __attribute__ ((warn_unused_result));
@ -259,10 +261,6 @@ void toku_ft_suppress_recovery_logs (FT_HANDLE brt, TOKUTXN txn);
int toku_ft_get_fragmentation(FT_HANDLE brt, TOKU_DB_FRAGMENTATION report) __attribute__ ((warn_unused_result)); int toku_ft_get_fragmentation(FT_HANDLE brt, TOKU_DB_FRAGMENTATION report) __attribute__ ((warn_unused_result));
BOOL toku_ft_is_empty_fast (FT_HANDLE brt);
// Effect: Return TRUE if there are no messages or leaf entries in the tree. If so, it's empty. If there are messages or leaf entries, we say it's not empty
// even though if we were to optimize the tree it might turn out that they are empty.
BOOL toku_ft_is_empty_fast (FT_HANDLE brt) __attribute__ ((warn_unused_result)); BOOL toku_ft_is_empty_fast (FT_HANDLE brt) __attribute__ ((warn_unused_result));
// Effect: Return TRUE if there are no messages or leaf entries in the tree. If so, it's empty. If there are messages or leaf entries, we say it's not empty // Effect: Return TRUE if there are no messages or leaf entries in the tree. If so, it's empty. If there are messages or leaf entries, we say it's not empty
// even though if we were to optimize the tree it might turn out that they are empty. // even though if we were to optimize the tree it might turn out that they are empty.

View file

@ -139,10 +139,10 @@ exit:
// We only deserialize brt header once and then share everything with all the brts. // We only deserialize brt header once and then share everything with all the brts.
static enum deserialize_error_code static enum deserialize_error_code
deserialize_ft_versioned(int fd, struct rbuf *rb, FT *ft, uint32_t version) deserialize_ft_versioned(int fd, struct rbuf *rb, FT *ftp, uint32_t version)
{ {
enum deserialize_error_code e = DS_OK; enum deserialize_error_code e = DS_OK;
FT h = NULL; FT ft = NULL;
invariant(version >= FT_LAYOUT_MIN_SUPPORTED_VERSION); invariant(version >= FT_LAYOUT_MIN_SUPPORTED_VERSION);
invariant(version <= FT_LAYOUT_VERSION); invariant(version <= FT_LAYOUT_VERSION);
// We already know: // We already know:
@ -155,28 +155,25 @@ deserialize_ft_versioned(int fd, struct rbuf *rb, FT *ft, uint32_t version)
rbuf_literal_bytes(rb, &magic, 8); rbuf_literal_bytes(rb, &magic, 8);
lazy_assert(memcmp(magic,"tokudata",8)==0); lazy_assert(memcmp(magic,"tokudata",8)==0);
CALLOC(h); XCALLOC(ft);
if (!h) { if (!ft) {
e = DS_ERRNO; e = DS_ERRNO;
goto exit; goto exit;
} }
h->type = FT_CURRENT; ft->checkpoint_header = NULL;
h->checkpoint_header = NULL; ft->panic = 0;
h->dirty = 0; ft->panic_string = 0;
h->panic = 0; toku_list_init(&ft->live_ft_handles);
h->panic_string = 0; int r = toku_omt_create(&ft->txns);
toku_list_init(&h->live_ft_handles);
int r = toku_omt_create(&h->txns);
assert_zero(r); assert_zero(r);
//version MUST be in network order on disk regardless of disk order //version MUST be in network order on disk regardless of disk order
h->layout_version_read_from_disk = rbuf_network_int(rb); ft->layout_version_read_from_disk = rbuf_network_int(rb);
invariant(h->layout_version_read_from_disk >= FT_LAYOUT_MIN_SUPPORTED_VERSION); invariant(ft->layout_version_read_from_disk >= FT_LAYOUT_MIN_SUPPORTED_VERSION);
invariant(h->layout_version_read_from_disk <= FT_LAYOUT_VERSION); invariant(ft->layout_version_read_from_disk <= FT_LAYOUT_VERSION);
h->layout_version = FT_LAYOUT_VERSION;
//build_id MUST be in network order on disk regardless of disk order //build_id MUST be in network order on disk regardless of disk order
h->build_id = rbuf_network_int(rb); uint32_t build_id = rbuf_network_int(rb);
//Size MUST be in network order regardless of disk order. //Size MUST be in network order regardless of disk order.
u_int32_t size = rbuf_network_int(rb); u_int32_t size = rbuf_network_int(rb);
@ -188,16 +185,17 @@ deserialize_ft_versioned(int fd, struct rbuf *rb, FT *ft, uint32_t version)
int64_t byte_order_stored = *(int64_t*)tmp_byte_order_check; int64_t byte_order_stored = *(int64_t*)tmp_byte_order_check;
lazy_assert(byte_order_stored == toku_byte_order_host); lazy_assert(byte_order_stored == toku_byte_order_host);
h->checkpoint_count = rbuf_ulonglong(rb); uint64_t checkpoint_count = rbuf_ulonglong(rb);
h->checkpoint_lsn = rbuf_lsn(rb); LSN checkpoint_lsn = rbuf_lsn(rb);
h->nodesize = rbuf_int(rb); unsigned nodesize = rbuf_int(rb);
DISKOFF translation_address_on_disk = rbuf_diskoff(rb); DISKOFF translation_address_on_disk = rbuf_diskoff(rb);
DISKOFF translation_size_on_disk = rbuf_diskoff(rb); DISKOFF translation_size_on_disk = rbuf_diskoff(rb);
lazy_assert(translation_address_on_disk > 0); lazy_assert(translation_address_on_disk > 0);
lazy_assert(translation_size_on_disk > 0); lazy_assert(translation_size_on_disk > 0);
// initialize the tree lock // initialize the tree lock
toku_ft_init_treelock(h); toku_ft_init_treelock(ft);
toku_ft_init_reflock(ft);
//Load translation table //Load translation table
{ {
@ -213,7 +211,7 @@ deserialize_ft_versioned(int fd, struct rbuf *rb, FT *ft, uint32_t version)
} }
toku_unlock_for_pwrite(); toku_unlock_for_pwrite();
// Create table and read in data. // Create table and read in data.
e = toku_blocktable_create_from_buffer(&h->blocktable, e = toku_blocktable_create_from_buffer(&ft->blocktable,
translation_address_on_disk, translation_address_on_disk,
translation_size_on_disk, translation_size_on_disk,
tbuf); tbuf);
@ -223,73 +221,69 @@ deserialize_ft_versioned(int fd, struct rbuf *rb, FT *ft, uint32_t version)
} }
} }
h->root_blocknum = rbuf_blocknum(rb); BLOCKNUM root_blocknum = rbuf_blocknum(rb);
h->flags = rbuf_int(rb); unsigned flags = rbuf_int(rb);
if (h->layout_version_read_from_disk <= FT_LAYOUT_VERSION_13) { if (ft->layout_version_read_from_disk <= FT_LAYOUT_VERSION_13) {
// deprecate 'TOKU_DB_VALCMP_BUILTIN'. just remove the flag // deprecate 'TOKU_DB_VALCMP_BUILTIN'. just remove the flag
h->flags &= ~TOKU_DB_VALCMP_BUILTIN_13; flags &= ~TOKU_DB_VALCMP_BUILTIN_13;
} }
h->layout_version_original = rbuf_int(rb); int layout_version_original = rbuf_int(rb);
h->build_id_original = rbuf_int(rb); uint32_t build_id_original = rbuf_int(rb);
h->time_of_creation = rbuf_ulonglong(rb); uint64_t time_of_creation = rbuf_ulonglong(rb);
h->time_of_last_modification = rbuf_ulonglong(rb); uint64_t time_of_last_modification = rbuf_ulonglong(rb);
h->time_of_last_verification = 0;
if (h->layout_version_read_from_disk <= FT_LAYOUT_VERSION_18) { if (ft->layout_version_read_from_disk <= FT_LAYOUT_VERSION_18) {
// 17 was the last version with these fields, we no longer store // 17 was the last version with these fields, we no longer store
// them, so read and discard them // them, so read and discard them
(void) rbuf_ulonglong(rb); // num_blocks_to_upgrade_13 (void) rbuf_ulonglong(rb); // num_blocks_to_upgrade_13
if (h->layout_version_read_from_disk >= FT_LAYOUT_VERSION_15) { if (ft->layout_version_read_from_disk >= FT_LAYOUT_VERSION_15) {
(void) rbuf_ulonglong(rb); // num_blocks_to_upgrade_14 (void) rbuf_ulonglong(rb); // num_blocks_to_upgrade_14
} }
} }
if (h->layout_version_read_from_disk >= FT_LAYOUT_VERSION_14) { // fake creation during the last checkpoint
rbuf_TXNID(rb, &h->root_xid_that_created); TXNID root_xid_that_created = checkpoint_lsn.lsn;
} else { if (ft->layout_version_read_from_disk >= FT_LAYOUT_VERSION_14) {
// fake creation during the last checkpoint rbuf_TXNID(rb, &root_xid_that_created);
h->root_xid_that_created = h->checkpoint_lsn.lsn;
} }
if (h->layout_version_read_from_disk >= FT_LAYOUT_VERSION_15) { // TODO(leif): get this to default to what's specified, not the
h->basementnodesize = rbuf_int(rb); // hard-coded default
h->time_of_last_verification = rbuf_ulonglong(rb); unsigned basementnodesize = FT_DEFAULT_BASEMENT_NODE_SIZE;
} else { uint64_t time_of_last_verification = 0;
h->basementnodesize = FT_DEFAULT_BASEMENT_NODE_SIZE; if (ft->layout_version_read_from_disk >= FT_LAYOUT_VERSION_15) {
h->time_of_last_verification = 0; basementnodesize = rbuf_int(rb);
time_of_last_verification = rbuf_ulonglong(rb);
} }
if (h->layout_version_read_from_disk >= FT_LAYOUT_VERSION_18) { STAT64INFO_S on_disk_stats = ZEROSTATS;
h->on_disk_stats.numrows = rbuf_ulonglong(rb); uint64_t time_of_last_optimize_begin = 0;
h->on_disk_stats.numbytes = rbuf_ulonglong(rb); uint64_t time_of_last_optimize_end = 0;
h->in_memory_stats = h->on_disk_stats; uint32_t count_of_optimize_in_progress = 0;
h->time_of_last_optimize_begin = rbuf_ulonglong(rb); MSN msn_at_start_of_last_completed_optimize = ZERO_MSN;
h->time_of_last_optimize_end = rbuf_ulonglong(rb); if (ft->layout_version_read_from_disk >= FT_LAYOUT_VERSION_18) {
h->count_of_optimize_in_progress = rbuf_int(rb); on_disk_stats.numrows = rbuf_ulonglong(rb);
h->count_of_optimize_in_progress_read_from_disk = h->count_of_optimize_in_progress; on_disk_stats.numbytes = rbuf_ulonglong(rb);
h->msn_at_start_of_last_completed_optimize = rbuf_msn(rb); ft->in_memory_stats = on_disk_stats;
} else { time_of_last_optimize_begin = rbuf_ulonglong(rb);
e = toku_upgrade_subtree_estimates_to_stat64info(fd, h); time_of_last_optimize_end = rbuf_ulonglong(rb);
if (e != DS_OK) { count_of_optimize_in_progress = rbuf_int(rb);
goto exit; msn_at_start_of_last_completed_optimize = rbuf_msn(rb);
}
h->time_of_last_optimize_begin = 0;
h->time_of_last_optimize_end = 0;
h->count_of_optimize_in_progress = 0;
h->count_of_optimize_in_progress_read_from_disk = 0;
h->msn_at_start_of_last_completed_optimize = ZERO_MSN;
} }
if (h->layout_version_read_from_disk >= FT_LAYOUT_VERSION_19) {
enum toku_compression_method compression_method;
MSN highest_unused_msn_for_upgrade = (MSN) { .msn = (MIN_MSN.msn - 1) };
if (ft->layout_version_read_from_disk >= FT_LAYOUT_VERSION_19) {
unsigned char method = rbuf_char(rb); unsigned char method = rbuf_char(rb);
h->compression_method = (enum toku_compression_method) method; compression_method = (enum toku_compression_method) method;
h->highest_unused_msn_for_upgrade = rbuf_msn(rb); highest_unused_msn_for_upgrade = rbuf_msn(rb);
} else { } else {
// we hard coded zlib until 5.2, then quicklz in 5.2 // we hard coded zlib until 5.2, then quicklz in 5.2
if (h->layout_version_read_from_disk < FT_LAYOUT_VERSION_18) { if (ft->layout_version_read_from_disk < FT_LAYOUT_VERSION_18) {
h->compression_method = TOKU_ZLIB_METHOD; compression_method = TOKU_ZLIB_METHOD;
} else { } else {
h->compression_method = TOKU_QUICKLZ_METHOD; compression_method = TOKU_QUICKLZ_METHOD;
} }
h->highest_unused_msn_for_upgrade.msn = MIN_MSN.msn - 1;
} }
(void) rbuf_int(rb); //Read in checksum and ignore (already verified). (void) rbuf_int(rb); //Read in checksum and ignore (already verified).
@ -300,21 +294,57 @@ deserialize_ft_versioned(int fd, struct rbuf *rb, FT *ft, uint32_t version)
goto exit; goto exit;
} }
invariant(h); struct ft_header h = {
invariant((uint32_t) h->layout_version_read_from_disk == version); .type = FT_CURRENT,
e = deserialize_descriptor_from(fd, h->blocktable, &h->descriptor, version); .dirty = 0,
.checkpoint_count = checkpoint_count,
.checkpoint_lsn = checkpoint_lsn,
.layout_version = FT_LAYOUT_VERSION,
.layout_version_original = layout_version_original,
.build_id = build_id,
.build_id_original = build_id_original,
.time_of_creation = time_of_creation,
.root_xid_that_created = root_xid_that_created,
.time_of_last_modification = time_of_last_modification,
.time_of_last_verification = time_of_last_verification,
.root_blocknum = root_blocknum,
.flags = flags,
.nodesize = nodesize,
.basementnodesize = basementnodesize,
.compression_method = compression_method,
.highest_unused_msn_for_upgrade = highest_unused_msn_for_upgrade,
.time_of_last_optimize_begin = time_of_last_optimize_begin,
.time_of_last_optimize_end = time_of_last_optimize_end,
.count_of_optimize_in_progress = count_of_optimize_in_progress,
.count_of_optimize_in_progress_read_from_disk = count_of_optimize_in_progress,
.msn_at_start_of_last_completed_optimize = msn_at_start_of_last_completed_optimize,
.on_disk_stats = on_disk_stats
};
ft->h = toku_xmemdup(&h, sizeof h);
if (ft->layout_version_read_from_disk < FT_LAYOUT_VERSION_18) {
// This needs ft->h to be non-null, so we have to do it after we
// read everything else.
e = toku_upgrade_subtree_estimates_to_stat64info(fd, ft);
if (e != DS_OK) {
goto exit;
}
}
invariant((uint32_t) ft->layout_version_read_from_disk == version);
e = deserialize_descriptor_from(fd, ft->blocktable, &ft->descriptor, version);
if (e != DS_OK) { if (e != DS_OK) {
goto exit; goto exit;
} }
// copy descriptor to cmp_descriptor for #4541 // copy descriptor to cmp_descriptor for #4541
h->cmp_descriptor.dbt.size = h->descriptor.dbt.size; ft->cmp_descriptor.dbt.size = ft->descriptor.dbt.size;
h->cmp_descriptor.dbt.data = toku_xmemdup(h->descriptor.dbt.data, h->descriptor.dbt.size); ft->cmp_descriptor.dbt.data = toku_xmemdup(ft->descriptor.dbt.data, ft->descriptor.dbt.size);
// Version 13 descriptors had an extra 4 bytes that we don't read // Version 13 descriptors had an extra 4 bytes that we don't read
// anymore. Since the header is going to think it's the current // anymore. Since the header is going to think it's the current
// version if it gets written out, we need to write the descriptor in // version if it gets written out, we need to write the descriptor in
// the new format (without those bytes) before that happens. // the new format (without those bytes) before that happens.
if (version <= FT_LAYOUT_VERSION_13) { if (version <= FT_LAYOUT_VERSION_13) {
r = toku_update_descriptor(h, &h->cmp_descriptor, fd); r = toku_update_descriptor(ft, &ft->cmp_descriptor, fd);
if (r != 0) { if (r != 0) {
errno = r; errno = r;
e = DS_ERRNO; e = DS_ERRNO;
@ -322,11 +352,11 @@ deserialize_ft_versioned(int fd, struct rbuf *rb, FT *ft, uint32_t version)
} }
} }
exit: exit:
if (e != DS_OK && h != NULL) { if (e != DS_OK && ft != NULL) {
toku_free(h); toku_free(ft);
h = NULL; ft = NULL;
} }
*ft = h; *ftp = ft;
return e; return e;
} }
@ -625,7 +655,7 @@ exit:
} }
int toku_serialize_ft_size (FT h) { int toku_serialize_ft_size (FT_HEADER h) {
u_int32_t size = serialize_ft_min_size(h->layout_version); u_int32_t size = serialize_ft_min_size(h->layout_version);
//There is no dynamic data. //There is no dynamic data.
lazy_assert(size <= BLOCK_ALLOCATOR_HEADER_RESERVE); lazy_assert(size <= BLOCK_ALLOCATOR_HEADER_RESERVE);
@ -633,7 +663,13 @@ int toku_serialize_ft_size (FT h) {
} }
int toku_serialize_ft_to_wbuf (struct wbuf *wbuf, FT h, DISKOFF translation_location_on_disk, DISKOFF translation_size_on_disk) { int toku_serialize_ft_to_wbuf (
struct wbuf *wbuf,
FT_HEADER h,
DISKOFF translation_location_on_disk,
DISKOFF translation_size_on_disk
)
{
wbuf_literal_bytes(wbuf, "tokudata", 8); wbuf_literal_bytes(wbuf, "tokudata", 8);
wbuf_network_int (wbuf, h->layout_version); //MUST be in network order regardless of disk order wbuf_network_int (wbuf, h->layout_version); //MUST be in network order regardless of disk order
wbuf_network_int (wbuf, BUILD_ID); //MUST be in network order regardless of disk order wbuf_network_int (wbuf, BUILD_ID); //MUST be in network order regardless of disk order
@ -643,7 +679,6 @@ int toku_serialize_ft_to_wbuf (struct wbuf *wbuf, FT h, DISKOFF translation_loca
wbuf_LSN (wbuf, h->checkpoint_lsn); wbuf_LSN (wbuf, h->checkpoint_lsn);
wbuf_int (wbuf, h->nodesize); wbuf_int (wbuf, h->nodesize);
//printf("%s:%d bta=%lu size=%lu\n", __FILE__, __LINE__, h->block_translation_address_on_disk, 4 + 16*h->translated_blocknum_limit);
wbuf_DISKOFF(wbuf, translation_location_on_disk); wbuf_DISKOFF(wbuf, translation_location_on_disk);
wbuf_DISKOFF(wbuf, translation_size_on_disk); wbuf_DISKOFF(wbuf, translation_size_on_disk);
wbuf_BLOCKNUM(wbuf, h->root_blocknum); wbuf_BLOCKNUM(wbuf, h->root_blocknum);
@ -655,8 +690,8 @@ int toku_serialize_ft_to_wbuf (struct wbuf *wbuf, FT h, DISKOFF translation_loca
wbuf_TXNID(wbuf, h->root_xid_that_created); wbuf_TXNID(wbuf, h->root_xid_that_created);
wbuf_int(wbuf, h->basementnodesize); wbuf_int(wbuf, h->basementnodesize);
wbuf_ulonglong(wbuf, h->time_of_last_verification); wbuf_ulonglong(wbuf, h->time_of_last_verification);
wbuf_ulonglong(wbuf, h->checkpoint_staging_stats.numrows); wbuf_ulonglong(wbuf, h->on_disk_stats.numrows);
wbuf_ulonglong(wbuf, h->checkpoint_staging_stats.numbytes); wbuf_ulonglong(wbuf, h->on_disk_stats.numbytes);
wbuf_ulonglong(wbuf, h->time_of_last_optimize_begin); wbuf_ulonglong(wbuf, h->time_of_last_optimize_begin);
wbuf_ulonglong(wbuf, h->time_of_last_optimize_end); wbuf_ulonglong(wbuf, h->time_of_last_optimize_end);
wbuf_int(wbuf, h->count_of_optimize_in_progress); wbuf_int(wbuf, h->count_of_optimize_in_progress);
@ -669,23 +704,21 @@ int toku_serialize_ft_to_wbuf (struct wbuf *wbuf, FT h, DISKOFF translation_loca
return 0; return 0;
} }
int toku_serialize_ft_to (int fd, FT h) { int toku_serialize_ft_to (int fd, FT_HEADER h, BLOCK_TABLE blocktable, CACHEFILE cf) {
int rr = 0; int rr = 0;
if (h->panic) return h->panic;
lazy_assert(h->type==FT_CHECKPOINT_INPROGRESS); lazy_assert(h->type==FT_CHECKPOINT_INPROGRESS);
toku_ft_lock(h);
struct wbuf w_translation; struct wbuf w_translation;
int64_t size_translation; int64_t size_translation;
int64_t address_translation; int64_t address_translation;
{ {
//Must serialize translation first, to get address,size for header. //Must serialize translation first, to get address,size for header.
toku_serialize_translation_to_wbuf_unlocked(h->blocktable, &w_translation, toku_serialize_translation_to_wbuf(blocktable, &w_translation,
&address_translation, &address_translation,
&size_translation); &size_translation);
lazy_assert(size_translation==w_translation.size); lazy_assert(size_translation==w_translation.size);
} }
struct wbuf w_main; struct wbuf w_main;
unsigned int size_main = toku_serialize_ft_size (h); unsigned int size_main = toku_serialize_ft_size(h);
{ {
wbuf_init(&w_main, toku_xmalloc(size_main), size_main); wbuf_init(&w_main, toku_xmalloc(size_main), size_main);
{ {
@ -694,7 +727,6 @@ int toku_serialize_ft_to (int fd, FT h) {
} }
lazy_assert(w_main.ndone==size_main); lazy_assert(w_main.ndone==size_main);
} }
toku_ft_unlock(h);
toku_lock_for_pwrite(); toku_lock_for_pwrite();
{ {
//Actual Write translation table //Actual Write translation table
@ -708,8 +740,8 @@ int toku_serialize_ft_to (int fd, FT h) {
//If the header has a cachefile we need to do cachefile fsync (to //If the header has a cachefile we need to do cachefile fsync (to
//prevent crash if we redirected to dev null) //prevent crash if we redirected to dev null)
//If there is no cachefile we still need to do an fsync. //If there is no cachefile we still need to do an fsync.
if (h->cf) { if (cf) {
rr = toku_cachefile_fsync(h->cf); rr = toku_cachefile_fsync(cf);
} }
else { else {
rr = toku_file_fsync(fd); rr = toku_file_fsync(fd);

View file

@ -74,7 +74,7 @@ int toku_testsetup_nonleaf (FT_HANDLE brt, int height, BLOCKNUM *blocknum, int n
int toku_testsetup_root(FT_HANDLE brt, BLOCKNUM blocknum) { int toku_testsetup_root(FT_HANDLE brt, BLOCKNUM blocknum) {
assert(testsetup_initialized); assert(testsetup_initialized);
brt->ft->root_blocknum = blocknum; brt->ft->h->root_blocknum = blocknum;
return 0; return 0;
} }

View file

@ -410,8 +410,8 @@ toku_verify_ft_with_progress (FT_HANDLE brt, int (*progress_callback)(void *extr
int r = toku_verify_ftnode(brt, ZERO_MSN, ZERO_MSN, root_node, -1, NULL, NULL, progress_callback, progress_extra, 1, verbose, keep_on_going); int r = toku_verify_ftnode(brt, ZERO_MSN, ZERO_MSN, root_node, -1, NULL, NULL, progress_callback, progress_extra, 1, verbose, keep_on_going);
if (r == 0) { if (r == 0) {
toku_ft_lock(brt->ft); toku_ft_lock(brt->ft);
brt->ft->time_of_last_verification = time(NULL); brt->ft->h->time_of_last_verification = time(NULL);
brt->ft->dirty = 1; brt->ft->h->dirty = 1;
toku_ft_unlock(brt->ft); toku_ft_unlock(brt->ft);
} }
return r; return r;

589
ft/ft.c
View file

@ -14,89 +14,100 @@ toku_ft_suppress_rollbacks(FT h, TOKUTXN txn) {
assert(h->txnid_that_created_or_locked_when_empty == TXNID_NONE || assert(h->txnid_that_created_or_locked_when_empty == TXNID_NONE ||
h->txnid_that_created_or_locked_when_empty == txnid); h->txnid_that_created_or_locked_when_empty == txnid);
h->txnid_that_created_or_locked_when_empty = txnid; h->txnid_that_created_or_locked_when_empty = txnid;
TXNID rootid = toku_txn_get_root_txnid(txn);
assert(h->root_that_created_or_locked_when_empty == TXNID_NONE ||
h->root_that_created_or_locked_when_empty == rootid);
h->root_that_created_or_locked_when_empty = rootid;
} }
void void
toku_reset_root_xid_that_created(FT h, TXNID new_root_xid_that_created) { toku_reset_root_xid_that_created(FT ft, TXNID new_root_xid_that_created) {
// Reset the root_xid_that_created field to the given value. // Reset the root_xid_that_created field to the given value.
// This redefines which xid created the dictionary. // This redefines which xid created the dictionary.
// hold lock around setting and clearing of dirty bit // hold lock around setting and clearing of dirty bit
// (see cooperative use of dirty bit in ft_begin_checkpoint()) // (see cooperative use of dirty bit in ft_begin_checkpoint())
toku_ft_lock (h); toku_ft_lock (ft);
h->root_xid_that_created = new_root_xid_that_created; ft->h->root_xid_that_created = new_root_xid_that_created;
h->dirty = 1; ft->h->dirty = 1;
toku_ft_unlock (h); toku_ft_unlock (ft);
} }
static void static void
ft_destroy(FT h) { ft_destroy(FT ft) {
if (!h->panic) assert(!h->checkpoint_header); if (!ft->panic) assert(!ft->checkpoint_header);
//header and checkpoint_header have same Blocktable pointer //header and checkpoint_header have same Blocktable pointer
//cannot destroy since it is still in use by CURRENT //cannot destroy since it is still in use by CURRENT
if (h->type == FT_CHECKPOINT_INPROGRESS) h->blocktable = NULL; assert(ft->h->type == FT_CURRENT);
else { toku_blocktable_destroy(&ft->blocktable);
assert(h->type == FT_CURRENT); if (ft->descriptor.dbt.data) toku_free(ft->descriptor.dbt.data);
toku_blocktable_destroy(&h->blocktable); if (ft->cmp_descriptor.dbt.data) toku_free(ft->cmp_descriptor.dbt.data);
if (h->descriptor.dbt.data) toku_free(h->descriptor.dbt.data); toku_ft_destroy_treelock(ft);
if (h->cmp_descriptor.dbt.data) toku_free(h->cmp_descriptor.dbt.data); toku_ft_destroy_reflock(ft);
toku_ft_destroy_treelock(h); toku_omt_destroy(&ft->txns);
toku_omt_destroy(&h->txns); toku_free(ft->h);
}
} }
// Make a copy of the header for the purpose of a checkpoint // Make a copy of the header for the purpose of a checkpoint
// Not reentrant for a single FT.
// See ft_checkpoint for explanation of why
// FT lock must be held.
static void static void
ft_copy_for_checkpoint(FT h, LSN checkpoint_lsn) { ft_copy_for_checkpoint_unlocked(FT ft, LSN checkpoint_lsn) {
assert(h->type == FT_CURRENT); assert(ft->h->type == FT_CURRENT);
assert(h->checkpoint_header == NULL); assert(ft->checkpoint_header == NULL);
assert(h->panic==0); assert(ft->panic==0);
FT XMALLOC(ch); FT_HEADER ch = toku_xmemdup(ft->h, sizeof *ft->h);
*ch = *h; //Do a shallow copy
ch->type = FT_CHECKPOINT_INPROGRESS; //Different type ch->type = FT_CHECKPOINT_INPROGRESS; //Different type
//printf("checkpoint_lsn=%" PRIu64 "\n", checkpoint_lsn.lsn); //printf("checkpoint_lsn=%" PRIu64 "\n", checkpoint_lsn.lsn);
ch->checkpoint_lsn = checkpoint_lsn; ch->checkpoint_lsn = checkpoint_lsn;
ch->panic_string = NULL;
//ch->blocktable is SHARED between the two headers //ch->blocktable is SHARED between the two headers
h->checkpoint_header = ch; ft->checkpoint_header = ch;
}
static void
ft_free(FT h) {
ft_destroy(h);
toku_free(h);
} }
void void
toku_ft_free (FT h) { toku_ft_free (FT ft) {
ft_free(h); ft_destroy(ft);
toku_free(ft);
} }
void void
toku_ft_init_treelock(FT h) { toku_ft_init_treelock(FT ft) {
toku_mutex_init(&h->tree_lock, NULL); toku_mutex_init(&ft->tree_lock, NULL);
} }
void void
toku_ft_destroy_treelock(FT h) { toku_ft_destroy_treelock(FT ft) {
toku_mutex_destroy(&h->tree_lock); toku_mutex_destroy(&ft->tree_lock);
} }
void void
toku_ft_grab_treelock(FT h) { toku_ft_grab_treelock(FT ft) {
toku_mutex_lock(&h->tree_lock); toku_mutex_lock(&ft->tree_lock);
} }
void void
toku_ft_release_treelock(FT h) { toku_ft_release_treelock(FT ft) {
toku_mutex_unlock(&h->tree_lock); toku_mutex_unlock(&ft->tree_lock);
}
void
toku_ft_init_reflock(FT ft) {
toku_mutex_init(&ft->ft_ref_lock, NULL);
}
void
toku_ft_destroy_reflock(FT ft) {
toku_mutex_destroy(&ft->ft_ref_lock);
}
void
toku_ft_grab_reflock(FT ft) {
toku_mutex_lock(&ft->ft_ref_lock);
}
void
toku_ft_release_reflock(FT ft) {
toku_mutex_unlock(&ft->ft_ref_lock);
} }
///////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////
@ -106,13 +117,13 @@ toku_ft_release_treelock(FT h) {
// maps to cf->log_fassociate_during_checkpoint // maps to cf->log_fassociate_during_checkpoint
static int static int
ft_log_fassociate_during_checkpoint (CACHEFILE cf, void *header_v) { ft_log_fassociate_during_checkpoint (CACHEFILE cf, void *header_v) {
FT h = header_v; FT ft = header_v;
char* fname_in_env = toku_cachefile_fname_in_env(cf); char* fname_in_env = toku_cachefile_fname_in_env(cf);
BYTESTRING bs = { strlen(fname_in_env), // don't include the NUL BYTESTRING bs = { strlen(fname_in_env), // don't include the NUL
fname_in_env }; fname_in_env };
TOKULOGGER logger = toku_cachefile_logger(cf); TOKULOGGER logger = toku_cachefile_logger(cf);
FILENUM filenum = toku_cachefile_filenum (cf); FILENUM filenum = toku_cachefile_filenum (cf);
int r = toku_log_fassociate(logger, NULL, 0, filenum, h->flags, bs); int r = toku_log_fassociate(logger, NULL, 0, filenum, ft->h->flags, bs);
return r; return r;
} }
@ -133,43 +144,64 @@ ft_log_suppress_rollback_during_checkpoint (CACHEFILE cf, void *header_v) {
// Maps to cf->begin_checkpoint_userdata // Maps to cf->begin_checkpoint_userdata
// Create checkpoint-in-progress versions of header and translation (btt) (and fifo for now...). // Create checkpoint-in-progress versions of header and translation (btt) (and fifo for now...).
//Has access to fd (it is protected). // Has access to fd (it is protected).
//
// Not reentrant for a single FT (see ft_checkpoint)
static int static int
ft_begin_checkpoint (LSN checkpoint_lsn, void *header_v) { ft_begin_checkpoint (LSN checkpoint_lsn, void *header_v) {
FT h = header_v; FT ft = header_v;
int r = h->panic; int r = ft->panic;
if (r==0) { if (r==0) {
// hold lock around copying and clearing of dirty bit // hold lock around copying and clearing of dirty bit
toku_ft_lock (h); toku_ft_lock (ft);
assert(h->type == FT_CURRENT); assert(ft->h->type == FT_CURRENT);
assert(h->checkpoint_header == NULL); assert(ft->checkpoint_header == NULL);
ft_copy_for_checkpoint(h, checkpoint_lsn); ft_copy_for_checkpoint_unlocked(ft, checkpoint_lsn);
h->dirty = 0; // this is only place this bit is cleared (in currentheader) ft->h->dirty = 0; // this is only place this bit is cleared (in currentheader)
// on_disk_stats includes on disk changes since last checkpoint, toku_block_translation_note_start_checkpoint_unlocked(ft->blocktable);
// so checkpoint_staging_stats now includes changes for checkpoint in progress. toku_ft_unlock (ft);
h->checkpoint_staging_stats = h->on_disk_stats;
toku_block_translation_note_start_checkpoint_unlocked(h->blocktable);
toku_ft_unlock (h);
} }
return r; return r;
} }
// #4922: Hack to remove data corruption race condition.
// Reading (and upgrading) a node up to version 19 causes this.
// We COULD skip this if we know that no nodes remained (as of last checkpoint)
// that are below version 19.
// If there are no nodes < version 19 this is harmless (field is unused).
// If there are, this will make certain the value is at least as low as necessary,
// and not much lower. (Too low is good, too high can cause data corruption).
// TODO(yoni): If we ever stop supporting upgrades of nodes < version 19 we can delete this.
// TODO(yoni): If we know no nodes are left to upgrade, we can skip this. (Probably not worth doing).
static void
ft_hack_highest_unused_msn_for_upgrade_for_checkpoint(FT ft) {
if (ft->h->layout_version_original < FT_LAYOUT_VERSION_19) {
ft->checkpoint_header->highest_unused_msn_for_upgrade = ft->h->highest_unused_msn_for_upgrade;
}
}
// maps to cf->checkpoint_userdata // maps to cf->checkpoint_userdata
// Write checkpoint-in-progress versions of header and translation to disk (really to OS internal buffer). // Write checkpoint-in-progress versions of header and translation to disk (really to OS internal buffer).
// Copy current header's version of checkpoint_staging stat64info to checkpoint header. // Copy current header's version of checkpoint_staging stat64info to checkpoint header.
// Must have access to fd (protected). // Must have access to fd (protected).
// Requires: all pending bits are clear. This implies that no thread will modify the checkpoint_staging // Requires: all pending bits are clear. This implies that no thread will modify the checkpoint_staging
// version of the stat64info. // version of the stat64info.
//
// No locks are taken for checkpoint_count/lsn because this is single threaded. Can be called by:
// - ft_close
// - end_checkpoint
// checkpoints hold references to FTs and so they cannot be closed during a checkpoint.
// ft_close is not reentrant for a single FT
// end_checkpoint is not reentrant period
static int static int
ft_checkpoint (CACHEFILE cf, int fd, void *header_v) { ft_checkpoint (CACHEFILE cf, int fd, void *header_v) {
FT h = header_v; FT ft = header_v;
FT ch = h->checkpoint_header; FT_HEADER ch = ft->checkpoint_header;
int r = 0; int r = 0;
if (h->panic!=0) goto handle_error; if (ft->panic!=0) goto handle_error;
//printf("%s:%d allocated_limit=%lu writing queue to %lu\n", __FILE__, __LINE__, //printf("%s:%d allocated_limit=%lu writing queue to %lu\n", __FILE__, __LINE__,
// block_allocator_allocated_limit(h->block_allocator), h->unused_blocks.b*h->nodesize); // block_allocator_allocated_limit(h->block_allocator), h->unused_blocks.b*h->nodesize);
assert(ch); assert(ch);
if (ch->panic!=0) goto handle_error;
assert(ch->type == FT_CHECKPOINT_INPROGRESS); assert(ch->type == FT_CHECKPOINT_INPROGRESS);
if (ch->dirty) { // this is only place this bit is tested (in checkpoint_header) if (ch->dirty) { // this is only place this bit is tested (in checkpoint_header)
TOKULOGGER logger = toku_cachefile_logger(cf); TOKULOGGER logger = toku_cachefile_logger(cf);
@ -178,22 +210,13 @@ ft_checkpoint (CACHEFILE cf, int fd, void *header_v) {
if (r!=0) goto handle_error; if (r!=0) goto handle_error;
} }
uint64_t now = (uint64_t) time(NULL); // 4018; uint64_t now = (uint64_t) time(NULL); // 4018;
h->time_of_last_modification = now; ft->h->time_of_last_modification = now;
ch->time_of_last_modification = now; ch->time_of_last_modification = now;
ch->checkpoint_count++; ch->checkpoint_count++;
// Threadsafety of checkpoint_staging_stats here depends on there being no pending bits set, ft_hack_highest_unused_msn_for_upgrade_for_checkpoint(ft);
// so that all callers to flush callback should have the for_checkpoint argument false,
// and therefore will not modify the checkpoint_staging_stats.
// TODO 4184: If the flush callback is called with the for_checkpoint argument true even when all the pending bits
// are clear, then this is a problem.
ch->checkpoint_staging_stats = h->checkpoint_staging_stats;
// The in_memory_stats and on_disk_stats in the checkpoint header should be ignored, but we set them
// here just in case the serializer looks in the wrong place.
ch->in_memory_stats = ch->checkpoint_staging_stats;
ch->on_disk_stats = ch->checkpoint_staging_stats;
// write translation and header to disk (or at least to OS internal buffer) // write translation and header to disk (or at least to OS internal buffer)
r = toku_serialize_ft_to(fd, ch); r = toku_serialize_ft_to(fd, ch, ft->blocktable, ft->cf);
if (r!=0) goto handle_error; if (r!=0) goto handle_error;
ch->dirty = 0; // this is only place this bit is cleared (in checkpoint_header) ch->dirty = 0; // this is only place this bit is cleared (in checkpoint_header)
@ -202,22 +225,16 @@ ft_checkpoint (CACHEFILE cf, int fd, void *header_v) {
if (r!=0) { if (r!=0) {
goto handle_error; goto handle_error;
} }
h->checkpoint_count++; // checkpoint succeeded, next checkpoint will save to alternate header location ft->h->checkpoint_count++; // checkpoint succeeded, next checkpoint will save to alternate header location
h->checkpoint_lsn = ch->checkpoint_lsn; //Header updated. ft->h->checkpoint_lsn = ch->checkpoint_lsn; //Header updated.
} }
else { else {
toku_block_translation_note_skipped_checkpoint(ch->blocktable); toku_block_translation_note_skipped_checkpoint(ft->blocktable);
} }
if (0) { if (0) {
handle_error: handle_error:
if (h->panic) r = h->panic; if (ft->panic) r = ft->panic;
else if (ch->panic) { else toku_block_translation_note_failed_checkpoint(ft->blocktable);
r = ch->panic;
//Steal panic string. Cannot afford to malloc.
h->panic = ch->panic;
h->panic_string = ch->panic_string;
}
else toku_block_translation_note_failed_checkpoint(ch->blocktable);
} }
return r; return r;
@ -229,15 +246,15 @@ handle_error:
// Must have access to fd (protected) // Must have access to fd (protected)
static int static int
ft_end_checkpoint (CACHEFILE UU(cachefile), int fd, void *header_v) { ft_end_checkpoint (CACHEFILE UU(cachefile), int fd, void *header_v) {
FT h = header_v; FT ft = header_v;
int r = h->panic; int r = ft->panic;
if (r==0) { if (r==0) {
assert(h->type == FT_CURRENT); assert(ft->h->type == FT_CURRENT);
toku_block_translation_note_end_checkpoint(h->blocktable, fd, h); toku_block_translation_note_end_checkpoint(ft->blocktable, fd, ft);
} }
if (h->checkpoint_header) { // could be NULL only if panic was true at begin_checkpoint if (ft->checkpoint_header) { // could be NULL only if panic was true at begin_checkpoint
ft_free(h->checkpoint_header); toku_free(ft->checkpoint_header);
h->checkpoint_header = NULL; ft->checkpoint_header = NULL;
} }
return r; return r;
} }
@ -246,16 +263,16 @@ ft_end_checkpoint (CACHEFILE UU(cachefile), int fd, void *header_v) {
// Has access to fd (it is protected). // Has access to fd (it is protected).
static int static int
ft_close (CACHEFILE cachefile, int fd, void *header_v, char **malloced_error_string, BOOL oplsn_valid, LSN oplsn) { ft_close (CACHEFILE cachefile, int fd, void *header_v, char **malloced_error_string, BOOL oplsn_valid, LSN oplsn) {
FT h = header_v; FT ft = header_v;
assert(h->type == FT_CURRENT); assert(ft->h->type == FT_CURRENT);
toku_ft_lock(h); // We already have exclusive access to this field already, so skip the locking.
assert(!toku_ft_needed(h)); // This should already never fail.
toku_ft_unlock(h); invariant(!toku_ft_needed_unlocked(ft));
int r = 0; int r = 0;
if (h->panic) { if (ft->panic) {
r = h->panic; r = ft->panic;
} else { } else {
assert(h->cf == cachefile); assert(ft->cf == cachefile);
TOKULOGGER logger = toku_cachefile_logger(cachefile); TOKULOGGER logger = toku_cachefile_logger(cachefile);
LSN lsn = ZERO_LSN; LSN lsn = ZERO_LSN;
//Get LSN //Get LSN
@ -263,8 +280,8 @@ ft_close (CACHEFILE cachefile, int fd, void *header_v, char **malloced_error_str
//Use recovery-specified lsn //Use recovery-specified lsn
lsn = oplsn; lsn = oplsn;
//Recovery cannot reduce lsn of a header. //Recovery cannot reduce lsn of a header.
if (lsn.lsn < h->checkpoint_lsn.lsn) if (lsn.lsn < ft->h->checkpoint_lsn.lsn)
lsn = h->checkpoint_lsn; lsn = ft->h->checkpoint_lsn;
} }
else { else {
//Get LSN from logger //Get LSN from logger
@ -273,11 +290,11 @@ ft_close (CACHEFILE cachefile, int fd, void *header_v, char **malloced_error_str
char* fname_in_env = toku_cachefile_fname_in_env(cachefile); char* fname_in_env = toku_cachefile_fname_in_env(cachefile);
assert(fname_in_env); assert(fname_in_env);
BYTESTRING bs = {.len=strlen(fname_in_env), .data=fname_in_env}; BYTESTRING bs = {.len=strlen(fname_in_env), .data=fname_in_env};
r = toku_log_fclose(logger, &lsn, h->dirty, bs, toku_cachefile_filenum(cachefile)); // flush the log on close (if new header is being written), otherwise it might not make it out. r = toku_log_fclose(logger, &lsn, ft->h->dirty, bs, toku_cachefile_filenum(cachefile)); // flush the log on close (if new header is being written), otherwise it might not make it out.
if (r!=0) return r; if (r!=0) return r;
} }
} }
if (h->dirty) { // this is the only place this bit is tested (in currentheader) if (ft->h->dirty) { // this is the only place this bit is tested (in currentheader)
if (logger) { //Rollback cachefile MUST NOT BE CLOSED DIRTY if (logger) { //Rollback cachefile MUST NOT BE CLOSED DIRTY
//It can be checkpointed only via 'checkpoint' //It can be checkpointed only via 'checkpoint'
assert(logger->rollback_cachefile != cachefile); assert(logger->rollback_cachefile != cachefile);
@ -286,18 +303,18 @@ ft_close (CACHEFILE cachefile, int fd, void *header_v, char **malloced_error_str
//assert(lsn.lsn!=0); //assert(lsn.lsn!=0);
r2 = ft_begin_checkpoint(lsn, header_v); r2 = ft_begin_checkpoint(lsn, header_v);
if (r==0) r = r2; if (r==0) r = r2;
r2 = ft_checkpoint(cachefile, fd, h); r2 = ft_checkpoint(cachefile, fd, ft);
if (r==0) r = r2; if (r==0) r = r2;
r2 = ft_end_checkpoint(cachefile, fd, header_v); r2 = ft_end_checkpoint(cachefile, fd, header_v);
if (r==0) r = r2; if (r==0) r = r2;
if (!h->panic) assert(!h->dirty); // dirty bit should be cleared by begin_checkpoint and never set again (because we're closing the dictionary) if (!ft->panic) assert(!ft->h->dirty); // dirty bit should be cleared by begin_checkpoint and never set again (because we're closing the dictionary)
} }
} }
if (malloced_error_string) *malloced_error_string = h->panic_string; if (malloced_error_string) *malloced_error_string = ft->panic_string;
if (r == 0) { if (r == 0) {
r = h->panic; r = ft->panic;
} }
toku_ft_free(h); toku_ft_free(ft);
return r; return r;
} }
@ -309,82 +326,75 @@ ft_note_pin_by_checkpoint (CACHEFILE UU(cachefile), void *header_v)
{ {
//Set arbitrary brt (for given header) as pinned by checkpoint. //Set arbitrary brt (for given header) as pinned by checkpoint.
//Only one can be pinned (only one checkpoint at a time), but not worth verifying. //Only one can be pinned (only one checkpoint at a time), but not worth verifying.
FT h = header_v; FT ft = header_v;
assert(!h->pinned_by_checkpoint);
h->pinned_by_checkpoint = true; // Note: open_close lock is held by checkpoint begin
toku_ft_grab_reflock(ft);
assert(!ft->pinned_by_checkpoint);
assert(toku_ft_needed_unlocked(ft));
ft->pinned_by_checkpoint = true;
toku_ft_release_reflock(ft);
return 0; return 0;
} }
static void
unpin_by_checkpoint_callback(FT ft, void *extra) {
invariant(extra == NULL);
invariant(ft->pinned_by_checkpoint);
ft->pinned_by_checkpoint = false; //Unpin
}
// maps to cf->note_unpin_by_checkpoint // maps to cf->note_unpin_by_checkpoint
//Must be protected by ydb lock. //Must be protected by ydb lock.
//Called by end_checkpoint, which grabs ydb lock around note_unpin //Called by end_checkpoint, which grabs ydb lock around note_unpin
static int static int
ft_note_unpin_by_checkpoint (CACHEFILE UU(cachefile), void *header_v) ft_note_unpin_by_checkpoint (CACHEFILE UU(cachefile), void *header_v)
{ {
FT h = header_v; FT ft = header_v;
assert(h->pinned_by_checkpoint); toku_ft_remove_reference(ft, false, ZERO_LSN, unpin_by_checkpoint_callback, NULL);
h->pinned_by_checkpoint = false; //Unpin return 0;
int r = 0;
//Close if necessary
if (!toku_ft_needed(h)) {
//Close immediately.
char *error_string = NULL;
r = toku_remove_ft(h, &error_string, false, ZERO_LSN);
lazy_assert_zero(r);
}
return r;
} }
// //
// End of Functions that are callbacks to the cachefile // End of Functions that are callbacks to the cachefile
///////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////
static int setup_initial_ft_root_node (FT h, BLOCKNUM blocknum) { static int setup_initial_ft_root_node (FT ft, BLOCKNUM blocknum) {
FTNODE XMALLOC(node); FTNODE XMALLOC(node);
toku_initialize_empty_ftnode(node, blocknum, 0, 1, h->layout_version, h->nodesize, h->flags); toku_initialize_empty_ftnode(node, blocknum, 0, 1, ft->h->layout_version, ft->h->nodesize, ft->h->flags);
BP_STATE(node,0) = PT_AVAIL; BP_STATE(node,0) = PT_AVAIL;
u_int32_t fullhash = toku_cachetable_hash(h->cf, blocknum); u_int32_t fullhash = toku_cachetable_hash(ft->cf, blocknum);
node->fullhash = fullhash; node->fullhash = fullhash;
int r = toku_cachetable_put(h->cf, blocknum, fullhash, int r = toku_cachetable_put(ft->cf, blocknum, fullhash,
node, make_ftnode_pair_attr(node), node, make_ftnode_pair_attr(node),
get_write_callbacks_for_node(h)); get_write_callbacks_for_node(ft));
if (r != 0) if (r != 0)
toku_free(node); toku_free(node);
else else
toku_unpin_ftnode(h, node); toku_unpin_ftnode(ft, node);
return r; return r;
} }
static int static int
ft_init (FT ft, FT_OPTIONS options, CACHEFILE cf, TOKUTXN txn) { ft_init(FT ft, FT_OPTIONS options, CACHEFILE cf) {
ft->type = FT_CURRENT;
ft->checkpoint_header = NULL; ft->checkpoint_header = NULL;
toku_ft_init_treelock(ft); ft->layout_version_read_from_disk = FT_LAYOUT_VERSION; // fake, prevent unnecessary upgrade logic
toku_blocktable_create_new(&ft->blocktable);
//Assign blocknum for root block, also dirty the header
toku_allocate_blocknum(ft->blocktable, &ft->root_blocknum, ft);
toku_list_init(&ft->live_ft_handles); toku_list_init(&ft->live_ft_handles);
int r = toku_omt_create(&ft->txns); int r = toku_omt_create(&ft->txns);
assert_zero(r); assert_zero(r);
ft->flags = options->flags;
ft->nodesize = options->nodesize;
ft->basementnodesize = options->basementnodesize;
ft->compression_method = options->compression_method;
ft->compare_fun = options->compare_fun; ft->compare_fun = options->compare_fun;
ft->update_fun = options->update_fun; ft->update_fun = options->update_fun;
if (ft->cf!=NULL) assert(ft->cf == cf); if (ft->cf != NULL) {
assert(ft->cf == cf);
}
ft->cf = cf; ft->cf = cf;
ft->root_xid_that_created = txn ? txn->ancestor_txnid64 : TXNID_NONE; ft->in_memory_stats = ZEROSTATS;
ft->in_memory_stats = ZEROSTATS;
ft->on_disk_stats = ZEROSTATS;
ft->checkpoint_staging_stats = ZEROSTATS;
ft->highest_unused_msn_for_upgrade.msn = MIN_MSN.msn - 1;
r = setup_initial_ft_root_node(ft, ft->root_blocknum); r = setup_initial_ft_root_node(ft, ft->h->root_blocknum);
if (r != 0) { if (r != 0) {
goto exit; goto exit;
} }
@ -407,8 +417,41 @@ exit:
} }
// allocate and initialize a brt header. static FT_HEADER
// t->ft->cf is not set to anything. ft_header_new(FT_OPTIONS options, BLOCKNUM root_blocknum, TXNID root_xid_that_created)
{
uint64_t now = (uint64_t) time(NULL);
struct ft_header h = {
.type = FT_CURRENT,
.dirty = 0,
.checkpoint_count = 0,
.checkpoint_lsn = ZERO_LSN,
.layout_version = FT_LAYOUT_VERSION,
.layout_version_original = FT_LAYOUT_VERSION,
.build_id = BUILD_ID,
.build_id_original = BUILD_ID,
.time_of_creation = now,
.root_xid_that_created = root_xid_that_created,
.time_of_last_modification = now,
.time_of_last_verification = 0,
.root_blocknum = root_blocknum,
.flags = options->flags,
.nodesize = options->nodesize,
.basementnodesize = options->basementnodesize,
.compression_method = options->compression_method,
.highest_unused_msn_for_upgrade = { .msn = (MIN_MSN.msn - 1) },
.time_of_last_optimize_begin = 0,
.time_of_last_optimize_end = 0,
.count_of_optimize_in_progress = 0,
.count_of_optimize_in_progress_read_from_disk = 0,
.msn_at_start_of_last_completed_optimize = ZERO_MSN,
.on_disk_stats = ZEROSTATS
};
return toku_xmemdup(&h, sizeof h);
}
// allocate and initialize a fractal tree.
// t->ft->cf is not set to anything. TODO(leif): I don't think that's true
int int
toku_create_new_ft(FT *ftp, FT_OPTIONS options, CACHEFILE cf, TOKUTXN txn) { toku_create_new_ft(FT *ftp, FT_OPTIONS options, CACHEFILE cf, TOKUTXN txn) {
int r; int r;
@ -416,23 +459,18 @@ toku_create_new_ft(FT *ftp, FT_OPTIONS options, CACHEFILE cf, TOKUTXN txn) {
FT XCALLOC(ft); FT XCALLOC(ft);
ft->layout_version = FT_LAYOUT_VERSION;
ft->layout_version_original = FT_LAYOUT_VERSION;
ft->layout_version_read_from_disk = FT_LAYOUT_VERSION; // fake, prevent unnecessary upgrade logic
ft->build_id = BUILD_ID;
ft->build_id_original = BUILD_ID;
uint64_t now = (uint64_t) time(NULL);
ft->time_of_creation = now;
ft->time_of_last_modification = now;
ft->time_of_last_verification = 0;
memset(&ft->descriptor, 0, sizeof(ft->descriptor)); memset(&ft->descriptor, 0, sizeof(ft->descriptor));
memset(&ft->cmp_descriptor, 0, sizeof(ft->cmp_descriptor)); memset(&ft->cmp_descriptor, 0, sizeof(ft->cmp_descriptor));
r = ft_init(ft, options, cf, txn); ft->h = ft_header_new(options, make_blocknum(0), (txn ? txn->ancestor_txnid64 : TXNID_NONE));
toku_ft_init_treelock(ft);
toku_ft_init_reflock(ft);
toku_blocktable_create_new(&ft->blocktable);
//Assign blocknum for root block, also dirty the header
toku_allocate_blocknum(ft->blocktable, &ft->h->root_blocknum, ft);
r = ft_init(ft, options, cf);
if (r != 0) { if (r != 0) {
goto exit; goto exit;
} }
@ -504,22 +542,30 @@ int toku_read_ft_and_store_in_cachefile (FT_HANDLE brt, CACHEFILE cf, LSN max_ac
void void
toku_ft_note_ft_handle_open(FT ft, FT_HANDLE live) { toku_ft_note_ft_handle_open(FT ft, FT_HANDLE live) {
toku_ft_lock(ft); toku_ft_grab_reflock(ft);
live->ft = ft; live->ft = ft;
toku_list_push(&ft->live_ft_handles, &live->live_ft_handle_link); toku_list_push(&ft->live_ft_handles, &live->live_ft_handle_link);
toku_ft_unlock(ft); toku_ft_release_reflock(ft);
} }
int int
toku_ft_needed(FT h) { toku_ft_needed_unlocked(FT h) {
return !toku_list_empty(&h->live_ft_handles) || toku_omt_size(h->txns) != 0 || h->pinned_by_checkpoint; return !toku_list_empty(&h->live_ft_handles) || toku_omt_size(h->txns) != 0 || h->pinned_by_checkpoint;
} }
BOOL
toku_ft_has_one_reference_unlocked(FT ft) {
u_int32_t pinned_by_checkpoint = ft->pinned_by_checkpoint ? 1 : 0;
u_int32_t num_txns = toku_omt_size(ft->txns);
int num_handles = toku_list_num_elements_est(&ft->live_ft_handles);
return ((pinned_by_checkpoint + num_txns + num_handles) == 1);
}
// Close brt. If opsln_valid, use given oplsn as lsn in brt header instead of logging // Close brt. If opsln_valid, use given oplsn as lsn in brt header instead of logging
// the close and using the lsn provided by logging the close. (Subject to constraint // the close and using the lsn provided by logging the close. (Subject to constraint
// that if a newer lsn is already in the dictionary, don't overwrite the dictionary.) // that if a newer lsn is already in the dictionary, don't overwrite the dictionary.)
int toku_remove_ft (FT h, char **error_string, BOOL oplsn_valid, LSN oplsn) { int toku_remove_ft (FT h, char **error_string, BOOL oplsn_valid, LSN oplsn) {
assert(!h->pinned_by_checkpoint);
int r = 0; int r = 0;
// Must do this work before closing the cf // Must do this work before closing the cf
if (h->cf) { if (h->cf) {
@ -534,11 +580,11 @@ int toku_remove_ft (FT h, char **error_string, BOOL oplsn_valid, LSN oplsn) {
// for this header, returns NULL // for this header, returns NULL
FT_HANDLE toku_ft_get_some_existing_ft_handle(FT h) { FT_HANDLE toku_ft_get_some_existing_ft_handle(FT h) {
FT_HANDLE ft_handle_ret = NULL; FT_HANDLE ft_handle_ret = NULL;
toku_ft_lock(h); toku_ft_grab_reflock(h);
if (!toku_list_empty(&h->live_ft_handles)) { if (!toku_list_empty(&h->live_ft_handles)) {
ft_handle_ret = toku_list_struct(toku_list_head(&h->live_ft_handles), struct ft_handle, live_ft_handle_link); ft_handle_ret = toku_list_struct(toku_list_head(&h->live_ft_handles), struct ft_handle, live_ft_handle_link);
} }
toku_ft_unlock(h); toku_ft_release_reflock(h);
return ft_handle_ret; return ft_handle_ret;
} }
@ -548,16 +594,16 @@ FT_HANDLE toku_ft_get_some_existing_ft_handle(FT h) {
// convenient here for keeping the HOT variables threadsafe.) // convenient here for keeping the HOT variables threadsafe.)
void void
toku_ft_note_hot_begin(FT_HANDLE brt) { toku_ft_note_hot_begin(FT_HANDLE brt) {
FT h = brt->ft; FT ft = brt->ft;
time_t now = time(NULL); time_t now = time(NULL);
// hold lock around setting and clearing of dirty bit // hold lock around setting and clearing of dirty bit
// (see cooperative use of dirty bit in ft_begin_checkpoint()) // (see cooperative use of dirty bit in ft_begin_checkpoint())
toku_ft_lock(h); toku_ft_lock(ft);
h->time_of_last_optimize_begin = now; ft->h->time_of_last_optimize_begin = now;
h->count_of_optimize_in_progress++; ft->h->count_of_optimize_in_progress++;
h->dirty = 1; ft->h->dirty = 1;
toku_ft_unlock(h); toku_ft_unlock(ft);
} }
@ -565,47 +611,45 @@ toku_ft_note_hot_begin(FT_HANDLE brt) {
// Note: See note for toku_ft_note_hot_begin(). // Note: See note for toku_ft_note_hot_begin().
void void
toku_ft_note_hot_complete(FT_HANDLE brt, BOOL success, MSN msn_at_start_of_hot) { toku_ft_note_hot_complete(FT_HANDLE brt, BOOL success, MSN msn_at_start_of_hot) {
FT h = brt->ft; FT ft = brt->ft;
time_t now = time(NULL); time_t now = time(NULL);
toku_ft_lock(h); toku_ft_lock(ft);
h->count_of_optimize_in_progress--; ft->h->count_of_optimize_in_progress--;
if (success) { if (success) {
h->time_of_last_optimize_end = now; ft->h->time_of_last_optimize_end = now;
h->msn_at_start_of_last_completed_optimize = msn_at_start_of_hot; ft->h->msn_at_start_of_last_completed_optimize = msn_at_start_of_hot;
// If we just successfully completed an optimization and no other thread is performing // If we just successfully completed an optimization and no other thread is performing
// an optimization, then the number of optimizations in progress is zero. // an optimization, then the number of optimizations in progress is zero.
// If there was a crash during a HOT optimization, this is how count_of_optimize_in_progress // If there was a crash during a HOT optimization, this is how count_of_optimize_in_progress
// would be reset to zero on the disk after recovery from that crash. // would be reset to zero on the disk after recovery from that crash.
if (h->count_of_optimize_in_progress == h->count_of_optimize_in_progress_read_from_disk) if (ft->h->count_of_optimize_in_progress == ft->h->count_of_optimize_in_progress_read_from_disk)
h->count_of_optimize_in_progress = 0; ft->h->count_of_optimize_in_progress = 0;
} }
h->dirty = 1; ft->h->dirty = 1;
toku_ft_unlock(h); toku_ft_unlock(ft);
} }
void void
toku_ft_init(FT h, toku_ft_init(FT ft,
BLOCKNUM root_blocknum_on_disk, LSN checkpoint_lsn, TXNID root_xid_that_created, uint32_t target_nodesize, uint32_t target_basementnodesize, enum toku_compression_method compression_method) { BLOCKNUM root_blocknum_on_disk,
memset(h, 0, sizeof *h); LSN checkpoint_lsn,
h->layout_version = FT_LAYOUT_VERSION; TXNID root_xid_that_created,
h->layout_version_original = FT_LAYOUT_VERSION; uint32_t target_nodesize,
h->build_id = BUILD_ID; uint32_t target_basementnodesize,
h->build_id_original = BUILD_ID; enum toku_compression_method compression_method)
uint64_t now = (uint64_t) time(NULL); {
h->time_of_creation = now; memset(ft, 0, sizeof *ft);
h->time_of_last_modification = now; struct ft_options options = {
h->time_of_last_verification = 0; .nodesize = target_nodesize,
h->checkpoint_count = 1; .basementnodesize = target_basementnodesize,
h->checkpoint_lsn = checkpoint_lsn; .compression_method = compression_method,
h->nodesize = target_nodesize; .flags = 0
h->basementnodesize = target_basementnodesize; };
h->root_blocknum = root_blocknum_on_disk; ft->h = ft_header_new(&options, root_blocknum_on_disk, root_xid_that_created);
h->flags = 0; ft->h->checkpoint_count = 1;
h->root_xid_that_created = root_xid_that_created; ft->h->checkpoint_lsn = checkpoint_lsn;
h->compression_method = compression_method;
h->highest_unused_msn_for_upgrade.msn = MIN_MSN.msn - 1;
} }
// Open a brt for use by redirect. The new brt must have the same dict_id as the old_ft passed in. (FILENUM is assigned by the ft_handle_open() function.) // Open a brt for use by redirect. The new brt must have the same dict_id as the old_ft passed in. (FILENUM is assigned by the ft_handle_open() function.)
@ -620,11 +664,11 @@ ft_handle_open_for_redirect(FT_HANDLE *new_ftp, const char *fname_in_env, TOKUTX
assert_zero(r); assert_zero(r);
r = toku_ft_set_update(t, old_h->update_fun); r = toku_ft_set_update(t, old_h->update_fun);
assert_zero(r); assert_zero(r);
r = toku_ft_set_nodesize(t, old_h->nodesize); r = toku_ft_set_nodesize(t, old_h->h->nodesize);
assert_zero(r); assert_zero(r);
r = toku_ft_set_basementnodesize(t, old_h->basementnodesize); r = toku_ft_set_basementnodesize(t, old_h->h->basementnodesize);
assert_zero(r); assert_zero(r);
r = toku_ft_set_compression_method(t, old_h->compression_method); r = toku_ft_set_compression_method(t, old_h->h->compression_method);
assert_zero(r); assert_zero(r);
CACHETABLE ct = toku_cachefile_get_cachetable(old_h->cf); CACHETABLE ct = toku_cachefile_get_cachetable(old_h->cf);
r = toku_ft_handle_open_with_dict_id(t, fname_in_env, 0, 0, ct, txn, old_h->dict_id); r = toku_ft_handle_open_with_dict_id(t, fname_in_env, 0, 0, ct, txn, old_h->dict_id);
@ -662,14 +706,13 @@ dictionary_redirect_internal(const char *dst_fname_in_env, FT src_h, TOKUTXN txn
// for each live brt, brt->ft is currently src_h // for each live brt, brt->ft is currently src_h
// we want to change it to dummy_dst // we want to change it to dummy_dst
toku_ft_grab_reflock(src_h);
while (!toku_list_empty(&src_h->live_ft_handles)) { while (!toku_list_empty(&src_h->live_ft_handles)) {
list = src_h->live_ft_handles.next; list = src_h->live_ft_handles.next;
FT_HANDLE src_handle = NULL; FT_HANDLE src_handle = NULL;
src_handle = toku_list_struct(list, struct ft_handle, live_ft_handle_link); src_handle = toku_list_struct(list, struct ft_handle, live_ft_handle_link);
toku_ft_lock(src_h);
toku_list_remove(&src_handle->live_ft_handle_link); toku_list_remove(&src_handle->live_ft_handle_link);
toku_ft_unlock(src_h);
toku_ft_note_ft_handle_open(dst_h, src_handle); toku_ft_note_ft_handle_open(dst_h, src_handle);
if (src_handle->redirect_callback) { if (src_handle->redirect_callback) {
@ -677,6 +720,9 @@ dictionary_redirect_internal(const char *dst_fname_in_env, FT src_h, TOKUTXN txn
} }
} }
assert(dst_h); assert(dst_h);
// making sure that we are not leaking src_h
assert(toku_ft_needed_unlocked(src_h));
toku_ft_release_reflock(src_h);
r = toku_ft_handle_close(tmp_dst_ft, FALSE, ZERO_LSN); r = toku_ft_handle_close(tmp_dst_ft, FALSE, ZERO_LSN);
assert_zero(r); assert_zero(r);
@ -699,23 +745,16 @@ toku_dictionary_redirect_abort(FT old_h, FT new_h, TOKUTXN txn) {
assert(old_filenum.fileid!=new_filenum.fileid); //Cannot be same file. assert(old_filenum.fileid!=new_filenum.fileid); //Cannot be same file.
//No living brts in old header. //No living brts in old header.
toku_ft_grab_reflock(old_h);
assert(toku_list_empty(&old_h->live_ft_handles)); assert(toku_list_empty(&old_h->live_ft_handles));
toku_ft_release_reflock(old_h);
} }
// If application did not close all DBs using the new file, then there should FT dst_h;
// be no zombies and we need to redirect the DBs back to the original file. // redirect back from new_h to old_h
if (!toku_list_empty(&new_h->live_ft_handles)) { r = dictionary_redirect_internal(old_fname_in_env, new_h, txn, &dst_h);
FT dst_h; assert_zero(r);
// redirect back from new_h to old_h assert(dst_h == old_h);
r = dictionary_redirect_internal(old_fname_in_env, new_h, txn, &dst_h);
assert_zero(r);
assert(dst_h == old_h);
}
else {
//No live brts.
//No need to redirect back.
r = 0;
}
return r; return r;
} }
@ -784,7 +823,6 @@ toku_dictionary_redirect (const char *dst_fname_in_env, FT_HANDLE old_ft, TOKUTX
// make rollback log entry // make rollback log entry
if (txn) { if (txn) {
assert(!toku_list_empty(&new_h->live_ft_handles));
r = toku_txn_note_ft(txn, new_h); // mark new brt as touched by this txn r = toku_txn_note_ft(txn, new_h); // mark new brt as touched by this txn
FILENUM old_filenum = toku_cachefile_filenum(old_h->cf); FILENUM old_filenum = toku_cachefile_filenum(old_h->cf);
@ -817,7 +855,7 @@ toku_ft_maybe_add_txn_ref(FT h, TOKUTXN txn) {
BOOL ref_added = FALSE; BOOL ref_added = FALSE;
OMTVALUE txnv; OMTVALUE txnv;
u_int32_t index; u_int32_t index;
toku_ft_lock(h); toku_ft_grab_reflock(h);
// Does brt already know about transaction txn? // Does brt already know about transaction txn?
int r = toku_omt_find_zero(h->txns, find_xid, txn, &txnv, &index); int r = toku_omt_find_zero(h->txns, find_xid, txn, &txnv, &index);
if (r==0) { if (r==0) {
@ -832,85 +870,80 @@ toku_ft_maybe_add_txn_ref(FT h, TOKUTXN txn) {
assert(r==0); assert(r==0);
ref_added = TRUE; ref_added = TRUE;
exit: exit:
toku_ft_unlock(h); toku_ft_release_reflock(h);
return ref_added; return ref_added;
} }
void static void
toku_ft_remove_txn_ref(FT h, TOKUTXN txn) { remove_txn_ref_callback(FT ft, void *context) {
TOKUTXN txn = context;
OMTVALUE txnv_again=NULL; OMTVALUE txnv_again=NULL;
u_int32_t index; u_int32_t index;
toku_ft_lock(h); int r = toku_omt_find_zero(ft->txns, find_xid, txn, &txnv_again, &index);
int r = toku_omt_find_zero(h->txns, find_xid, txn, &txnv_again, &index);
assert(r==0); assert(r==0);
assert(txnv_again == txn); assert(txnv_again == txn);
r = toku_omt_delete_at(h->txns, index); r = toku_omt_delete_at(ft->txns, index);
assert(r==0); assert(r==0);
// TODO: (Zardosht) figure out how to properly do this }
// below this unlock, are depending on ydb lock
toku_ft_unlock(h); void
if (!toku_ft_needed(h)) { toku_ft_remove_txn_ref(FT ft, TOKUTXN txn) {
//Close immediately. toku_ft_remove_reference(ft, false, ZERO_LSN, remove_txn_ref_callback, txn);
// I have no idea how this error string business works
char *error_string = NULL;
r = toku_remove_ft(h, &error_string, false, ZERO_LSN);
lazy_assert_zero(r);
}
} }
void toku_calculate_root_offset_pointer ( void toku_calculate_root_offset_pointer (
FT h, FT ft,
CACHEKEY* root_key, CACHEKEY* root_key,
u_int32_t *roothash u_int32_t *roothash
) )
{ {
*roothash = toku_cachetable_hash(h->cf, h->root_blocknum); *roothash = toku_cachetable_hash(ft->cf, ft->h->root_blocknum);
*root_key = h->root_blocknum; *root_key = ft->h->root_blocknum;
} }
void toku_ft_set_new_root_blocknum( void toku_ft_set_new_root_blocknum(
FT h, FT ft,
CACHEKEY new_root_key CACHEKEY new_root_key
) )
{ {
h->root_blocknum = new_root_key; ft->h->root_blocknum = new_root_key;
} }
LSN toku_ft_checkpoint_lsn(FT h) { LSN toku_ft_checkpoint_lsn(FT ft) {
return h->checkpoint_lsn; return ft->h->checkpoint_lsn;
} }
int toku_ft_set_panic(FT h, int panic, char *panic_string) { int toku_ft_set_panic(FT ft, int panic, char *panic_string) {
if (h->panic == 0) { if (ft->panic == 0) {
h->panic = panic; ft->panic = panic;
if (h->panic_string) { if (ft->panic_string) {
toku_free(h->panic_string); toku_free(ft->panic_string);
} }
h->panic_string = toku_strdup(panic_string); ft->panic_string = toku_strdup(panic_string);
} }
return 0; return 0;
} }
void void
toku_ft_stat64 (FT h, struct ftstat64_s *s) { toku_ft_stat64 (FT ft, struct ftstat64_s *s) {
s->fsize = toku_cachefile_size(h->cf); s->fsize = toku_cachefile_size(ft->cf);
// just use the in memory stats from the header // just use the in memory stats from the header
// prevent appearance of negative numbers for numrows, numbytes // prevent appearance of negative numbers for numrows, numbytes
int64_t n = h->in_memory_stats.numrows; int64_t n = ft->in_memory_stats.numrows;
if (n < 0) { if (n < 0) {
n = 0; n = 0;
} }
s->nkeys = s->ndata = n; s->nkeys = s->ndata = n;
n = h->in_memory_stats.numbytes; n = ft->in_memory_stats.numbytes;
if (n < 0) { if (n < 0) {
n = 0; n = 0;
} }
s->dsize = n; s->dsize = n;
// 4018 // 4018
s->create_time_sec = h->time_of_creation; s->create_time_sec = ft->h->time_of_creation;
s->modify_time_sec = h->time_of_last_modification; s->modify_time_sec = ft->h->time_of_last_modification;
s->verify_time_sec = h->time_of_last_verification; s->verify_time_sec = ft->h->time_of_last_verification;
} }
// TODO: (Zardosht), once the fdlock has been removed from cachetable, remove // TODO: (Zardosht), once the fdlock has been removed from cachetable, remove
@ -963,3 +996,33 @@ toku_ft_decrease_stats(STAT64INFO headerstats, STAT64INFO_S delta) {
(void) __sync_fetch_and_sub(&(headerstats->numrows), delta.numrows); (void) __sync_fetch_and_sub(&(headerstats->numrows), delta.numrows);
(void) __sync_fetch_and_sub(&(headerstats->numbytes), delta.numbytes); (void) __sync_fetch_and_sub(&(headerstats->numbytes), delta.numbytes);
} }
void
toku_ft_remove_reference(FT ft, bool oplsn_valid, LSN oplsn, remove_ft_ref_callback remove_ref, void *extra) {
toku_ft_grab_reflock(ft);
if (toku_ft_has_one_reference_unlocked(ft)) {
toku_ft_release_reflock(ft);
toku_ft_open_close_lock();
toku_ft_grab_reflock(ft);
remove_ref(ft, extra);
BOOL needed = toku_ft_needed_unlocked(ft);
toku_ft_release_reflock(ft);
if (!needed) {
// close header
char *error_string = NULL;
int r;
r = toku_remove_ft(ft, &error_string, oplsn_valid, oplsn);
assert_zero(r);
assert(error_string == NULL);
}
toku_ft_open_close_unlock();
}
else {
remove_ref(ft, extra);
toku_ft_release_reflock(ft);
}
}

11
ft/ft.h
View file

@ -22,13 +22,19 @@ void toku_ft_destroy_treelock(FT h);
void toku_ft_grab_treelock(FT h); void toku_ft_grab_treelock(FT h);
void toku_ft_release_treelock(FT h); void toku_ft_release_treelock(FT h);
void toku_ft_init_reflock(FT ft);
void toku_ft_destroy_reflock(FT ft);
void toku_ft_grab_reflock(FT ft);
void toku_ft_release_reflock(FT ft);
int toku_create_new_ft(FT *ftp, FT_OPTIONS options, CACHEFILE cf, TOKUTXN txn); int toku_create_new_ft(FT *ftp, FT_OPTIONS options, CACHEFILE cf, TOKUTXN txn);
void toku_ft_free (FT h); void toku_ft_free (FT h);
int toku_read_ft_and_store_in_cachefile (FT_HANDLE brt, CACHEFILE cf, LSN max_acceptable_lsn, FT *header, BOOL* was_open); int toku_read_ft_and_store_in_cachefile (FT_HANDLE brt, CACHEFILE cf, LSN max_acceptable_lsn, FT *header, BOOL* was_open);
void toku_ft_note_ft_handle_open(FT ft, FT_HANDLE live); void toku_ft_note_ft_handle_open(FT ft, FT_HANDLE live);
int toku_ft_needed(FT h); int toku_ft_needed_unlocked(FT h);
BOOL toku_ft_has_one_reference_unlocked(FT ft);
int toku_remove_ft (FT h, char **error_string, BOOL oplsn_valid, LSN oplsn) __attribute__ ((warn_unused_result)); int toku_remove_ft (FT h, char **error_string, BOOL oplsn_valid, LSN oplsn) __attribute__ ((warn_unused_result));
FT_HANDLE toku_ft_get_some_existing_ft_handle(FT h); FT_HANDLE toku_ft_get_some_existing_ft_handle(FT h);
@ -71,5 +77,8 @@ void toku_ft_update_cmp_descriptor(FT h);
void toku_ft_update_stats(STAT64INFO headerstats, STAT64INFO_S delta); void toku_ft_update_stats(STAT64INFO headerstats, STAT64INFO_S delta);
void toku_ft_decrease_stats(STAT64INFO headerstats, STAT64INFO_S delta); void toku_ft_decrease_stats(STAT64INFO headerstats, STAT64INFO_S delta);
void toku_ft_remove_reference(FT ft,
bool oplsn_valid, LSN oplsn,
remove_ft_ref_callback remove_ref, void *extra);
#endif #endif

View file

@ -881,8 +881,8 @@ toku_serialize_ftnode_to (int fd, BLOCKNUM blocknum, FTNODE node, FTNODE_DISK_DA
int r = toku_serialize_ftnode_to_memory( int r = toku_serialize_ftnode_to_memory(
node, node,
ndd, ndd,
h->basementnodesize, h->h->basementnodesize,
h->compression_method, h->h->compression_method,
do_rebalancing, do_rebalancing,
FALSE, // in_parallel FALSE, // in_parallel
&n_to_write, &n_to_write,
@ -1786,7 +1786,7 @@ deserialize_and_upgrade_internal_node(FTNODE node,
// of messages in the buffer. // of messages in the buffer.
MSN lowest; MSN lowest;
u_int64_t amount = n_in_this_buffer; u_int64_t amount = n_in_this_buffer;
lowest.msn = __sync_sub_and_fetch(&bfe->h->highest_unused_msn_for_upgrade.msn, amount); lowest.msn = __sync_sub_and_fetch(&bfe->h->h->highest_unused_msn_for_upgrade.msn, amount);
if (highest_msn.msn == 0) { if (highest_msn.msn == 0) {
highest_msn.msn = lowest.msn + n_in_this_buffer; highest_msn.msn = lowest.msn + n_in_this_buffer;
} }
@ -2035,7 +2035,7 @@ deserialize_and_upgrade_leaf_node(FTNODE node,
// Whatever this is must be less than the MSNs of every message above // Whatever this is must be less than the MSNs of every message above
// it, so it's ok to take it here. // it, so it's ok to take it here.
bn->max_msn_applied = bfe->h->highest_unused_msn_for_upgrade; bn->max_msn_applied = bfe->h->h->highest_unused_msn_for_upgrade;
bn->stale_ancestor_messages_applied = false; bn->stale_ancestor_messages_applied = false;
node->max_msn_applied_to_node_on_disk = bn->max_msn_applied; node->max_msn_applied_to_node_on_disk = bn->max_msn_applied;
@ -2625,7 +2625,7 @@ toku_serialize_rollback_log_to (int fd, BLOCKNUM blocknum, ROLLBACK_LOG_NODE log
size_t n_to_write; size_t n_to_write;
char *compressed_buf; char *compressed_buf;
{ {
int r = toku_serialize_rollback_log_to_memory(log, n_workitems, n_threads, h->compression_method, &n_to_write, &compressed_buf); int r = toku_serialize_rollback_log_to_memory(log, n_workitems, n_threads, h->h->compression_method, &n_to_write, &compressed_buf);
if (r!=0) return r; if (r!=0) return r;
} }
@ -2949,9 +2949,9 @@ toku_upgrade_subtree_estimates_to_stat64info(int fd, FT h)
FTNODE_DISK_DATA unused_ndd = NULL; FTNODE_DISK_DATA unused_ndd = NULL;
struct ftnode_fetch_extra bfe; struct ftnode_fetch_extra bfe;
fill_bfe_for_min_read(&bfe, h); fill_bfe_for_min_read(&bfe, h);
e = deserialize_ftnode_from_fd(fd, h->root_blocknum, 0, &unused_node, &unused_ndd, e = deserialize_ftnode_from_fd(fd, h->h->root_blocknum, 0, &unused_node, &unused_ndd,
&bfe, &h->on_disk_stats); &bfe, &h->h->on_disk_stats);
h->in_memory_stats = h->on_disk_stats; h->in_memory_stats = h->h->on_disk_stats;
if (unused_node) { if (unused_node) {
toku_ftnode_free(&unused_node); toku_ftnode_free(&unused_node);

View file

@ -85,34 +85,34 @@ dump_descriptor(DESCRIPTOR d) {
static void static void
dump_header (int f, FT *header, CACHEFILE cf) { dump_header (int f, FT *header, CACHEFILE cf) {
FT h; FT ft;
int r; int r;
char timestr[26]; char timestr[26];
r = toku_deserialize_ft_from (f, MAX_LSN, &h); r = toku_deserialize_ft_from (f, MAX_LSN, &ft);
assert(r==0); assert(r==0);
h->cf = cf; ft->cf = cf;
printf("ft:\n"); printf("ft:\n");
printf(" layout_version=%d\n", h->layout_version); printf(" layout_version=%d\n", ft->h->layout_version);
printf(" layout_version_original=%d\n", h->layout_version_original); printf(" layout_version_original=%d\n", ft->h->layout_version_original);
printf(" layout_version_read_from_disk=%d\n", h->layout_version_read_from_disk); printf(" layout_version_read_from_disk=%d\n", ft->layout_version_read_from_disk);
printf(" build_id=%d\n", h->build_id); printf(" build_id=%d\n", ft->h->build_id);
printf(" build_id_original=%d\n", h->build_id_original); printf(" build_id_original=%d\n", ft->h->build_id_original);
format_time(h->time_of_creation, timestr); format_time(ft->h->time_of_creation, timestr);
printf(" time_of_creation= %"PRIu64" %s\n", h->time_of_creation, timestr); printf(" time_of_creation= %"PRIu64" %s\n", ft->h->time_of_creation, timestr);
format_time(h->time_of_last_modification, timestr); format_time(ft->h->time_of_last_modification, timestr);
printf(" time_of_last_modification=%"PRIu64" %s\n", h->time_of_last_modification, timestr); printf(" time_of_last_modification=%"PRIu64" %s\n", ft->h->time_of_last_modification, timestr);
printf(" dirty=%d\n", h->dirty); printf(" dirty=%d\n", ft->h->dirty);
printf(" checkpoint_count=%" PRId64 "\n", h->checkpoint_count); printf(" checkpoint_count=%" PRId64 "\n", ft->h->checkpoint_count);
printf(" checkpoint_lsn=%" PRId64 "\n", h->checkpoint_lsn.lsn); printf(" checkpoint_lsn=%" PRId64 "\n", ft->h->checkpoint_lsn.lsn);
printf(" nodesize=%u\n", h->nodesize); printf(" nodesize=%u\n", ft->h->nodesize);
printf(" basementnodesize=%u\n", h->basementnodesize); printf(" basementnodesize=%u\n", ft->h->basementnodesize);
printf(" compression_method=%u\n", (unsigned) h->compression_method); printf(" compression_method=%u\n", (unsigned) ft->h->compression_method);
printf(" unnamed_root=%" PRId64 "\n", h->root_blocknum.b); printf(" unnamed_root=%" PRId64 "\n", ft->h->root_blocknum.b);
printf(" flags=%u\n", h->flags); printf(" flags=%u\n", ft->h->flags);
dump_descriptor(&h->descriptor); dump_descriptor(&ft->descriptor);
printf(" estimated numrows=%" PRId64 "\n", h->in_memory_stats.numrows); printf(" estimated numrows=%" PRId64 "\n", ft->in_memory_stats.numrows);
printf(" estimated numbytes=%" PRId64 "\n", h->in_memory_stats.numbytes); printf(" estimated numbytes=%" PRId64 "\n", ft->in_memory_stats.numbytes);
*header = h; *header = ft;
} }
static int static int
@ -506,14 +506,14 @@ main (int argc, const char *const argv[]) {
const char *n = argv[0]; const char *n = argv[0];
int f = open(n, O_RDWR + O_BINARY); assert(f>=0); int f = open(n, O_RDWR + O_BINARY); assert(f>=0);
FT h; FT ft;
// create a cachefile for the header // create a cachefile for the header
int r = toku_create_cachetable(&ct, 1<<25, (LSN){0}, 0); int r = toku_create_cachetable(&ct, 1<<25, (LSN){0}, 0);
assert(r == 0); assert(r == 0);
CACHEFILE cf; CACHEFILE cf;
r = toku_cachetable_openfd (&cf, ct, f, n); r = toku_cachetable_openfd (&cf, ct, f, n);
assert(r==0); assert(r==0);
dump_header(f, &h, cf); dump_header(f, &ft, cf);
if (interactive) { if (interactive) {
while (1) { while (1) {
printf("ftdump>"); fflush(stdout); printf("ftdump>"); fflush(stdout);
@ -530,25 +530,25 @@ main (int argc, const char *const argv[]) {
if (strcmp(fields[0], "help") == 0) { if (strcmp(fields[0], "help") == 0) {
interactive_help(); interactive_help();
} else if (strcmp(fields[0], "header") == 0) { } else if (strcmp(fields[0], "header") == 0) {
toku_ft_free(h); toku_ft_free(ft);
dump_header(f, &h, cf); dump_header(f, &ft, cf);
} else if (strcmp(fields[0], "block") == 0 && nfields == 2) { } else if (strcmp(fields[0], "block") == 0 && nfields == 2) {
BLOCKNUM blocknum = make_blocknum(getuint64(fields[1])); BLOCKNUM blocknum = make_blocknum(getuint64(fields[1]));
dump_block(f, blocknum, h); dump_block(f, blocknum, ft);
} else if (strcmp(fields[0], "node") == 0 && nfields == 2) { } else if (strcmp(fields[0], "node") == 0 && nfields == 2) {
BLOCKNUM off = make_blocknum(getuint64(fields[1])); BLOCKNUM off = make_blocknum(getuint64(fields[1]));
dump_node(f, off, h); dump_node(f, off, ft);
} else if (strcmp(fields[0], "dumpdata") == 0 && nfields == 2) { } else if (strcmp(fields[0], "dumpdata") == 0 && nfields == 2) {
dump_data = strtol(fields[1], NULL, 10); dump_data = strtol(fields[1], NULL, 10);
} else if (strcmp(fields[0], "block_translation") == 0 || strcmp(fields[0], "bx") == 0) { } else if (strcmp(fields[0], "block_translation") == 0 || strcmp(fields[0], "bx") == 0) {
u_int64_t offset = 0; u_int64_t offset = 0;
if (nfields == 2) if (nfields == 2)
offset = getuint64(fields[1]); offset = getuint64(fields[1]);
dump_block_translation(h, offset); dump_block_translation(ft, offset);
} else if (strcmp(fields[0], "fragmentation") == 0) { } else if (strcmp(fields[0], "fragmentation") == 0) {
dump_fragmentation(f, h); dump_fragmentation(f, ft);
} else if (strcmp(fields[0], "garbage") == 0) { } else if (strcmp(fields[0], "garbage") == 0) {
dump_garbage_stats(f, h); dump_garbage_stats(f, ft);
} else if (strcmp(fields[0], "file") == 0 && nfields >= 3) { } else if (strcmp(fields[0], "file") == 0 && nfields >= 3) {
u_int64_t offset = getuint64(fields[1]); u_int64_t offset = getuint64(fields[1]);
u_int64_t size = getuint64(fields[2]); u_int64_t size = getuint64(fields[2]);
@ -565,18 +565,18 @@ main (int argc, const char *const argv[]) {
} }
} }
} else if (rootnode) { } else if (rootnode) {
dump_node(f, h->root_blocknum, h); dump_node(f, ft->h->root_blocknum, ft);
} else { } else {
printf("Block translation:"); printf("Block translation:");
toku_dump_translation_table(stdout, h->blocktable); toku_dump_translation_table(stdout, ft->blocktable);
struct __dump_node_extra info; struct __dump_node_extra info;
info.f = f; info.f = f;
info.h = h; info.h = ft;
toku_blocktable_iterate(h->blocktable, TRANSLATION_CHECKPOINTED, toku_blocktable_iterate(ft->blocktable, TRANSLATION_CHECKPOINTED,
dump_node_wrapper, &info, TRUE, TRUE); dump_node_wrapper, &info, TRUE, TRUE);
} }
toku_ft_free(h); toku_ft_free(ft);
return 0; return 0;
} }

View file

@ -507,7 +507,7 @@ int toku_ft_loader_internal_init (/* out */ FTLOADER *blp,
#define SET_TO_MY_STRDUP(lval, s) do { char *v = toku_strdup(s); if (!v) { int r = errno; toku_ft_loader_internal_destroy(bl, TRUE); return r; } lval = v; } while (0) #define SET_TO_MY_STRDUP(lval, s) do { char *v = toku_strdup(s); if (!v) { int r = errno; toku_ft_loader_internal_destroy(bl, TRUE); return r; } lval = v; } while (0)
MY_CALLOC_N(N, bl->root_xids_that_created); MY_CALLOC_N(N, bl->root_xids_that_created);
for (int i=0; i<N; i++) if (brts[i]) bl->root_xids_that_created[i]=brts[i]->ft->root_xid_that_created; for (int i=0; i<N; i++) if (brts[i]) bl->root_xids_that_created[i]=brts[i]->ft->h->root_xid_that_created;
MY_CALLOC_N(N, bl->dbs); MY_CALLOC_N(N, bl->dbs);
for (int i=0; i<N; i++) if (brts[i]) bl->dbs[i]=dbs[i]; for (int i=0; i<N; i++) if (brts[i]) bl->dbs[i]=dbs[i];
MY_CALLOC_N(N, bl->descriptors); MY_CALLOC_N(N, bl->descriptors);
@ -2206,11 +2206,12 @@ static int toku_loader_write_ft_from_q (FTLOADER bl,
if (bl->root_xids_that_created) if (bl->root_xids_that_created)
root_xid_that_created = bl->root_xids_that_created[which_db]; root_xid_that_created = bl->root_xids_that_created[which_db];
struct ft h; // TODO: (Zardosht/Yoni/Leif), do this code properly
toku_ft_init(&h, (BLOCKNUM){0}, bl->load_lsn, root_xid_that_created, target_nodesize, target_basementnodesize, target_compression_method); struct ft ft;
toku_ft_init(&ft, (BLOCKNUM){0}, bl->load_lsn, root_xid_that_created, target_nodesize, target_basementnodesize, target_compression_method);
struct dbout out; struct dbout out;
dbout_init(&out, &h); dbout_init(&out, &ft);
out.fd = fd; out.fd = fd;
out.current_off = 8192; // leave 8K reserved at beginning out.current_off = 8192; // leave 8K reserved at beginning
out.n_translations = 3; // 3 translations reserved at the beginning out.n_translations = 3; // 3 translations reserved at the beginning
@ -2333,7 +2334,7 @@ static int toku_loader_write_ft_from_q (FTLOADER bl,
} }
if (deltas.numrows || deltas.numbytes) { if (deltas.numrows || deltas.numbytes) {
toku_ft_update_stats(&h.in_memory_stats, deltas); toku_ft_update_stats(&ft.in_memory_stats, deltas);
} }
cleanup_maxkey(&maxkey); cleanup_maxkey(&maxkey);
@ -2375,7 +2376,7 @@ static int toku_loader_write_ft_from_q (FTLOADER bl,
{ {
invariant(sts.n_subtrees==1); invariant(sts.n_subtrees==1);
out.h->root_blocknum = make_blocknum(sts.subtrees[0].block); out.h->h->root_blocknum = make_blocknum(sts.subtrees[0].block);
toku_free(sts.subtrees); sts.subtrees = NULL; toku_free(sts.subtrees); sts.subtrees = NULL;
// write the descriptor // write the descriptor
@ -2766,16 +2767,15 @@ static int write_translation_table (struct dbout *out, long long *off_of_transla
static int static int
write_header (struct dbout *out, long long translation_location_on_disk, long long translation_size_on_disk) { write_header (struct dbout *out, long long translation_location_on_disk, long long translation_size_on_disk) {
int result = 0; int result = 0;
unsigned int size = toku_serialize_ft_size (out->h->h);
out->h->checkpoint_staging_stats = out->h->in_memory_stats; // #4184
unsigned int size = toku_serialize_ft_size (out->h);
struct wbuf wbuf; struct wbuf wbuf;
char *MALLOC_N(size, buf); char *MALLOC_N(size, buf);
if (buf == NULL) { if (buf == NULL) {
result = errno; result = errno;
} else { } else {
wbuf_init(&wbuf, buf, size); wbuf_init(&wbuf, buf, size);
toku_serialize_ft_to_wbuf(&wbuf, out->h, translation_location_on_disk, translation_size_on_disk); out->h->h->on_disk_stats = out->h->in_memory_stats;
toku_serialize_ft_to_wbuf(&wbuf, out->h->h, translation_location_on_disk, translation_size_on_disk);
if (wbuf.ndone != size) if (wbuf.ndone != size)
result = EINVAL; result = EINVAL;
else else

View file

@ -38,6 +38,7 @@ typedef struct ftnode_leaf_basement_node *BASEMENTNODE;
typedef struct ftnode_nonleaf_childinfo *NONLEAF_CHILDINFO; typedef struct ftnode_nonleaf_childinfo *NONLEAF_CHILDINFO;
typedef struct sub_block *SUB_BLOCK; typedef struct sub_block *SUB_BLOCK;
typedef struct ft *FT; typedef struct ft *FT;
typedef struct ft_header *FT_HEADER;
typedef struct ft_options *FT_OPTIONS; typedef struct ft_options *FT_OPTIONS;
struct wbuf; struct wbuf;
struct dbuf; struct dbuf;
@ -252,6 +253,7 @@ typedef int (*ft_compare_func)(DB *, const DBT *, const DBT *);
typedef void (*setval_func)(const DBT *, void *); typedef void (*setval_func)(const DBT *, void *);
typedef int (*ft_update_func)(DB *, const DBT *, const DBT *, const DBT *, setval_func, void *); typedef int (*ft_update_func)(DB *, const DBT *, const DBT *, const DBT *, setval_func, void *);
typedef void (*on_redirect_callback)(FT_HANDLE, void*); typedef void (*on_redirect_callback)(FT_HANDLE, void*);
typedef void (*remove_ft_ref_callback)(FT, void*);
#define UU(x) x __attribute__((__unused__)) #define UU(x) x __attribute__((__unused__))

View file

@ -197,7 +197,7 @@ toku_logger_open_rollback(TOKULOGGER logger, CACHETABLE cachetable, BOOL create)
//Verify it is empty //Verify it is empty
assert(!t->ft->panic); assert(!t->ft->panic);
//Must have no data blocks (rollback logs or otherwise). //Must have no data blocks (rollback logs or otherwise).
toku_block_verify_no_data_blocks_except_root_unlocked(t->ft->blocktable, t->ft->root_blocknum); toku_block_verify_no_data_blocks_except_root_unlocked(t->ft->blocktable, t->ft->h->root_blocknum);
BOOL is_empty; BOOL is_empty;
is_empty = toku_ft_is_empty_fast(t); is_empty = toku_ft_is_empty_fast(t);
assert(is_empty); assert(is_empty);
@ -216,26 +216,26 @@ toku_logger_close_rollback(TOKULOGGER logger, BOOL recovery_failed) {
if (!logger->is_panicked && cf) { if (!logger->is_panicked && cf) {
FT_HANDLE ft_to_close; FT_HANDLE ft_to_close;
{ //Find "brt" { //Find "brt"
FT h = toku_cachefile_get_userdata(cf); FT ft = toku_cachefile_get_userdata(cf);
if (!h->panic && recovery_failed) { if (!ft->panic && recovery_failed) {
r = toku_ft_set_panic(h, EINVAL, "Recovery failed"); r = toku_ft_set_panic(ft, EINVAL, "Recovery failed");
assert_zero(r); assert_zero(r);
} }
//Verify it is safe to close it. //Verify it is safe to close it.
if (!h->panic) { //If paniced, it is safe to close. if (!ft->panic) { //If paniced, it is safe to close.
assert(!h->dirty); //Must not be dirty. assert(!ft->h->dirty); //Must not be dirty.
//Must have no data blocks (rollback logs or otherwise). //Must have no data blocks (rollback logs or otherwise).
toku_block_verify_no_data_blocks_except_root_unlocked(h->blocktable, h->root_blocknum); toku_block_verify_no_data_blocks_except_root_unlocked(ft->blocktable, ft->h->root_blocknum);
} }
assert(!h->dirty); assert(!ft->h->dirty);
ft_to_close = toku_ft_get_some_existing_ft_handle(h); ft_to_close = toku_ft_get_some_existing_ft_handle(ft);
assert(ft_to_close); assert(ft_to_close);
{ {
BOOL is_empty; BOOL is_empty;
is_empty = toku_ft_is_empty_fast(ft_to_close); is_empty = toku_ft_is_empty_fast(ft_to_close);
assert(is_empty); assert(is_empty);
} }
assert(!h->dirty); // it should not have been dirtied by the toku_ft_is_empty test. assert(!ft->h->dirty); // it should not have been dirtied by the toku_ft_is_empty test.
} }
r = toku_ft_handle_close(ft_to_close, FALSE, ZERO_LSN); r = toku_ft_handle_close(ft_to_close, FALSE, ZERO_LSN);

View file

@ -330,11 +330,15 @@ test_prefetching(void) {
FT_HANDLE XMALLOC(brt); FT_HANDLE XMALLOC(brt);
FT XCALLOC(brt_h); FT XCALLOC(brt_h);
toku_ft_init(brt_h,
make_blocknum(0),
ZERO_LSN,
TXNID_NONE,
4*1024*1024,
128*1024,
TOKU_DEFAULT_COMPRESSION_METHOD);
brt->ft = brt_h; brt->ft = brt_h;
brt_h->type = FT_CURRENT;
brt_h->panic = 0; brt_h->panic_string = 0; brt_h->panic = 0; brt_h->panic_string = 0;
brt_h->basementnodesize = 128*1024;
brt_h->compression_method = TOKU_DEFAULT_COMPRESSION_METHOD;
toku_ft_init_treelock(brt_h); toku_ft_init_treelock(brt_h);
toku_blocktable_create_new(&brt_h->blocktable); toku_blocktable_create_new(&brt_h->blocktable);
//Want to use block #20 //Want to use block #20

View file

@ -273,11 +273,15 @@ test_serialize_nonleaf(void) {
FT_HANDLE XMALLOC(brt); FT_HANDLE XMALLOC(brt);
FT XCALLOC(brt_h); FT XCALLOC(brt_h);
toku_ft_init(brt_h,
make_blocknum(0),
ZERO_LSN,
TXNID_NONE,
4*1024*1024,
128*1024,
TOKU_DEFAULT_COMPRESSION_METHOD);
brt->ft = brt_h; brt->ft = brt_h;
brt_h->type = FT_CURRENT;
brt_h->panic = 0; brt_h->panic_string = 0; brt_h->panic = 0; brt_h->panic_string = 0;
brt_h->basementnodesize = 128*1024;
brt_h->compression_method = TOKU_DEFAULT_COMPRESSION_METHOD;
toku_ft_init_treelock(brt_h); toku_ft_init_treelock(brt_h);
toku_blocktable_create_new(&brt_h->blocktable); toku_blocktable_create_new(&brt_h->blocktable);
//Want to use block #20 //Want to use block #20
@ -359,11 +363,15 @@ test_serialize_leaf(void) {
FT_HANDLE XMALLOC(brt); FT_HANDLE XMALLOC(brt);
FT XCALLOC(brt_h); FT XCALLOC(brt_h);
toku_ft_init(brt_h,
make_blocknum(0),
ZERO_LSN,
TXNID_NONE,
4*1024*1024,
128*1024,
TOKU_DEFAULT_COMPRESSION_METHOD);
brt->ft = brt_h; brt->ft = brt_h;
brt_h->type = FT_CURRENT;
brt_h->panic = 0; brt_h->panic_string = 0; brt_h->panic = 0; brt_h->panic_string = 0;
brt_h->basementnodesize = 128*1024;
brt_h->compression_method = TOKU_DEFAULT_COMPRESSION_METHOD;
toku_ft_init_treelock(brt_h); toku_ft_init_treelock(brt_h);
toku_blocktable_create_new(&brt_h->blocktable); toku_blocktable_create_new(&brt_h->blocktable);
//Want to use block #20 //Want to use block #20

View file

@ -104,11 +104,15 @@ test_serialize_leaf(int valsize, int nelts, double entropy) {
FT_HANDLE XMALLOC(brt); FT_HANDLE XMALLOC(brt);
FT XCALLOC(brt_h); FT XCALLOC(brt_h);
toku_ft_init(brt_h,
make_blocknum(0),
ZERO_LSN,
TXNID_NONE,
4*1024*1024,
128*1024,
TOKU_DEFAULT_COMPRESSION_METHOD);
brt->ft = brt_h; brt->ft = brt_h;
brt_h->type = FT_CURRENT;
brt_h->panic = 0; brt_h->panic_string = 0; brt_h->panic = 0; brt_h->panic_string = 0;
brt_h->basementnodesize = 128*1024;
brt_h->compression_method = TOKU_DEFAULT_COMPRESSION_METHOD;
brt_h->compare_fun = long_key_cmp; brt_h->compare_fun = long_key_cmp;
toku_ft_init_treelock(brt_h); toku_ft_init_treelock(brt_h);
toku_blocktable_create_new(&brt_h->blocktable); toku_blocktable_create_new(&brt_h->blocktable);
@ -237,11 +241,15 @@ test_serialize_nonleaf(int valsize, int nelts, double entropy) {
FT_HANDLE XMALLOC(brt); FT_HANDLE XMALLOC(brt);
FT XCALLOC(brt_h); FT XCALLOC(brt_h);
toku_ft_init(brt_h,
make_blocknum(0),
ZERO_LSN,
TXNID_NONE,
4*1024*1024,
128*1024,
TOKU_DEFAULT_COMPRESSION_METHOD);
brt->ft = brt_h; brt->ft = brt_h;
brt_h->type = FT_CURRENT;
brt_h->panic = 0; brt_h->panic_string = 0; brt_h->panic = 0; brt_h->panic_string = 0;
brt_h->basementnodesize = 128*1024;
brt_h->compression_method = TOKU_DEFAULT_COMPRESSION_METHOD;
brt_h->compare_fun = long_key_cmp; brt_h->compare_fun = long_key_cmp;
toku_ft_init_treelock(brt_h); toku_ft_init_treelock(brt_h);
toku_blocktable_create_new(&brt_h->blocktable); toku_blocktable_create_new(&brt_h->blocktable);

View file

@ -250,11 +250,15 @@ test_serialize_leaf_check_msn(enum ftnode_verify_type bft, BOOL do_clone) {
FT_HANDLE XMALLOC(brt); FT_HANDLE XMALLOC(brt);
FT XCALLOC(brt_h); FT XCALLOC(brt_h);
toku_ft_init(brt_h,
make_blocknum(0),
ZERO_LSN,
TXNID_NONE,
4*1024*1024,
128*1024,
TOKU_DEFAULT_COMPRESSION_METHOD);
brt->ft = brt_h; brt->ft = brt_h;
brt_h->type = FT_CURRENT;
brt_h->panic = 0; brt_h->panic_string = 0; brt_h->panic = 0; brt_h->panic_string = 0;
brt_h->basementnodesize = 128*1024;
brt_h->compression_method = TOKU_DEFAULT_COMPRESSION_METHOD;
toku_ft_init_treelock(brt_h); toku_ft_init_treelock(brt_h);
toku_blocktable_create_new(&brt_h->blocktable); toku_blocktable_create_new(&brt_h->blocktable);
//Want to use block #20 //Want to use block #20
@ -392,11 +396,15 @@ test_serialize_leaf_with_large_pivots(enum ftnode_verify_type bft, BOOL do_clone
FT_HANDLE XMALLOC(brt); FT_HANDLE XMALLOC(brt);
FT XCALLOC(brt_h); FT XCALLOC(brt_h);
toku_ft_init(brt_h,
make_blocknum(0),
ZERO_LSN,
TXNID_NONE,
4*1024*1024,
128*1024,
TOKU_DEFAULT_COMPRESSION_METHOD);
brt->ft = brt_h; brt->ft = brt_h;
brt_h->type = FT_CURRENT;
brt_h->panic = 0; brt_h->panic_string = 0; brt_h->panic = 0; brt_h->panic_string = 0;
brt_h->basementnodesize = 128*1024;
brt_h->compression_method = TOKU_DEFAULT_COMPRESSION_METHOD;
toku_ft_init_treelock(brt_h); toku_ft_init_treelock(brt_h);
toku_blocktable_create_new(&brt_h->blocktable); toku_blocktable_create_new(&brt_h->blocktable);
//Want to use block #20 //Want to use block #20
@ -531,11 +539,15 @@ test_serialize_leaf_with_many_rows(enum ftnode_verify_type bft, BOOL do_clone) {
FT_HANDLE XMALLOC(brt); FT_HANDLE XMALLOC(brt);
FT XCALLOC(brt_h); FT XCALLOC(brt_h);
toku_ft_init(brt_h,
make_blocknum(0),
ZERO_LSN,
TXNID_NONE,
4*1024*1024,
128*1024,
TOKU_DEFAULT_COMPRESSION_METHOD);
brt->ft = brt_h; brt->ft = brt_h;
brt_h->type = FT_CURRENT;
brt_h->panic = 0; brt_h->panic_string = 0; brt_h->panic = 0; brt_h->panic_string = 0;
brt_h->basementnodesize = 128*1024;
brt_h->compression_method = TOKU_DEFAULT_COMPRESSION_METHOD;
toku_ft_init_treelock(brt_h); toku_ft_init_treelock(brt_h);
toku_blocktable_create_new(&brt_h->blocktable); toku_blocktable_create_new(&brt_h->blocktable);
//Want to use block #20 //Want to use block #20
@ -675,11 +687,15 @@ test_serialize_leaf_with_large_rows(enum ftnode_verify_type bft, BOOL do_clone)
FT_HANDLE XMALLOC(brt); FT_HANDLE XMALLOC(brt);
FT XCALLOC(brt_h); FT XCALLOC(brt_h);
toku_ft_init(brt_h,
make_blocknum(0),
ZERO_LSN,
TXNID_NONE,
4*1024*1024,
128*1024,
TOKU_DEFAULT_COMPRESSION_METHOD);
brt->ft = brt_h; brt->ft = brt_h;
brt_h->type = FT_CURRENT;
brt_h->panic = 0; brt_h->panic_string = 0; brt_h->panic = 0; brt_h->panic_string = 0;
brt_h->basementnodesize = 128*1024;
brt_h->compression_method = TOKU_DEFAULT_COMPRESSION_METHOD;
toku_ft_init_treelock(brt_h); toku_ft_init_treelock(brt_h);
toku_blocktable_create_new(&brt_h->blocktable); toku_blocktable_create_new(&brt_h->blocktable);
//Want to use block #20 //Want to use block #20
@ -835,11 +851,15 @@ test_serialize_leaf_with_empty_basement_nodes(enum ftnode_verify_type bft, BOOL
FT_HANDLE XMALLOC(brt); FT_HANDLE XMALLOC(brt);
FT XCALLOC(brt_h); FT XCALLOC(brt_h);
toku_ft_init(brt_h,
make_blocknum(0),
ZERO_LSN,
TXNID_NONE,
4*1024*1024,
128*1024,
TOKU_DEFAULT_COMPRESSION_METHOD);
brt->ft = brt_h; brt->ft = brt_h;
brt_h->type = FT_CURRENT;
brt_h->panic = 0; brt_h->panic_string = 0; brt_h->panic = 0; brt_h->panic_string = 0;
brt_h->basementnodesize = 128*1024;
brt_h->compression_method = TOKU_DEFAULT_COMPRESSION_METHOD;
toku_ft_init_treelock(brt_h); toku_ft_init_treelock(brt_h);
toku_blocktable_create_new(&brt_h->blocktable); toku_blocktable_create_new(&brt_h->blocktable);
//Want to use block #20 //Want to use block #20
@ -959,11 +979,15 @@ test_serialize_leaf_with_multiple_empty_basement_nodes(enum ftnode_verify_type b
FT_HANDLE XMALLOC(brt); FT_HANDLE XMALLOC(brt);
FT XCALLOC(brt_h); FT XCALLOC(brt_h);
toku_ft_init(brt_h,
make_blocknum(0),
ZERO_LSN,
TXNID_NONE,
4*1024*1024,
128*1024,
TOKU_DEFAULT_COMPRESSION_METHOD);
brt->ft = brt_h; brt->ft = brt_h;
brt_h->type = FT_CURRENT;
brt_h->panic = 0; brt_h->panic_string = 0; brt_h->panic = 0; brt_h->panic_string = 0;
brt_h->basementnodesize = 128*1024;
brt_h->compression_method = TOKU_DEFAULT_COMPRESSION_METHOD;
toku_ft_init_treelock(brt_h); toku_ft_init_treelock(brt_h);
toku_blocktable_create_new(&brt_h->blocktable); toku_blocktable_create_new(&brt_h->blocktable);
//Want to use block #20 //Want to use block #20
@ -1088,11 +1112,15 @@ test_serialize_leaf(enum ftnode_verify_type bft, BOOL do_clone) {
FT_HANDLE XMALLOC(brt); FT_HANDLE XMALLOC(brt);
FT XCALLOC(brt_h); FT XCALLOC(brt_h);
toku_ft_init(brt_h,
make_blocknum(0),
ZERO_LSN,
TXNID_NONE,
4*1024*1024,
128*1024,
TOKU_DEFAULT_COMPRESSION_METHOD);
brt->ft = brt_h; brt->ft = brt_h;
brt_h->type = FT_CURRENT;
brt_h->panic = 0; brt_h->panic_string = 0; brt_h->panic = 0; brt_h->panic_string = 0;
brt_h->basementnodesize = 128*1024;
brt_h->compression_method = TOKU_DEFAULT_COMPRESSION_METHOD;
toku_ft_init_treelock(brt_h); toku_ft_init_treelock(brt_h);
toku_blocktable_create_new(&brt_h->blocktable); toku_blocktable_create_new(&brt_h->blocktable);
//Want to use block #20 //Want to use block #20
@ -1230,11 +1258,15 @@ test_serialize_nonleaf(enum ftnode_verify_type bft, BOOL do_clone) {
FT_HANDLE XMALLOC(brt); FT_HANDLE XMALLOC(brt);
FT XCALLOC(brt_h); FT XCALLOC(brt_h);
toku_ft_init(brt_h,
make_blocknum(0),
ZERO_LSN,
TXNID_NONE,
4*1024*1024,
128*1024,
TOKU_DEFAULT_COMPRESSION_METHOD);
brt->ft = brt_h; brt->ft = brt_h;
brt_h->type = FT_CURRENT;
brt_h->panic = 0; brt_h->panic_string = 0; brt_h->panic = 0; brt_h->panic_string = 0;
brt_h->basementnodesize = 128*1024;
brt_h->compression_method = TOKU_DEFAULT_COMPRESSION_METHOD;
toku_ft_init_treelock(brt_h); toku_ft_init_treelock(brt_h);
toku_blocktable_create_new(&brt_h->blocktable); toku_blocktable_create_new(&brt_h->blocktable);
//Want to use block #20 //Want to use block #20

View file

@ -25,14 +25,15 @@ static void test_header (void) {
r = toku_open_ft_handle(fname, 1, &t, 1024, 256, TOKU_DEFAULT_COMPRESSION_METHOD, ct, null_txn, toku_builtin_compare_fun); r = toku_open_ft_handle(fname, 1, &t, 1024, 256, TOKU_DEFAULT_COMPRESSION_METHOD, ct, null_txn, toku_builtin_compare_fun);
assert(r==0); assert(r==0);
// now insert some info into the header // now insert some info into the header
FT h = t->ft; FT ft = t->ft;
h->dirty = 1; ft->h->dirty = 1;
h->layout_version_original = 13; // cast away const because we actually want to fiddle with the header
h->layout_version_read_from_disk = 14; // in this test
h->build_id_original = 1234; *((int *) &ft->h->layout_version_original) = 13;
h->in_memory_stats = (STAT64INFO_S) {10, 11}; ft->layout_version_read_from_disk = 14;
h->on_disk_stats = (STAT64INFO_S) {20, 21}; *((uint32_t *) &ft->h->build_id_original) = 1234;
h->checkpoint_staging_stats = (STAT64INFO_S) {30, 31}; ft->in_memory_stats = (STAT64INFO_S) {10, 11};
ft->h->on_disk_stats = (STAT64INFO_S) {20, 21};
r = toku_close_ft_handle_nolsn(t, 0); assert(r==0); r = toku_close_ft_handle_nolsn(t, 0); assert(r==0);
r = toku_cachetable_close(&ct); r = toku_cachetable_close(&ct);
assert(r==0); assert(r==0);
@ -43,20 +44,17 @@ static void test_header (void) {
r = toku_open_ft_handle(fname, 0, &t, 1024, 256, TOKU_DEFAULT_COMPRESSION_METHOD, ct, null_txn, toku_builtin_compare_fun); r = toku_open_ft_handle(fname, 0, &t, 1024, 256, TOKU_DEFAULT_COMPRESSION_METHOD, ct, null_txn, toku_builtin_compare_fun);
assert(r==0); assert(r==0);
h = t->ft; ft = t->ft;
STAT64INFO_S expected_stats = {20, 21}; // on checkpoint, on_disk_stats copied to checkpoint_staging_stats STAT64INFO_S expected_stats = {20, 21}; // on checkpoint, on_disk_stats copied to ft->checkpoint_header->on_disk_stats
assert(h->layout_version == FT_LAYOUT_VERSION); assert(ft->h->layout_version == FT_LAYOUT_VERSION);
assert(h->layout_version_original == 13); assert(ft->h->layout_version_original == 13);
assert(h->layout_version_read_from_disk == FT_LAYOUT_VERSION); assert(ft->layout_version_read_from_disk == FT_LAYOUT_VERSION);
assert(h->build_id_original == 1234); assert(ft->h->build_id_original == 1234);
assert(h->in_memory_stats.numrows == expected_stats.numrows); assert(ft->in_memory_stats.numrows == expected_stats.numrows);
assert(h->on_disk_stats.numbytes == expected_stats.numbytes); assert(ft->h->on_disk_stats.numbytes == expected_stats.numbytes);
r = toku_close_ft_handle_nolsn(t, 0); assert(r==0); r = toku_close_ft_handle_nolsn(t, 0); assert(r==0);
r = toku_cachetable_close(&ct); r = toku_cachetable_close(&ct);
assert(r==0); assert(r==0);
} }
int int

View file

@ -658,7 +658,6 @@ static int remove_txn (OMTVALUE hv, u_int32_t UU(idx), void *txnv)
if (txn->txnid64==h->txnid_that_created_or_locked_when_empty) { if (txn->txnid64==h->txnid_that_created_or_locked_when_empty) {
h->txnid_that_created_or_locked_when_empty = TXNID_NONE; h->txnid_that_created_or_locked_when_empty = TXNID_NONE;
h->root_that_created_or_locked_when_empty = TXNID_NONE;
} }
if (txn->txnid64==h->txnid_that_suppressed_recovery_logs) { if (txn->txnid64==h->txnid_that_suppressed_recovery_logs) {
h->txnid_that_suppressed_recovery_logs = TXNID_NONE; h->txnid_that_suppressed_recovery_logs = TXNID_NONE;

View file

@ -500,7 +500,8 @@ toku_db_change_descriptor(DB *db, DB_TXN* txn, const DBT* descriptor, u_int32_t
goto cleanup; goto cleanup;
} }
if (!is_db_hot_index) { if (!is_db_hot_index) {
r = toku_db_pre_acquire_fileops_lock(db, txn); //TODO(zardosht): why doesn't hot_index need to do locking?
r = toku_db_pre_acquire_table_lock(db, txn);
if (r != 0) { goto cleanup; } if (r != 0) { goto cleanup; }
} }
@ -677,9 +678,9 @@ locked_db_open(DB *db, DB_TXN *txn, const char *fname, const char *dbname, DBTYP
static int static int
locked_db_change_descriptor(DB *db, DB_TXN* txn, const DBT* descriptor, u_int32_t flags) { locked_db_change_descriptor(DB *db, DB_TXN* txn, const DBT* descriptor, u_int32_t flags) {
toku_ydb_lock(); toku_multi_operation_client_lock(); //Cannot begin checkpoint
int r = toku_db_change_descriptor(db, txn, descriptor, flags); int r = toku_db_change_descriptor(db, txn, descriptor, flags);
toku_ydb_unlock(); toku_multi_operation_client_unlock(); //Can now begin checkpoint
return r; return r;
} }

View file

@ -19,6 +19,13 @@ struct toku_list {
struct toku_list *next, *prev; struct toku_list *next, *prev;
}; };
static inline int toku_list_num_elements_est(struct toku_list *head) {
if (head->next == head) return 0;
if (head->next == head->prev) return 1;
return 2;
}
static inline void toku_list_init(struct toku_list *head) { static inline void toku_list_init(struct toku_list *head) {
head->next = head->prev = head; head->next = head->prev = head;
} }