mirror of
https://github.com/MariaDB/server.git
synced 2025-04-25 08:29:58 +02:00
1929 lines
67 KiB
C++
1929 lines
67 KiB
C++
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
|
|
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
|
|
#ident "$Id$"
|
|
/*======
|
|
This file is part of PerconaFT.
|
|
|
|
|
|
Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
|
|
|
|
PerconaFT is free software: you can redistribute it and/or modify
|
|
it under the terms of the GNU General Public License, version 2,
|
|
as published by the Free Software Foundation.
|
|
|
|
PerconaFT is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
GNU General Public License for more details.
|
|
|
|
You should have received a copy of the GNU General Public License
|
|
along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
----------------------------------------
|
|
|
|
PerconaFT is free software: you can redistribute it and/or modify
|
|
it under the terms of the GNU Affero General Public License, version 3,
|
|
as published by the Free Software Foundation.
|
|
|
|
PerconaFT is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
GNU Affero General Public License for more details.
|
|
|
|
You should have received a copy of the GNU Affero General Public License
|
|
along with PerconaFT. If not, see <http://www.gnu.org/licenses/>.
|
|
======= */
|
|
|
|
#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
|
|
|
|
#include <my_global.h>
|
|
#include "ft/ft.h"
|
|
#include "ft/ft-cachetable-wrappers.h"
|
|
#include "ft/ft-internal.h"
|
|
#include "ft/ft-flusher.h"
|
|
#include "ft/ft-flusher-internal.h"
|
|
#include "ft/node.h"
|
|
#include "ft/serialize/block_table.h"
|
|
#include "ft/serialize/ft_node-serialize.h"
|
|
#include "portability/toku_assert.h"
|
|
#include "portability/toku_atomic.h"
|
|
#include "util/status.h"
|
|
#include "util/context.h"
|
|
|
|
|
|
void toku_ft_flusher_get_status(FT_FLUSHER_STATUS status) {
|
|
fl_status.init();
|
|
*status = fl_status;
|
|
}
|
|
|
|
//
|
|
// For test purposes only.
|
|
// These callbacks are never used in production code, only as a way
|
|
// to test the system (for example, by causing crashes at predictable times).
|
|
//
|
|
static void (*flusher_thread_callback)(int, void*) = NULL;
|
|
static void *flusher_thread_callback_extra = NULL;
|
|
|
|
void toku_flusher_thread_set_callback(void (*callback_f)(int, void*),
|
|
void* extra) {
|
|
flusher_thread_callback = callback_f;
|
|
flusher_thread_callback_extra = extra;
|
|
}
|
|
|
|
static void call_flusher_thread_callback(int flt_state) {
|
|
if (flusher_thread_callback) {
|
|
flusher_thread_callback(flt_state, flusher_thread_callback_extra);
|
|
}
|
|
}
|
|
|
|
static int
|
|
find_heaviest_child(FTNODE node)
|
|
{
|
|
int max_child = 0;
|
|
uint64_t max_weight = toku_bnc_nbytesinbuf(BNC(node, 0)) + BP_WORKDONE(node, 0);
|
|
|
|
invariant(node->n_children > 0);
|
|
for (int i = 1; i < node->n_children; i++) {
|
|
uint64_t bytes_in_buf = toku_bnc_nbytesinbuf(BNC(node, i));
|
|
uint64_t workdone = BP_WORKDONE(node, i);
|
|
if (workdone > 0) {
|
|
invariant(bytes_in_buf > 0);
|
|
}
|
|
uint64_t this_weight = bytes_in_buf + workdone;
|
|
if (max_weight < this_weight) {
|
|
max_child = i;
|
|
max_weight = this_weight;
|
|
}
|
|
}
|
|
return max_child;
|
|
}
|
|
|
|
static void
|
|
update_flush_status(FTNODE child, int cascades) {
|
|
FL_STATUS_VAL(FT_FLUSHER_FLUSH_TOTAL)++;
|
|
if (cascades > 0) {
|
|
FL_STATUS_VAL(FT_FLUSHER_FLUSH_CASCADES)++;
|
|
switch (cascades) {
|
|
case 1:
|
|
FL_STATUS_VAL(FT_FLUSHER_FLUSH_CASCADES_1)++; break;
|
|
case 2:
|
|
FL_STATUS_VAL(FT_FLUSHER_FLUSH_CASCADES_2)++; break;
|
|
case 3:
|
|
FL_STATUS_VAL(FT_FLUSHER_FLUSH_CASCADES_3)++; break;
|
|
case 4:
|
|
FL_STATUS_VAL(FT_FLUSHER_FLUSH_CASCADES_4)++; break;
|
|
case 5:
|
|
FL_STATUS_VAL(FT_FLUSHER_FLUSH_CASCADES_5)++; break;
|
|
default:
|
|
FL_STATUS_VAL(FT_FLUSHER_FLUSH_CASCADES_GT_5)++; break;
|
|
}
|
|
}
|
|
bool flush_needs_io = false;
|
|
for (int i = 0; !flush_needs_io && i < child->n_children; ++i) {
|
|
if (BP_STATE(child, i) == PT_ON_DISK) {
|
|
flush_needs_io = true;
|
|
}
|
|
}
|
|
if (flush_needs_io) {
|
|
FL_STATUS_VAL(FT_FLUSHER_FLUSH_NEEDED_IO)++;
|
|
} else {
|
|
FL_STATUS_VAL(FT_FLUSHER_FLUSH_IN_MEMORY)++;
|
|
}
|
|
}
|
|
|
|
static void
|
|
maybe_destroy_child_blbs(FTNODE node, FTNODE child, FT ft)
|
|
{
|
|
// If the node is already fully in memory, as in upgrade, we don't
|
|
// need to destroy the basement nodes because they are all equally
|
|
// up to date.
|
|
if (child->n_children > 1 &&
|
|
child->height == 0 &&
|
|
!child->dirty()) {
|
|
for (int i = 0; i < child->n_children; ++i) {
|
|
if (BP_STATE(child, i) == PT_AVAIL &&
|
|
node->max_msn_applied_to_node_on_disk.msn < BLB_MAX_MSN_APPLIED(child, i).msn)
|
|
{
|
|
toku_evict_bn_from_memory(child, i, ft);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
static void
|
|
ft_merge_child(
|
|
FT ft,
|
|
FTNODE node,
|
|
int childnum_to_merge,
|
|
bool *did_react,
|
|
struct flusher_advice *fa);
|
|
|
|
static int
|
|
pick_heaviest_child(FT UU(ft),
|
|
FTNODE parent,
|
|
void* UU(extra))
|
|
{
|
|
int childnum = find_heaviest_child(parent);
|
|
paranoid_invariant(toku_bnc_n_entries(BNC(parent, childnum))>0);
|
|
return childnum;
|
|
}
|
|
|
|
bool
|
|
dont_destroy_basement_nodes(void* UU(extra))
|
|
{
|
|
return false;
|
|
}
|
|
|
|
static bool
|
|
do_destroy_basement_nodes(void* UU(extra))
|
|
{
|
|
return true;
|
|
}
|
|
|
|
bool
|
|
always_recursively_flush(FTNODE UU(child), void* UU(extra))
|
|
{
|
|
return true;
|
|
}
|
|
|
|
bool
|
|
never_recursively_flush(FTNODE UU(child), void* UU(extra))
|
|
{
|
|
return false;
|
|
}
|
|
|
|
/**
|
|
* Flusher thread ("normal" flushing) implementation.
|
|
*/
|
|
struct flush_status_update_extra {
|
|
int cascades;
|
|
uint32_t nodesize;
|
|
};
|
|
|
|
static bool
|
|
recurse_if_child_is_gorged(FTNODE child, void* extra)
|
|
{
|
|
struct flush_status_update_extra *fste = (flush_status_update_extra *)extra;
|
|
return toku_ftnode_nonleaf_is_gorged(child, fste->nodesize);
|
|
}
|
|
|
|
int
|
|
default_pick_child_after_split(FT UU(ft),
|
|
FTNODE UU(parent),
|
|
int UU(childnuma),
|
|
int UU(childnumb),
|
|
void* UU(extra))
|
|
{
|
|
return -1;
|
|
}
|
|
|
|
void
|
|
default_merge_child(struct flusher_advice *fa,
|
|
FT ft,
|
|
FTNODE parent,
|
|
int childnum,
|
|
FTNODE child,
|
|
void* UU(extra))
|
|
{
|
|
//
|
|
// There is probably a way to pass FTNODE child
|
|
// into ft_merge_child, but for simplicity for now,
|
|
// we are just going to unpin child and
|
|
// let ft_merge_child pin it again
|
|
//
|
|
toku_unpin_ftnode(ft, child);
|
|
//
|
|
//
|
|
// it is responsibility of ft_merge_child to unlock parent
|
|
//
|
|
bool did_react;
|
|
ft_merge_child(ft, parent, childnum, &did_react, fa);
|
|
}
|
|
|
|
void
|
|
flusher_advice_init(
|
|
struct flusher_advice *fa,
|
|
FA_PICK_CHILD pick_child,
|
|
FA_SHOULD_DESTROY_BN should_destroy_basement_nodes,
|
|
FA_SHOULD_RECURSIVELY_FLUSH should_recursively_flush,
|
|
FA_MAYBE_MERGE_CHILD maybe_merge_child,
|
|
FA_UPDATE_STATUS update_status,
|
|
FA_PICK_CHILD_AFTER_SPLIT pick_child_after_split,
|
|
void* extra
|
|
)
|
|
{
|
|
fa->pick_child = pick_child;
|
|
fa->should_destroy_basement_nodes = should_destroy_basement_nodes;
|
|
fa->should_recursively_flush = should_recursively_flush;
|
|
fa->maybe_merge_child = maybe_merge_child;
|
|
fa->update_status = update_status;
|
|
fa->pick_child_after_split = pick_child_after_split;
|
|
fa->extra = extra;
|
|
}
|
|
|
|
static void
|
|
flt_update_status(FTNODE child,
|
|
int UU(dirtied),
|
|
void* extra)
|
|
{
|
|
struct flush_status_update_extra *fste = (struct flush_status_update_extra *) extra;
|
|
update_flush_status(child, fste->cascades);
|
|
// If `toku_ft_flush_some_child` decides to recurse after this, we'll need
|
|
// cascades to increase. If not it doesn't matter.
|
|
fste->cascades++;
|
|
}
|
|
|
|
static void
|
|
flt_flusher_advice_init(struct flusher_advice *fa, struct flush_status_update_extra *fste, uint32_t nodesize)
|
|
{
|
|
fste->cascades = 0;
|
|
fste->nodesize = nodesize;
|
|
flusher_advice_init(fa,
|
|
pick_heaviest_child,
|
|
dont_destroy_basement_nodes,
|
|
recurse_if_child_is_gorged,
|
|
default_merge_child,
|
|
flt_update_status,
|
|
default_pick_child_after_split,
|
|
fste);
|
|
}
|
|
|
|
struct ctm_extra {
|
|
bool is_last_child;
|
|
DBT target_key;
|
|
};
|
|
|
|
static int
|
|
ctm_pick_child(FT ft,
|
|
FTNODE parent,
|
|
void* extra)
|
|
{
|
|
struct ctm_extra* ctme = (struct ctm_extra *) extra;
|
|
int childnum;
|
|
if (parent->height == 1 && ctme->is_last_child) {
|
|
childnum = parent->n_children - 1;
|
|
} else {
|
|
childnum = toku_ftnode_which_child(parent, &ctme->target_key, ft->cmp);
|
|
}
|
|
return childnum;
|
|
}
|
|
|
|
static void
|
|
ctm_update_status(
|
|
FTNODE UU(child),
|
|
int dirtied,
|
|
void* UU(extra)
|
|
)
|
|
{
|
|
FL_STATUS_VAL(FT_FLUSHER_CLEANER_NUM_DIRTIED_FOR_LEAF_MERGE) += dirtied;
|
|
}
|
|
|
|
static void
|
|
ctm_maybe_merge_child(struct flusher_advice *fa,
|
|
FT ft,
|
|
FTNODE parent,
|
|
int childnum,
|
|
FTNODE child,
|
|
void *extra)
|
|
{
|
|
if (child->height == 0) {
|
|
(void) toku_sync_fetch_and_add(&FL_STATUS_VAL(FT_FLUSHER_CLEANER_NUM_LEAF_MERGES_COMPLETED), 1);
|
|
}
|
|
default_merge_child(fa, ft, parent, childnum, child, extra);
|
|
}
|
|
|
|
static void
|
|
ct_maybe_merge_child(struct flusher_advice *fa,
|
|
FT ft,
|
|
FTNODE parent,
|
|
int childnum,
|
|
FTNODE child,
|
|
void* extra)
|
|
{
|
|
if (child->height > 0) {
|
|
default_merge_child(fa, ft, parent, childnum, child, extra);
|
|
}
|
|
else {
|
|
struct ctm_extra ctme;
|
|
paranoid_invariant(parent->n_children > 1);
|
|
int pivot_to_save;
|
|
//
|
|
// we have two cases, one where the childnum
|
|
// is the last child, and therefore the pivot we
|
|
// save is not of the pivot which we wish to descend
|
|
// and another where it is not the last child,
|
|
// so the pivot is sufficient for identifying the leaf
|
|
// to be merged
|
|
//
|
|
if (childnum == (parent->n_children - 1)) {
|
|
ctme.is_last_child = true;
|
|
pivot_to_save = childnum - 1;
|
|
}
|
|
else {
|
|
ctme.is_last_child = false;
|
|
pivot_to_save = childnum;
|
|
}
|
|
toku_clone_dbt(&ctme.target_key, parent->pivotkeys.get_pivot(pivot_to_save));
|
|
|
|
// at this point, ctme is properly setup, now we can do the merge
|
|
struct flusher_advice new_fa;
|
|
flusher_advice_init(
|
|
&new_fa,
|
|
ctm_pick_child,
|
|
dont_destroy_basement_nodes,
|
|
always_recursively_flush,
|
|
ctm_maybe_merge_child,
|
|
ctm_update_status,
|
|
default_pick_child_after_split,
|
|
&ctme);
|
|
|
|
toku_unpin_ftnode(ft, parent);
|
|
toku_unpin_ftnode(ft, child);
|
|
|
|
FTNODE root_node = NULL;
|
|
{
|
|
uint32_t fullhash;
|
|
CACHEKEY root;
|
|
toku_calculate_root_offset_pointer(ft, &root, &fullhash);
|
|
ftnode_fetch_extra bfe;
|
|
bfe.create_for_full_read(ft);
|
|
toku_pin_ftnode(ft, root, fullhash, &bfe, PL_WRITE_EXPENSIVE, &root_node, true);
|
|
toku_ftnode_assert_fully_in_memory(root_node);
|
|
}
|
|
|
|
(void) toku_sync_fetch_and_add(&FL_STATUS_VAL(FT_FLUSHER_CLEANER_NUM_LEAF_MERGES_STARTED), 1);
|
|
(void) toku_sync_fetch_and_add(&FL_STATUS_VAL(FT_FLUSHER_CLEANER_NUM_LEAF_MERGES_RUNNING), 1);
|
|
|
|
toku_ft_flush_some_child(ft, root_node, &new_fa);
|
|
|
|
(void) toku_sync_fetch_and_sub(&FL_STATUS_VAL(FT_FLUSHER_CLEANER_NUM_LEAF_MERGES_RUNNING), 1);
|
|
|
|
toku_destroy_dbt(&ctme.target_key);
|
|
}
|
|
}
|
|
|
|
static void
|
|
ct_update_status(FTNODE child,
|
|
int dirtied,
|
|
void* extra)
|
|
{
|
|
struct flush_status_update_extra* fste = (struct flush_status_update_extra *) extra;
|
|
update_flush_status(child, fste->cascades);
|
|
FL_STATUS_VAL(FT_FLUSHER_CLEANER_NODES_DIRTIED) += dirtied;
|
|
// Incrementing this in case `toku_ft_flush_some_child` decides to recurse.
|
|
fste->cascades++;
|
|
}
|
|
|
|
static void
|
|
ct_flusher_advice_init(struct flusher_advice *fa, struct flush_status_update_extra* fste, uint32_t nodesize)
|
|
{
|
|
fste->cascades = 0;
|
|
fste->nodesize = nodesize;
|
|
flusher_advice_init(fa,
|
|
pick_heaviest_child,
|
|
do_destroy_basement_nodes,
|
|
recurse_if_child_is_gorged,
|
|
ct_maybe_merge_child,
|
|
ct_update_status,
|
|
default_pick_child_after_split,
|
|
fste);
|
|
}
|
|
|
|
//
|
|
// This returns true if the node MAY be reactive,
|
|
// false is we are absolutely sure that it is NOT reactive.
|
|
// The reason for inaccuracy is that the node may be
|
|
// a leaf node that is not entirely in memory. If so, then
|
|
// we cannot be sure if the node is reactive.
|
|
//
|
|
static bool ft_ftnode_may_be_reactive(FT ft, FTNODE node)
|
|
{
|
|
if (node->height == 0) {
|
|
return true;
|
|
} else {
|
|
return toku_ftnode_get_nonleaf_reactivity(node, ft->h->fanout) != RE_STABLE;
|
|
}
|
|
}
|
|
|
|
/* NODE is a node with a child.
|
|
* childnum was split into two nodes childa, and childb. childa is the same as the original child. childb is a new child.
|
|
* We must slide things around, & move things from the old table to the new tables.
|
|
* Requires: the CHILDNUMth buffer of node is empty.
|
|
* We don't push anything down to children. We split the node, and things land wherever they land.
|
|
* We must delete the old buffer (but the old child is already deleted.)
|
|
* On return, the new children and node STAY PINNED.
|
|
*/
|
|
static void
|
|
handle_split_of_child(
|
|
FT ft,
|
|
FTNODE node,
|
|
int childnum,
|
|
FTNODE childa,
|
|
FTNODE childb,
|
|
DBT *splitk /* the data in the childsplitk is alloc'd and is consumed by this call. */
|
|
)
|
|
{
|
|
paranoid_invariant(node->height>0);
|
|
paranoid_invariant(0 <= childnum);
|
|
paranoid_invariant(childnum < node->n_children);
|
|
toku_ftnode_assert_fully_in_memory(node);
|
|
toku_ftnode_assert_fully_in_memory(childa);
|
|
toku_ftnode_assert_fully_in_memory(childb);
|
|
NONLEAF_CHILDINFO old_bnc = BNC(node, childnum);
|
|
paranoid_invariant(toku_bnc_nbytesinbuf(old_bnc)==0);
|
|
WHEN_NOT_GCOV(
|
|
if (toku_ft_debug_mode) {
|
|
printf("%s:%d Child %d splitting on %s\n", __FILE__, __LINE__, childnum, (char*)splitk->data);
|
|
printf("%s:%d oldsplitkeys:", __FILE__, __LINE__);
|
|
for(int i = 0; i < node->n_children - 1; i++) printf(" %s", (char *) node->pivotkeys.get_pivot(i).data);
|
|
printf("\n");
|
|
}
|
|
)
|
|
|
|
node->set_dirty();
|
|
|
|
XREALLOC_N(node->n_children+1, node->bp);
|
|
// Slide the children over.
|
|
// suppose n_children is 10 and childnum is 5, meaning node->childnum[5] just got split
|
|
// this moves node->bp[6] through node->bp[9] over to
|
|
// node->bp[7] through node->bp[10]
|
|
for (int cnum=node->n_children; cnum>childnum+1; cnum--) {
|
|
node->bp[cnum] = node->bp[cnum-1];
|
|
}
|
|
memset(&node->bp[childnum+1],0,sizeof(node->bp[0]));
|
|
node->n_children++;
|
|
|
|
paranoid_invariant(BP_BLOCKNUM(node, childnum).b==childa->blocknum.b); // use the same child
|
|
|
|
// We never set the rightmost blocknum to be the root.
|
|
// Instead, we wait for the root to split and let promotion initialize the rightmost
|
|
// blocknum to be the first non-root leaf node on the right extreme to receive an insert.
|
|
BLOCKNUM rightmost_blocknum = toku_unsafe_fetch(&ft->rightmost_blocknum);
|
|
invariant(ft->h->root_blocknum.b != rightmost_blocknum.b);
|
|
if (childa->blocknum.b == rightmost_blocknum.b) {
|
|
// The rightmost leaf (a) split into (a) and (b). We want (b) to swap pair values
|
|
// with (a), now that it is the new rightmost leaf. This keeps the rightmost blocknum
|
|
// constant, the same the way we keep the root blocknum constant.
|
|
toku_ftnode_swap_pair_values(childa, childb);
|
|
BP_BLOCKNUM(node, childnum) = childa->blocknum;
|
|
}
|
|
|
|
BP_BLOCKNUM(node, childnum+1) = childb->blocknum;
|
|
BP_WORKDONE(node, childnum+1) = 0;
|
|
BP_STATE(node,childnum+1) = PT_AVAIL;
|
|
|
|
NONLEAF_CHILDINFO new_bnc = toku_create_empty_nl();
|
|
for (unsigned int i = 0; i < (sizeof new_bnc->flow) / (sizeof new_bnc->flow[0]); ++i) {
|
|
// just split the flows in half for now, can't guess much better
|
|
// at the moment
|
|
new_bnc->flow[i] = old_bnc->flow[i] / 2;
|
|
old_bnc->flow[i] = (old_bnc->flow[i] + 1) / 2;
|
|
}
|
|
set_BNC(node, childnum+1, new_bnc);
|
|
|
|
// Insert the new split key , sliding the other keys over
|
|
node->pivotkeys.insert_at(splitk, childnum);
|
|
|
|
WHEN_NOT_GCOV(
|
|
if (toku_ft_debug_mode) {
|
|
printf("%s:%d splitkeys:", __FILE__, __LINE__);
|
|
for (int i = 0; i < node->n_children - 2; i++) printf(" %s", (char *) node->pivotkeys.get_pivot(i).data);
|
|
printf("\n");
|
|
}
|
|
)
|
|
|
|
/* Keep pushing to the children, but not if the children would require a pushdown */
|
|
toku_ftnode_assert_fully_in_memory(node);
|
|
toku_ftnode_assert_fully_in_memory(childa);
|
|
toku_ftnode_assert_fully_in_memory(childb);
|
|
|
|
VERIFY_NODE(t, node);
|
|
VERIFY_NODE(t, childa);
|
|
VERIFY_NODE(t, childb);
|
|
}
|
|
|
|
static void
|
|
verify_all_in_mempool(FTNODE UU() node)
|
|
{
|
|
#ifdef TOKU_DEBUG_PARANOID
|
|
if (node->height==0) {
|
|
for (int i = 0; i < node->n_children; i++) {
|
|
invariant(BP_STATE(node,i) == PT_AVAIL);
|
|
BLB_DATA(node, i)->verify_mempool();
|
|
}
|
|
}
|
|
#endif
|
|
}
|
|
|
|
static uint64_t
|
|
ftleaf_disk_size(FTNODE node)
|
|
// Effect: get the disk size of a leafentry
|
|
{
|
|
paranoid_invariant(node->height == 0);
|
|
toku_ftnode_assert_fully_in_memory(node);
|
|
uint64_t retval = 0;
|
|
for (int i = 0; i < node->n_children; i++) {
|
|
retval += BLB_DATA(node, i)->get_disk_size();
|
|
}
|
|
return retval;
|
|
}
|
|
|
|
static void
|
|
ftleaf_get_split_loc(
|
|
FTNODE node,
|
|
enum split_mode split_mode,
|
|
int *num_left_bns, // which basement within leaf
|
|
int *num_left_les // which key within basement
|
|
)
|
|
// Effect: Find the location within a leaf node where we want to perform a split
|
|
// num_left_bns is how many basement nodes (which OMT) should be split to the left.
|
|
// num_left_les is how many leafentries in OMT of the last bn should be on the left side of the split.
|
|
{
|
|
switch (split_mode) {
|
|
case SPLIT_LEFT_HEAVY: {
|
|
*num_left_bns = node->n_children;
|
|
*num_left_les = BLB_DATA(node, *num_left_bns - 1)->num_klpairs();
|
|
if (*num_left_les == 0) {
|
|
*num_left_bns = node->n_children - 1;
|
|
*num_left_les = BLB_DATA(node, *num_left_bns - 1)->num_klpairs();
|
|
}
|
|
goto exit;
|
|
}
|
|
case SPLIT_RIGHT_HEAVY: {
|
|
*num_left_bns = 1;
|
|
*num_left_les = BLB_DATA(node, 0)->num_klpairs() ? 1 : 0;
|
|
goto exit;
|
|
}
|
|
case SPLIT_EVENLY: {
|
|
paranoid_invariant(node->height == 0);
|
|
// TODO: (Zardosht) see if we can/should make this faster, we iterate over the rows twice
|
|
uint64_t sumlesizes = ftleaf_disk_size(node);
|
|
uint32_t size_so_far = 0;
|
|
for (int i = 0; i < node->n_children; i++) {
|
|
bn_data* bd = BLB_DATA(node, i);
|
|
uint32_t n_leafentries = bd->num_klpairs();
|
|
for (uint32_t j=0; j < n_leafentries; j++) {
|
|
size_t size_this_le;
|
|
int rr = bd->fetch_klpair_disksize(j, &size_this_le);
|
|
invariant_zero(rr);
|
|
size_so_far += size_this_le;
|
|
if (size_so_far >= sumlesizes/2) {
|
|
*num_left_bns = i + 1;
|
|
*num_left_les = j + 1;
|
|
if (*num_left_bns == node->n_children &&
|
|
(unsigned int) *num_left_les == n_leafentries) {
|
|
// need to correct for when we're splitting after the
|
|
// last element, that makes no sense
|
|
if (*num_left_les > 1) {
|
|
(*num_left_les)--;
|
|
} else if (*num_left_bns > 1) {
|
|
(*num_left_bns)--;
|
|
*num_left_les = BLB_DATA(node, *num_left_bns - 1)->num_klpairs();
|
|
} else {
|
|
// we are trying to split a leaf with only one
|
|
// leafentry in it
|
|
abort();
|
|
}
|
|
}
|
|
goto exit;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
abort();
|
|
exit:
|
|
return;
|
|
}
|
|
|
|
static void
|
|
move_leafentries(
|
|
BASEMENTNODE dest_bn,
|
|
BASEMENTNODE src_bn,
|
|
uint32_t lbi, //lower bound inclusive
|
|
uint32_t ube //upper bound exclusive
|
|
)
|
|
//Effect: move leafentries in the range [lbi, upe) from src_omt to newly created dest_omt
|
|
{
|
|
invariant(ube == src_bn->data_buffer.num_klpairs());
|
|
src_bn->data_buffer.split_klpairs(&dest_bn->data_buffer, lbi);
|
|
}
|
|
|
|
static void ftnode_finalize_split(FTNODE node, FTNODE B, MSN max_msn_applied_to_node) {
|
|
// Effect: Finalizes a split by updating some bits and dirtying both nodes
|
|
toku_ftnode_assert_fully_in_memory(node);
|
|
toku_ftnode_assert_fully_in_memory(B);
|
|
verify_all_in_mempool(node);
|
|
verify_all_in_mempool(B);
|
|
|
|
node->max_msn_applied_to_node_on_disk = max_msn_applied_to_node;
|
|
B->max_msn_applied_to_node_on_disk = max_msn_applied_to_node;
|
|
|
|
// The new node in the split inherits the oldest known reference xid
|
|
B->oldest_referenced_xid_known = node->oldest_referenced_xid_known;
|
|
|
|
node->set_dirty();
|
|
B->set_dirty();
|
|
}
|
|
|
|
void
|
|
ftleaf_split(
|
|
FT ft,
|
|
FTNODE node,
|
|
FTNODE *nodea,
|
|
FTNODE *nodeb,
|
|
DBT *splitk,
|
|
bool create_new_node,
|
|
enum split_mode split_mode,
|
|
uint32_t num_dependent_nodes,
|
|
FTNODE* dependent_nodes)
|
|
// Effect: Split a leaf node.
|
|
// Argument "node" is node to be split.
|
|
// Upon return:
|
|
// nodea and nodeb point to new nodes that result from split of "node"
|
|
// nodea is the left node that results from the split
|
|
// splitk is the right-most key of nodea
|
|
{
|
|
|
|
paranoid_invariant(node->height == 0);
|
|
FL_STATUS_VAL(FT_FLUSHER_SPLIT_LEAF)++;
|
|
if (node->n_children) {
|
|
// First move all the accumulated stat64info deltas into the first basement.
|
|
// After the split, either both nodes or neither node will be included in the next checkpoint.
|
|
// The accumulated stats in the dictionary will be correct in either case.
|
|
// By moving all the deltas into one (arbitrary) basement, we avoid the need to maintain
|
|
// correct information for a basement that is divided between two leafnodes (i.e. when split is
|
|
// not on a basement boundary).
|
|
STAT64INFO_S delta_for_leafnode = toku_get_and_clear_basement_stats(node);
|
|
BASEMENTNODE bn = BLB(node,0);
|
|
bn->stat64_delta = delta_for_leafnode;
|
|
}
|
|
|
|
|
|
FTNODE B = nullptr;
|
|
uint32_t fullhash;
|
|
BLOCKNUM name;
|
|
|
|
if (create_new_node) {
|
|
// put value in cachetable and do checkpointing
|
|
// of dependent nodes
|
|
//
|
|
// We do this here, before evaluating the last_bn_on_left
|
|
// and last_le_on_left_within_bn because this operation
|
|
// may write to disk the dependent nodes.
|
|
// While doing so, we may rebalance the leaf node
|
|
// we are splitting, thereby invalidating the
|
|
// values of last_bn_on_left and last_le_on_left_within_bn.
|
|
// So, we must call this before evaluating
|
|
// those two values
|
|
cachetable_put_empty_node_with_dep_nodes(
|
|
ft,
|
|
num_dependent_nodes,
|
|
dependent_nodes,
|
|
&name,
|
|
&fullhash,
|
|
&B
|
|
);
|
|
// GCC 4.8 seems to get confused and think B is maybe uninitialized at link time.
|
|
// TODO(leif): figure out why it thinks this and actually fix it.
|
|
invariant_notnull(B);
|
|
}
|
|
|
|
|
|
paranoid_invariant(node->height==0);
|
|
toku_ftnode_assert_fully_in_memory(node);
|
|
verify_all_in_mempool(node);
|
|
MSN max_msn_applied_to_node = node->max_msn_applied_to_node_on_disk;
|
|
|
|
// variables that say where we will do the split.
|
|
// After the split, there will be num_left_bns basement nodes in the left node,
|
|
// and the last basement node in the left node will have num_left_les leafentries.
|
|
int num_left_bns;
|
|
int num_left_les;
|
|
ftleaf_get_split_loc(node, split_mode, &num_left_bns, &num_left_les);
|
|
{
|
|
// did we split right on the boundary between basement nodes?
|
|
const bool split_on_boundary = (num_left_les == 0) || (num_left_les == (int) BLB_DATA(node, num_left_bns - 1)->num_klpairs());
|
|
// Now we know where we are going to break it
|
|
// the two nodes will have a total of n_children+1 basement nodes
|
|
// and n_children-1 pivots
|
|
// the left node, node, will have last_bn_on_left+1 basement nodes
|
|
// the right node, B, will have n_children-last_bn_on_left basement nodes
|
|
// the pivots of node will be the first last_bn_on_left pivots that originally exist
|
|
// the pivots of B will be the last (n_children - 1 - last_bn_on_left) pivots that originally exist
|
|
|
|
// Note: The basements will not be rebalanced. Only the mempool of the basement that is split
|
|
// (if split_on_boundary is false) will be affected. All other mempools will remain intact. ???
|
|
|
|
//set up the basement nodes in the new node
|
|
int num_children_in_node = num_left_bns;
|
|
// In the SPLIT_RIGHT_HEAVY case, we need to add 1 back because
|
|
// while it's not on the boundary, we do need node->n_children
|
|
// children in B.
|
|
int num_children_in_b = node->n_children - num_left_bns + (!split_on_boundary ? 1 : 0);
|
|
if (num_children_in_b == 0) {
|
|
// for uneven split, make sure we have at least 1 bn
|
|
paranoid_invariant(split_mode == SPLIT_LEFT_HEAVY);
|
|
num_children_in_b = 1;
|
|
}
|
|
paranoid_invariant(num_children_in_node > 0);
|
|
if (create_new_node) {
|
|
toku_initialize_empty_ftnode(
|
|
B,
|
|
name,
|
|
0,
|
|
num_children_in_b,
|
|
ft->h->layout_version,
|
|
ft->h->flags);
|
|
B->fullhash = fullhash;
|
|
}
|
|
else {
|
|
B = *nodeb;
|
|
REALLOC_N(num_children_in_b, B->bp);
|
|
B->n_children = num_children_in_b;
|
|
for (int i = 0; i < num_children_in_b; i++) {
|
|
BP_BLOCKNUM(B,i).b = 0;
|
|
BP_STATE(B,i) = PT_AVAIL;
|
|
BP_WORKDONE(B,i) = 0;
|
|
set_BLB(B, i, toku_create_empty_bn());
|
|
}
|
|
}
|
|
|
|
// now move all the data
|
|
|
|
int curr_src_bn_index = num_left_bns - 1;
|
|
int curr_dest_bn_index = 0;
|
|
|
|
// handle the move of a subset of data in last_bn_on_left from node to B
|
|
if (!split_on_boundary) {
|
|
BP_STATE(B,curr_dest_bn_index) = PT_AVAIL;
|
|
destroy_basement_node(BLB(B, curr_dest_bn_index)); // Destroy B's empty OMT, so I can rebuild it from an array
|
|
set_BNULL(B, curr_dest_bn_index);
|
|
set_BLB(B, curr_dest_bn_index, toku_create_empty_bn_no_buffer());
|
|
move_leafentries(BLB(B, curr_dest_bn_index),
|
|
BLB(node, curr_src_bn_index),
|
|
num_left_les, // first row to be moved to B
|
|
BLB_DATA(node, curr_src_bn_index)->num_klpairs() // number of rows in basement to be split
|
|
);
|
|
BLB_MAX_MSN_APPLIED(B, curr_dest_bn_index) = BLB_MAX_MSN_APPLIED(node, curr_src_bn_index);
|
|
curr_dest_bn_index++;
|
|
}
|
|
curr_src_bn_index++;
|
|
|
|
paranoid_invariant(B->n_children >= curr_dest_bn_index);
|
|
paranoid_invariant(node->n_children >= curr_src_bn_index);
|
|
|
|
// move the rest of the basement nodes
|
|
for ( ; curr_src_bn_index < node->n_children; curr_src_bn_index++, curr_dest_bn_index++) {
|
|
destroy_basement_node(BLB(B, curr_dest_bn_index));
|
|
set_BNULL(B, curr_dest_bn_index);
|
|
B->bp[curr_dest_bn_index] = node->bp[curr_src_bn_index];
|
|
}
|
|
if (curr_dest_bn_index < B->n_children) {
|
|
// B already has an empty basement node here.
|
|
BP_STATE(B, curr_dest_bn_index) = PT_AVAIL;
|
|
}
|
|
|
|
//
|
|
// now handle the pivots
|
|
//
|
|
|
|
// the child index in the original node that corresponds to the
|
|
// first node in the right node of the split
|
|
int split_idx = num_left_bns - (split_on_boundary ? 0 : 1);
|
|
node->pivotkeys.split_at(split_idx, &B->pivotkeys);
|
|
if (split_on_boundary && num_left_bns < node->n_children && splitk) {
|
|
toku_copyref_dbt(splitk, node->pivotkeys.get_pivot(num_left_bns - 1));
|
|
} else if (splitk) {
|
|
bn_data* bd = BLB_DATA(node, num_left_bns - 1);
|
|
uint32_t keylen;
|
|
void *key;
|
|
int rr = bd->fetch_key_and_len(bd->num_klpairs() - 1, &keylen, &key);
|
|
invariant_zero(rr);
|
|
toku_memdup_dbt(splitk, key, keylen);
|
|
}
|
|
|
|
node->n_children = num_children_in_node;
|
|
REALLOC_N(num_children_in_node, node->bp);
|
|
}
|
|
|
|
ftnode_finalize_split(node, B, max_msn_applied_to_node);
|
|
*nodea = node;
|
|
*nodeb = B;
|
|
} // end of ftleaf_split()
|
|
|
|
void
|
|
ft_nonleaf_split(
|
|
FT ft,
|
|
FTNODE node,
|
|
FTNODE *nodea,
|
|
FTNODE *nodeb,
|
|
DBT *splitk,
|
|
uint32_t num_dependent_nodes,
|
|
FTNODE* dependent_nodes)
|
|
{
|
|
//VERIFY_NODE(t,node);
|
|
FL_STATUS_VAL(FT_FLUSHER_SPLIT_NONLEAF)++;
|
|
toku_ftnode_assert_fully_in_memory(node);
|
|
int old_n_children = node->n_children;
|
|
int n_children_in_a = old_n_children/2;
|
|
int n_children_in_b = old_n_children-n_children_in_a;
|
|
MSN max_msn_applied_to_node = node->max_msn_applied_to_node_on_disk;
|
|
FTNODE B;
|
|
paranoid_invariant(node->height>0);
|
|
paranoid_invariant(node->n_children>=2); // Otherwise, how do we split? We need at least two children to split. */
|
|
create_new_ftnode_with_dep_nodes(ft, &B, node->height, n_children_in_b, num_dependent_nodes, dependent_nodes);
|
|
{
|
|
/* The first n_children_in_a go into node a.
|
|
* That means that the first n_children_in_a-1 keys go into node a.
|
|
* The splitter key is key number n_children_in_a */
|
|
for (int i = n_children_in_a; i<old_n_children; i++) {
|
|
int targchild = i-n_children_in_a;
|
|
// TODO: Figure out better way to handle this
|
|
// the problem is that create_new_ftnode_with_dep_nodes for B creates
|
|
// all the data structures, whereas we really don't want it to fill
|
|
// in anything for the bp's.
|
|
// Now we have to go free what it just created so we can
|
|
// slide the bp over
|
|
destroy_nonleaf_childinfo(BNC(B, targchild));
|
|
// now move the bp over
|
|
B->bp[targchild] = node->bp[i];
|
|
memset(&node->bp[i], 0, sizeof(node->bp[0]));
|
|
}
|
|
|
|
// the split key for our parent is the rightmost pivot key in node
|
|
node->pivotkeys.split_at(n_children_in_a, &B->pivotkeys);
|
|
toku_clone_dbt(splitk, node->pivotkeys.get_pivot(n_children_in_a - 1));
|
|
node->pivotkeys.delete_at(n_children_in_a - 1);
|
|
|
|
node->n_children = n_children_in_a;
|
|
REALLOC_N(node->n_children, node->bp);
|
|
}
|
|
|
|
ftnode_finalize_split(node, B, max_msn_applied_to_node);
|
|
*nodea = node;
|
|
*nodeb = B;
|
|
}
|
|
|
|
//
|
|
// responsibility of ft_split_child is to take locked FTNODEs node and child
|
|
// and do the following:
|
|
// - split child,
|
|
// - fix node,
|
|
// - release lock on node
|
|
// - possibly flush either new children created from split, otherwise unlock children
|
|
//
|
|
static void
|
|
ft_split_child(
|
|
FT ft,
|
|
FTNODE node,
|
|
int childnum,
|
|
FTNODE child,
|
|
enum split_mode split_mode,
|
|
struct flusher_advice *fa)
|
|
{
|
|
paranoid_invariant(node->height>0);
|
|
paranoid_invariant(toku_bnc_nbytesinbuf(BNC(node, childnum))==0); // require that the buffer for this child is empty
|
|
FTNODE nodea, nodeb;
|
|
DBT splitk;
|
|
|
|
// for test
|
|
call_flusher_thread_callback(flt_flush_before_split);
|
|
|
|
FTNODE dep_nodes[2];
|
|
dep_nodes[0] = node;
|
|
dep_nodes[1] = child;
|
|
if (child->height==0) {
|
|
ftleaf_split(ft, child, &nodea, &nodeb, &splitk, true, split_mode, 2, dep_nodes);
|
|
} else {
|
|
ft_nonleaf_split(ft, child, &nodea, &nodeb, &splitk, 2, dep_nodes);
|
|
}
|
|
// printf("%s:%d child did split\n", __FILE__, __LINE__);
|
|
handle_split_of_child (ft, node, childnum, nodea, nodeb, &splitk);
|
|
|
|
// for test
|
|
call_flusher_thread_callback(flt_flush_during_split);
|
|
|
|
// at this point, the split is complete
|
|
// now we need to unlock node,
|
|
// and possibly continue
|
|
// flushing one of the children
|
|
int picked_child = fa->pick_child_after_split(ft, node, childnum, childnum + 1, fa->extra);
|
|
toku_unpin_ftnode(ft, node);
|
|
if (picked_child == childnum ||
|
|
(picked_child < 0 && nodea->height > 0 && fa->should_recursively_flush(nodea, fa->extra))) {
|
|
toku_unpin_ftnode(ft, nodeb);
|
|
toku_ft_flush_some_child(ft, nodea, fa);
|
|
}
|
|
else if (picked_child == childnum + 1 ||
|
|
(picked_child < 0 && nodeb->height > 0 && fa->should_recursively_flush(nodeb, fa->extra))) {
|
|
toku_unpin_ftnode(ft, nodea);
|
|
toku_ft_flush_some_child(ft, nodeb, fa);
|
|
}
|
|
else {
|
|
toku_unpin_ftnode(ft, nodea);
|
|
toku_unpin_ftnode(ft, nodeb);
|
|
}
|
|
|
|
toku_destroy_dbt(&splitk);
|
|
}
|
|
|
|
static void bring_node_fully_into_memory(FTNODE node, FT ft) {
|
|
if (!toku_ftnode_fully_in_memory(node)) {
|
|
ftnode_fetch_extra bfe;
|
|
bfe.create_for_full_read(ft);
|
|
toku_cachetable_pf_pinned_pair(
|
|
node,
|
|
toku_ftnode_pf_callback,
|
|
&bfe,
|
|
ft->cf,
|
|
node->blocknum,
|
|
toku_cachetable_hash(ft->cf, node->blocknum)
|
|
);
|
|
}
|
|
}
|
|
|
|
static void
|
|
flush_this_child(
|
|
FT ft,
|
|
FTNODE node,
|
|
FTNODE child,
|
|
int childnum,
|
|
struct flusher_advice *fa)
|
|
// Effect: Push everything in the CHILDNUMth buffer of node down into the child.
|
|
{
|
|
update_flush_status(child, 0);
|
|
toku_ftnode_assert_fully_in_memory(node);
|
|
if (fa->should_destroy_basement_nodes(fa)) {
|
|
maybe_destroy_child_blbs(node, child, ft);
|
|
}
|
|
bring_node_fully_into_memory(child, ft);
|
|
toku_ftnode_assert_fully_in_memory(child);
|
|
paranoid_invariant(node->height>0);
|
|
paranoid_invariant(child->blocknum.b!=0);
|
|
// VERIFY_NODE does not work off client thread as of now
|
|
//VERIFY_NODE(t, child);
|
|
node->set_dirty();
|
|
child->set_dirty();
|
|
|
|
BP_WORKDONE(node, childnum) = 0; // this buffer is drained, no work has been done by its contents
|
|
NONLEAF_CHILDINFO bnc = BNC(node, childnum);
|
|
set_BNC(node, childnum, toku_create_empty_nl());
|
|
|
|
// now we have a bnc to flush to the child. pass down the parent's
|
|
// oldest known referenced xid as we flush down to the child.
|
|
toku_bnc_flush_to_child(ft, bnc, child, node->oldest_referenced_xid_known);
|
|
destroy_nonleaf_childinfo(bnc);
|
|
}
|
|
|
|
static void
|
|
merge_leaf_nodes(FTNODE a, FTNODE b)
|
|
{
|
|
FL_STATUS_VAL(FT_FLUSHER_MERGE_LEAF)++;
|
|
toku_ftnode_assert_fully_in_memory(a);
|
|
toku_ftnode_assert_fully_in_memory(b);
|
|
paranoid_invariant(a->height == 0);
|
|
paranoid_invariant(b->height == 0);
|
|
paranoid_invariant(a->n_children > 0);
|
|
paranoid_invariant(b->n_children > 0);
|
|
|
|
// Mark nodes as dirty before moving basements from b to a.
|
|
// This way, whatever deltas are accumulated in the basements are
|
|
// applied to the in_memory_stats in the header if they have not already
|
|
// been (if nodes are clean).
|
|
// TODO(leif): this is no longer the way in_memory_stats is
|
|
// maintained. verify that it's ok to move this just before the unpin
|
|
// and then do that.
|
|
a->set_dirty();
|
|
b->set_dirty();
|
|
|
|
bn_data* a_last_bd = BLB_DATA(a, a->n_children-1);
|
|
// this bool states if the last basement node in a has any items or not
|
|
// If it does, then it stays in the merge. If it does not, the last basement node
|
|
// of a gets eliminated because we do not have a pivot to store for it (because it has no elements)
|
|
const bool a_has_tail = a_last_bd->num_klpairs() > 0;
|
|
|
|
int num_children = a->n_children + b->n_children;
|
|
if (!a_has_tail) {
|
|
int lastchild = a->n_children - 1;
|
|
BASEMENTNODE bn = BLB(a, lastchild);
|
|
|
|
// verify that last basement in a is empty, then destroy mempool
|
|
size_t used_space = a_last_bd->get_disk_size();
|
|
invariant_zero(used_space);
|
|
destroy_basement_node(bn);
|
|
set_BNULL(a, lastchild);
|
|
num_children--;
|
|
if (lastchild < a->pivotkeys.num_pivots()) {
|
|
a->pivotkeys.delete_at(lastchild);
|
|
}
|
|
} else {
|
|
// fill in pivot for what used to be max of node 'a', if it is needed
|
|
uint32_t keylen;
|
|
void *key;
|
|
int r = a_last_bd->fetch_key_and_len(a_last_bd->num_klpairs() - 1, &keylen, &key);
|
|
invariant_zero(r);
|
|
DBT pivotkey;
|
|
toku_fill_dbt(&pivotkey, key, keylen);
|
|
a->pivotkeys.replace_at(&pivotkey, a->n_children - 1);
|
|
}
|
|
|
|
// realloc basement nodes in `a'
|
|
REALLOC_N(num_children, a->bp);
|
|
|
|
// move each basement node from b to a
|
|
uint32_t offset = a_has_tail ? a->n_children : a->n_children - 1;
|
|
for (int i = 0; i < b->n_children; i++) {
|
|
a->bp[i + offset] = b->bp[i];
|
|
memset(&b->bp[i], 0, sizeof(b->bp[0]));
|
|
}
|
|
|
|
// append b's pivots to a's pivots
|
|
a->pivotkeys.append(b->pivotkeys);
|
|
|
|
// now that all the data has been moved from b to a, we can destroy the data in b
|
|
a->n_children = num_children;
|
|
b->pivotkeys.destroy();
|
|
b->n_children = 0;
|
|
}
|
|
|
|
static void balance_leaf_nodes(
|
|
FTNODE a,
|
|
FTNODE b,
|
|
DBT *splitk)
|
|
// Effect:
|
|
// If b is bigger then move stuff from b to a until b is the smaller.
|
|
// If a is bigger then move stuff from a to b until a is the smaller.
|
|
{
|
|
FL_STATUS_VAL(FT_FLUSHER_BALANCE_LEAF)++;
|
|
// first merge all the data into a
|
|
merge_leaf_nodes(a,b);
|
|
// now split them
|
|
// because we are not creating a new node, we can pass in no dependent nodes
|
|
ftleaf_split(NULL, a, &a, &b, splitk, false, SPLIT_EVENLY, 0, NULL);
|
|
}
|
|
|
|
static void
|
|
maybe_merge_pinned_leaf_nodes(
|
|
FTNODE a,
|
|
FTNODE b,
|
|
const DBT *parent_splitk,
|
|
bool *did_merge,
|
|
bool *did_rebalance,
|
|
DBT *splitk,
|
|
uint32_t nodesize
|
|
)
|
|
// Effect: Either merge a and b into one one node (merge them into a) and set *did_merge = true.
|
|
// (We do this if the resulting node is not fissible)
|
|
// or distribute the leafentries evenly between a and b, and set *did_rebalance = true.
|
|
// (If a and be are already evenly distributed, we may do nothing.)
|
|
{
|
|
unsigned int sizea = toku_serialize_ftnode_size(a);
|
|
unsigned int sizeb = toku_serialize_ftnode_size(b);
|
|
uint32_t num_leafentries = toku_ftnode_leaf_num_entries(a) + toku_ftnode_leaf_num_entries(b);
|
|
if (num_leafentries > 1 && (sizea + sizeb)*4 > (nodesize*3)) {
|
|
// the combined size is more than 3/4 of a node, so don't merge them.
|
|
*did_merge = false;
|
|
if (sizea*4 > nodesize && sizeb*4 > nodesize) {
|
|
// no need to do anything if both are more than 1/4 of a node.
|
|
*did_rebalance = false;
|
|
toku_clone_dbt(splitk, *parent_splitk);
|
|
return;
|
|
}
|
|
// one is less than 1/4 of a node, and together they are more than 3/4 of a node.
|
|
*did_rebalance = true;
|
|
balance_leaf_nodes(a, b, splitk);
|
|
} else {
|
|
// we are merging them.
|
|
*did_merge = true;
|
|
*did_rebalance = false;
|
|
toku_init_dbt(splitk);
|
|
merge_leaf_nodes(a, b);
|
|
}
|
|
}
|
|
|
|
static void
|
|
maybe_merge_pinned_nonleaf_nodes(
|
|
const DBT *parent_splitk,
|
|
FTNODE a,
|
|
FTNODE b,
|
|
bool *did_merge,
|
|
bool *did_rebalance,
|
|
DBT *splitk)
|
|
{
|
|
toku_ftnode_assert_fully_in_memory(a);
|
|
toku_ftnode_assert_fully_in_memory(b);
|
|
invariant_notnull(parent_splitk->data);
|
|
|
|
int old_n_children = a->n_children;
|
|
int new_n_children = old_n_children + b->n_children;
|
|
|
|
XREALLOC_N(new_n_children, a->bp);
|
|
memcpy(a->bp + old_n_children, b->bp, b->n_children * sizeof(b->bp[0]));
|
|
memset(b->bp, 0, b->n_children * sizeof(b->bp[0]));
|
|
|
|
a->pivotkeys.insert_at(parent_splitk, old_n_children - 1);
|
|
a->pivotkeys.append(b->pivotkeys);
|
|
a->n_children = new_n_children;
|
|
b->n_children = 0;
|
|
|
|
a->set_dirty();
|
|
b->set_dirty();
|
|
|
|
*did_merge = true;
|
|
*did_rebalance = false;
|
|
toku_init_dbt(splitk);
|
|
|
|
FL_STATUS_VAL(FT_FLUSHER_MERGE_NONLEAF)++;
|
|
}
|
|
|
|
static void
|
|
maybe_merge_pinned_nodes(
|
|
FTNODE parent,
|
|
const DBT *parent_splitk,
|
|
FTNODE a,
|
|
FTNODE b,
|
|
bool *did_merge,
|
|
bool *did_rebalance,
|
|
DBT *splitk,
|
|
uint32_t nodesize
|
|
)
|
|
// Effect: either merge a and b into one node (merge them into a) and set *did_merge = true.
|
|
// (We do this if the resulting node is not fissible)
|
|
// or distribute a and b evenly and set *did_merge = false and *did_rebalance = true
|
|
// (If a and be are already evenly distributed, we may do nothing.)
|
|
// If we distribute:
|
|
// For leaf nodes, we distribute the leafentries evenly.
|
|
// For nonleaf nodes, we distribute the children evenly. That may leave one or both of the nodes overfull, but that's OK.
|
|
// If we distribute, we set *splitk to a malloced pivot key.
|
|
// Parameters:
|
|
// t The FT.
|
|
// parent The parent of the two nodes to be split.
|
|
// parent_splitk The pivot key between a and b. This is either free()'d or returned in *splitk.
|
|
// a The first node to merge.
|
|
// b The second node to merge.
|
|
// logger The logger.
|
|
// did_merge (OUT): Did the two nodes actually get merged?
|
|
// splitk (OUT): If the two nodes did not get merged, the new pivot key between the two nodes.
|
|
{
|
|
MSN msn_max;
|
|
paranoid_invariant(a->height == b->height);
|
|
toku_ftnode_assert_fully_in_memory(parent);
|
|
toku_ftnode_assert_fully_in_memory(a);
|
|
toku_ftnode_assert_fully_in_memory(b);
|
|
parent->set_dirty(); // just to make sure
|
|
{
|
|
MSN msna = a->max_msn_applied_to_node_on_disk;
|
|
MSN msnb = b->max_msn_applied_to_node_on_disk;
|
|
msn_max = (msna.msn > msnb.msn) ? msna : msnb;
|
|
}
|
|
if (a->height == 0) {
|
|
maybe_merge_pinned_leaf_nodes(a, b, parent_splitk, did_merge, did_rebalance, splitk, nodesize);
|
|
} else {
|
|
maybe_merge_pinned_nonleaf_nodes(parent_splitk, a, b, did_merge, did_rebalance, splitk);
|
|
}
|
|
if (*did_merge || *did_rebalance) {
|
|
// accurate for leaf nodes because all msgs above have been
|
|
// applied, accurate for non-leaf nodes because buffer immediately
|
|
// above each node has been flushed
|
|
a->max_msn_applied_to_node_on_disk = msn_max;
|
|
b->max_msn_applied_to_node_on_disk = msn_max;
|
|
}
|
|
}
|
|
|
|
static void merge_remove_key_callback(BLOCKNUM *bp, bool for_checkpoint, void *extra) {
|
|
FT ft = (FT) extra;
|
|
ft->blocktable.free_blocknum(bp, ft, for_checkpoint);
|
|
}
|
|
|
|
//
|
|
// Takes as input a locked node and a childnum_to_merge
|
|
// As output, two of node's children are merged or rebalanced, and node is unlocked
|
|
//
|
|
static void
|
|
ft_merge_child(
|
|
FT ft,
|
|
FTNODE node,
|
|
int childnum_to_merge,
|
|
bool *did_react,
|
|
struct flusher_advice *fa)
|
|
{
|
|
// this function should not be called
|
|
// if the child is not mergable
|
|
paranoid_invariant(node->n_children > 1);
|
|
toku_ftnode_assert_fully_in_memory(node);
|
|
|
|
int childnuma,childnumb;
|
|
if (childnum_to_merge > 0) {
|
|
childnuma = childnum_to_merge-1;
|
|
childnumb = childnum_to_merge;
|
|
} else {
|
|
childnuma = childnum_to_merge;
|
|
childnumb = childnum_to_merge+1;
|
|
}
|
|
paranoid_invariant(0 <= childnuma);
|
|
paranoid_invariant(childnuma+1 == childnumb);
|
|
paranoid_invariant(childnumb < node->n_children);
|
|
|
|
paranoid_invariant(node->height>0);
|
|
|
|
// We suspect that at least one of the children is fusible, but they might not be.
|
|
// for test
|
|
call_flusher_thread_callback(flt_flush_before_merge);
|
|
|
|
FTNODE childa, childb;
|
|
{
|
|
uint32_t childfullhash = compute_child_fullhash(ft->cf, node, childnuma);
|
|
ftnode_fetch_extra bfe;
|
|
bfe.create_for_full_read(ft);
|
|
toku_pin_ftnode_with_dep_nodes(ft, BP_BLOCKNUM(node, childnuma), childfullhash, &bfe, PL_WRITE_EXPENSIVE, 1, &node, &childa, true);
|
|
}
|
|
// for test
|
|
call_flusher_thread_callback(flt_flush_before_pin_second_node_for_merge);
|
|
{
|
|
FTNODE dep_nodes[2];
|
|
dep_nodes[0] = node;
|
|
dep_nodes[1] = childa;
|
|
uint32_t childfullhash = compute_child_fullhash(ft->cf, node, childnumb);
|
|
ftnode_fetch_extra bfe;
|
|
bfe.create_for_full_read(ft);
|
|
toku_pin_ftnode_with_dep_nodes(ft, BP_BLOCKNUM(node, childnumb), childfullhash, &bfe, PL_WRITE_EXPENSIVE, 2, dep_nodes, &childb, true);
|
|
}
|
|
|
|
if (toku_bnc_n_entries(BNC(node,childnuma))>0) {
|
|
flush_this_child(ft, node, childa, childnuma, fa);
|
|
}
|
|
if (toku_bnc_n_entries(BNC(node,childnumb))>0) {
|
|
flush_this_child(ft, node, childb, childnumb, fa);
|
|
}
|
|
|
|
// now we have both children pinned in main memory, and cachetable locked,
|
|
// so no checkpoints will occur.
|
|
|
|
bool did_merge, did_rebalance;
|
|
{
|
|
DBT splitk;
|
|
toku_init_dbt(&splitk);
|
|
const DBT old_split_key = node->pivotkeys.get_pivot(childnuma);
|
|
maybe_merge_pinned_nodes(node, &old_split_key, childa, childb, &did_merge, &did_rebalance, &splitk, ft->h->nodesize);
|
|
//toku_verify_estimates(t,childa);
|
|
// the tree did react if a merge (did_merge) or rebalance (new spkit key) occurred
|
|
*did_react = (bool)(did_merge || did_rebalance);
|
|
|
|
if (did_merge) {
|
|
invariant_null(splitk.data);
|
|
NONLEAF_CHILDINFO remaining_bnc = BNC(node, childnuma);
|
|
NONLEAF_CHILDINFO merged_bnc = BNC(node, childnumb);
|
|
for (unsigned int i = 0; i < (sizeof remaining_bnc->flow) / (sizeof remaining_bnc->flow[0]); ++i) {
|
|
remaining_bnc->flow[i] += merged_bnc->flow[i];
|
|
}
|
|
destroy_nonleaf_childinfo(merged_bnc);
|
|
set_BNULL(node, childnumb);
|
|
node->n_children--;
|
|
memmove(&node->bp[childnumb],
|
|
&node->bp[childnumb+1],
|
|
(node->n_children-childnumb)*sizeof(node->bp[0]));
|
|
REALLOC_N(node->n_children, node->bp);
|
|
node->pivotkeys.delete_at(childnuma);
|
|
|
|
// Handle a merge of the rightmost leaf node.
|
|
BLOCKNUM rightmost_blocknum = toku_unsafe_fetch(&ft->rightmost_blocknum);
|
|
if (did_merge && childb->blocknum.b == rightmost_blocknum.b) {
|
|
invariant(childb->blocknum.b != ft->h->root_blocknum.b);
|
|
toku_ftnode_swap_pair_values(childa, childb);
|
|
BP_BLOCKNUM(node, childnuma) = childa->blocknum;
|
|
}
|
|
|
|
paranoid_invariant(BP_BLOCKNUM(node, childnuma).b == childa->blocknum.b);
|
|
childa->set_dirty(); // just to make sure
|
|
childb->set_dirty(); // just to make sure
|
|
} else {
|
|
// flow will be inaccurate for a while, oh well. the children
|
|
// are leaves in this case so it's not a huge deal (we're
|
|
// pretty far down the tree)
|
|
|
|
// If we didn't merge the nodes, then we need the correct pivot.
|
|
invariant_notnull(splitk.data);
|
|
node->pivotkeys.replace_at(&splitk, childnuma);
|
|
node->set_dirty();
|
|
}
|
|
toku_destroy_dbt(&splitk);
|
|
}
|
|
//
|
|
// now we possibly flush the children
|
|
//
|
|
if (did_merge) {
|
|
// for test
|
|
call_flusher_thread_callback(flt_flush_before_unpin_remove);
|
|
|
|
// merge_remove_key_callback will free the blocknum
|
|
int rrb = toku_cachetable_unpin_and_remove(
|
|
ft->cf,
|
|
childb->ct_pair,
|
|
merge_remove_key_callback,
|
|
ft
|
|
);
|
|
assert_zero(rrb);
|
|
|
|
// for test
|
|
call_flusher_thread_callback(ft_flush_aflter_merge);
|
|
|
|
// unlock the parent
|
|
paranoid_invariant(node->dirty());
|
|
toku_unpin_ftnode(ft, node);
|
|
}
|
|
else {
|
|
// for test
|
|
call_flusher_thread_callback(ft_flush_aflter_rebalance);
|
|
|
|
// unlock the parent
|
|
paranoid_invariant(node->dirty());
|
|
toku_unpin_ftnode(ft, node);
|
|
toku_unpin_ftnode(ft, childb);
|
|
}
|
|
if (childa->height > 0 && fa->should_recursively_flush(childa, fa->extra)) {
|
|
toku_ft_flush_some_child(ft, childa, fa);
|
|
}
|
|
else {
|
|
toku_unpin_ftnode(ft, childa);
|
|
}
|
|
}
|
|
|
|
void toku_ft_flush_some_child(FT ft, FTNODE parent, struct flusher_advice *fa)
|
|
// Effect: This function does the following:
|
|
// - Pick a child of parent (the heaviest child),
|
|
// - flush from parent to child,
|
|
// - possibly split/merge child.
|
|
// - if child is gorged, recursively proceed with child
|
|
// Note that parent is already locked
|
|
// Upon exit of this function, parent is unlocked and no new
|
|
// new nodes (such as a child) remain locked
|
|
{
|
|
int dirtied = 0;
|
|
NONLEAF_CHILDINFO bnc = NULL;
|
|
paranoid_invariant(parent->height>0);
|
|
toku_ftnode_assert_fully_in_memory(parent);
|
|
TXNID parent_oldest_referenced_xid_known = parent->oldest_referenced_xid_known;
|
|
|
|
// pick the child we want to flush to
|
|
int childnum = fa->pick_child(ft, parent, fa->extra);
|
|
|
|
// for test
|
|
call_flusher_thread_callback(flt_flush_before_child_pin);
|
|
|
|
// get the child into memory
|
|
BLOCKNUM targetchild = BP_BLOCKNUM(parent, childnum);
|
|
ft->blocktable.verify_blocknum_allocated(targetchild);
|
|
uint32_t childfullhash = compute_child_fullhash(ft->cf, parent, childnum);
|
|
FTNODE child;
|
|
ftnode_fetch_extra bfe;
|
|
// Note that we don't read the entire node into memory yet.
|
|
// The idea is let's try to do the minimum work before releasing the parent lock
|
|
bfe.create_for_min_read(ft);
|
|
toku_pin_ftnode_with_dep_nodes(ft, targetchild, childfullhash, &bfe, PL_WRITE_EXPENSIVE, 1, &parent, &child, true);
|
|
|
|
// for test
|
|
call_flusher_thread_callback(ft_flush_aflter_child_pin);
|
|
|
|
if (fa->should_destroy_basement_nodes(fa)) {
|
|
maybe_destroy_child_blbs(parent, child, ft);
|
|
}
|
|
|
|
//Note that at this point, we don't have the entire child in.
|
|
// Let's do a quick check to see if the child may be reactive
|
|
// If the child cannot be reactive, then we can safely unlock
|
|
// the parent before finishing reading in the entire child node.
|
|
bool may_child_be_reactive = ft_ftnode_may_be_reactive(ft, child);
|
|
|
|
paranoid_invariant(child->blocknum.b!=0);
|
|
|
|
// only do the following work if there is a flush to perform
|
|
if (toku_bnc_n_entries(BNC(parent, childnum)) > 0 || parent->height == 1) {
|
|
if (!parent->dirty()) {
|
|
dirtied++;
|
|
parent->set_dirty();
|
|
}
|
|
// detach buffer
|
|
BP_WORKDONE(parent, childnum) = 0; // this buffer is drained, no work has been done by its contents
|
|
bnc = BNC(parent, childnum);
|
|
NONLEAF_CHILDINFO new_bnc = toku_create_empty_nl();
|
|
memcpy(new_bnc->flow, bnc->flow, sizeof bnc->flow);
|
|
set_BNC(parent, childnum, new_bnc);
|
|
}
|
|
|
|
//
|
|
// at this point, the buffer has been detached from the parent
|
|
// and a new empty buffer has been placed in its stead
|
|
// so, if we are absolutely sure that the child is not
|
|
// reactive, we can unpin the parent
|
|
//
|
|
if (!may_child_be_reactive) {
|
|
toku_unpin_ftnode(ft, parent);
|
|
parent = NULL;
|
|
}
|
|
|
|
//
|
|
// now, if necessary, read/decompress the rest of child into memory,
|
|
// so that we can proceed and apply the flush
|
|
//
|
|
bring_node_fully_into_memory(child, ft);
|
|
|
|
// It is possible after reading in the entire child,
|
|
// that we now know that the child is not reactive
|
|
// if so, we can unpin parent right now
|
|
// we won't be splitting/merging child
|
|
// and we have already replaced the bnc
|
|
// for the root with a fresh one
|
|
enum reactivity child_re = toku_ftnode_get_reactivity(ft, child);
|
|
if (parent && child_re == RE_STABLE) {
|
|
toku_unpin_ftnode(ft, parent);
|
|
parent = NULL;
|
|
}
|
|
|
|
// from above, we know at this point that either the bnc
|
|
// is detached from the parent (which may be unpinned),
|
|
// and we have to apply the flush, or there was no data
|
|
// in the buffer to flush, and as a result, flushing is not necessary
|
|
// and bnc is NULL
|
|
if (bnc != NULL) {
|
|
if (!child->dirty()) {
|
|
dirtied++;
|
|
child->set_dirty();
|
|
}
|
|
// do the actual flush
|
|
toku_bnc_flush_to_child(
|
|
ft,
|
|
bnc,
|
|
child,
|
|
parent_oldest_referenced_xid_known
|
|
);
|
|
destroy_nonleaf_childinfo(bnc);
|
|
}
|
|
|
|
fa->update_status(child, dirtied, fa->extra);
|
|
// let's get the reactivity of the child again,
|
|
// it is possible that the flush got rid of some values
|
|
// and now the parent is no longer reactive
|
|
child_re = toku_ftnode_get_reactivity(ft, child);
|
|
// if the parent has been unpinned above, then
|
|
// this is our only option, even if the child is not stable
|
|
// if the child is not stable, we'll handle it the next
|
|
// time we need to flush to the child
|
|
if (!parent ||
|
|
child_re == RE_STABLE ||
|
|
(child_re == RE_FUSIBLE && parent->n_children == 1)
|
|
)
|
|
{
|
|
if (parent) {
|
|
toku_unpin_ftnode(ft, parent);
|
|
parent = NULL;
|
|
}
|
|
//
|
|
// it is the responsibility of toku_ft_flush_some_child to unpin child
|
|
//
|
|
if (child->height > 0 && fa->should_recursively_flush(child, fa->extra)) {
|
|
toku_ft_flush_some_child(ft, child, fa);
|
|
}
|
|
else {
|
|
toku_unpin_ftnode(ft, child);
|
|
}
|
|
}
|
|
else if (child_re == RE_FISSIBLE) {
|
|
//
|
|
// it is responsibility of `ft_split_child` to unlock nodes of
|
|
// parent and child as it sees fit
|
|
//
|
|
paranoid_invariant(parent); // just make sure we have not accidentally unpinned parent
|
|
ft_split_child(ft, parent, childnum, child, SPLIT_EVENLY, fa);
|
|
}
|
|
else if (child_re == RE_FUSIBLE) {
|
|
//
|
|
// it is responsibility of `maybe_merge_child to unlock nodes of
|
|
// parent and child as it sees fit
|
|
//
|
|
paranoid_invariant(parent); // just make sure we have not accidentally unpinned parent
|
|
fa->maybe_merge_child(fa, ft, parent, childnum, child, fa->extra);
|
|
}
|
|
else {
|
|
abort();
|
|
}
|
|
}
|
|
|
|
void toku_bnc_flush_to_child(FT ft, NONLEAF_CHILDINFO bnc, FTNODE child, TXNID parent_oldest_referenced_xid_known) {
|
|
paranoid_invariant(bnc);
|
|
|
|
TOKULOGGER logger = toku_cachefile_logger(ft->cf);
|
|
TXN_MANAGER txn_manager = logger != nullptr ? toku_logger_get_txn_manager(logger) : nullptr;
|
|
TXNID oldest_referenced_xid_for_simple_gc = TXNID_NONE;
|
|
|
|
txn_manager_state txn_state_for_gc(txn_manager);
|
|
bool do_garbage_collection = child->height == 0 && txn_manager != nullptr;
|
|
if (do_garbage_collection) {
|
|
txn_state_for_gc.init();
|
|
oldest_referenced_xid_for_simple_gc = toku_txn_manager_get_oldest_referenced_xid_estimate(txn_manager);
|
|
}
|
|
txn_gc_info gc_info(&txn_state_for_gc,
|
|
oldest_referenced_xid_for_simple_gc,
|
|
child->oldest_referenced_xid_known,
|
|
true);
|
|
struct flush_msg_fn {
|
|
FT ft;
|
|
FTNODE child;
|
|
NONLEAF_CHILDINFO bnc;
|
|
txn_gc_info *gc_info;
|
|
|
|
STAT64INFO_S stats_delta;
|
|
int64_t logical_rows_delta = 0;
|
|
size_t remaining_memsize = bnc->msg_buffer.buffer_size_in_use();
|
|
|
|
flush_msg_fn(FT t, FTNODE n, NONLEAF_CHILDINFO nl, txn_gc_info *g) :
|
|
ft(t), child(n), bnc(nl), gc_info(g), remaining_memsize(bnc->msg_buffer.buffer_size_in_use()) {
|
|
stats_delta = { 0, 0 };
|
|
}
|
|
int operator()(const ft_msg &msg, bool is_fresh) {
|
|
size_t flow_deltas[] = { 0, 0 };
|
|
size_t memsize_in_buffer = message_buffer::msg_memsize_in_buffer(msg);
|
|
if (remaining_memsize <= bnc->flow[0]) {
|
|
// this message is in the current checkpoint's worth of
|
|
// the end of the message buffer
|
|
flow_deltas[0] = memsize_in_buffer;
|
|
} else if (remaining_memsize <= bnc->flow[0] + bnc->flow[1]) {
|
|
// this message is in the last checkpoint's worth of the
|
|
// end of the message buffer
|
|
flow_deltas[1] = memsize_in_buffer;
|
|
}
|
|
toku_ftnode_put_msg(
|
|
ft->cmp,
|
|
ft->update_fun,
|
|
child,
|
|
-1,
|
|
msg,
|
|
is_fresh,
|
|
gc_info,
|
|
flow_deltas,
|
|
&stats_delta,
|
|
&logical_rows_delta);
|
|
remaining_memsize -= memsize_in_buffer;
|
|
return 0;
|
|
}
|
|
} flush_fn(ft, child, bnc, &gc_info);
|
|
bnc->msg_buffer.iterate(flush_fn);
|
|
|
|
child->oldest_referenced_xid_known = parent_oldest_referenced_xid_known;
|
|
|
|
invariant(flush_fn.remaining_memsize == 0);
|
|
if (flush_fn.stats_delta.numbytes || flush_fn.stats_delta.numrows) {
|
|
toku_ft_update_stats(&ft->in_memory_stats, flush_fn.stats_delta);
|
|
}
|
|
toku_ft_adjust_logical_row_count(ft, flush_fn.logical_rows_delta);
|
|
if (do_garbage_collection) {
|
|
size_t buffsize = bnc->msg_buffer.buffer_size_in_use();
|
|
// may be misleading if there's a broadcast message in there
|
|
toku_ft_status_note_msg_bytes_out(buffsize);
|
|
}
|
|
}
|
|
|
|
static void
|
|
update_cleaner_status(
|
|
FTNODE node,
|
|
int childnum)
|
|
{
|
|
FL_STATUS_VAL(FT_FLUSHER_CLEANER_TOTAL_NODES)++;
|
|
if (node->height == 1) {
|
|
FL_STATUS_VAL(FT_FLUSHER_CLEANER_H1_NODES)++;
|
|
} else {
|
|
FL_STATUS_VAL(FT_FLUSHER_CLEANER_HGT1_NODES)++;
|
|
}
|
|
|
|
unsigned int nbytesinbuf = toku_bnc_nbytesinbuf(BNC(node, childnum));
|
|
if (nbytesinbuf == 0) {
|
|
FL_STATUS_VAL(FT_FLUSHER_CLEANER_EMPTY_NODES)++;
|
|
} else {
|
|
if (nbytesinbuf > FL_STATUS_VAL(FT_FLUSHER_CLEANER_MAX_BUFFER_SIZE)) {
|
|
FL_STATUS_VAL(FT_FLUSHER_CLEANER_MAX_BUFFER_SIZE) = nbytesinbuf;
|
|
}
|
|
if (nbytesinbuf < FL_STATUS_VAL(FT_FLUSHER_CLEANER_MIN_BUFFER_SIZE)) {
|
|
FL_STATUS_VAL(FT_FLUSHER_CLEANER_MIN_BUFFER_SIZE) = nbytesinbuf;
|
|
}
|
|
FL_STATUS_VAL(FT_FLUSHER_CLEANER_TOTAL_BUFFER_SIZE) += nbytesinbuf;
|
|
|
|
uint64_t workdone = BP_WORKDONE(node, childnum);
|
|
if (workdone > FL_STATUS_VAL(FT_FLUSHER_CLEANER_MAX_BUFFER_WORKDONE)) {
|
|
FL_STATUS_VAL(FT_FLUSHER_CLEANER_MAX_BUFFER_WORKDONE) = workdone;
|
|
}
|
|
if (workdone < FL_STATUS_VAL(FT_FLUSHER_CLEANER_MIN_BUFFER_WORKDONE)) {
|
|
FL_STATUS_VAL(FT_FLUSHER_CLEANER_MIN_BUFFER_WORKDONE) = workdone;
|
|
}
|
|
FL_STATUS_VAL(FT_FLUSHER_CLEANER_TOTAL_BUFFER_WORKDONE) += workdone;
|
|
}
|
|
}
|
|
|
|
static void
|
|
dummy_update_status(
|
|
FTNODE UU(child),
|
|
int UU(dirtied),
|
|
void* UU(extra)
|
|
)
|
|
{
|
|
}
|
|
|
|
static int
|
|
dummy_pick_heaviest_child(FT UU(h),
|
|
FTNODE UU(parent),
|
|
void* UU(extra))
|
|
{
|
|
abort();
|
|
return -1;
|
|
}
|
|
|
|
void toku_ft_split_child(
|
|
FT ft,
|
|
FTNODE node,
|
|
int childnum,
|
|
FTNODE child,
|
|
enum split_mode split_mode
|
|
)
|
|
{
|
|
struct flusher_advice fa;
|
|
flusher_advice_init(
|
|
&fa,
|
|
dummy_pick_heaviest_child,
|
|
dont_destroy_basement_nodes,
|
|
never_recursively_flush,
|
|
default_merge_child,
|
|
dummy_update_status,
|
|
default_pick_child_after_split,
|
|
NULL
|
|
);
|
|
ft_split_child(
|
|
ft,
|
|
node,
|
|
childnum, // childnum to split
|
|
child,
|
|
split_mode,
|
|
&fa
|
|
);
|
|
}
|
|
|
|
void toku_ft_merge_child(
|
|
FT ft,
|
|
FTNODE node,
|
|
int childnum
|
|
)
|
|
{
|
|
struct flusher_advice fa;
|
|
flusher_advice_init(
|
|
&fa,
|
|
dummy_pick_heaviest_child,
|
|
dont_destroy_basement_nodes,
|
|
never_recursively_flush,
|
|
default_merge_child,
|
|
dummy_update_status,
|
|
default_pick_child_after_split,
|
|
NULL
|
|
);
|
|
bool did_react;
|
|
ft_merge_child(
|
|
ft,
|
|
node,
|
|
childnum, // childnum to merge
|
|
&did_react,
|
|
&fa
|
|
);
|
|
}
|
|
|
|
int
|
|
toku_ftnode_cleaner_callback(
|
|
void *ftnode_pv,
|
|
BLOCKNUM blocknum,
|
|
uint32_t fullhash,
|
|
void *extraargs)
|
|
{
|
|
FTNODE node = (FTNODE) ftnode_pv;
|
|
invariant(node->blocknum.b == blocknum.b);
|
|
invariant(node->fullhash == fullhash);
|
|
invariant(node->height > 0); // we should never pick a leaf node (for now at least)
|
|
FT ft = (FT) extraargs;
|
|
bring_node_fully_into_memory(node, ft);
|
|
int childnum = find_heaviest_child(node);
|
|
update_cleaner_status(node, childnum);
|
|
|
|
// Either toku_ft_flush_some_child will unlock the node, or we do it here.
|
|
if (toku_bnc_nbytesinbuf(BNC(node, childnum)) > 0) {
|
|
struct flusher_advice fa;
|
|
struct flush_status_update_extra fste;
|
|
ct_flusher_advice_init(&fa, &fste, ft->h->nodesize);
|
|
toku_ft_flush_some_child(ft, node, &fa);
|
|
} else {
|
|
toku_unpin_ftnode(ft, node);
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
struct flusher_extra {
|
|
FT ft;
|
|
FTNODE node;
|
|
NONLEAF_CHILDINFO bnc;
|
|
TXNID parent_oldest_referenced_xid_known;
|
|
};
|
|
|
|
//
|
|
// This is the function that gets called by a
|
|
// background thread. Its purpose is to complete
|
|
// a flush, and possibly do a split/merge.
|
|
//
|
|
static void flush_node_fun(void *fe_v)
|
|
{
|
|
toku::context flush_ctx(CTX_FLUSH);
|
|
struct flusher_extra* fe = (struct flusher_extra *) fe_v;
|
|
// The node that has been placed on the background
|
|
// thread may not be fully in memory. Some message
|
|
// buffers may be compressed. Before performing
|
|
// any operations, we must first make sure
|
|
// the node is fully in memory
|
|
//
|
|
// If we have a bnc, that means fe->node is a child, and we've already
|
|
// destroyed its basement nodes if necessary, so we now need to either
|
|
// read them back in, or just do the regular partial fetch. If we
|
|
// don't, that means fe->node is a parent, so we need to do this anyway.
|
|
bring_node_fully_into_memory(fe->node,fe->ft);
|
|
fe->node->set_dirty();
|
|
|
|
struct flusher_advice fa;
|
|
struct flush_status_update_extra fste;
|
|
flt_flusher_advice_init(&fa, &fste, fe->ft->h->nodesize);
|
|
|
|
if (fe->bnc) {
|
|
// In this case, we have a bnc to flush to a node
|
|
|
|
// for test purposes
|
|
call_flusher_thread_callback(flt_flush_before_applying_inbox);
|
|
|
|
toku_bnc_flush_to_child(
|
|
fe->ft,
|
|
fe->bnc,
|
|
fe->node,
|
|
fe->parent_oldest_referenced_xid_known
|
|
);
|
|
destroy_nonleaf_childinfo(fe->bnc);
|
|
|
|
// after the flush has completed, now check to see if the node needs flushing
|
|
// If so, call toku_ft_flush_some_child on the node (because this flush intends to
|
|
// pass a meaningful oldest referenced xid for simple garbage collection), and it is the
|
|
// responsibility of the flush to unlock the node. otherwise, we unlock it here.
|
|
if (fe->node->height > 0 && toku_ftnode_nonleaf_is_gorged(fe->node, fe->ft->h->nodesize)) {
|
|
toku_ft_flush_some_child(fe->ft, fe->node, &fa);
|
|
}
|
|
else {
|
|
toku_unpin_ftnode(fe->ft,fe->node);
|
|
}
|
|
}
|
|
else {
|
|
// In this case, we were just passed a node with no
|
|
// bnc, which means we are tasked with flushing some
|
|
// buffer in the node.
|
|
// It is the responsibility of flush some child to unlock the node
|
|
toku_ft_flush_some_child(fe->ft, fe->node, &fa);
|
|
}
|
|
remove_background_job_from_cf(fe->ft->cf);
|
|
toku_free(fe);
|
|
}
|
|
|
|
static void
|
|
place_node_and_bnc_on_background_thread(
|
|
FT ft,
|
|
FTNODE node,
|
|
NONLEAF_CHILDINFO bnc,
|
|
TXNID parent_oldest_referenced_xid_known)
|
|
{
|
|
struct flusher_extra *XMALLOC(fe);
|
|
fe->ft = ft;
|
|
fe->node = node;
|
|
fe->bnc = bnc;
|
|
fe->parent_oldest_referenced_xid_known = parent_oldest_referenced_xid_known;
|
|
cachefile_kibbutz_enq(ft->cf, flush_node_fun, fe);
|
|
}
|
|
|
|
//
|
|
// This takes as input a gorged, locked, non-leaf node named parent
|
|
// and sets up a flush to be done in the background.
|
|
// The flush is setup like this:
|
|
// - We call maybe_get_and_pin_clean on the child we want to flush to in order to try to lock the child
|
|
// - if we successfully pin the child, and the child does not need to be split or merged
|
|
// then we detach the buffer, place the child and buffer onto a background thread, and
|
|
// have the flush complete in the background, and unlock the parent. The child will be
|
|
// unlocked on the background thread
|
|
// - if any of the above does not happen (child cannot be locked,
|
|
// child needs to be split/merged), then we place the parent on the background thread.
|
|
// The parent will be unlocked on the background thread
|
|
//
|
|
void toku_ft_flush_node_on_background_thread(FT ft, FTNODE parent)
|
|
{
|
|
toku::context flush_ctx(CTX_FLUSH);
|
|
TXNID parent_oldest_referenced_xid_known = parent->oldest_referenced_xid_known;
|
|
//
|
|
// first let's see if we can detach buffer on client thread
|
|
// and pick the child we want to flush to
|
|
//
|
|
int childnum = find_heaviest_child(parent);
|
|
paranoid_invariant(toku_bnc_n_entries(BNC(parent, childnum))>0);
|
|
//
|
|
// see if we can pin the child
|
|
//
|
|
FTNODE child;
|
|
uint32_t childfullhash = compute_child_fullhash(ft->cf, parent, childnum);
|
|
int r = toku_maybe_pin_ftnode_clean(ft, BP_BLOCKNUM(parent, childnum), childfullhash, PL_WRITE_EXPENSIVE, &child);
|
|
if (r != 0) {
|
|
// In this case, we could not lock the child, so just place the parent on the background thread
|
|
// In the callback, we will use toku_ft_flush_some_child, which checks to
|
|
// see if we should blow away the old basement nodes.
|
|
place_node_and_bnc_on_background_thread(ft, parent, NULL, parent_oldest_referenced_xid_known);
|
|
}
|
|
else {
|
|
//
|
|
// successfully locked child
|
|
//
|
|
bool may_child_be_reactive = ft_ftnode_may_be_reactive(ft, child);
|
|
if (!may_child_be_reactive) {
|
|
// We're going to unpin the parent, so before we do, we must
|
|
// check to see if we need to blow away the basement nodes to
|
|
// keep the MSN invariants intact.
|
|
maybe_destroy_child_blbs(parent, child, ft);
|
|
|
|
//
|
|
// can detach buffer and unpin root here
|
|
//
|
|
parent->set_dirty();
|
|
BP_WORKDONE(parent, childnum) = 0; // this buffer is drained, no work has been done by its contents
|
|
NONLEAF_CHILDINFO bnc = BNC(parent, childnum);
|
|
NONLEAF_CHILDINFO new_bnc = toku_create_empty_nl();
|
|
memcpy(new_bnc->flow, bnc->flow, sizeof bnc->flow);
|
|
set_BNC(parent, childnum, new_bnc);
|
|
|
|
//
|
|
// at this point, the buffer has been detached from the parent
|
|
// and a new empty buffer has been placed in its stead
|
|
// so, because we know for sure the child is not
|
|
// reactive, we can unpin the parent
|
|
//
|
|
place_node_and_bnc_on_background_thread(ft, child, bnc, parent_oldest_referenced_xid_known);
|
|
toku_unpin_ftnode(ft, parent);
|
|
}
|
|
else {
|
|
// because the child may be reactive, we need to
|
|
// put parent on background thread.
|
|
// As a result, we unlock the child here.
|
|
toku_unpin_ftnode(ft, child);
|
|
// Again, we'll have the parent on the background thread, so
|
|
// we don't need to destroy the basement nodes yet.
|
|
place_node_and_bnc_on_background_thread(ft, parent, NULL, parent_oldest_referenced_xid_known);
|
|
}
|
|
}
|
|
}
|
|
|
|
#include <toku_race_tools.h>
|
|
void __attribute__((__constructor__)) toku_ft_flusher_helgrind_ignore(void);
|
|
void
|
|
toku_ft_flusher_helgrind_ignore(void) {
|
|
TOKU_VALGRIND_HG_DISABLE_CHECKING(&fl_status, sizeof fl_status);
|
|
}
|