2007-07-13 19:37:47 +00:00
/* -*- mode: C; c-basic-offset: 4 -*- */
2008-01-24 15:10:32 +00:00
# ident "Copyright (c) 2007, 2008 Tokutek Inc. All rights reserved."
2007-11-29 14:18:54 +00:00
2007-07-13 19:37:47 +00:00
/* Buffered repository tree.
* Observation : The in - memory representation of a node doesn ' t have to be the same as the on - disk representation .
* Goal for the in - memory representation : fast
* Goal for on - disk : small
*
* So to get this running fast , I ' ll make a version that doesn ' t do range queries :
* use a hash table for in - memory
* simply write the strings on disk .
* Later I ' ll do a PMA or a skiplist for the in - memory version .
* Also , later I ' ll convert the format to network order fromn host order .
* Later , for on disk , I ' ll compress it ( perhaps with gzip , perhaps with the bzip2 algorithm . )
*
* The collection of nodes forms a data structure like a B - tree . The complexities of keeping it balanced apply .
*
* We always write nodes to a new location on disk .
* The nodes themselves contain the information about the tree structure .
* Q : During recovery , how do we find the root node without looking at every block on disk ?
* A : The root node is either the designated root near the front of the freelist .
* The freelist is updated infrequently . Before updating the stable copy of the freelist , we make sure that
* the root is up - to - date . We can make the freelist - and - root update be an arbitrarily small fraction of disk bandwidth .
*
*/
2008-02-05 18:25:23 +00:00
# include <arpa/inet.h>
2007-11-26 18:47:44 +00:00
# include <errno.h>
# include <inttypes.h>
# include <stdio.h>
# include <stdlib.h>
2007-07-13 19:37:47 +00:00
# include <string.h>
2008-04-08 02:09:19 +00:00
# include <sys/time.h>
2007-07-13 19:37:47 +00:00
# include <unistd.h>
2007-11-26 18:47:44 +00:00
2013-04-16 23:57:18 -04:00
# include "block_allocator.h"
2008-02-08 03:17:38 +00:00
# include "toku_assert.h"
2007-11-26 18:47:44 +00:00
# include "brt-internal.h"
# include "key.h"
# include "log_header.h"
2008-04-02 23:40:36 +00:00
# include "kv-pair.h"
# include "mempool.h"
2008-04-04 18:03:03 +00:00
# include "leafentry.h"
2007-07-13 19:37:47 +00:00
2008-04-07 01:30:25 +00:00
//#define SLOW
# ifdef SLOW
2008-04-22 20:39:50 +00:00
# define VERIFY_NODE(n) (toku_verify_counts(n), verify_all_in_mempool(n))
2008-04-07 01:30:25 +00:00
# else
# define VERIFY_NODE(n) ((void)0)
# endif
2007-07-13 19:37:47 +00:00
extern long long n_items_malloced ;
2008-01-29 21:43:08 +00:00
static void verify_local_fingerprint_nonleaf ( BRTNODE node ) ;
2007-11-14 17:58:38 +00:00
2008-06-02 20:52:12 +00:00
// We invalidate all the OMTCURSORS any time we push into the root of the BRT for that OMT.
// We keep a counter on each brt header, but if the brt header is evicted from the cachetable
// then we lose that counter. So we also keep a global counter.
// An alternative would be to keep only the global counter. But that would invalidate all OMTCURSORS
// even from unrelated BRTs. This way we only invalidate an OMTCURSOR if
2008-07-21 18:00:38 +00:00
static u_int64_t global_root_put_counter = 0 ;
2008-06-02 20:52:12 +00:00
2007-07-13 19:37:47 +00:00
/* Frees a node, including all the stuff in the hash table. */
2007-11-29 15:09:14 +00:00
void toku_brtnode_free ( BRTNODE * nodep ) {
2007-08-01 02:37:21 +00:00
BRTNODE node = * nodep ;
2007-07-13 19:37:47 +00:00
int i ;
//printf("%s:%d %p->mdict[0]=%p\n", __FILE__, __LINE__, node, node->mdicts[0]);
if ( node - > height > 0 ) {
for ( i = 0 ; i < node - > u . n . n_children - 1 ; i + + ) {
2008-03-06 21:46:57 +00:00
toku_free ( node - > u . n . childkeys [ i ] ) ;
2007-07-13 19:37:47 +00:00
}
for ( i = 0 ; i < node - > u . n . n_children ; i + + ) {
2008-01-31 22:05:43 +00:00
if ( BNC_BUFFER ( node , i ) ) {
toku_fifo_free ( & BNC_BUFFER ( node , i ) ) ;
2007-07-13 19:37:47 +00:00
}
}
2008-03-06 21:46:57 +00:00
toku_free ( node - > u . n . childkeys ) ;
toku_free ( node - > u . n . childinfos ) ;
2007-07-13 19:37:47 +00:00
} else {
if ( node - > u . l . buffer ) // The buffer may have been freed already, in some cases.
2008-04-22 20:39:50 +00:00
toku_omt_destroy ( & node - > u . l . buffer ) ;
2008-04-02 23:40:36 +00:00
void * mpbase = toku_mempool_get_base ( & node - > u . l . buffer_mempool ) ;
toku_mempool_fini ( & node - > u . l . buffer_mempool ) ;
toku_free ( mpbase ) ;
2007-07-13 19:37:47 +00:00
}
2008-04-02 23:40:36 +00:00
2007-07-20 18:00:14 +00:00
toku_free ( node ) ;
2007-08-01 02:37:21 +00:00
* nodep = 0 ;
2007-07-13 19:37:47 +00:00
}
2008-05-27 21:08:31 +00:00
static long brtnode_memory_size ( BRTNODE node ) {
if ( node - > height > 0 ) {
#if 0
2008-05-28 01:22:51 +00:00
return toku_serialize_brtnode_size ( node ) ;
# else
2008-05-27 21:08:31 +00:00
int n_children = node - > u . n . n_children ;
int fifo_sum = 0 ;
int i ;
for ( i = 0 ; i < n_children ; i + + ) {
fifo_sum + = toku_fifo_memory_size ( node - > u . n . childinfos [ i ] . buffer ) ;
}
return sizeof ( * node )
+ ( 1 + n_children ) * ( sizeof ( node - > u . n . childinfos [ 0 ] ) )
+ ( n_children ) + ( sizeof ( node - > u . n . childkeys [ 0 ] ) )
+ node - > u . n . totalchildkeylens
+ fifo_sum ;
# endif
} else {
2013-04-16 23:57:18 -04:00
return sizeof ( * node ) + toku_omt_memory_size ( node - > u . l . buffer ) + toku_mempool_get_size ( & node - > u . l . buffer_mempool ) ;
2008-05-27 21:08:31 +00:00
}
2007-09-21 17:55:49 +00:00
}
2008-04-22 20:39:50 +00:00
2008-04-25 13:45:55 +00:00
static int verify_in_mempool ( OMTVALUE lev , u_int32_t UU ( idx ) , void * vmp ) {
LEAFENTRY le = lev ;
2008-04-22 20:39:50 +00:00
struct mempool * mp = vmp ;
assert ( toku_mempool_inrange ( mp , le , leafentry_memsize ( le ) ) ) ;
return 0 ;
}
2008-04-23 04:17:28 +00:00
void toku_verify_all_in_mempool ( BRTNODE node ) {
2008-04-22 20:39:50 +00:00
if ( node - > height = = 0 ) {
toku_omt_iterate ( node - > u . l . buffer , verify_in_mempool , & node - > u . l . buffer_mempool ) ;
2008-01-23 19:44:13 +00:00
}
}
2008-04-22 20:39:50 +00:00
2008-05-02 14:38:35 +00:00
static void fixup_child_fingerprint ( BRTNODE node , int childnum_of_node , BRTNODE child , BRT UU ( brt ) , TOKULOGGER UU ( logger ) ) {
2008-04-30 13:23:04 +00:00
u_int64_t leafentry_estimate = 0 ;
2007-11-14 17:58:38 +00:00
u_int32_t sum = child - > local_fingerprint ;
if ( child - > height > 0 ) {
int i ;
for ( i = 0 ; i < child - > u . n . n_children ; i + + ) {
2008-01-31 22:05:43 +00:00
sum + = BNC_SUBTREE_FINGERPRINT ( child , i ) ;
2008-04-30 13:23:04 +00:00
leafentry_estimate + = BNC_SUBTREE_LEAFENTRY_ESTIMATE ( child , i ) ;
2007-11-14 17:58:38 +00:00
}
2008-04-30 13:23:04 +00:00
} else {
leafentry_estimate = toku_omt_size ( child - > u . l . buffer ) ;
2007-11-14 17:58:38 +00:00
}
// Don't try to get fancy about not modifying the fingerprint if it didn't change.
// We only call this function if we have reason to believe that the child's fingerprint did change.
2008-01-31 22:05:43 +00:00
BNC_SUBTREE_FINGERPRINT ( node , childnum_of_node ) = sum ;
2008-04-30 13:23:04 +00:00
BNC_SUBTREE_LEAFENTRY_ESTIMATE ( node , childnum_of_node ) = leafentry_estimate ;
2007-11-14 17:58:38 +00:00
node - > dirty = 1 ;
}
2008-02-06 19:27:25 +00:00
// If you pass in data==0 then it only compares the key, not the data (even if is a DUPSORT database)
2007-12-06 13:52:52 +00:00
static int brt_compare_pivot ( BRT brt , DBT * key , DBT * data , bytevec ck ) {
2007-11-19 00:46:09 +00:00
int cmp ;
DBT mydbt ;
2007-11-27 18:16:45 +00:00
struct kv_pair * kv = ( struct kv_pair * ) ck ;
2007-11-19 20:22:56 +00:00
if ( brt - > flags & TOKU_DB_DUPSORT ) {
2007-11-29 15:17:46 +00:00
cmp = brt - > compare_fun ( brt - > db , key , toku_fill_dbt ( & mydbt , kv_pair_key ( kv ) , kv_pair_keylen ( kv ) ) ) ;
2007-11-19 00:46:09 +00:00
if ( cmp = = 0 & & data ! = 0 )
2007-11-29 15:17:46 +00:00
cmp = brt - > dup_compare ( brt - > db , data , toku_fill_dbt ( & mydbt , kv_pair_val ( kv ) , kv_pair_vallen ( kv ) ) ) ;
2007-11-27 18:16:45 +00:00
} else {
2007-11-29 15:17:46 +00:00
cmp = brt - > compare_fun ( brt - > db , key , toku_fill_dbt ( & mydbt , kv_pair_key ( kv ) , kv_pair_keylen ( kv ) ) ) ;
2007-11-27 18:16:45 +00:00
}
2007-11-19 00:46:09 +00:00
return cmp ;
}
2013-04-16 23:57:18 -04:00
void toku_brtnode_flush_callback ( CACHEFILE cachefile , BLOCKNUM nodename , void * brtnode_v , void * extraargs , long size __attribute ( ( unused ) ) , BOOL write_me , BOOL keep_me , LSN modified_lsn __attribute__ ( ( __unused__ ) ) , BOOL rename_p __attribute__ ( ( __unused__ ) ) ) {
2013-04-16 23:57:18 -04:00
struct brt_header * h = extraargs ;
2007-07-13 19:37:47 +00:00
BRTNODE brtnode = brtnode_v ;
2007-11-14 17:58:38 +00:00
// if ((write_me || keep_me) && (brtnode->height==0)) {
2007-11-20 00:32:25 +00:00
// toku_pma_verify_fingerprint(brtnode->u.l.buffer, brtnode->rand4fingerprint, brtnode->subtree_fingerprint);
2007-11-14 17:58:38 +00:00
// }
2007-07-13 19:37:47 +00:00
if ( 0 ) {
2013-04-16 23:57:18 -04:00
printf ( " %s:%d toku_brtnode_flush_callback %p thisnodename=% " PRId64 " keep_me=%d height=%d " , __FILE__ , __LINE__ , brtnode , brtnode - > thisnodename . b , keep_me , brtnode - > height ) ;
2008-05-08 07:16:26 +00:00
if ( brtnode - > height = = 0 ) printf ( " buf=%p mempool-base=%p " , brtnode - > u . l . buffer , brtnode - > u . l . buffer_mempool . base ) ;
2007-07-13 19:37:47 +00:00
printf ( " \n " ) ;
}
2007-11-24 03:50:28 +00:00
//if (modified_lsn.lsn > brtnode->lsn.lsn) brtnode->lsn=modified_lsn;
2013-04-16 23:57:18 -04:00
assert ( brtnode - > thisnodename . b = = nodename . b ) ;
2007-07-13 19:37:47 +00:00
//printf("%s:%d %p->mdict[0]=%p\n", __FILE__, __LINE__, brtnode, brtnode->mdicts[0]);
if ( write_me ) {
2013-04-16 23:57:18 -04:00
toku_serialize_brtnode_to ( toku_cachefile_fd ( cachefile ) , brtnode - > thisnodename , brtnode , h ) ;
2007-07-13 19:37:47 +00:00
}
//printf("%s:%d %p->mdict[0]=%p\n", __FILE__, __LINE__, brtnode, brtnode->mdicts[0]);
if ( ! keep_me ) {
2007-11-29 15:09:14 +00:00
toku_brtnode_free ( & brtnode ) ;
2007-07-13 19:37:47 +00:00
}
//printf("%s:%d n_items_malloced=%lld\n", __FILE__, __LINE__, n_items_malloced);
}
2013-04-16 23:57:18 -04:00
int toku_brtnode_fetch_callback ( CACHEFILE cachefile , BLOCKNUM nodename , u_int32_t fullhash , void * * brtnode_pv , long * sizep , void * extraargs , LSN * written_lsn ) {
assert ( extraargs ) ;
2013-04-16 23:57:18 -04:00
struct brt_header * h = extraargs ;
2007-07-13 19:37:47 +00:00
BRTNODE * result = ( BRTNODE * ) brtnode_pv ;
2013-04-16 23:57:18 -04:00
int r = toku_deserialize_brtnode_from ( toku_cachefile_fd ( cachefile ) , nodename , fullhash , result , h ) ;
2008-01-23 18:06:23 +00:00
if ( r = = 0 ) {
2008-05-27 21:08:31 +00:00
* sizep = brtnode_memory_size ( * result ) ;
2008-01-23 18:06:23 +00:00
* written_lsn = ( * result ) - > disk_lsn ;
}
2007-10-03 19:34:31 +00:00
//(*result)->parent_brtnode = 0; /* Don't know it right now. */
//printf("%s:%d installed %p (offset=%lld)\n", __FILE__, __LINE__, *result, nodename);
2007-09-21 17:55:49 +00:00
return r ;
2007-07-13 19:37:47 +00:00
}
2008-05-22 21:28:00 +00:00
void toku_brtheader_free ( struct brt_header * h ) {
if ( h - > n_named_roots > 0 ) {
int i ;
for ( i = 0 ; i < h - > n_named_roots ; i + + ) {
toku_free ( h - > names [ i ] ) ;
}
toku_free ( h - > names ) ;
}
toku_fifo_free ( & h - > fifo ) ;
toku_free ( h - > roots ) ;
2008-06-18 21:38:01 +00:00
toku_free ( h - > root_hashes ) ;
2008-05-22 21:28:00 +00:00
toku_free ( h - > flags_array ) ;
2013-04-16 23:57:18 -04:00
toku_free ( h - > block_translation ) ;
destroy_block_allocator ( & h - > block_allocator ) ;
2008-05-22 21:28:00 +00:00
toku_free ( h ) ;
}
2013-04-16 23:57:18 -04:00
int toku_brtheader_close ( CACHEFILE cachefile , void * header_v ) {
2007-07-13 19:37:47 +00:00
struct brt_header * h = header_v ;
2013-04-16 23:57:18 -04:00
//printf("%s:%d allocated_limit=%lu writing queue to %lu\n", __FILE__, __LINE__,
// block_allocator_allocated_limit(h->block_allocator), h->unused_blocks.b*h->nodesize);
2013-04-16 23:57:18 -04:00
if ( h - > dirty ) {
2007-11-19 23:54:17 +00:00
toku_serialize_brt_header_to ( toku_cachefile_fd ( cachefile ) , h ) ;
2013-04-16 23:57:18 -04:00
u_int64_t write_to = block_allocator_allocated_limit ( h - > block_allocator ) ; // Must compute this after writing the header.
//printf("%s:%d fifo written to %lu\n", __FILE__, __LINE__, write_to);
toku_serialize_fifo_at ( toku_cachefile_fd ( cachefile ) , write_to , h - > fifo ) ;
2007-07-13 19:37:47 +00:00
}
2013-04-16 23:57:18 -04:00
toku_brtheader_free ( h ) ;
return 0 ;
2007-07-13 19:37:47 +00:00
}
2013-04-16 23:57:18 -04:00
#if 0
static int toku_brtheader_fetch_callback ( CACHEFILE cachefile , BLOCKNUM nodename , u_int32_t fullhash , void * * headerp_v , long * sizep __attribute__ ( ( unused ) ) , void * extraargs __attribute__ ( ( __unused__ ) ) , LSN * written_lsn ) {
2008-04-09 02:45:27 +00:00
int r ;
2007-07-13 19:37:47 +00:00
struct brt_header * * h = ( struct brt_header * * ) headerp_v ;
2013-04-16 23:57:18 -04:00
assert ( nodename . b = = 0 ) ;
2008-06-17 17:05:19 +00:00
if ( ( r = toku_deserialize_brtheader_from ( toku_cachefile_fd ( cachefile ) , nodename , fullhash , h ) ) ) return r ;
2008-04-09 02:45:27 +00:00
//printf("%s:%d fifo=%p\nn", __FILE__, __LINE__, (*h)->fifo);
2007-11-14 17:58:38 +00:00
written_lsn - > lsn = 0 ; // !!! WRONG. This should be stored or kept redundantly or something.
2013-04-16 23:57:18 -04:00
assert ( ( * h ) - > free_blocks . b = = - 1 ) ;
2008-04-09 02:45:27 +00:00
return 0 ;
2007-07-13 19:37:47 +00:00
}
2013-04-16 23:57:18 -04:00
# endif
2007-07-13 19:37:47 +00:00
2013-04-16 23:57:18 -04:00
int toku_read_brt_header_and_store_in_cachefile ( CACHEFILE cf , struct brt_header * * header )
// If the cachefile already has the header, then just get it.
// If the cachefile has not been initialized, then don't modify anything.
{
{
struct brt_header * h ;
if ( ( h = toku_cachefile_get_userdata ( cf ) ) ! = 0 ) {
* header = h ;
return 0 ;
}
}
struct brt_header * h ;
int r = toku_deserialize_brtheader_from ( toku_cachefile_fd ( cf ) , make_blocknum ( 0 ) , & h ) ;
2007-07-13 19:37:47 +00:00
if ( r ! = 0 ) return r ;
2013-04-16 23:57:18 -04:00
h - > root_put_counter = global_root_put_counter + + ;
2013-04-16 23:57:18 -04:00
toku_cachefile_set_userdata ( cf , ( void * ) h , toku_brtheader_close ) ;
* header = h ;
2007-07-13 19:37:47 +00:00
return 0 ;
}
2008-03-05 18:34:32 +00:00
int toku_unpin_brtnode ( BRT brt , BRTNODE node ) {
2008-01-23 19:44:13 +00:00
// if (node->dirty && txn) {
// // For now just update the log_lsn. Later we'll have to deal with the checksums.
// node->log_lsn = toku_txn_get_last_lsn(txn);
// //if (node->log_lsn.lsn>33320) printf("%s:%d node%lld lsn=%lld\n", __FILE__, __LINE__, node->thisnodename, node->log_lsn.lsn);
// }
2008-04-07 01:30:25 +00:00
VERIFY_NODE ( node ) ;
2008-06-17 17:05:19 +00:00
return toku_cachetable_unpin ( brt - > cf , node - > thisnodename , node - > fullhash , node - > dirty , brtnode_memory_size ( node ) ) ;
2008-01-23 18:06:23 +00:00
}
2007-07-13 19:37:47 +00:00
typedef struct kvpair {
bytevec key ;
unsigned int keylen ;
bytevec val ;
unsigned int vallen ;
} * KVPAIR ;
2013-04-16 23:57:18 -04:00
int allocate_diskblocknumber ( BLOCKNUM * res , BRT brt , TOKULOGGER logger __attribute__ ( ( __unused__ ) ) ) {
assert ( brt - > h - > free_blocks . b = = - 1 ) ; // no blocks in the free list
BLOCKNUM result = brt - > h - > unused_blocks ;
brt - > h - > unused_blocks . b + + ;
2013-04-16 23:57:18 -04:00
brt - > h - > dirty = 1 ;
2008-01-18 21:28:27 +00:00
* res = result ;
2013-04-16 23:57:18 -04:00
return 0 ;
2007-07-13 19:37:47 +00:00
}
2008-04-03 13:02:15 +00:00
u_int32_t mp_pool_size_for_nodesize ( u_int32_t nodesize ) {
2008-04-03 13:13:21 +00:00
# if 1
2008-04-03 13:02:15 +00:00
return nodesize + nodesize / 4 ;
# else
return nodesize ;
# endif
}
2008-04-08 02:09:19 +00:00
// Simple LCG random number generator. Not high quality, but good enough.
static int r_seeded = 0 ;
2008-07-21 18:00:38 +00:00
static u_int32_t rstate = 1 ;
2008-04-08 02:09:19 +00:00
static inline void mysrandom ( int s ) {
rstate = s ;
r_seeded = 1 ;
}
2008-07-21 18:00:38 +00:00
static inline u_int32_t myrandom ( void ) {
2008-04-08 02:09:19 +00:00
if ( ! r_seeded ) {
struct timeval tv ;
gettimeofday ( & tv , 0 ) ;
mysrandom ( tv . tv_sec ) ;
}
2008-07-21 18:00:38 +00:00
rstate = ( 279470275ull * ( u_int64_t ) rstate ) % 4294967291ull ;
2008-04-08 02:09:19 +00:00
return rstate ;
}
2013-04-16 23:57:18 -04:00
static void initialize_brtnode ( BRT t , BRTNODE n , BLOCKNUM nodename , int height ) {
2007-07-13 19:37:47 +00:00
n - > tag = TYP_BRTNODE ;
n - > nodesize = t - > h - > nodesize ;
2008-05-22 21:28:00 +00:00
n - > flags = t - > flags ;
2007-07-13 19:37:47 +00:00
n - > thisnodename = nodename ;
2007-11-24 03:50:28 +00:00
n - > disk_lsn . lsn = 0 ; // a new one can always be 0.
n - > log_lsn = n - > disk_lsn ;
2013-04-16 23:57:18 -04:00
n - > layout_version = BRT_LAYOUT_VERSION ;
2007-07-13 19:37:47 +00:00
n - > height = height ;
2007-11-14 17:58:38 +00:00
n - > rand4fingerprint = random ( ) ;
n - > local_fingerprint = 0 ;
2007-11-15 14:44:05 +00:00
n - > dirty = 1 ;
2007-07-13 19:37:47 +00:00
assert ( height > = 0 ) ;
if ( height > 0 ) {
n - > u . n . n_children = 0 ;
n - > u . n . totalchildkeylens = 0 ;
2008-01-11 14:03:33 +00:00
n - > u . n . n_bytes_in_buffers = 0 ;
2008-03-06 21:46:57 +00:00
n - > u . n . childinfos = 0 ;
n - > u . n . childkeys = 0 ;
2007-07-13 19:37:47 +00:00
} else {
2008-04-22 20:39:50 +00:00
int r = toku_omt_create ( & n - > u . l . buffer ) ;
2008-04-02 23:40:36 +00:00
assert ( r = = 0 ) ;
{
2008-04-03 13:13:21 +00:00
u_int32_t mpsize = mp_pool_size_for_nodesize ( n - > nodesize ) ;
void * mp = toku_malloc ( mpsize ) ;
2008-04-02 23:40:36 +00:00
assert ( mp ) ;
2008-04-03 13:13:21 +00:00
toku_mempool_init ( & n - > u . l . buffer_mempool , mp , mpsize ) ;
2008-04-02 23:40:36 +00:00
}
2007-07-13 19:37:47 +00:00
static int rcount = 0 ;
//printf("%s:%d n PMA= %p (rcount=%d)\n", __FILE__, __LINE__, n->u.l.buffer, rcount);
rcount + + ;
n - > u . l . n_bytes_in_buffer = 0 ;
2008-06-18 10:02:06 +00:00
n - > u . l . seqinsert = 0 ;
2007-07-13 19:37:47 +00:00
}
}
2008-04-22 20:39:50 +00:00
// logs the memory allocation, but not the creation of the new node
2008-03-06 21:46:57 +00:00
int toku_create_new_brtnode ( BRT t , BRTNODE * result , int height , TOKULOGGER logger ) {
2007-07-13 19:37:47 +00:00
TAGMALLOC ( BRTNODE , n ) ;
int r ;
2013-04-16 23:57:18 -04:00
BLOCKNUM name ;
r = allocate_diskblocknumber ( & name , t , logger ) ;
2008-01-18 21:28:27 +00:00
assert ( r = = 0 ) ;
2007-07-13 19:37:47 +00:00
assert ( n ) ;
assert ( t - > h - > nodesize > 0 ) ;
2008-06-15 17:09:14 +00:00
n - > ever_been_written = 0 ;
2007-07-13 19:37:47 +00:00
initialize_brtnode ( t , n , name , height ) ;
* result = n ;
assert ( n - > nodesize > 0 ) ;
2007-11-16 20:34:13 +00:00
// n->brt = t;
2008-04-07 01:30:25 +00:00
//printf("%s:%d putting %p (%lld)\n", __FILE__, __LINE__, n, n->thisnodename);
2008-06-17 17:05:19 +00:00
u_int32_t fullhash = toku_cachetable_hash ( t - > cf , n - > thisnodename ) ;
n - > fullhash = fullhash ;
r = toku_cachetable_put ( t - > cf , n - > thisnodename , fullhash ,
n , brtnode_memory_size ( n ) ,
2013-04-16 23:57:18 -04:00
toku_brtnode_flush_callback , toku_brtnode_fetch_callback , t - > h ) ;
2008-01-22 16:27:54 +00:00
assert ( r = = 0 ) ;
2008-03-06 21:46:57 +00:00
return 0 ;
2007-07-13 19:37:47 +00:00
}
2008-02-05 18:25:23 +00:00
static int insert_to_buffer_in_nonleaf ( BRTNODE node , int childnum , DBT * k , DBT * v , int type , TXNID xid ) {
2007-09-06 21:36:45 +00:00
unsigned int n_bytes_added = BRT_CMD_OVERHEAD + KEY_VALUE_OVERHEAD + k - > size + v - > size ;
2008-02-05 18:25:23 +00:00
int r = toku_fifo_enq ( BNC_BUFFER ( node , childnum ) , k - > data , k - > size , v - > data , v - > size , type , xid ) ;
2007-07-13 19:37:47 +00:00
if ( r ! = 0 ) return r ;
2008-03-18 10:19:41 +00:00
// printf("%s:%d fingerprint %08x -> ", __FILE__, __LINE__, node->local_fingerprint);
2008-07-27 22:16:49 +00:00
node - > local_fingerprint + = node - > rand4fingerprint * toku_calc_fingerprint_cmd ( type , xid , k - > data , k - > size , v - > data , v - > size ) ;
2008-03-18 10:19:41 +00:00
// printf(" %08x\n", node->local_fingerprint);
2008-01-31 22:05:43 +00:00
BNC_NBYTESINBUF ( node , childnum ) + = n_bytes_added ;
2008-01-11 14:03:33 +00:00
node - > u . n . n_bytes_in_buffers + = n_bytes_added ;
2007-11-15 14:44:05 +00:00
node - > dirty = 1 ;
2007-07-13 19:37:47 +00:00
return 0 ;
}
2008-04-25 13:45:55 +00:00
static int fill_buf ( OMTVALUE lev , u_int32_t idx , void * varray ) {
LEAFENTRY le = lev ;
2008-04-22 20:39:50 +00:00
LEAFENTRY * array = varray ;
array [ idx ] = le ;
2008-04-02 23:40:36 +00:00
return 0 ;
}
2007-07-13 19:37:47 +00:00
2008-02-08 19:54:00 +00:00
static int brtleaf_split ( TOKULOGGER logger , FILENUM filenum , BRT t , BRTNODE node , BRTNODE * nodea , BRTNODE * nodeb , DBT * splitk ) {
2008-01-22 16:27:54 +00:00
BRTNODE B ;
2008-06-18 10:22:06 +00:00
int r ;
2007-07-13 19:37:47 +00:00
assert ( node - > height = = 0 ) ;
assert ( t - > h - > nodesize > = node - > nodesize ) ; /* otherwise we might be in trouble because the nodesize shrank. */
2008-03-05 18:34:32 +00:00
toku_create_new_brtnode ( t , & B , 0 , logger ) ;
2007-12-31 17:30:19 +00:00
//printf("leaf_split %lld - %lld %lld\n", node->thisnodename, A->thisnodename, B->thisnodename);
2007-07-13 19:37:47 +00:00
//printf("%s:%d A PMA= %p\n", __FILE__, __LINE__, A->u.l.buffer);
//printf("%s:%d B PMA= %p\n", __FILE__, __LINE__, A->u.l.buffer);
assert ( B - > nodesize > 0 ) ;
assert ( node - > nodesize > 0 ) ;
//printf("%s:%d A is at %lld\n", __FILE__, __LINE__, A->thisnodename);
//printf("%s:%d B is at %lld nodesize=%d\n", __FILE__, __LINE__, B->thisnodename, B->nodesize);
assert ( node - > height > 0 | | node - > u . l . buffer ! = 0 ) ;
2008-04-22 20:39:50 +00:00
2008-04-23 04:17:28 +00:00
toku_verify_all_in_mempool ( node ) ;
2008-04-22 20:39:50 +00:00
u_int32_t n_leafentries = toku_omt_size ( node - > u . l . buffer ) ;
u_int32_t break_at = 0 ;
2008-06-30 20:15:38 +00:00
unsigned int seqinsert = node - > u . l . seqinsert ;
node - > u . l . seqinsert = 0 ;
if ( seqinsert > = n_leafentries / 2 ) {
2008-07-08 11:08:15 +00:00
u_int32_t node_size = toku_serialize_brtnode_size ( node ) ;
2008-06-18 10:22:06 +00:00
break_at = n_leafentries - 1 ;
OMTVALUE v ;
2008-07-08 11:08:15 +00:00
while ( 1 ) {
r = toku_omt_fetch ( node - > u . l . buffer , break_at , & v , NULL ) ;
assert ( r = = 0 ) ;
LEAFENTRY le = v ;
node_size - = OMT_ITEM_OVERHEAD + leafentry_disksize ( le ) ;
2008-07-13 00:04:22 +00:00
if ( node_size < = node - > nodesize & & ( n_leafentries - break_at ) > = 2 )
2008-07-08 11:08:15 +00:00
break ;
break_at - = 1 ;
}
2008-06-18 10:22:06 +00:00
2008-07-08 11:08:15 +00:00
u_int32_t i ;
for ( i = 0 ; break_at < toku_omt_size ( node - > u . l . buffer ) ; i + + ) {
// fetch the max from the node and delete it
if ( i > 0 ) {
r = toku_omt_fetch ( node - > u . l . buffer , break_at , & v , NULL ) ;
assert ( r = = 0 ) ;
}
LEAFENTRY oldle = v ;
u_int32_t diff_fp = toku_le_crc ( oldle ) ;
u_int32_t diff_size = OMT_ITEM_OVERHEAD + leafentry_disksize ( oldle ) ;
r = toku_omt_delete_at ( node - > u . l . buffer , break_at ) ;
assert ( r = = 0 ) ;
LEAFENTRY newle = toku_mempool_malloc ( & B - > u . l . buffer_mempool , leafentry_memsize ( oldle ) , 1 ) ;
assert ( newle ! = 0 ) ; // it's a fresh mpool, so this should always work.
memcpy ( newle , oldle , leafentry_memsize ( oldle ) ) ;
toku_mempool_mfree ( & node - > u . l . buffer_mempool , oldle , leafentry_memsize ( oldle ) ) ;
node - > local_fingerprint - = node - > rand4fingerprint * diff_fp ;
B - > local_fingerprint + = B - > rand4fingerprint * diff_fp ;
node - > u . l . n_bytes_in_buffer - = diff_size ;
B - > u . l . n_bytes_in_buffer + = diff_size ;
// insert into B
r = toku_omt_insert_at ( B - > u . l . buffer , newle , i ) ;
assert ( r = = 0 ) ;
toku_verify_all_in_mempool ( node ) ;
toku_verify_all_in_mempool ( B ) ;
}
2008-06-18 10:22:06 +00:00
} else {
OMTVALUE * MALLOC_N ( n_leafentries , leafentries ) ;
assert ( leafentries ) ;
toku_omt_iterate ( node - > u . l . buffer , fill_buf , leafentries ) ;
break_at = 0 ;
{
u_int32_t i ;
u_int32_t sumlesizes = 0 ;
for ( i = 0 ; i < n_leafentries ; i + + ) sumlesizes + = leafentry_disksize ( leafentries [ i ] ) ;
u_int32_t sumsofar = 0 ;
for ( i = 0 ; i < n_leafentries ; i + + ) {
assert ( toku_mempool_inrange ( & node - > u . l . buffer_mempool , leafentries [ i ] , leafentry_memsize ( leafentries [ i ] ) ) ) ;
sumsofar + = leafentry_disksize ( leafentries [ i ] ) ;
if ( sumsofar * 2 > = sumlesizes ) {
break_at = i ;
break ;
}
}
}
// Now we know where we are going to break it
OMT old_omt = node - > u . l . buffer ;
toku_omt_destroy ( & B - > u . l . buffer ) ; // Destroy B's empty OMT, so I can rebuild it from an array
{
u_int32_t i ;
u_int32_t diff_fp = 0 ;
u_int32_t diff_size = 0 ;
for ( i = break_at ; i < n_leafentries ; i + + ) {
LEAFENTRY oldle = leafentries [ i ] ;
LEAFENTRY newle = toku_mempool_malloc ( & B - > u . l . buffer_mempool , leafentry_memsize ( oldle ) , 1 ) ;
assert ( newle ! = 0 ) ; // it's a fresh mpool, so this should always work.
diff_fp + = toku_le_crc ( oldle ) ;
diff_size + = OMT_ITEM_OVERHEAD + leafentry_disksize ( oldle ) ;
memcpy ( newle , oldle , leafentry_memsize ( oldle ) ) ;
toku_mempool_mfree ( & node - > u . l . buffer_mempool , oldle , leafentry_memsize ( oldle ) ) ;
leafentries [ i ] = newle ;
}
node - > local_fingerprint - = node - > rand4fingerprint * diff_fp ;
B - > local_fingerprint + = B - > rand4fingerprint * diff_fp ;
node - > u . l . n_bytes_in_buffer - = diff_size ;
B - > u . l . n_bytes_in_buffer + = diff_size ;
}
if ( ( r = toku_omt_create_from_sorted_array ( & B - > u . l . buffer , leafentries + break_at , n_leafentries - break_at ) ) ) return r ;
if ( ( r = toku_omt_create_from_sorted_array ( & node - > u . l . buffer , leafentries , break_at ) ) ) return r ;
toku_free ( leafentries ) ;
toku_verify_all_in_mempool ( node ) ;
toku_verify_all_in_mempool ( B ) ;
toku_omt_destroy ( & old_omt ) ;
2008-04-22 20:39:50 +00:00
}
2008-05-04 16:56:15 +00:00
LSN lsn = { 0 } ;
2008-04-22 20:39:50 +00:00
r = toku_log_leafsplit ( logger , & lsn , 0 , filenum , node - > thisnodename , B - > thisnodename , n_leafentries , break_at , node - > nodesize , B - > rand4fingerprint , ( t - > flags & TOKU_DB_DUPSORT ) ! = 0 ) ;
if ( logger ) {
node - > log_lsn = lsn ;
B - > log_lsn = lsn ;
}
2008-04-02 23:40:36 +00:00
//toku_verify_gpma(node->u.l.buffer);
//toku_verify_gpma(B->u.l.buffer);
if ( splitk ) {
memset ( splitk , 0 , sizeof * splitk ) ;
2008-04-25 13:45:55 +00:00
OMTVALUE lev ;
2008-05-30 20:41:12 +00:00
r = toku_omt_fetch ( node - > u . l . buffer , toku_omt_size ( node - > u . l . buffer ) - 1 , & lev , NULL ) ;
2008-04-22 20:39:50 +00:00
assert ( r = = 0 ) ; // that fetch should have worked.
2008-04-25 13:45:55 +00:00
LEAFENTRY le = lev ;
2008-04-02 23:40:36 +00:00
if ( node - > flags & TOKU_DB_DUPSORT ) {
2008-04-07 01:30:25 +00:00
splitk - > size = le_any_keylen ( le ) + le_any_vallen ( le ) ;
splitk - > data = kv_pair_malloc ( le_any_key ( le ) , le_any_keylen ( le ) , le_any_val ( le ) , le_any_vallen ( le ) ) ;
2008-04-02 23:40:36 +00:00
} else {
2008-04-07 01:30:25 +00:00
splitk - > size = le_any_keylen ( le ) ;
splitk - > data = kv_pair_malloc ( le_any_key ( le ) , le_any_keylen ( le ) , 0 , 0 ) ;
2008-04-02 23:40:36 +00:00
}
splitk - > flags = 0 ;
}
2007-08-09 18:54:58 +00:00
assert ( r = = 0 ) ;
2007-07-13 19:37:47 +00:00
assert ( node - > height > 0 | | node - > u . l . buffer ! = 0 ) ;
/* Remove it from the cache table, and free its storage. */
//printf("%s:%d old pma = %p\n", __FILE__, __LINE__, node->u.l.buffer);
2008-01-22 16:27:54 +00:00
* nodea = node ;
2007-07-13 19:37:47 +00:00
* nodeb = B ;
2008-06-18 20:49:50 +00:00
assert ( toku_serialize_brtnode_size ( node ) < = node - > nodesize ) ;
assert ( toku_serialize_brtnode_size ( B ) < = B - > nodesize ) ;
2007-07-13 19:37:47 +00:00
return 0 ;
}
2008-04-07 01:30:25 +00:00
//#define MAX_PATHLEN_TO_ROOT 40
2008-03-19 22:42:46 +00:00
2008-04-07 01:30:25 +00:00
static int log_and_save_brtenq ( TOKULOGGER logger , BRT t , BRTNODE node , int childnum , TXNID xid , int type , const char * key , int keylen , const char * data , int datalen , u_int32_t * fingerprint ) {
2008-03-19 22:42:46 +00:00
BYTESTRING keybs = { . len = keylen , . data = ( char * ) key } ;
BYTESTRING databs = { . len = datalen , . data = ( char * ) data } ;
u_int32_t old_fingerprint = * fingerprint ;
2008-07-27 22:16:49 +00:00
u_int32_t fdiff = node - > rand4fingerprint * toku_calc_fingerprint_cmd ( type , xid , key , keylen , data , datalen ) ;
2008-03-19 22:42:46 +00:00
u_int32_t new_fingerprint = old_fingerprint + fdiff ;
2013-04-16 23:57:18 -04:00
//printf("%s:%d node=%lld fingerprint old=%08x new=%08x diff=%08x xid=%lld\n", __FILE__, __LINE__, node->thisnodename, old_fingerprint, new_fingerprint, fdiff, (long long)xid);
2008-03-19 22:42:46 +00:00
* fingerprint = new_fingerprint ;
2008-07-10 18:46:41 +00:00
if ( t - > txn_that_created ! = xid ) {
int r = toku_log_brtenq ( logger , & node - > log_lsn , 0 , toku_cachefile_filenum ( t - > cf ) , node - > thisnodename , childnum , xid , type , keybs , databs ) ;
if ( r ! = 0 ) return r ;
}
2008-03-19 22:42:46 +00:00
return 0 ;
}
2007-07-24 01:32:03 +00:00
/* Side effect: sets splitk->data pointer to a malloc'd value */
2008-04-07 01:30:25 +00:00
static int brt_nonleaf_split ( BRT t , BRTNODE node , BRTNODE * nodea , BRTNODE * nodeb , DBT * splitk , TOKULOGGER logger ) {
2008-01-29 21:43:08 +00:00
int old_n_children = node - > u . n . n_children ;
int n_children_in_a = old_n_children / 2 ;
int n_children_in_b = old_n_children - n_children_in_a ;
BRTNODE B ;
FILENUM fnum = toku_cachefile_filenum ( t - > cf ) ;
2007-07-13 19:37:47 +00:00
assert ( node - > height > 0 ) ;
assert ( node - > u . n . n_children > = 2 ) ; // Otherwise, how do we split? We need at least two children to split. */
assert ( t - > h - > nodesize > = node - > nodesize ) ; /* otherwise we might be in trouble because the nodesize shrank. */
2008-03-05 18:34:32 +00:00
toku_create_new_brtnode ( t , & B , node - > height , logger ) ;
2008-03-06 21:46:57 +00:00
MALLOC_N ( n_children_in_b + 1 , B - > u . n . childinfos ) ;
MALLOC_N ( n_children_in_b , B - > u . n . childkeys ) ;
2008-01-29 21:43:08 +00:00
B - > u . n . n_children = n_children_in_b ;
2007-10-03 19:34:31 +00:00
//printf("%s:%d %p (%lld) becomes %p and %p\n", __FILE__, __LINE__, node, node->thisnodename, A, B);
2007-07-13 19:37:47 +00:00
//printf("%s:%d A is at %lld\n", __FILE__, __LINE__, A->thisnodename);
{
/* The first n_children_in_a go into node a.
* That means that the first n_children_in_a - 1 keys go into node a .
* The splitter key is key number n_children_in_a */
int i ;
2008-01-29 21:43:08 +00:00
2008-01-31 14:58:14 +00:00
for ( i = 0 ; i < n_children_in_b ; i + + ) {
2008-01-31 22:05:43 +00:00
int r = toku_fifo_create ( & BNC_BUFFER ( B , i ) ) ;
2008-01-29 21:43:08 +00:00
if ( r ! = 0 ) return r ;
2008-03-06 21:46:57 +00:00
BNC_NBYTESINBUF ( B , i ) = 0 ;
BNC_SUBTREE_FINGERPRINT ( B , i ) = 0 ;
2008-04-30 13:23:04 +00:00
BNC_SUBTREE_LEAFENTRY_ESTIMATE ( B , i ) = 0 ;
2007-07-13 19:37:47 +00:00
}
2008-01-29 21:43:08 +00:00
for ( i = n_children_in_a ; i < old_n_children ; i + + ) {
2007-07-13 19:37:47 +00:00
int targchild = i - n_children_in_a ;
2008-01-31 22:05:43 +00:00
FIFO from_htab = BNC_BUFFER ( node , i ) ;
FIFO to_htab = BNC_BUFFER ( B , targchild ) ;
2013-04-16 23:57:18 -04:00
BLOCKNUM thischildblocknum = BNC_BLOCKNUM ( node , i ) ;
2008-01-29 21:43:08 +00:00
2013-04-16 23:57:18 -04:00
BNC_BLOCKNUM ( B , targchild ) = thischildblocknum ;
2008-06-18 00:30:36 +00:00
BNC_HAVE_FULLHASH ( B , targchild ) = BNC_HAVE_FULLHASH ( node , i ) ;
BNC_FULLHASH ( B , targchild ) = BNC_FULLHASH ( node , i ) ;
2007-07-13 19:37:47 +00:00
2013-04-16 23:57:18 -04:00
int r = toku_log_addchild ( logger , ( LSN * ) 0 , 0 , fnum , B - > thisnodename , targchild , thischildblocknum , BNC_SUBTREE_FINGERPRINT ( node , i ) ) ;
2008-01-29 21:43:08 +00:00
if ( r ! = 0 ) return r ;
while ( 1 ) {
bytevec key , data ;
unsigned int keylen , datalen ;
2008-03-17 18:56:12 +00:00
u_int32_t type ;
2008-02-05 18:25:23 +00:00
TXNID xid ;
int fr = toku_fifo_peek ( from_htab , & key , & keylen , & data , & datalen , & type , & xid ) ;
2008-01-29 21:43:08 +00:00
if ( fr ! = 0 ) break ;
int n_bytes_moved = keylen + datalen + KEY_VALUE_OVERHEAD + BRT_CMD_OVERHEAD ;
u_int32_t old_from_fingerprint = node - > local_fingerprint ;
2008-07-27 22:16:49 +00:00
u_int32_t delta = toku_calc_fingerprint_cmd ( type , xid , key , keylen , data , datalen ) ;
2008-01-29 21:43:08 +00:00
u_int32_t new_from_fingerprint = old_from_fingerprint - node - > rand4fingerprint * delta ;
if ( r ! = 0 ) return r ;
2008-07-10 18:46:41 +00:00
if ( t - > txn_that_created ! = xid ) {
r = toku_log_brtdeq ( logger , & node - > log_lsn , 0 , fnum , node - > thisnodename , n_children_in_a ) ;
if ( r ! = 0 ) return r ;
}
2008-04-07 01:30:25 +00:00
r = log_and_save_brtenq ( logger , t , B , targchild , xid , type , key , keylen , data , datalen , & B - > local_fingerprint ) ;
2008-02-05 18:25:23 +00:00
r = toku_fifo_enq ( to_htab , key , keylen , data , datalen , type , xid ) ;
2008-01-29 21:43:08 +00:00
if ( r ! = 0 ) return r ;
toku_fifo_deq ( from_htab ) ;
// key and data will no longer be valid
node - > local_fingerprint = new_from_fingerprint ;
2008-01-31 22:05:43 +00:00
B - > u . n . n_bytes_in_buffers + = n_bytes_moved ;
BNC_NBYTESINBUF ( B , targchild ) + = n_bytes_moved ;
node - > u . n . n_bytes_in_buffers - = n_bytes_moved ;
BNC_NBYTESINBUF ( node , i ) - = n_bytes_moved ;
2008-02-08 18:44:50 +00:00
// verify_local_fingerprint_nonleaf(B);
// verify_local_fingerprint_nonleaf(node);
2008-01-29 21:43:08 +00:00
}
2007-11-14 17:58:38 +00:00
2008-01-30 21:23:01 +00:00
// Delete a child, removing it's fingerprint, and also the preceeding pivot key. The child number must be > 0
2008-01-29 21:43:08 +00:00
{
BYTESTRING bs = { . len = kv_pair_keylen ( node - > u . n . childkeys [ i - 1 ] ) ,
. data = kv_pair_key ( node - > u . n . childkeys [ i - 1 ] ) } ;
2008-01-30 21:23:01 +00:00
assert ( i > 0 ) ;
2013-04-16 23:57:18 -04:00
r = toku_log_delchild ( logger , ( LSN * ) 0 , 0 , fnum , node - > thisnodename , n_children_in_a , thischildblocknum , BNC_SUBTREE_FINGERPRINT ( node , i ) , bs ) ;
2008-01-29 21:43:08 +00:00
if ( r ! = 0 ) return r ;
if ( i > n_children_in_a ) {
2008-03-21 21:02:30 +00:00
r = toku_log_setpivot ( logger , ( LSN * ) 0 , 0 , fnum , B - > thisnodename , targchild - 1 , bs ) ;
2008-01-29 21:43:08 +00:00
if ( r ! = 0 ) return r ;
B - > u . n . childkeys [ targchild - 1 ] = node - > u . n . childkeys [ i - 1 ] ;
B - > u . n . totalchildkeylens + = toku_brt_pivot_key_len ( t , node - > u . n . childkeys [ i - 1 ] ) ;
node - > u . n . totalchildkeylens - = toku_brt_pivot_key_len ( t , node - > u . n . childkeys [ i - 1 ] ) ;
node - > u . n . childkeys [ i - 1 ] = 0 ;
}
}
2013-04-16 23:57:18 -04:00
BNC_BLOCKNUM ( node , i ) = make_blocknum ( 0 ) ;
2008-06-18 00:30:36 +00:00
BNC_HAVE_FULLHASH ( node , i ) = FALSE ;
2008-01-29 21:43:08 +00:00
2008-01-31 22:05:43 +00:00
BNC_SUBTREE_FINGERPRINT ( B , targchild ) = BNC_SUBTREE_FINGERPRINT ( node , i ) ;
BNC_SUBTREE_FINGERPRINT ( node , i ) = 0 ;
2008-01-29 21:43:08 +00:00
2008-04-30 13:23:04 +00:00
BNC_SUBTREE_LEAFENTRY_ESTIMATE ( B , targchild ) = BNC_SUBTREE_LEAFENTRY_ESTIMATE ( node , i ) ;
BNC_SUBTREE_LEAFENTRY_ESTIMATE ( node , i ) = 0 ;
2008-01-31 22:05:43 +00:00
assert ( BNC_NBYTESINBUF ( node , i ) = = 0 ) ;
2007-07-13 19:37:47 +00:00
}
2008-01-29 21:43:08 +00:00
// Drop the n_children now (not earlier) so that we can do the fingerprint verification at any time.
node - > u . n . n_children = n_children_in_a ;
for ( i = n_children_in_a ; i < old_n_children ; i + + ) {
2008-01-31 22:05:43 +00:00
toku_fifo_free ( & BNC_BUFFER ( node , i ) ) ;
2007-07-13 19:37:47 +00:00
}
2008-01-29 21:43:08 +00:00
2007-07-24 01:32:03 +00:00
splitk - > data = ( void * ) ( node - > u . n . childkeys [ n_children_in_a - 1 ] ) ;
2007-12-06 14:20:47 +00:00
splitk - > size = toku_brt_pivot_key_len ( t , node - > u . n . childkeys [ n_children_in_a - 1 ] ) ;
node - > u . n . totalchildkeylens - = toku_brt_pivot_key_len ( t , node - > u . n . childkeys [ n_children_in_a - 1 ] ) ;
2008-03-06 21:46:57 +00:00
REALLOC_N ( n_children_in_a + 1 , node - > u . n . childinfos ) ;
REALLOC_N ( n_children_in_a , node - > u . n . childkeys ) ;
2007-10-03 19:34:31 +00:00
2008-01-29 21:43:08 +00:00
verify_local_fingerprint_nonleaf ( node ) ;
verify_local_fingerprint_nonleaf ( B ) ;
2007-07-13 19:37:47 +00:00
}
2008-01-29 21:43:08 +00:00
* nodea = node ;
2007-07-13 19:37:47 +00:00
* nodeb = B ;
2008-06-18 20:49:50 +00:00
assert ( toku_serialize_brtnode_size ( node ) < = node - > nodesize ) ;
assert ( toku_serialize_brtnode_size ( B ) < = B - > nodesize ) ;
2008-01-29 21:43:08 +00:00
return 0 ;
2007-07-13 19:37:47 +00:00
}
2007-11-28 19:00:21 +00:00
static void find_heaviest_child ( BRTNODE node , int * childnum ) {
2007-07-13 19:37:47 +00:00
int max_child = 0 ;
2008-01-31 22:05:43 +00:00
int max_weight = BNC_NBYTESINBUF ( node , 0 ) ;
2007-07-13 19:37:47 +00:00
int i ;
2007-07-24 02:36:00 +00:00
if ( 0 ) printf ( " %s:%d weights: %d " , __FILE__ , __LINE__ , max_weight ) ;
2007-07-13 19:37:47 +00:00
assert ( node - > u . n . n_children > 0 ) ;
for ( i = 1 ; i < node - > u . n . n_children ; i + + ) {
2008-01-31 22:05:43 +00:00
int this_weight = BNC_NBYTESINBUF ( node , i ) ;
2007-07-24 02:36:00 +00:00
if ( 0 ) printf ( " %d " , this_weight ) ;
2007-07-13 19:37:47 +00:00
if ( max_weight < this_weight ) {
max_child = i ;
max_weight = this_weight ;
}
}
* childnum = max_child ;
2007-07-24 02:36:00 +00:00
if ( 0 ) printf ( " \n " ) ;
2007-07-13 19:37:47 +00:00
}
2008-05-17 00:49:13 +00:00
static const char * unparse_cmd_type ( enum brt_cmd_type typ ) __attribute__ ( ( __unused__ ) ) ;
static const char * unparse_cmd_type ( enum brt_cmd_type typ ) {
switch ( typ ) {
case BRT_NONE : return " NONE " ;
case BRT_INSERT : return " INSERT " ;
case BRT_DELETE_ANY : return " DELETE_ANY " ;
case BRT_DELETE_BOTH : return " DELETE_BOTH " ;
case BRT_ABORT_ANY : return " ABORT_ANY " ;
case BRT_ABORT_BOTH : return " ABORT_BOTH " ;
case BRT_COMMIT_ANY : return " COMMIT_ANY " ;
case BRT_COMMIT_BOTH : return " COMMIT_BOTH " ;
}
return " ? " ;
}
2008-02-04 15:04:22 +00:00
static int brtnode_put_cmd ( BRT t , BRTNODE node , BRT_CMD cmd ,
2007-09-28 17:11:22 +00:00
int * did_split , BRTNODE * nodea , BRTNODE * nodeb ,
DBT * split ,
2008-04-07 01:30:25 +00:00
TOKULOGGER ) ;
2007-07-13 19:37:47 +00:00
2008-05-23 17:04:21 +00:00
// The maximum row size is 16KB according to the PRD. That means the max pivot key size is 16KB.
# define MAX_PIVOT_KEY_SIZE (1<<14)
2008-01-11 14:03:33 +00:00
/* key is not in the buffer. Either put the key-value pair in the child, or put it in the node. */
2007-09-06 21:36:45 +00:00
static int push_brt_cmd_down_only_if_it_wont_push_more_else_put_here ( BRT t , BRTNODE node , BRTNODE child ,
2008-02-04 15:04:22 +00:00
BRT_CMD cmd ,
2007-09-28 17:11:22 +00:00
int childnum_of_node ,
2008-04-07 01:30:25 +00:00
TOKULOGGER logger ) {
2007-07-13 19:37:47 +00:00
assert ( node - > height > 0 ) ; /* Not a leaf. */
2007-09-06 21:36:45 +00:00
DBT * k = cmd - > u . id . key ;
DBT * v = cmd - > u . id . val ;
2008-05-17 00:49:13 +00:00
unsigned int oldsize = toku_serialize_brtnode_size ( child ) ;
2008-05-23 17:04:21 +00:00
unsigned int newsize_bounded = oldsize + k - > size + v - > size + KEY_VALUE_OVERHEAD + LE_OVERHEAD_BOUND + MAX_PIVOT_KEY_SIZE ;
2008-04-22 20:39:50 +00:00
newsize_bounded + = ( child - > height > 0 ) ? BRT_CMD_OVERHEAD : OMT_ITEM_OVERHEAD ;
int to_child = newsize_bounded < = child - > nodesize ;
if ( 0 ) {
2007-07-24 01:32:03 +00:00
printf ( " %s:%d pushing %s to %s %d " , __FILE__ , __LINE__ , ( char * ) k - > data , to_child ? " child " : " hash " , childnum_of_node ) ;
2007-07-13 19:37:47 +00:00
if ( childnum_of_node + 1 < node - > u . n . n_children ) {
2007-07-24 01:32:03 +00:00
DBT k2 ;
2007-07-13 19:37:47 +00:00
printf ( " nextsplitkey=%s \n " , ( char * ) node - > u . n . childkeys [ childnum_of_node ] ) ;
2007-12-06 13:52:52 +00:00
assert ( t - > compare_fun ( t - > db , k , toku_fill_dbt ( & k2 , node - > u . n . childkeys [ childnum_of_node ] , toku_brt_pivot_key_len ( t , node - > u . n . childkeys [ childnum_of_node ] ) ) ) < = 0 ) ;
2007-07-13 19:37:47 +00:00
} else {
printf ( " \n " ) ;
}
}
2007-11-14 17:58:38 +00:00
int r ;
2007-07-13 19:37:47 +00:00
if ( to_child ) {
2007-07-24 01:32:03 +00:00
int again_split = - 1 ; BRTNODE againa , againb ;
DBT againk ;
2007-11-29 15:17:46 +00:00
toku_init_dbt ( & againk ) ;
2007-07-13 19:37:47 +00:00
//printf("%s:%d hello!\n", __FILE__, __LINE__);
2007-11-14 17:58:38 +00:00
r = brtnode_put_cmd ( t , child , cmd ,
2008-02-08 19:54:00 +00:00
& again_split , & againa , & againb , & againk ,
2008-04-07 01:30:25 +00:00
logger ) ;
2007-07-13 19:37:47 +00:00
if ( r ! = 0 ) return r ;
assert ( again_split = = 0 ) ; /* I only did the insert if I knew it wouldn't push down, and hence wouldn't split. */
} else {
2008-02-05 18:25:23 +00:00
r = insert_to_buffer_in_nonleaf ( node , childnum_of_node , k , v , cmd - > type , cmd - > xid ) ;
2007-07-13 19:37:47 +00:00
}
2008-05-17 00:49:13 +00:00
if ( newsize_bounded < toku_serialize_brtnode_size ( child ) ) {
fprintf ( stderr , " %s:%d size estimate is messed up. newsize_bounded=%d actual_size=%d child_height=%d to_child=%d \n " ,
__FILE__ , __LINE__ , newsize_bounded , toku_serialize_brtnode_size ( child ) , child - > height , to_child ) ;
fprintf ( stderr , " cmd->type=%s cmd->xid=%lld \n " , unparse_cmd_type ( cmd - > type ) , ( unsigned long long ) cmd - > xid ) ;
fprintf ( stderr , " oldsize=%d k->size=%d v->size=%d \n " , oldsize , k - > size , v - > size ) ;
assert ( toku_serialize_brtnode_size ( child ) < = child - > nodesize ) ;
//assert(newsize_bounded >= toku_serialize_brtnode_size(child)); // Don't abort on this
}
2008-02-08 19:54:00 +00:00
fixup_child_fingerprint ( node , childnum_of_node , child , t , logger ) ;
2007-11-14 17:58:38 +00:00
return r ;
2007-07-13 19:37:47 +00:00
}
2007-09-06 21:36:45 +00:00
static int push_a_brt_cmd_down ( BRT t , BRTNODE node , BRTNODE child , int childnum ,
2008-02-04 15:04:22 +00:00
BRT_CMD cmd ,
2007-09-28 17:11:22 +00:00
int * child_did_split , BRTNODE * childa , BRTNODE * childb ,
DBT * childsplitk ,
2008-04-07 01:30:25 +00:00
TOKULOGGER logger ) {
2007-07-13 19:37:47 +00:00
//if (debug) printf("%s:%d %*sinserting down\n", __FILE__, __LINE__, debug, "");
//printf("%s:%d hello!\n", __FILE__, __LINE__);
assert ( node - > height > 0 ) ;
{
2007-09-06 21:36:45 +00:00
int r = brtnode_put_cmd ( t , child , cmd ,
2007-09-28 17:11:22 +00:00
child_did_split , childa , childb , childsplitk ,
2008-04-07 01:30:25 +00:00
logger ) ;
2007-07-13 19:37:47 +00:00
if ( r ! = 0 ) return r ;
}
2007-07-24 02:36:00 +00:00
2007-09-06 21:36:45 +00:00
DBT * k = cmd - > u . id . key ;
DBT * v = cmd - > u . id . val ;
2007-07-13 19:37:47 +00:00
//if (debug) printf("%s:%d %*sinserted down child_did_split=%d\n", __FILE__, __LINE__, debug, "", child_did_split);
2008-03-17 18:56:12 +00:00
u_int32_t old_fingerprint = node - > local_fingerprint ;
2008-07-27 22:16:49 +00:00
u_int32_t new_fingerprint = old_fingerprint - node - > rand4fingerprint * toku_calc_fingerprint_cmdstruct ( cmd ) ;
2008-03-17 18:56:12 +00:00
node - > local_fingerprint = new_fingerprint ;
2008-07-10 18:46:41 +00:00
if ( t - > txn_that_created ! = cmd - > xid ) {
2008-05-02 14:38:35 +00:00
int r = toku_log_brtdeq ( logger , & node - > log_lsn , 0 , toku_cachefile_filenum ( t - > cf ) , node - > thisnodename , childnum ) ;
2008-03-17 18:56:12 +00:00
assert ( r = = 0 ) ;
}
2007-07-13 19:37:47 +00:00
{
2008-01-31 22:05:43 +00:00
int r = toku_fifo_deq ( BNC_BUFFER ( node , childnum ) ) ;
2007-07-24 02:36:00 +00:00
//printf("%s:%d deleted status=%d\n", __FILE__, __LINE__, r);
2007-07-13 19:37:47 +00:00
if ( r ! = 0 ) return r ;
}
{
2007-09-06 21:36:45 +00:00
int n_bytes_removed = ( k - > size + v - > size + KEY_VALUE_OVERHEAD + BRT_CMD_OVERHEAD ) ;
2008-01-11 14:03:33 +00:00
node - > u . n . n_bytes_in_buffers - = n_bytes_removed ;
2008-01-31 22:05:43 +00:00
BNC_NBYTESINBUF ( node , childnum ) - = n_bytes_removed ;
2007-11-15 14:44:05 +00:00
node - > dirty = 1 ;
2007-07-13 19:37:47 +00:00
}
2007-11-14 17:58:38 +00:00
if ( * child_did_split ) {
2008-03-18 10:19:41 +00:00
// Don't try to fix these up.
//fixup_child_fingerprint(node, childnum, *childa, t, logger);
//fixup_child_fingerprint(node, childnum+1, *childb, t, logger);
2007-11-14 17:58:38 +00:00
} else {
2008-02-08 19:54:00 +00:00
fixup_child_fingerprint ( node , childnum , child , t , logger ) ;
2007-11-14 17:58:38 +00:00
}
2007-07-13 19:37:47 +00:00
return 0 ;
}
2008-05-20 23:47:39 +00:00
static int brtnode_maybe_push_down ( BRT t , BRTNODE node , int * did_split , BRTNODE * nodea , BRTNODE * nodeb , DBT * splitk , TOKULOGGER logger ) ;
2007-11-30 17:40:04 +00:00
2007-11-28 19:00:21 +00:00
static int split_count = 0 ;
2007-07-13 19:37:47 +00:00
/* NODE is a node with a child.
2008-02-05 18:25:23 +00:00
* childnum was split into two nodes childa , and childb . childa is the same as the original child . childb is a new child .
2007-07-13 19:37:47 +00:00
* We must slide things around , & move things from the old table to the new tables .
2008-01-17 15:41:42 +00:00
* We also move things to the new children as much as we can without doing any pushdowns or splitting of the child .
2008-01-11 14:03:33 +00:00
* We must delete the old buffer ( but the old child is already deleted . )
2007-07-13 19:37:47 +00:00
* We also unpin the new children .
*/
static int handle_split_of_child ( BRT t , BRTNODE node , int childnum ,
2007-07-24 01:32:03 +00:00
BRTNODE childa , BRTNODE childb ,
DBT * childsplitk , /* the data in the childsplitk is alloc'd and is consumed by this call. */
int * did_split , BRTNODE * nodea , BRTNODE * nodeb ,
DBT * splitk ,
2008-04-07 01:30:25 +00:00
TOKULOGGER logger ) {
2007-07-13 19:37:47 +00:00
assert ( node - > height > 0 ) ;
2007-08-23 18:07:18 +00:00
assert ( 0 < = childnum & & childnum < node - > u . n . n_children ) ;
2008-01-31 22:05:43 +00:00
FIFO old_h = BNC_BUFFER ( node , childnum ) ;
int old_count = BNC_NBYTESINBUF ( node , childnum ) ;
2007-07-13 19:37:47 +00:00
int cnum ;
int r ;
assert ( node - > u . n . n_children < = TREE_FANOUT ) ;
2007-11-29 14:44:03 +00:00
if ( toku_brt_debug_mode ) {
2007-07-13 19:37:47 +00:00
int i ;
2007-07-24 01:32:03 +00:00
printf ( " %s:%d Child %d did split on %s \n " , __FILE__ , __LINE__ , childnum , ( char * ) childsplitk - > data ) ;
2007-07-13 19:37:47 +00:00
printf ( " %s:%d oldsplitkeys: " , __FILE__ , __LINE__ ) ;
for ( i = 0 ; i < node - > u . n . n_children - 1 ; i + + ) printf ( " %s " , ( char * ) node - > u . n . childkeys [ i ] ) ;
printf ( " \n " ) ;
}
2007-11-15 14:44:05 +00:00
node - > dirty = 1 ;
2007-09-18 16:09:55 +00:00
2007-11-14 17:58:38 +00:00
//verify_local_fingerprint_nonleaf(node);
2008-03-06 21:46:57 +00:00
REALLOC_N ( node - > u . n . n_children + 2 , node - > u . n . childinfos ) ;
REALLOC_N ( node - > u . n . n_children + 1 , node - > u . n . childkeys ) ;
2007-07-13 19:37:47 +00:00
// Slide the children over.
2008-04-30 13:23:04 +00:00
BNC_SUBTREE_FINGERPRINT ( node , node - > u . n . n_children + 1 ) = 0 ;
BNC_SUBTREE_LEAFENTRY_ESTIMATE ( node , node - > u . n . n_children + 1 ) = 0 ;
2007-07-13 19:37:47 +00:00
for ( cnum = node - > u . n . n_children ; cnum > childnum + 1 ; cnum - - ) {
2008-01-31 22:05:43 +00:00
node - > u . n . childinfos [ cnum ] = node - > u . n . childinfos [ cnum - 1 ] ;
2007-07-13 19:37:47 +00:00
}
2008-03-21 21:02:30 +00:00
r = toku_log_addchild ( logger , ( LSN * ) 0 , 0 , toku_cachefile_filenum ( t - > cf ) , node - > thisnodename , childnum + 1 , childb - > thisnodename , 0 ) ;
2008-03-18 10:19:41 +00:00
node - > u . n . n_children + + ;
2013-04-16 23:57:18 -04:00
assert ( BNC_BLOCKNUM ( node , childnum ) . b = = childa - > thisnodename . b ) ; // use the same child
BNC_BLOCKNUM ( node , childnum + 1 ) = childb - > thisnodename ;
2008-06-18 00:30:36 +00:00
BNC_HAVE_FULLHASH ( node , childnum + 1 ) = TRUE ;
BNC_FULLHASH ( node , childnum + 1 ) = childb - > fullhash ;
2008-03-18 10:19:41 +00:00
// BNC_SUBTREE_FINGERPRINT(node, childnum)=0; // leave the subtreefingerprint alone for the child, so we can log the change
2008-04-30 13:23:04 +00:00
BNC_SUBTREE_FINGERPRINT ( node , childnum + 1 ) = 0 ;
BNC_SUBTREE_LEAFENTRY_ESTIMATE ( node , childnum + 1 ) = 0 ;
2008-02-08 19:54:00 +00:00
fixup_child_fingerprint ( node , childnum , childa , t , logger ) ;
fixup_child_fingerprint ( node , childnum + 1 , childb , t , logger ) ;
2008-01-31 22:05:43 +00:00
r = toku_fifo_create ( & BNC_BUFFER ( node , childnum + 1 ) ) ; assert ( r = = 0 ) ;
2008-03-18 10:19:41 +00:00
//verify_local_fingerprint_nonleaf(node); // The fingerprint hasn't changed and everhything is still there.
r = toku_fifo_create ( & BNC_BUFFER ( node , childnum ) ) ; assert ( r = = 0 ) ; // ??? SHould handle this error case
2008-01-31 22:05:43 +00:00
BNC_NBYTESINBUF ( node , childnum ) = 0 ;
BNC_NBYTESINBUF ( node , childnum + 1 ) = 0 ;
2007-11-14 17:58:38 +00:00
// Remove all the cmds from the local fingerprint. Some may get added in again when we try to push to the child.
2008-02-05 18:25:23 +00:00
FIFO_ITERATE ( old_h , skey , skeylen , sval , svallen , type , xid ,
2008-03-18 10:19:41 +00:00
( {
u_int32_t old_fingerprint = node - > local_fingerprint ;
2008-07-27 22:16:49 +00:00
u_int32_t new_fingerprint = old_fingerprint - node - > rand4fingerprint * toku_calc_fingerprint_cmd ( type , xid , skey , skeylen , sval , svallen ) ;
2008-07-10 18:46:41 +00:00
if ( t - > txn_that_created ! = xid ) {
r = toku_log_brtdeq ( logger , & node - > log_lsn , 0 , toku_cachefile_filenum ( t - > cf ) , node - > thisnodename , childnum ) ;
assert ( r = = 0 ) ;
}
2008-03-18 10:19:41 +00:00
node - > local_fingerprint = new_fingerprint ;
} ) ) ;
//verify_local_fingerprint_nonleaf(node);
2007-11-14 17:58:38 +00:00
2007-07-13 19:37:47 +00:00
// Slide the keys over
2008-01-30 21:23:01 +00:00
{
2008-01-31 14:52:52 +00:00
struct kv_pair * pivot = childsplitk - > data ;
2008-01-30 21:23:01 +00:00
BYTESTRING bs = { . len = childsplitk - > size ,
2008-01-31 14:52:52 +00:00
. data = kv_pair_key ( pivot ) } ;
2008-03-21 21:02:30 +00:00
r = toku_log_setpivot ( logger , ( LSN * ) 0 , 0 , toku_cachefile_filenum ( t - > cf ) , node - > thisnodename , childnum , bs ) ;
2008-01-30 21:23:01 +00:00
if ( r ! = 0 ) return r ;
2008-01-31 14:52:52 +00:00
2008-03-18 10:19:41 +00:00
for ( cnum = node - > u . n . n_children - 2 ; cnum > childnum ; cnum - - ) {
2008-01-31 14:52:52 +00:00
node - > u . n . childkeys [ cnum ] = node - > u . n . childkeys [ cnum - 1 ] ;
}
2008-03-07 02:06:15 +00:00
//if (logger) assert((t->flags&TOKU_DB_DUPSORT)==0); // the setpivot is wrong for TOKU_DB_DUPSORT, so recovery will be broken.
2008-01-31 14:52:52 +00:00
node - > u . n . childkeys [ childnum ] = pivot ;
2008-03-07 02:06:15 +00:00
node - > u . n . totalchildkeylens + = toku_brt_pivot_key_len ( t , pivot ) ;
2008-01-30 21:23:01 +00:00
}
2008-01-31 14:52:52 +00:00
2007-11-29 14:44:03 +00:00
if ( toku_brt_debug_mode ) {
2007-07-13 19:37:47 +00:00
int i ;
printf ( " %s:%d splitkeys: " , __FILE__ , __LINE__ ) ;
2008-03-18 10:19:41 +00:00
for ( i = 0 ; i < node - > u . n . n_children - 2 ; i + + ) printf ( " %s " , ( char * ) node - > u . n . childkeys [ i ] ) ;
2007-07-13 19:37:47 +00:00
printf ( " \n " ) ;
}
2008-03-18 10:19:41 +00:00
//verify_local_fingerprint_nonleaf(node);
2008-01-11 14:03:33 +00:00
node - > u . n . n_bytes_in_buffers - = old_count ; /* By default, they are all removed. We might add them back in. */
2007-07-13 19:37:47 +00:00
/* Keep pushing to the children, but not if the children would require a pushdown */
2008-02-05 18:25:23 +00:00
FIFO_ITERATE ( old_h , skey , skeylen , sval , svallen , type , xid , ( {
2007-07-24 01:32:03 +00:00
DBT skd , svd ;
2008-02-05 18:25:23 +00:00
BRT_CMD_S brtcmd = { type , xid , . u . id = { toku_fill_dbt ( & skd , skey , skeylen ) ,
toku_fill_dbt ( & svd , sval , svallen ) } } ;
2007-11-14 17:58:38 +00:00
//verify_local_fingerprint_nonleaf(childa); verify_local_fingerprint_nonleaf(childb);
2008-02-06 19:27:25 +00:00
int pusha = 0 , pushb = 0 ;
2008-02-05 18:25:23 +00:00
switch ( type ) {
case BRT_INSERT :
case BRT_DELETE_BOTH :
2008-04-07 01:30:25 +00:00
case BRT_DELETE_ANY :
case BRT_ABORT_BOTH :
case BRT_ABORT_ANY :
case BRT_COMMIT_BOTH :
case BRT_COMMIT_ANY :
if ( ( type ! = BRT_DELETE_ANY & & type ! = BRT_ABORT_ANY & & type ! = BRT_COMMIT_ANY ) | | 0 = = ( t - > flags & TOKU_DB_DUPSORT ) ) {
2008-02-06 19:27:25 +00:00
// If it's an INSERT or DELETE_BOTH or there are no duplicates then we just put the command into one subtree
2008-02-05 18:25:23 +00:00
int cmp = brt_compare_pivot ( t , & skd , & svd , childsplitk - > data ) ;
2008-02-06 19:27:25 +00:00
if ( cmp < = 0 ) pusha = 1 ;
else pushb = 1 ;
} else {
2008-04-07 01:30:25 +00:00
assert ( ( type = = BRT_DELETE_ANY | | type = = BRT_ABORT_ANY | | type = = BRT_COMMIT_ANY ) & & t - > flags & TOKU_DB_DUPSORT ) ;
// It is a DELETE or ABORT_ANY and it's a DUPSORT database,
// in which case if the comparison function comes up 0 we must write the command to both children. (See #201)
2008-02-06 19:27:25 +00:00
int cmp = brt_compare_pivot ( t , & skd , 0 , childsplitk - > data ) ;
if ( cmp < = 0 ) pusha = 1 ;
if ( cmp > = 0 ) pushb = 1 ; // Could be that both pusha and pushb are set
}
if ( pusha ) {
// If we already have something in the buffer, we must add the new command to the buffer so that commands don't get out of order.
if ( toku_fifo_n_entries ( BNC_BUFFER ( node , childnum ) ) = = 0 ) {
2008-04-07 01:30:25 +00:00
r = push_brt_cmd_down_only_if_it_wont_push_more_else_put_here ( t , node , childa , & brtcmd , childnum , logger ) ;
2008-02-05 18:25:23 +00:00
} else {
2008-02-06 19:27:25 +00:00
r = insert_to_buffer_in_nonleaf ( node , childnum , & skd , & svd , type , xid ) ;
2008-02-05 18:25:23 +00:00
}
2008-01-29 21:43:08 +00:00
}
2008-02-06 19:27:25 +00:00
if ( pushb ) {
// If we already have something in the buffer, we must add the new command to the buffer so that commands don't get out of order.
if ( toku_fifo_n_entries ( BNC_BUFFER ( node , childnum + 1 ) ) = = 0 ) {
2008-04-07 01:30:25 +00:00
r = push_brt_cmd_down_only_if_it_wont_push_more_else_put_here ( t , node , childb , & brtcmd , childnum + 1 , logger ) ;
2008-02-06 19:27:25 +00:00
} else {
r = insert_to_buffer_in_nonleaf ( node , childnum + 1 , & skd , & svd , type , xid ) ;
}
}
//verify_local_fingerprint_nonleaf(childa); verify_local_fingerprint_nonleaf(childb);
if ( r ! = 0 ) printf ( " r=%d \n " , r ) ;
assert ( r = = 0 ) ;
2008-02-05 18:25:23 +00:00
goto ok ;
2008-02-06 19:27:25 +00:00
2008-02-05 18:25:23 +00:00
case BRT_NONE :
// Don't have to do anything in this case, can just drop the command
2008-02-06 19:27:25 +00:00
goto ok ;
2008-02-05 18:25:23 +00:00
}
printf ( " Bad type %d \n " , type ) ; // Don't use default: because I want a compiler warning if I forget a enum case, and I want a runtime error if the type isn't one of the expected ones.
assert ( 0 ) ;
2008-02-06 19:27:25 +00:00
ok : /*nothing*/ ;
2008-02-05 18:25:23 +00:00
} ) ) ;
2007-09-06 21:36:45 +00:00
2008-01-11 14:03:33 +00:00
toku_fifo_free ( & old_h ) ;
2007-07-13 19:37:47 +00:00
2007-11-14 17:58:38 +00:00
//verify_local_fingerprint_nonleaf(childa);
//verify_local_fingerprint_nonleaf(childb);
//verify_local_fingerprint_nonleaf(node);
2007-07-13 19:37:47 +00:00
2008-04-07 01:30:25 +00:00
VERIFY_NODE ( node ) ;
VERIFY_NODE ( childa ) ;
VERIFY_NODE ( childb ) ;
2007-07-13 19:37:47 +00:00
2008-03-05 18:34:32 +00:00
r = toku_unpin_brtnode ( t , childa ) ;
2007-11-14 17:58:38 +00:00
assert ( r = = 0 ) ;
2008-03-05 18:34:32 +00:00
r = toku_unpin_brtnode ( t , childb ) ;
2007-11-14 17:58:38 +00:00
assert ( r = = 0 ) ;
2007-07-13 19:37:47 +00:00
if ( node - > u . n . n_children > TREE_FANOUT ) {
//printf("%s:%d about to split having pushed %d out of %d keys\n", __FILE__, __LINE__, i, n_pairs);
2008-04-07 01:30:25 +00:00
r = brt_nonleaf_split ( t , node , nodea , nodeb , splitk , logger ) ;
2008-01-29 21:43:08 +00:00
if ( r ! = 0 ) return r ;
2007-07-13 19:37:47 +00:00
//printf("%s:%d did split\n", __FILE__, __LINE__);
split_count + + ;
* did_split = 1 ;
assert ( ( * nodea ) - > height > 0 ) ;
assert ( ( * nodeb ) - > height > 0 ) ;
assert ( ( * nodea ) - > u . n . n_children > 0 ) ;
assert ( ( * nodeb ) - > u . n . n_children > 0 ) ;
2013-04-16 23:57:18 -04:00
assert ( BNC_BLOCKNUM ( * nodea , ( * nodea ) - > u . n . n_children - 1 ) . b ! = 0 ) ;
assert ( BNC_BLOCKNUM ( * nodeb , ( * nodeb ) - > u . n . n_children - 1 ) . b ! = 0 ) ;
2007-11-19 23:54:17 +00:00
assert ( toku_serialize_brtnode_size ( * nodea ) < = ( * nodea ) - > nodesize ) ;
assert ( toku_serialize_brtnode_size ( * nodeb ) < = ( * nodeb ) - > nodesize ) ;
2007-11-14 17:58:38 +00:00
//verify_local_fingerprint_nonleaf(*nodea);
//verify_local_fingerprint_nonleaf(*nodeb);
2007-07-13 19:37:47 +00:00
} else {
* did_split = 0 ;
2007-11-30 17:40:04 +00:00
if ( toku_serialize_brtnode_size ( node ) > node - > nodesize ) {
2007-12-01 13:12:56 +00:00
/* lighten the node by pushing down its buffers. this may cause
the current node to split and go away */
2008-05-20 23:47:39 +00:00
r = brtnode_maybe_push_down ( t , node , did_split , nodea , nodeb , splitk , logger ) ;
2007-11-30 17:40:04 +00:00
assert ( r = = 0 ) ;
}
2007-12-01 13:12:56 +00:00
if ( * did_split = = 0 ) assert ( toku_serialize_brtnode_size ( node ) < = node - > nodesize ) ;
2007-07-13 19:37:47 +00:00
}
return 0 ;
}
2008-06-18 00:30:36 +00:00
static u_int32_t compute_child_fullhash ( CACHEFILE cf , BRTNODE node , int childnum ) {
switch ( BNC_HAVE_FULLHASH ( node , childnum ) ) {
case TRUE :
{
2013-04-16 23:57:18 -04:00
assert ( BNC_FULLHASH ( node , childnum ) = = toku_cachetable_hash ( cf , BNC_BLOCKNUM ( node , childnum ) ) ) ;
2008-06-18 00:30:36 +00:00
return BNC_FULLHASH ( node , childnum ) ;
}
case FALSE :
{
2013-04-16 23:57:18 -04:00
u_int32_t child_fullhash = toku_cachetable_hash ( cf , BNC_BLOCKNUM ( node , childnum ) ) ;
2008-06-18 00:30:36 +00:00
BNC_HAVE_FULLHASH ( node , childnum ) = TRUE ;
BNC_FULLHASH ( node , childnum ) = child_fullhash ;
return child_fullhash ;
}
}
assert ( 0 ) ;
return 0 ;
}
2007-09-06 21:36:45 +00:00
static int push_some_brt_cmds_down ( BRT t , BRTNODE node , int childnum ,
2007-09-28 17:11:22 +00:00
int * did_split , BRTNODE * nodea , BRTNODE * nodeb ,
DBT * splitk ,
2008-04-07 01:30:25 +00:00
TOKULOGGER logger ) {
2007-07-13 19:37:47 +00:00
void * childnode_v ;
BRTNODE child ;
int r ;
assert ( node - > height > 0 ) ;
2013-04-16 23:57:18 -04:00
BLOCKNUM targetchild = BNC_BLOCKNUM ( node , childnum ) ;
assert ( targetchild . b > = 0 & & targetchild . b < t - > h - > unused_blocks . b ) ; // This assertion could fail in a concurrent setting since another process might have bumped unused memory.
2008-06-18 00:30:36 +00:00
u_int32_t childfullhash = compute_child_fullhash ( t - > cf , node , childnum ) ;
2008-06-17 17:05:19 +00:00
r = toku_cachetable_get_and_pin ( t - > cf , targetchild , childfullhash , & childnode_v , NULL ,
2013-04-16 23:57:18 -04:00
toku_brtnode_flush_callback , toku_brtnode_fetch_callback , t - > h ) ;
2007-07-13 19:37:47 +00:00
if ( r ! = 0 ) return r ;
2007-10-03 19:34:31 +00:00
//printf("%s:%d pin %p\n", __FILE__, __LINE__, childnode_v);
2007-07-13 19:37:47 +00:00
child = childnode_v ;
2013-04-16 23:57:18 -04:00
assert ( child - > thisnodename . b ! = 0 ) ;
2007-11-14 17:58:38 +00:00
//verify_local_fingerprint_nonleaf(child);
2008-04-07 01:30:25 +00:00
VERIFY_NODE ( child ) ;
2008-01-11 14:03:33 +00:00
//printf("%s:%d height=%d n_bytes_in_buffer = {%d, %d, %d, ...}\n", __FILE__, __LINE__, child->height, child->n_bytes_in_buffer[0], child->n_bytes_in_buffer[1], child->n_bytes_in_buffer[2]);
2013-04-16 23:57:18 -04:00
if ( child - > height > 0 & & child - > u . n . n_children > 0 ) assert ( BNC_BLOCKNUM ( child , child - > u . n . n_children - 1 ) . b ! = 0 ) ;
2007-07-13 19:37:47 +00:00
2007-07-30 21:44:27 +00:00
if ( 0 ) {
static int count = 0 ;
count + + ;
printf ( " %s:%d pushing %d count=%d \n " , __FILE__ , __LINE__ , childnum , count ) ;
}
2007-07-13 19:37:47 +00:00
{
bytevec key , val ;
ITEMLEN keylen , vallen ;
2008-01-31 22:05:43 +00:00
//printf("%s:%d Try random_pick, weight=%d \n", __FILE__, __LINE__, BNC_NBYTESINBUF(node, childnum));
assert ( toku_fifo_n_entries ( BNC_BUFFER ( node , childnum ) ) > 0 ) ;
2008-03-17 18:56:12 +00:00
u_int32_t type ;
2008-02-05 18:25:23 +00:00
TXNID xid ;
while ( 0 = = toku_fifo_peek ( BNC_BUFFER ( node , childnum ) , & key , & keylen , & val , & vallen , & type , & xid ) ) {
2007-07-24 01:32:03 +00:00
int child_did_split = 0 ; BRTNODE childa , childb ;
DBT hk , hv ;
DBT childsplitk ;
2007-09-06 21:36:45 +00:00
2008-02-05 18:25:23 +00:00
BRT_CMD_S brtcmd = { type , xid , . u . id = { toku_fill_dbt ( & hk , key , keylen ) ,
toku_fill_dbt ( & hv , val , vallen ) } } ;
2007-09-06 21:36:45 +00:00
2007-07-24 02:36:00 +00:00
//printf("%s:%d random_picked\n", __FILE__, __LINE__);
2007-11-29 15:17:46 +00:00
toku_init_dbt ( & childsplitk ) ;
2007-09-06 21:36:45 +00:00
r = push_a_brt_cmd_down ( t , node , child , childnum ,
2007-09-28 17:11:22 +00:00
& brtcmd ,
& child_did_split , & childa , & childb ,
& childsplitk ,
2008-04-07 01:30:25 +00:00
logger ) ;
2007-07-24 02:36:00 +00:00
if ( 0 ) {
unsigned int sum = 0 ;
2008-02-05 18:25:23 +00:00
FIFO_ITERATE ( BNC_BUFFER ( node , childnum ) , subhk __attribute__ ( ( __unused__ ) ) , hkl , hd __attribute__ ( ( __unused__ ) ) , hdl , subtype __attribute__ ( ( __unused__ ) ) , subxid __attribute__ ( ( __unused__ ) ) ,
2008-01-11 14:03:33 +00:00
sum + = hkl + hdl + KEY_VALUE_OVERHEAD + BRT_CMD_OVERHEAD ) ;
2007-07-24 02:36:00 +00:00
printf ( " %s:%d sum=%d \n " , __FILE__ , __LINE__ , sum ) ;
2008-01-31 22:05:43 +00:00
assert ( sum = = BNC_NBYTESINBUF ( node , childnum ) ) ;
2007-07-24 02:36:00 +00:00
}
2008-01-31 22:05:43 +00:00
if ( BNC_NBYTESINBUF ( node , childnum ) > 0 ) assert ( toku_fifo_n_entries ( BNC_BUFFER ( node , childnum ) ) > 0 ) ;
//printf("%s:%d %d=push_a_brt_cmd_down=(); child_did_split=%d (weight=%d)\n", __FILE__, __LINE__, r, child_did_split, BNC_NBYTESINBUF(node, childnum));
2007-07-13 19:37:47 +00:00
if ( r ! = 0 ) return r ;
if ( child_did_split ) {
// If the child splits, we don't push down any further.
r = handle_split_of_child ( t , node , childnum ,
2007-07-24 01:32:03 +00:00
childa , childb , & childsplitk ,
2007-07-24 11:13:42 +00:00
did_split , nodea , nodeb , splitk ,
2008-04-07 01:30:25 +00:00
logger ) ;
2007-11-14 17:58:38 +00:00
//if (*did_split) {
// verify_local_fingerprint_nonleaf(*nodea);
// verify_local_fingerprint_nonleaf(*nodeb);
//}
2007-07-13 19:37:47 +00:00
return r ; /* Don't do any more pushing if the child splits. */
}
}
2007-07-24 02:36:00 +00:00
if ( 0 ) printf ( " %s:%d done random picking \n " , __FILE__ , __LINE__ ) ;
2007-07-13 19:37:47 +00:00
}
2007-11-19 23:54:17 +00:00
assert ( toku_serialize_brtnode_size ( node ) < = node - > nodesize ) ;
2007-11-14 17:58:38 +00:00
//verify_local_fingerprint_nonleaf(node);
2008-03-05 18:34:32 +00:00
r = toku_unpin_brtnode ( t , child ) ;
2007-07-13 19:37:47 +00:00
if ( r ! = 0 ) return r ;
* did_split = 0 ;
return 0 ;
}
2008-05-20 23:47:39 +00:00
static int brtnode_maybe_push_down ( BRT t , BRTNODE node , int * did_split , BRTNODE * nodea , BRTNODE * nodeb , DBT * splitk , TOKULOGGER logger )
2007-07-13 19:37:47 +00:00
/* If the buffer is too full, then push down. Possibly the child will split. That may make us split. */
{
assert ( node - > height > 0 ) ;
2007-11-19 23:54:17 +00:00
if ( toku_serialize_brtnode_size ( node ) > node - > nodesize ) {
2007-07-13 19:37:47 +00:00
{
/* Push to a child. */
/* Find the heaviest child, and push stuff to it. Keep pushing to the child until we run out.
* But if the child pushes something to its child and our buffer has gotten small enough , then we stop pushing . */
int childnum ;
find_heaviest_child ( node , & childnum ) ;
2013-04-16 23:57:18 -04:00
assert ( BNC_BLOCKNUM ( node , childnum ) . b ! = 0 ) ;
2008-05-20 23:47:39 +00:00
int r = push_some_brt_cmds_down ( t , node , childnum , did_split , nodea , nodeb , splitk , logger ) ;
2007-07-13 19:37:47 +00:00
if ( r ! = 0 ) return r ;
assert ( * did_split = = 0 | | * did_split = = 1 ) ;
if ( * did_split ) {
2007-11-19 23:54:17 +00:00
assert ( toku_serialize_brtnode_size ( * nodea ) < = ( * nodea ) - > nodesize ) ;
assert ( toku_serialize_brtnode_size ( * nodeb ) < = ( * nodeb ) - > nodesize ) ;
2007-07-13 19:37:47 +00:00
assert ( ( * nodea ) - > u . n . n_children > 0 ) ;
assert ( ( * nodeb ) - > u . n . n_children > 0 ) ;
2013-04-16 23:57:18 -04:00
assert ( BNC_BLOCKNUM ( * nodea , ( * nodea ) - > u . n . n_children - 1 ) . b ! = 0 ) ;
assert ( BNC_BLOCKNUM ( * nodeb , ( * nodeb ) - > u . n . n_children - 1 ) . b ! = 0 ) ;
2007-11-14 17:58:38 +00:00
//verify_local_fingerprint_nonleaf(*nodea);
//verify_local_fingerprint_nonleaf(*nodeb);
2007-07-13 19:37:47 +00:00
} else {
2007-11-19 23:54:17 +00:00
assert ( toku_serialize_brtnode_size ( node ) < = node - > nodesize ) ;
2007-07-13 19:37:47 +00:00
}
}
} else {
* did_split = 0 ;
2007-11-19 23:54:17 +00:00
assert ( toku_serialize_brtnode_size ( node ) < = node - > nodesize ) ;
2007-07-13 19:37:47 +00:00
}
2007-11-14 17:58:38 +00:00
//if (*did_split) {
// verify_local_fingerprint_nonleaf(*nodea);
// verify_local_fingerprint_nonleaf(*nodeb);
//} else {
// verify_local_fingerprint_nonleaf(node);
//}
2007-07-13 19:37:47 +00:00
return 0 ;
}
2008-04-07 01:30:25 +00:00
int leafval_bessel_le_committed ( u_int32_t klen , void * kval ,
u_int32_t dlen , void * dval ,
struct cmd_leafval_bessel_extra * be ) {
BRT t = be - > t ;
DBT dbt ;
int cmp = t - > compare_fun ( t - > db ,
toku_fill_dbt ( & dbt , kval , klen ) ,
be - > cmd - > u . id . key ) ;
if ( cmp = = 0 & & be - > compare_both_keys & & be - > cmd - > u . id . val - > data ) {
2008-04-02 23:40:36 +00:00
return t - > dup_compare ( t - > db ,
2008-04-07 01:30:25 +00:00
toku_fill_dbt ( & dbt , dval , dlen ) ,
be - > cmd - > u . id . val ) ;
2008-04-02 23:40:36 +00:00
} else {
return cmp ;
}
}
2008-04-07 01:30:25 +00:00
int leafval_bessel_le_both ( TXNID xid __attribute__ ( ( __unused__ ) ) ,
u_int32_t klen , void * kval ,
2008-04-09 21:43:13 +00:00
u_int32_t clen __attribute__ ( ( __unused__ ) ) , void * cval __attribute__ ( ( __unused__ ) ) ,
u_int32_t plen , void * pval ,
2008-04-07 01:30:25 +00:00
struct cmd_leafval_bessel_extra * be ) {
2008-04-09 21:43:13 +00:00
return leafval_bessel_le_committed ( klen , kval , plen , pval , be ) ;
2008-04-07 01:30:25 +00:00
}
int leafval_bessel_le_provdel ( TXNID xid __attribute__ ( ( __unused__ ) ) ,
u_int32_t klen , void * kval ,
u_int32_t clen , void * cval ,
struct cmd_leafval_bessel_extra * be ) {
return leafval_bessel_le_committed ( klen , kval , clen , cval , be ) ;
}
int leafval_bessel_le_provpair ( TXNID xid __attribute__ ( ( __unused__ ) ) ,
u_int32_t klen , void * kval ,
u_int32_t plen , void * pval ,
struct cmd_leafval_bessel_extra * be ) {
return leafval_bessel_le_committed ( klen , kval , plen , pval , be ) ;
}
2008-04-25 13:45:55 +00:00
int toku_cmd_leafval_bessel ( OMTVALUE lev , void * extra ) {
LEAFENTRY le = lev ;
2008-04-07 01:30:25 +00:00
struct cmd_leafval_bessel_extra * be = extra ;
LESWITCHCALL ( le , leafval_bessel , be ) ;
}
// Whenever anything provisional is happening, it's XID must match the cmd's.
static int apply_cmd_to_le_committed ( u_int32_t klen , void * kval ,
u_int32_t dlen , void * dval ,
BRT_CMD cmd ,
u_int32_t * newlen , u_int32_t * disksize , LEAFENTRY * new_data ) {
2008-05-16 13:47:02 +00:00
//assert(cmd->u.id.key->size == klen);
//assert(memcmp(cmd->u.id.key->data, kval, klen)==0);
2008-04-07 01:30:25 +00:00
switch ( cmd - > type ) {
case BRT_INSERT :
return le_both ( cmd - > xid ,
klen , kval ,
dlen , dval ,
cmd - > u . id . val - > size , cmd - > u . id . val - > data ,
newlen , disksize , new_data ) ;
case BRT_DELETE_ANY :
case BRT_DELETE_BOTH :
return le_provdel ( cmd - > xid ,
klen , kval ,
dlen , dval ,
newlen , disksize , new_data ) ;
case BRT_ABORT_BOTH :
case BRT_ABORT_ANY :
case BRT_COMMIT_BOTH :
case BRT_COMMIT_ANY :
// Just return the original committed record
return le_committed ( klen , kval , dlen , dval ,
newlen , disksize , new_data ) ;
case BRT_NONE : break ;
}
assert ( 0 ) ;
return 0 ;
}
2008-05-03 12:43:25 +00:00
static int apply_cmd_to_le_both ( TXNID xid ,
2008-04-07 01:30:25 +00:00
u_int32_t klen , void * kval ,
u_int32_t clen , void * cval ,
u_int32_t plen , void * pval ,
BRT_CMD cmd ,
u_int32_t * newlen , u_int32_t * disksize , LEAFENTRY * new_data ) {
2008-05-03 12:43:25 +00:00
u_int32_t prev_len ;
void * prev_val ;
if ( xid = = cmd - > xid ) {
// The xids match, so throw away the provisional value.
prev_len = clen ; prev_val = cval ;
} else {
// If the xids don't match, then we are moving the provisional value to committed status.
prev_len = plen ; prev_val = pval ;
}
2008-04-07 01:30:25 +00:00
// keep the committed value for rollback.
2008-05-16 13:47:02 +00:00
//assert(cmd->u.id.key->size == klen);
//assert(memcmp(cmd->u.id.key->data, kval, klen)==0);
2008-04-07 01:30:25 +00:00
switch ( cmd - > type ) {
case BRT_INSERT :
return le_both ( cmd - > xid ,
klen , kval ,
2008-05-03 12:43:25 +00:00
prev_len , prev_val ,
2008-04-07 01:30:25 +00:00
cmd - > u . id . val - > size , cmd - > u . id . val - > data ,
newlen , disksize , new_data ) ;
case BRT_DELETE_ANY :
case BRT_DELETE_BOTH :
return le_provdel ( cmd - > xid ,
klen , kval ,
2008-05-03 12:43:25 +00:00
prev_len , prev_val ,
2008-04-07 01:30:25 +00:00
newlen , disksize , new_data ) ;
case BRT_ABORT_BOTH :
case BRT_ABORT_ANY :
2008-05-03 12:43:25 +00:00
// I don't see how you could have an abort where the xids don't match. But do it anyway.
2008-04-07 01:30:25 +00:00
return le_committed ( klen , kval ,
2008-05-03 12:43:25 +00:00
prev_len , prev_val ,
2008-04-07 01:30:25 +00:00
newlen , disksize , new_data ) ;
case BRT_COMMIT_BOTH :
case BRT_COMMIT_ANY :
2008-05-03 12:43:25 +00:00
// In the future we won't even have these commit messages.
2008-04-07 01:30:25 +00:00
return le_committed ( klen , kval ,
plen , pval ,
newlen , disksize , new_data ) ;
case BRT_NONE : break ;
}
assert ( 0 ) ;
return 0 ;
}
2008-05-03 12:43:25 +00:00
static int apply_cmd_to_le_provdel ( TXNID xid ,
2008-04-07 01:30:25 +00:00
u_int32_t klen , void * kval ,
u_int32_t clen , void * cval ,
BRT_CMD cmd ,
u_int32_t * newlen , u_int32_t * disksize , LEAFENTRY * new_data ) {
// keep the committed value for rollback
2008-05-16 13:47:02 +00:00
//assert(cmd->u.id.key->size == klen);
//assert(memcmp(cmd->u.id.key->data, kval, klen)==0);
2008-04-07 01:30:25 +00:00
switch ( cmd - > type ) {
case BRT_INSERT :
2008-05-03 12:43:25 +00:00
if ( cmd - > xid = = xid ) {
return le_both ( cmd - > xid ,
klen , kval ,
clen , cval ,
cmd - > u . id . val - > size , cmd - > u . id . val - > data ,
newlen , disksize , new_data ) ;
} else {
// It's an insert, but the committed value is deleted (since the xids don't match, we assume the delete took effect)
return le_provpair ( cmd - > xid ,
klen , kval ,
cmd - > u . id . val - > size , cmd - > u . id . val - > data ,
newlen , disksize , new_data ) ;
}
2008-04-07 01:30:25 +00:00
case BRT_DELETE_ANY :
case BRT_DELETE_BOTH :
2008-05-03 12:43:25 +00:00
if ( cmd - > xid = = xid ) {
// A delete of a delete could conceivably return the identical value, saving a malloc and a free, but to simplify things we just reallocate it
// because othewise we have to notice not to free() the olditem.
return le_provdel ( cmd - > xid ,
klen , kval ,
clen , cval ,
newlen , disksize , new_data ) ;
} else {
// The commited value is deleted, and we are deleting, so treat as a delete.
* new_data = 0 ;
return 0 ;
}
2008-04-07 01:30:25 +00:00
case BRT_ABORT_BOTH :
case BRT_ABORT_ANY :
2008-05-03 12:43:25 +00:00
// I don't see how the xids could not match...
2008-04-07 01:30:25 +00:00
return le_committed ( klen , kval ,
clen , cval ,
newlen , disksize , new_data ) ;
case BRT_COMMIT_BOTH :
case BRT_COMMIT_ANY :
* new_data = 0 ;
return 0 ;
case BRT_NONE : break ;
}
assert ( 0 ) ;
return 0 ;
}
2008-05-03 12:43:25 +00:00
static int apply_cmd_to_le_provpair ( TXNID xid ,
2008-04-07 01:30:25 +00:00
u_int32_t klen , void * kval ,
u_int32_t plen , void * pval ,
BRT_CMD cmd ,
u_int32_t * newlen , u_int32_t * disksize , LEAFENTRY * new_data ) {
2008-05-16 13:47:02 +00:00
//assert(cmd->u.id.key->size == klen);
//assert(memcmp(cmd->u.id.key->data, kval, klen)==0);
2008-04-07 01:30:25 +00:00
switch ( cmd - > type ) {
case BRT_INSERT :
2008-05-03 12:43:25 +00:00
if ( cmd - > xid = = xid ) {
// it's still a provpair (the old prov value is lost)
return le_provpair ( cmd - > xid ,
klen , kval ,
cmd - > u . id . val - > size , cmd - > u . id . val - > data ,
newlen , disksize , new_data ) ;
} else {
// the old prov was actually committed.
return le_both ( cmd - > xid ,
2008-04-07 01:30:25 +00:00
klen , kval ,
2008-05-03 12:43:25 +00:00
plen , pval ,
2008-04-07 01:30:25 +00:00
cmd - > u . id . val - > size , cmd - > u . id . val - > data ,
newlen , disksize , new_data ) ;
2008-05-03 12:43:25 +00:00
}
2008-04-07 01:30:25 +00:00
case BRT_DELETE_BOTH :
case BRT_DELETE_ANY :
2008-05-03 12:43:25 +00:00
if ( cmd - > xid = = xid ) {
// A delete of a provisional pair is nothign
* new_data = 0 ;
return 0 ;
} else {
// The prov pair is actually a committed value.
return le_provdel ( cmd - > xid ,
klen , kval ,
plen , pval ,
newlen , disksize , new_data ) ;
}
2008-04-07 01:30:25 +00:00
case BRT_ABORT_BOTH :
case BRT_ABORT_ANY :
2008-05-03 12:43:25 +00:00
// An abort of a provisional pair is nothing.
2008-04-07 01:30:25 +00:00
* new_data = 0 ;
return 0 ;
case BRT_COMMIT_ANY :
case BRT_COMMIT_BOTH :
return le_committed ( klen , kval ,
plen , pval ,
newlen , disksize , new_data ) ;
case BRT_NONE : break ;
}
assert ( 0 ) ;
return 0 ;
}
static int apply_cmd_to_leaf ( BRT_CMD cmd ,
2008-04-22 20:39:50 +00:00
void * stored_data , // NULL if there was no stored data.
2008-04-07 01:30:25 +00:00
u_int32_t * newlen , u_int32_t * disksize , LEAFENTRY * new_data ) {
if ( stored_data = = 0 ) {
switch ( cmd - > type ) {
case BRT_INSERT :
{
LEAFENTRY le ;
int r = le_provpair ( cmd - > xid ,
cmd - > u . id . key - > size , cmd - > u . id . key - > data ,
cmd - > u . id . val - > size , cmd - > u . id . val - > data ,
newlen , disksize , & le ) ;
if ( r = = 0 ) * new_data = le ;
return r ;
}
case BRT_DELETE_BOTH :
case BRT_DELETE_ANY :
case BRT_ABORT_BOTH :
case BRT_ABORT_ANY :
case BRT_COMMIT_BOTH :
case BRT_COMMIT_ANY :
* new_data = 0 ;
return 0 ; // Don't have to insert anything.
case BRT_NONE :
break ;
}
assert ( 0 ) ;
return 0 ;
} else {
LESWITCHCALL ( stored_data , apply_cmd_to , cmd ,
newlen , disksize , new_data ) ;
}
}
int should_compare_both_keys ( BRTNODE node , BRT_CMD cmd ) {
switch ( cmd - > type ) {
case BRT_INSERT :
return node - > flags & TOKU_DB_DUPSORT ;
case BRT_DELETE_BOTH :
case BRT_ABORT_BOTH :
case BRT_COMMIT_BOTH :
return 1 ;
case BRT_DELETE_ANY :
case BRT_ABORT_ANY :
case BRT_COMMIT_ANY :
return 0 ;
case BRT_NONE :
break ;
}
assert ( 0 ) ;
return 0 ;
}
static int brt_leaf_apply_cmd_once ( BRT t , BRTNODE node , BRT_CMD cmd , TOKULOGGER logger ,
2008-04-22 20:39:50 +00:00
u_int32_t idx , LEAFENTRY le ) {
2008-04-07 01:30:25 +00:00
FILENUM filenum = toku_cachefile_filenum ( t - > cf ) ;
2008-05-04 16:56:15 +00:00
u_int32_t newlen = 0 , newdisksize = 0 ;
LEAFENTRY newdata = 0 ;
2008-04-22 20:39:50 +00:00
int r = apply_cmd_to_leaf ( cmd , le , & newlen , & newdisksize , & newdata ) ;
2008-04-07 01:30:25 +00:00
if ( r ! = 0 ) return r ;
if ( newdata ) assert ( newdisksize = = leafentry_disksize ( newdata ) ) ;
2008-05-17 00:49:13 +00:00
//printf("Applying command: %s xid=%lld ", unparse_cmd_type(cmd->type), (long long)cmd->xid);
2008-05-16 18:48:23 +00:00
//toku_print_BYTESTRING(stdout, cmd->u.id.key->size, cmd->u.id.key->data);
//printf(" ");
//toku_print_BYTESTRING(stdout, cmd->u.id.val->size, cmd->u.id.val->data);
//printf(" to \n");
//print_leafentry(stdout, le); printf("\n");
//printf(" got "); print_leafentry(stdout, newdata); printf("\n");
2008-05-08 15:24:53 +00:00
if ( le & & newdata ) {
2008-07-10 18:46:41 +00:00
if ( t - > txn_that_created ! = cmd - > xid ) {
if ( ( r = toku_log_deleteleafentry ( logger , & node - > log_lsn , 0 , filenum , node - > thisnodename , idx ) ) ) return r ;
if ( ( r = toku_log_insertleafentry ( logger , & node - > log_lsn , 0 , toku_cachefile_filenum ( t - > cf ) , node - > thisnodename , idx , newdata ) ) ) return r ;
}
2008-04-12 14:50:50 +00:00
2008-04-22 20:39:50 +00:00
node - > u . l . n_bytes_in_buffer - = OMT_ITEM_OVERHEAD + leafentry_disksize ( le ) ;
2008-04-12 14:50:50 +00:00
node - > local_fingerprint - = node - > rand4fingerprint * toku_le_crc ( le ) ;
2008-05-08 15:24:53 +00:00
2008-05-29 21:38:46 +00:00
u_int32_t size = leafentry_memsize ( le ) ;
2008-04-22 20:39:50 +00:00
LEAFENTRY new_le = mempool_malloc_from_omt ( node - > u . l . buffer , & node - > u . l . buffer_mempool , newlen ) ;
assert ( new_le ) ;
2008-04-07 01:30:25 +00:00
memcpy ( new_le , newdata , newlen ) ;
2008-05-29 17:12:45 +00:00
// This mfree must occur after the mempool_malloc so that when the mempool is compressed everything is accounted for.
2008-05-29 21:38:46 +00:00
// But we must compute the size before doing the mempool malloc because otherwise the le pointer is no good.
toku_mempool_mfree ( & node - > u . l . buffer_mempool , 0 , size ) ; // Must pass 0, since le may be no good any more.
2008-05-29 17:05:03 +00:00
2008-04-22 20:39:50 +00:00
node - > u . l . n_bytes_in_buffer + = OMT_ITEM_OVERHEAD + newdisksize ;
2008-04-07 01:30:25 +00:00
node - > local_fingerprint + = node - > rand4fingerprint * toku_le_crc ( newdata ) ;
toku_free ( newdata ) ;
2008-05-08 15:24:53 +00:00
if ( ( r = toku_omt_set_at ( node - > u . l . buffer , new_le , idx ) ) ) return r ;
} else {
if ( le ) {
// It's there, note that it's gone and remove it from the mempool
2008-07-10 18:46:41 +00:00
if ( t - > txn_that_created ! = cmd - > xid ) {
if ( ( r = toku_log_deleteleafentry ( logger , & node - > log_lsn , 0 , filenum , node - > thisnodename , idx ) ) ) return r ;
}
2008-05-08 15:24:53 +00:00
if ( ( r = toku_omt_delete_at ( node - > u . l . buffer , idx ) ) ) return r ;
node - > u . l . n_bytes_in_buffer - = OMT_ITEM_OVERHEAD + leafentry_disksize ( le ) ;
node - > local_fingerprint - = node - > rand4fingerprint * toku_le_crc ( le ) ;
toku_mempool_mfree ( & node - > u . l . buffer_mempool , 0 , leafentry_memsize ( le ) ) ; // Must pass 0, since le may be no good any more.
}
if ( newdata ) {
LEAFENTRY new_le = mempool_malloc_from_omt ( node - > u . l . buffer , & node - > u . l . buffer_mempool , newlen ) ;
assert ( new_le ) ;
memcpy ( new_le , newdata , newlen ) ;
if ( ( r = toku_omt_insert_at ( node - > u . l . buffer , new_le , idx ) ) ) return r ;
2008-07-10 18:46:41 +00:00
if ( t - > txn_that_created ! = cmd - > xid ) {
if ( ( r = toku_log_insertleafentry ( logger , & node - > log_lsn , 0 , toku_cachefile_filenum ( t - > cf ) , node - > thisnodename , idx , newdata ) ) ) return r ;
}
2008-05-08 15:24:53 +00:00
node - > u . l . n_bytes_in_buffer + = OMT_ITEM_OVERHEAD + newdisksize ;
node - > local_fingerprint + = node - > rand4fingerprint * toku_le_crc ( newdata ) ;
toku_free ( newdata ) ;
}
2008-04-07 01:30:25 +00:00
}
// printf("%s:%d rand4=%08x local_fingerprint=%08x this=%08x\n", __FILE__, __LINE__, node->rand4fingerprint, node->local_fingerprint, toku_calccrc32_kvpair_struct(kv));
return 0 ;
}
2008-02-04 15:04:22 +00:00
static int brt_leaf_put_cmd ( BRT t , BRTNODE node , BRT_CMD cmd ,
2007-09-28 17:11:22 +00:00
int * did_split , BRTNODE * nodea , BRTNODE * nodeb , DBT * splitk ,
2008-02-08 19:54:00 +00:00
TOKULOGGER logger ) {
2007-11-20 00:32:25 +00:00
// toku_pma_verify_fingerprint(node->u.l.buffer, node->rand4fingerprint, node->subtree_fingerprint);
2008-04-07 01:30:25 +00:00
VERIFY_NODE ( node ) ;
2008-01-24 13:51:34 +00:00
assert ( node - > height = = 0 ) ;
2007-12-04 10:02:59 +00:00
FILENUM filenum = toku_cachefile_filenum ( t - > cf ) ;
2008-04-07 01:30:25 +00:00
2008-04-22 20:39:50 +00:00
LEAFENTRY storeddata ;
2008-06-02 20:52:12 +00:00
OMTVALUE storeddatav = NULL ;
2008-04-25 13:45:55 +00:00
2008-04-07 01:30:25 +00:00
u_int32_t idx ;
int r ;
int compare_both = should_compare_both_keys ( node , cmd ) ;
struct cmd_leafval_bessel_extra be = { t , cmd , compare_both } ;
2008-05-15 21:27:35 +00:00
//static int counter=0;
//counter++;
//printf("counter=%d\n", counter);
2008-04-07 01:30:25 +00:00
switch ( cmd - > type ) {
case BRT_INSERT :
2008-06-18 10:02:06 +00:00
if ( node - > u . l . seqinsert ) {
idx = toku_omt_size ( node - > u . l . buffer ) ;
r = toku_omt_fetch ( node - > u . l . buffer , idx - 1 , & storeddatav , NULL ) ;
if ( r ! = 0 ) goto fz ;
storeddata = storeddatav ;
int cmp = toku_cmd_leafval_bessel ( storeddata , & be ) ;
if ( cmp > = 0 ) goto fz ;
r = DB_NOTFOUND ;
} else {
fz :
r = toku_omt_find_zero ( node - > u . l . buffer , toku_cmd_leafval_bessel , & be ,
& storeddatav , & idx , NULL ) ;
}
2008-04-07 01:30:25 +00:00
if ( r = = DB_NOTFOUND ) {
storeddata = 0 ;
2008-04-25 13:45:55 +00:00
} else if ( r ! = 0 ) {
2008-04-07 01:30:25 +00:00
return r ;
2008-04-25 13:45:55 +00:00
} else {
storeddata = storeddatav ;
}
2008-04-07 01:30:25 +00:00
2008-04-22 20:39:50 +00:00
r = brt_leaf_apply_cmd_once ( t , node , cmd , logger , idx , storeddata ) ;
2008-04-07 01:30:25 +00:00
if ( r ! = 0 ) return r ;
2008-07-13 00:04:22 +00:00
// if the insertion point is within a window of the right edge of
// the leaf then it is sequential
// window = min(32, number of leaf entries/16)
u_int32_t s = toku_omt_size ( node - > u . l . buffer ) ;
u_int32_t w = s / 16 ;
if ( w = = 0 ) w = 1 ;
if ( w > 32 ) w = 32 ;
// within the window?
if ( s - idx < = w ) {
2008-06-18 10:02:06 +00:00
node - > u . l . seqinsert + = 1 ;
2008-07-13 00:04:22 +00:00
} else {
node - > u . l . seqinsert = 0 ;
}
2008-04-07 01:30:25 +00:00
break ;
case BRT_DELETE_BOTH :
case BRT_ABORT_BOTH :
case BRT_COMMIT_BOTH :
// Delete the one item
2008-04-22 20:39:50 +00:00
r = toku_omt_find_zero ( node - > u . l . buffer , toku_cmd_leafval_bessel , & be ,
2008-05-30 20:41:12 +00:00
& storeddatav , & idx , NULL ) ;
2008-04-07 01:30:25 +00:00
if ( r = = DB_NOTFOUND ) break ;
if ( r ! = 0 ) return r ;
2008-04-25 13:45:55 +00:00
storeddata = storeddatav ;
2008-04-07 01:30:25 +00:00
VERIFY_NODE ( node ) ;
static int count = 0 ;
count + + ;
2008-04-22 20:39:50 +00:00
r = brt_leaf_apply_cmd_once ( t , node , cmd , logger , idx , storeddata ) ;
2008-04-07 01:30:25 +00:00
if ( r ! = 0 ) return r ;
VERIFY_NODE ( node ) ;
break ;
case BRT_DELETE_ANY :
case BRT_ABORT_ANY :
case BRT_COMMIT_ANY :
// Delete all the matches
2008-04-22 20:39:50 +00:00
r = toku_omt_find_zero ( node - > u . l . buffer , toku_cmd_leafval_bessel , & be ,
2008-05-30 20:41:12 +00:00
& storeddatav , & idx , NULL ) ;
2008-04-07 01:30:25 +00:00
if ( r = = DB_NOTFOUND ) break ;
if ( r ! = 0 ) return r ;
2008-04-25 13:45:55 +00:00
storeddata = storeddatav ;
2008-04-07 01:30:25 +00:00
while ( 1 ) {
int vallen = le_any_vallen ( storeddata ) ;
2008-04-08 02:09:19 +00:00
void * save_val = toku_memdup ( le_any_val ( storeddata ) , vallen ) ;
2008-04-07 01:30:25 +00:00
2008-04-22 20:39:50 +00:00
r = brt_leaf_apply_cmd_once ( t , node , cmd , logger , idx , storeddata ) ;
2008-04-02 23:40:36 +00:00
if ( r ! = 0 ) return r ;
2008-04-07 01:30:25 +00:00
// Now we must find the next one.
DBT valdbt ;
BRT_CMD_S ncmd = { cmd - > type , cmd - > xid , . u . id = { cmd - > u . id . key , toku_fill_dbt ( & valdbt , save_val , vallen ) } } ;
struct cmd_leafval_bessel_extra nbe = { t , & ncmd , 1 } ;
2008-04-22 20:39:50 +00:00
r = toku_omt_find ( node - > u . l . buffer , toku_cmd_leafval_bessel , & nbe , + 1 ,
2008-05-30 20:41:12 +00:00
& storeddatav , & idx , NULL ) ;
2008-04-07 01:30:25 +00:00
toku_free ( save_val ) ;
if ( r ! = 0 ) break ;
2008-04-25 13:45:55 +00:00
storeddata = storeddatav ;
2008-04-07 01:30:25 +00:00
{ // Continue only if the next record that we found has the same key.
DBT adbt ;
if ( t - > compare_fun ( t - > db ,
toku_fill_dbt ( & adbt , le_any_key ( storeddata ) , le_any_keylen ( storeddata ) ) ,
cmd - > u . id . key ) ! = 0 )
break ;
}
2008-04-02 23:40:36 +00:00
}
2008-04-07 01:30:25 +00:00
break ;
case BRT_NONE : return EINVAL ;
}
/// All done doing the work
node - > dirty = 1 ;
2007-11-14 17:58:38 +00:00
2007-11-20 00:32:25 +00:00
// toku_pma_verify_fingerprint(node->u.l.buffer, node->rand4fingerprint, node->subtree_fingerprint);
2007-11-14 17:58:38 +00:00
2008-04-07 01:30:25 +00:00
VERIFY_NODE ( node ) ;
// If it doesn't fit, then split the leaf.
if ( toku_serialize_brtnode_size ( node ) > node - > nodesize ) {
r = brtleaf_split ( logger , filenum , t , node , nodea , nodeb , splitk ) ;
if ( r ! = 0 ) return r ;
//printf("%s:%d splitkey=%s\n", __FILE__, __LINE__, (char*)*splitkey);
split_count + + ;
* did_split = 1 ;
assert ( toku_serialize_brtnode_size ( * nodea ) < = ( * nodea ) - > nodesize ) ;
assert ( toku_serialize_brtnode_size ( * nodeb ) < = ( * nodeb ) - > nodesize ) ;
VERIFY_NODE ( * nodea ) ;
VERIFY_NODE ( * nodeb ) ;
2008-01-02 20:33:51 +00:00
} else {
2008-04-07 01:30:25 +00:00
* did_split = 0 ;
2008-01-02 20:33:51 +00:00
}
2008-04-07 01:30:25 +00:00
return 0 ;
2007-07-13 19:37:47 +00:00
}
2007-11-19 00:46:09 +00:00
/* find the leftmost child that may contain the key */
2008-03-05 18:34:32 +00:00
unsigned int toku_brtnode_which_child ( BRTNODE node , DBT * k , DBT * d , BRT t ) {
2007-07-24 01:32:03 +00:00
int i ;
assert ( node - > height > 0 ) ;
2008-06-16 01:26:51 +00:00
# define DO_PIVOT_SEARCH_LR 0
# if DO_PIVOT_SEARCH_LR
2007-07-24 01:32:03 +00:00
for ( i = 0 ; i < node - > u . n . n_children - 1 ; i + + ) {
2007-12-06 13:52:52 +00:00
int cmp = brt_compare_pivot ( t , k , d , node - > u . n . childkeys [ i ] ) ;
2007-11-19 00:46:09 +00:00
if ( cmp > 0 ) continue ;
if ( cmp < 0 ) return i ;
return i ;
2007-07-24 01:32:03 +00:00
}
return node - > u . n . n_children - 1 ;
2008-06-16 01:26:51 +00:00
# else
// give preference for appending to the dictionary. no change for
// random keys
for ( i = node - > u . n . n_children - 2 ; i > = 0 ; i - - ) {
int cmp = brt_compare_pivot ( t , k , d , node - > u . n . childkeys [ i ] ) ;
if ( cmp > 0 ) return i + 1 ;
}
return 0 ;
# endif
2007-07-24 01:32:03 +00:00
}
2008-01-08 21:03:17 +00:00
/* put a cmd into a nodes child */
2008-02-04 15:04:22 +00:00
static int brt_nonleaf_put_cmd_child_node ( BRT t , BRTNODE node , BRT_CMD cmd ,
2008-01-08 21:03:17 +00:00
int * did_split , BRTNODE * nodea , BRTNODE * nodeb , DBT * splitk ,
2008-05-20 23:47:39 +00:00
TOKULOGGER logger , int childnum , int maybe ) {
2007-10-02 16:17:44 +00:00
int r ;
void * child_v ;
BRTNODE child ;
int child_did_split ;
BRTNODE childa , childb ;
DBT childsplitk ;
* did_split = 0 ;
2013-04-16 23:57:18 -04:00
BLOCKNUM childblocknum = BNC_BLOCKNUM ( node , childnum ) ;
2008-06-18 00:30:36 +00:00
u_int32_t fullhash = compute_child_fullhash ( t - > cf , node , childnum ) ;
2007-10-02 16:17:44 +00:00
if ( maybe )
2013-04-16 23:57:18 -04:00
r = toku_cachetable_maybe_get_and_pin ( t - > cf , childblocknum , fullhash , & child_v ) ;
2007-10-02 16:17:44 +00:00
else
2013-04-16 23:57:18 -04:00
r = toku_cachetable_get_and_pin ( t - > cf , childblocknum , fullhash , & child_v , NULL ,
2013-04-16 23:57:18 -04:00
toku_brtnode_flush_callback , toku_brtnode_fetch_callback , t - > h ) ;
2007-10-02 16:17:44 +00:00
if ( r ! = 0 )
return r ;
child = child_v ;
child_did_split = 0 ;
r = brtnode_put_cmd ( t , child , cmd ,
2008-05-20 23:47:39 +00:00
& child_did_split , & childa , & childb , & childsplitk , logger ) ;
2007-10-02 19:19:44 +00:00
if ( r ! = 0 ) {
/* putting to the child failed for some reason, so unpin the child and return the error code */
2008-03-05 18:34:32 +00:00
int rr = toku_unpin_brtnode ( t , child ) ;
2007-10-02 19:19:44 +00:00
assert ( rr = = 0 ) ;
return r ;
}
2007-10-02 16:17:44 +00:00
if ( child_did_split ) {
if ( 0 ) printf ( " brt_nonleaf_insert child_split %p \n " , child ) ;
r = handle_split_of_child ( t , node , childnum ,
childa , childb , & childsplitk ,
did_split , nodea , nodeb , splitk ,
2008-04-07 01:30:25 +00:00
logger ) ;
2007-10-02 16:17:44 +00:00
assert ( r = = 0 ) ;
} else {
2007-11-14 17:58:38 +00:00
//verify_local_fingerprint_nonleaf(child);
2008-02-08 19:54:00 +00:00
fixup_child_fingerprint ( node , childnum , child , t , logger ) ;
2008-03-05 18:34:32 +00:00
int rr = toku_unpin_brtnode ( t , child ) ;
2007-10-02 19:19:44 +00:00
assert ( rr = = 0 ) ;
2007-10-02 16:17:44 +00:00
}
return r ;
}
2007-07-13 19:37:47 +00:00
2007-11-28 19:00:21 +00:00
int toku_brt_do_push_cmd = 1 ;
2007-10-03 14:51:23 +00:00
2008-01-08 21:03:17 +00:00
/* put a cmd into a node at childnum */
2008-02-04 15:04:22 +00:00
static int brt_nonleaf_put_cmd_child ( BRT t , BRTNODE node , BRT_CMD cmd ,
2008-01-08 21:03:17 +00:00
int * did_split , BRTNODE * nodea , BRTNODE * nodeb , DBT * splitk ,
2008-05-20 23:47:39 +00:00
TOKULOGGER logger , unsigned int childnum , int can_push , int * do_push_down ) {
2007-11-14 17:58:38 +00:00
//verify_local_fingerprint_nonleaf(node);
2007-08-23 18:07:18 +00:00
2008-01-08 21:03:17 +00:00
/* try to push the cmd to the subtree if the buffer is empty and pushes are enabled */
2008-01-31 22:05:43 +00:00
if ( BNC_NBYTESINBUF ( node , childnum ) = = 0 & & can_push & & toku_brt_do_push_cmd ) {
2008-05-20 23:47:39 +00:00
int r = brt_nonleaf_put_cmd_child_node ( t , node , cmd , did_split , nodea , nodeb , splitk , logger , childnum , 1 ) ;
2008-01-08 21:03:17 +00:00
if ( r = = 0 )
2007-11-19 00:46:09 +00:00
return r ;
}
//verify_local_fingerprint_nonleaf(node);
2008-01-08 21:03:17 +00:00
/* append the cmd to the child buffer */
2007-11-19 00:46:09 +00:00
{
2008-01-08 21:03:17 +00:00
int type = cmd - > type ;
DBT * k = cmd - > u . id . key ;
DBT * v = cmd - > u . id . val ;
2008-04-07 01:30:25 +00:00
int r = log_and_save_brtenq ( logger , t , node , childnum , cmd - > xid , type , k - > data , k - > size , v - > data , v - > size , & node - > local_fingerprint ) ;
2008-03-19 22:42:46 +00:00
if ( r ! = 0 ) return r ;
2007-11-19 00:46:09 +00:00
int diff = k - > size + v - > size + KEY_VALUE_OVERHEAD + BRT_CMD_OVERHEAD ;
2008-03-17 18:56:12 +00:00
r = toku_fifo_enq ( BNC_BUFFER ( node , childnum ) , k - > data , k - > size , v - > data , v - > size , type , cmd - > xid ) ;
2007-11-19 00:46:09 +00:00
assert ( r = = 0 ) ;
2008-01-11 14:03:33 +00:00
node - > u . n . n_bytes_in_buffers + = diff ;
2008-01-31 22:05:43 +00:00
BNC_NBYTESINBUF ( node , childnum ) + = diff ;
2007-11-19 00:46:09 +00:00
node - > dirty = 1 ;
}
2008-01-08 21:03:17 +00:00
* do_push_down = 1 ;
2007-11-19 00:46:09 +00:00
return 0 ;
}
2008-04-07 01:30:25 +00:00
static int brt_nonleaf_cmd_once ( BRT t , BRTNODE node , BRT_CMD cmd ,
int * did_split , BRTNODE * nodea , BRTNODE * nodeb , DBT * splitk ,
2008-05-20 23:47:39 +00:00
TOKULOGGER logger ) {
2007-11-19 00:46:09 +00:00
//verify_local_fingerprint_nonleaf(node);
2008-01-08 21:03:17 +00:00
unsigned int childnum ;
int r ;
2007-11-19 00:46:09 +00:00
2008-01-08 21:03:17 +00:00
/* find the right subtree */
2008-03-05 18:34:32 +00:00
childnum = toku_brtnode_which_child ( node , cmd - > u . id . key , cmd - > u . id . val , t ) ;
2008-01-08 21:03:17 +00:00
/* put the cmd in the subtree */
int do_push_down = 0 ;
2008-05-20 23:47:39 +00:00
r = brt_nonleaf_put_cmd_child ( t , node , cmd , did_split , nodea , nodeb , splitk , logger , childnum , 1 , & do_push_down ) ;
2008-01-08 21:03:17 +00:00
if ( r ! = 0 ) return r ;
/* maybe push down */
if ( do_push_down ) {
2008-05-20 23:47:39 +00:00
//if (debug) printf("%s:%d %*sDoing maybe_push_down\n", __FILE__, __LINE__, debug, "");
2008-01-08 21:03:17 +00:00
//verify_local_fingerprint_nonleaf(node);
2008-05-20 23:47:39 +00:00
r = brtnode_maybe_push_down ( t , node , did_split , nodea , nodeb , splitk , logger ) ;
2008-01-08 21:03:17 +00:00
if ( r ! = 0 ) return r ;
2008-05-20 23:47:39 +00:00
//if (debug) printf("%s:%d %*sDid maybe_push_down\n", __FILE__, __LINE__, debug, "");
2008-01-08 21:03:17 +00:00
if ( * did_split ) {
assert ( toku_serialize_brtnode_size ( * nodea ) < = ( * nodea ) - > nodesize ) ;
assert ( toku_serialize_brtnode_size ( * nodeb ) < = ( * nodeb ) - > nodesize ) ;
assert ( ( * nodea ) - > u . n . n_children > 0 ) ;
assert ( ( * nodeb ) - > u . n . n_children > 0 ) ;
2013-04-16 23:57:18 -04:00
assert ( BNC_BLOCKNUM ( * nodea , ( * nodea ) - > u . n . n_children - 1 ) . b ! = 0 ) ;
assert ( BNC_BLOCKNUM ( * nodeb , ( * nodeb ) - > u . n . n_children - 1 ) . b ! = 0 ) ;
2008-01-08 21:03:17 +00:00
} else {
assert ( toku_serialize_brtnode_size ( node ) < = node - > nodesize ) ;
}
//if (*did_split) {
// verify_local_fingerprint_nonleaf(*nodea);
// verify_local_fingerprint_nonleaf(*nodeb);
//} else {
// verify_local_fingerprint_nonleaf(node);
//}
2007-07-13 19:37:47 +00:00
}
return 0 ;
}
2007-11-19 00:46:09 +00:00
/* delete in all subtrees starting from the left most one which contains the key */
2008-04-07 01:30:25 +00:00
static int brt_nonleaf_cmd_many ( BRT t , BRTNODE node , BRT_CMD cmd ,
int * did_split , BRTNODE * nodea , BRTNODE * nodeb , DBT * splitk ,
TOKULOGGER logger ) {
2007-11-19 00:46:09 +00:00
int r ;
2008-04-07 01:30:25 +00:00
/* find all children that need a copy of the command */
int sendchild [ TREE_FANOUT ] , delidx = 0 ;
2013-04-16 23:57:19 -04:00
# define sendchild_append(i) \
if ( delidx = = 0 | | sendchild [ delidx - 1 ] ! = i ) sendchild [ delidx + + ] = i ;
2007-11-19 00:46:09 +00:00
int i ;
for ( i = 0 ; i < node - > u . n . n_children - 1 ; i + + ) {
2007-12-06 13:52:52 +00:00
int cmp = brt_compare_pivot ( t , cmd - > u . id . key , 0 , node - > u . n . childkeys [ i ] ) ;
2007-11-19 00:46:09 +00:00
if ( cmp > 0 ) {
continue ;
} else if ( cmp < 0 ) {
2008-04-07 01:30:25 +00:00
sendchild_append ( i ) ;
2007-11-19 00:46:09 +00:00
break ;
2007-11-19 20:22:56 +00:00
} else if ( t - > flags & TOKU_DB_DUPSORT ) {
2008-04-07 01:30:25 +00:00
sendchild_append ( i ) ;
sendchild_append ( i + 1 ) ;
2007-11-19 00:46:09 +00:00
} else {
2008-04-07 01:30:25 +00:00
sendchild_append ( i ) ;
2007-11-19 00:46:09 +00:00
break ;
}
}
if ( delidx = = 0 )
2008-04-07 01:30:25 +00:00
sendchild_append ( node - > u . n . n_children - 1 ) ;
2007-11-19 00:46:09 +00:00
2008-04-07 01:30:25 +00:00
/* issue the to all of the children found previously */
2008-01-08 21:03:17 +00:00
int do_push_down = 0 ;
2007-11-19 00:46:09 +00:00
for ( i = 0 ; i < delidx ; i + + ) {
2008-05-20 23:47:39 +00:00
r = brt_nonleaf_put_cmd_child ( t , node , cmd , did_split , nodea , nodeb , splitk , logger , sendchild [ i ] , delidx = = 1 , & do_push_down ) ;
2007-11-19 00:46:09 +00:00
assert ( r = = 0 ) ;
}
2008-01-08 21:03:17 +00:00
if ( do_push_down ) {
/* maybe push down */
//verify_local_fingerprint_nonleaf(node);
2008-05-20 23:47:39 +00:00
r = brtnode_maybe_push_down ( t , node , did_split , nodea , nodeb , splitk , logger ) ;
2008-01-08 21:03:17 +00:00
if ( r ! = 0 ) return r ;
if ( * did_split ) {
assert ( toku_serialize_brtnode_size ( * nodea ) < = ( * nodea ) - > nodesize ) ;
assert ( toku_serialize_brtnode_size ( * nodeb ) < = ( * nodeb ) - > nodesize ) ;
assert ( ( * nodea ) - > u . n . n_children > 0 ) ;
assert ( ( * nodeb ) - > u . n . n_children > 0 ) ;
2013-04-16 23:57:18 -04:00
assert ( BNC_BLOCKNUM ( * nodea , ( * nodea ) - > u . n . n_children - 1 ) . b ! = 0 ) ;
assert ( BNC_BLOCKNUM ( * nodeb , ( * nodeb ) - > u . n . n_children - 1 ) . b ! = 0 ) ;
2008-01-08 21:03:17 +00:00
} else {
assert ( toku_serialize_brtnode_size ( node ) < = node - > nodesize ) ;
}
//if (*did_split) {
// verify_local_fingerprint_nonleaf(*nodea);
// verify_local_fingerprint_nonleaf(*nodeb);
//} else {
// verify_local_fingerprint_nonleaf(node);
//}
2007-11-19 00:46:09 +00:00
}
return 0 ;
}
2008-02-04 15:04:22 +00:00
static int brt_nonleaf_put_cmd ( BRT t , BRTNODE node , BRT_CMD cmd ,
2007-11-19 00:46:09 +00:00
int * did_split , BRTNODE * nodea , BRTNODE * nodeb ,
DBT * splitk ,
2008-04-07 01:30:25 +00:00
TOKULOGGER logger ) {
switch ( cmd - > type ) {
case BRT_INSERT :
case BRT_DELETE_BOTH :
case BRT_ABORT_BOTH :
case BRT_COMMIT_BOTH :
do_once :
2008-05-20 23:47:39 +00:00
return brt_nonleaf_cmd_once ( t , node , cmd , did_split , nodea , nodeb , splitk , logger ) ;
2008-04-07 01:30:25 +00:00
case BRT_DELETE_ANY :
case BRT_ABORT_ANY :
case BRT_COMMIT_ANY :
if ( 0 = = ( node - > flags & TOKU_DB_DUPSORT ) ) goto do_once ; // nondupsort delete_any is just do once.
2008-05-20 23:47:39 +00:00
return brt_nonleaf_cmd_many ( t , node , cmd , did_split , nodea , nodeb , splitk , logger ) ;
2008-04-07 01:30:25 +00:00
case BRT_NONE :
break ;
}
return EINVAL ;
2007-11-19 00:46:09 +00:00
}
2007-07-13 19:37:47 +00:00
2008-01-29 21:43:08 +00:00
static void verify_local_fingerprint_nonleaf ( BRTNODE node ) {
u_int32_t fp = 0 ;
int i ;
if ( node - > height = = 0 ) return ;
for ( i = 0 ; i < node - > u . n . n_children ; i + + )
2008-02-05 18:25:23 +00:00
FIFO_ITERATE ( BNC_BUFFER ( node , i ) , key , keylen , data , datalen , type , xid ,
2008-01-29 21:43:08 +00:00
( {
2008-07-27 22:16:49 +00:00
fp + = node - > rand4fingerprint * toku_calc_fingerprint_cmd ( type , xid , key , keylen , data , datalen ) ;
2008-01-29 21:43:08 +00:00
} ) ) ;
assert ( fp = = node - > local_fingerprint ) ;
}
2007-11-14 17:58:38 +00:00
2008-02-04 15:04:22 +00:00
static int brtnode_put_cmd ( BRT t , BRTNODE node , BRT_CMD cmd ,
2007-09-28 17:11:22 +00:00
int * did_split , BRTNODE * nodea , BRTNODE * nodeb , DBT * splitk ,
2008-04-07 01:30:25 +00:00
TOKULOGGER logger ) {
2007-11-14 17:58:38 +00:00
//static int counter=0; // FOO
//static int oldcounter=0;
//int tmpcounter;
//u_int32_t oldfingerprint=node->local_fingerprint;
int r ;
//counter++; tmpcounter=counter;
2007-07-13 19:37:47 +00:00
if ( node - > height = = 0 ) {
2007-11-20 00:32:25 +00:00
// toku_pma_verify_fingerprint(node->u.l.buffer, node->rand4fingerprint, node->subtree_fingerprint);
2007-11-14 17:58:38 +00:00
r = brt_leaf_put_cmd ( t , node , cmd ,
2008-02-08 19:54:00 +00:00
did_split , nodea , nodeb , splitk ,
2008-05-20 23:47:39 +00:00
logger ) ;
2007-07-13 19:37:47 +00:00
} else {
2007-11-14 17:58:38 +00:00
r = brt_nonleaf_put_cmd ( t , node , cmd ,
2008-02-08 19:54:00 +00:00
did_split , nodea , nodeb , splitk ,
2008-05-20 23:47:39 +00:00
logger ) ;
2007-07-13 19:37:47 +00:00
}
2007-11-14 17:58:38 +00:00
//oldcounter=tmpcounter;
// Watch out. If did_split then the original node is no longer allocated.
if ( * did_split ) {
2007-11-19 23:54:17 +00:00
assert ( toku_serialize_brtnode_size ( * nodea ) < = ( * nodea ) - > nodesize ) ;
assert ( toku_serialize_brtnode_size ( * nodeb ) < = ( * nodeb ) - > nodesize ) ;
2007-11-14 17:58:38 +00:00
// if ((*nodea)->height==0) {
2007-11-20 00:32:25 +00:00
// toku_pma_verify_fingerprint((*nodea)->u.l.buffer, (*nodea)->rand4fingerprint, (*nodea)->subtree_fingerprint);
// toku_pma_verify_fingerprint((*nodeb)->u.l.buffer, (*nodeb)->rand4fingerprint, (*nodeb)->subtree_fingerprint);
2007-11-14 17:58:38 +00:00
// }
} else {
2007-11-19 23:54:17 +00:00
assert ( toku_serialize_brtnode_size ( node ) < = node - > nodesize ) ;
2007-11-14 17:58:38 +00:00
// if (node->height==0) {
2007-11-20 00:32:25 +00:00
// toku_pma_verify_fingerprint(node->u.l.buffer, node->rand4fingerprint, node->local_fingerprint);
2007-11-14 17:58:38 +00:00
// } else {
// verify_local_fingerprint_nonleaf(node);
// }
}
//if (node->local_fingerprint==3522421844U) {
// if (*did_split) {
// verify_local_fingerprint_nonleaf(*nodea);
// verify_local_fingerprint_nonleaf(*nodeb);
// }
return r ;
2007-07-13 19:37:47 +00:00
}
2007-11-29 14:44:03 +00:00
int toku_brt_create_cachetable ( CACHETABLE * ct , long cachesize , LSN initial_lsn , TOKULOGGER logger ) {
2007-11-14 17:58:38 +00:00
if ( cachesize = = 0 )
cachesize = 128 * 1024 * 1024 ;
2007-11-19 23:47:44 +00:00
return toku_create_cachetable ( ct , cachesize , initial_lsn , logger ) ;
2007-07-13 19:37:47 +00:00
}
2013-04-16 23:57:18 -04:00
static int setup_initial_brt_root_node ( BRT t , BLOCKNUM blocknum , TOKULOGGER logger ) {
2007-07-13 19:37:47 +00:00
int r ;
2007-09-18 16:09:55 +00:00
TAGMALLOC ( BRTNODE , node ) ;
2007-07-13 19:37:47 +00:00
assert ( node ) ;
2008-06-15 17:09:14 +00:00
node - > ever_been_written = 0 ;
2007-07-13 19:37:47 +00:00
//printf("%s:%d\n", __FILE__, __LINE__);
2013-04-16 23:57:18 -04:00
initialize_brtnode ( t , node , blocknum , 0 ) ;
2007-11-16 20:34:13 +00:00
// node->brt = t;
2007-07-13 19:37:47 +00:00
if ( 0 ) {
printf ( " %s:%d for tree %p node %p mdict_create--> %p \n " , __FILE__ , __LINE__ , t , node , node - > u . l . buffer ) ;
2013-04-16 23:57:18 -04:00
printf ( " %s:%d put root at % " PRId64 " \n " , __FILE__ , __LINE__ , blocknum . b ) ;
2007-07-13 19:37:47 +00:00
}
2007-10-03 19:34:31 +00:00
//printf("%s:%d putting %p (%lld)\n", __FILE__, __LINE__, node, node->thisnodename);
2013-04-16 23:57:18 -04:00
u_int32_t fullhash = toku_cachetable_hash ( t - > cf , blocknum ) ;
2008-06-17 17:05:19 +00:00
node - > fullhash = fullhash ;
2013-04-16 23:57:18 -04:00
r = toku_cachetable_put ( t - > cf , blocknum , fullhash ,
2008-06-17 17:05:19 +00:00
node , brtnode_memory_size ( node ) ,
2013-04-16 23:57:18 -04:00
toku_brtnode_flush_callback , toku_brtnode_fetch_callback , t - > h ) ;
2007-07-13 19:37:47 +00:00
if ( r ! = 0 ) {
2007-07-20 18:00:14 +00:00
toku_free ( node ) ;
2007-07-13 19:37:47 +00:00
return r ;
}
2007-11-14 17:58:38 +00:00
// verify_local_fingerprint_nonleaf(node);
2013-04-16 23:57:18 -04:00
toku_log_newbrtnode ( logger , & node - > log_lsn , 0 , toku_cachefile_filenum ( t - > cf ) , blocknum , 0 , t - > h - > nodesize , ( t - > flags & TOKU_DB_DUPSORT ) ! = 0 , node - > rand4fingerprint ) ;
2008-03-05 18:34:32 +00:00
r = toku_unpin_brtnode ( t , node ) ;
2007-07-13 19:37:47 +00:00
if ( r ! = 0 ) {
2007-07-20 18:00:14 +00:00
toku_free ( node ) ;
2007-07-13 19:37:47 +00:00
return r ;
}
return 0 ;
}
2007-07-20 14:20:58 +00:00
//#define BRT_TRACE
2007-07-13 19:37:47 +00:00
# ifdef BRT_TRACE
# define WHEN_BRTTRACE(x) x
# else
# define WHEN_BRTTRACE(x) ((void)0)
# endif
2007-11-29 14:44:03 +00:00
int toku_brt_create ( BRT * brt_ptr ) {
2007-11-14 17:58:38 +00:00
BRT brt = toku_malloc ( sizeof * brt ) ;
if ( brt = = 0 )
return ENOMEM ;
memset ( brt , 0 , sizeof * brt ) ;
2008-01-25 15:43:37 +00:00
list_init ( & brt - > cursors ) ;
2007-11-14 17:58:38 +00:00
brt - > flags = 0 ;
2008-05-22 21:28:00 +00:00
brt - > did_set_flags = 0 ;
2007-11-14 17:58:38 +00:00
brt - > nodesize = BRT_DEFAULT_NODE_SIZE ;
2007-11-20 00:02:51 +00:00
brt - > compare_fun = toku_default_compare_fun ;
brt - > dup_compare = toku_default_compare_fun ;
2008-04-26 03:16:30 +00:00
int r = toku_omt_create ( & brt - > txns ) ;
if ( r ! = 0 ) { toku_free ( brt ) ; return r ; }
2007-11-14 17:58:38 +00:00
* brt_ptr = brt ;
return 0 ;
}
2007-11-29 19:32:53 +00:00
int toku_brt_set_flags ( BRT brt , unsigned int flags ) {
2008-05-22 21:28:00 +00:00
brt - > did_set_flags = 1 ;
2007-11-14 17:58:38 +00:00
brt - > flags = flags ;
return 0 ;
}
2007-11-29 19:32:53 +00:00
int toku_brt_get_flags ( BRT brt , unsigned int * flags ) {
2007-11-25 18:17:01 +00:00
* flags = brt - > flags ;
return 0 ;
}
2007-11-29 19:32:53 +00:00
int toku_brt_set_nodesize ( BRT brt , unsigned int nodesize ) {
2007-11-14 17:58:38 +00:00
brt - > nodesize = nodesize ;
return 0 ;
}
2007-11-29 19:32:53 +00:00
int toku_brt_get_nodesize ( BRT brt , unsigned int * nodesize ) {
* nodesize = brt - > nodesize ;
return 0 ;
}
2007-11-29 15:09:14 +00:00
int toku_brt_set_bt_compare ( BRT brt , int ( * bt_compare ) ( DB * , const DBT * , const DBT * ) ) {
2007-11-14 17:58:38 +00:00
brt - > compare_fun = bt_compare ;
return 0 ;
}
2007-11-29 15:09:14 +00:00
int toku_brt_set_dup_compare ( BRT brt , int ( * dup_compare ) ( DB * , const DBT * , const DBT * ) ) {
2007-11-14 17:58:38 +00:00
brt - > dup_compare = dup_compare ;
return 0 ;
}
2008-01-11 14:38:49 +00:00
int toku_brt_get_fd ( BRT brt , int * fdp ) {
* fdp = toku_cachefile_fd ( brt - > cf ) ;
return 0 ;
}
2008-06-18 21:38:01 +00:00
static void compute_and_fill_remembered_hash ( BRT brt , int rootnum ) {
struct remembered_hash * rh = & brt - > h - > root_hashes [ rootnum ] ;
assert ( brt - > cf ) ; // if cf is null, we'll be hosed.
rh - > valid = TRUE ;
rh - > fnum = toku_cachefile_filenum ( brt - > cf ) ;
rh - > root = brt - > h - > roots [ rootnum ] ;
rh - > fullhash = toku_cachetable_hash ( brt - > cf , rh - > root ) ;
}
static u_int32_t get_roothash ( BRT brt , int rootnum ) {
struct remembered_hash * rh = & brt - > h - > root_hashes [ rootnum ] ;
2013-04-16 23:57:18 -04:00
BLOCKNUM root = brt - > h - > roots [ rootnum ] ;
2008-06-18 21:38:01 +00:00
// compare cf first, since cf is NULL for invalid entries.
assert ( rh ) ;
//printf("v=%d\n", rh->valid);
if ( rh - > valid ) {
//printf("f=%d\n", rh->fnum.fileid);
//printf("cf=%d\n", toku_cachefile_filenum(brt->cf).fileid);
if ( rh - > fnum . fileid = = toku_cachefile_filenum ( brt - > cf ) . fileid )
2013-04-16 23:57:18 -04:00
if ( rh - > root . b = = root . b )
2008-06-18 21:38:01 +00:00
return rh - > fullhash ;
}
compute_and_fill_remembered_hash ( brt , rootnum ) ;
return rh - > fullhash ;
}
2008-07-21 02:34:13 +00:00
// open a file for use by the brt. if the file does not exist, create it.
static int brt_open_file ( BRT brt , const char * fname , const char * fname_in_env , int is_create , TOKUTXN txn , int * fdp ) {
brt = brt ;
mode_t mode = 0777 ;
int r ;
int fd = open ( fname , O_RDWR , mode ) ;
if ( fd = = - 1 ) {
r = errno ;
if ( errno = = ENOENT ) {
if ( ! is_create ) {
return r ;
}
fd = open ( fname , O_RDWR | O_CREAT , mode ) ;
if ( fd = = - 1 ) {
r = errno ; return r ;
}
r = toku_logger_log_fcreate ( txn , fname_in_env , mode ) ;
if ( r ! = 0 ) {
close ( fd ) ; return r ;
}
} else
return r ;
}
* fdp = fd ;
return 0 ;
}
// allocate and initialize a brt header.
2013-04-16 23:57:18 -04:00
// t->cf is not set to anything.
2008-07-21 02:34:13 +00:00
static int brt_alloc_init_header ( BRT t , const char * dbname , TOKUTXN txn ) {
int r ;
2013-04-16 23:57:18 -04:00
BLOCKNUM root = make_blocknum ( 1 ) ;
2008-07-21 02:34:13 +00:00
assert ( t - > h = = 0 ) ;
if ( ( MALLOC ( t - > h ) ) = = 0 ) {
assert ( errno = = ENOMEM ) ;
r = ENOMEM ;
if ( 0 ) { died2 : toku_free ( t - > h ) ; }
t - > h = 0 ;
return r ;
}
t - > h - > dirty = 1 ;
if ( ( MALLOC_N ( 1 , t - > h - > flags_array ) ) = = 0 ) { r = errno ; if ( 0 ) { died3 : toku_free ( t - > h - > flags_array ) ; } goto died2 ; }
t - > h - > flags_array [ 0 ] = t - > flags ;
t - > h - > nodesize = t - > nodesize ;
2013-04-16 23:57:18 -04:00
t - > h - > free_blocks = make_blocknum ( - 1 ) ;
t - > h - > unused_blocks = make_blocknum ( 2 ) ;
2013-04-16 23:57:18 -04:00
t - > h - > translated_blocknum_limit = 0 ;
2013-04-16 23:57:18 -04:00
t - > h - > block_translation = 0 ;
t - > h - > block_translation_size_on_disk = 0 ;
t - > h - > block_translation_address_on_disk = 0 ;
2013-04-16 23:57:18 -04:00
// printf("%s:%d translated_blocknum_limit=%ld, block_translation_address_on_disk=%ld\n", __FILE__, __LINE__, t->h->translated_blocknum_limit, t->h->block_translation_address_on_disk);
2013-04-16 23:57:18 -04:00
create_block_allocator ( & t - > h - > block_allocator , t - > nodesize , BLOCK_ALLOCATOR_ALIGNMENT ) ;
2008-07-21 02:34:13 +00:00
toku_fifo_create ( & t - > h - > fifo ) ;
2013-04-16 23:57:18 -04:00
t - > h - > root_put_counter = global_root_put_counter + + ;
2008-07-21 02:34:13 +00:00
if ( dbname ) {
t - > h - > n_named_roots = 1 ;
if ( ( MALLOC_N ( 1 , t - > h - > names ) ) = = 0 ) { assert ( errno = = ENOMEM ) ; r = ENOMEM ; if ( 0 ) { died4 : if ( dbname ) toku_free ( t - > h - > names ) ; } goto died3 ; }
if ( ( MALLOC_N ( 1 , t - > h - > roots ) ) = = 0 ) { assert ( errno = = ENOMEM ) ; r = ENOMEM ; if ( 0 ) { died5 : if ( dbname ) toku_free ( t - > h - > roots ) ; } goto died4 ; }
if ( ( MALLOC_N ( 1 , t - > h - > root_hashes ) ) = = 0 ) { assert ( errno = = ENOMEM ) ; r = ENOMEM ; if ( 0 ) { died6 : if ( dbname ) toku_free ( t - > h - > root_hashes ) ; } goto died5 ; }
if ( ( t - > h - > names [ 0 ] = toku_strdup ( dbname ) ) = = 0 ) { assert ( errno = = ENOMEM ) ; r = ENOMEM ; if ( 0 ) { died7 : if ( dbname ) toku_free ( t - > h - > names [ 0 ] ) ; } goto died6 ; }
2013-04-16 23:57:18 -04:00
t - > h - > roots [ 0 ] = root ; // Block 0 is the header. Block 1 is the root.
2008-07-21 02:34:13 +00:00
compute_and_fill_remembered_hash ( t , 0 ) ;
} else {
MALLOC_N ( 1 , t - > h - > roots ) ; assert ( t - > h - > roots ) ;
MALLOC_N ( 1 , t - > h - > root_hashes ) ; assert ( t - > h - > root_hashes ) ;
2013-04-16 23:57:18 -04:00
t - > h - > roots [ 0 ] = root ;
2008-07-21 02:34:13 +00:00
compute_and_fill_remembered_hash ( t , 0 ) ;
t - > h - > n_named_roots = - 1 ;
t - > h - > names = 0 ;
}
{
LOGGEDBRTHEADER lh = { . size = toku_serialize_brt_header_size ( t - > h ) ,
. flags = t - > flags ,
. nodesize = t - > h - > nodesize ,
2013-04-16 23:57:18 -04:00
. free_blocks = t - > h - > free_blocks ,
. unused_blocks = t - > h - > unused_blocks ,
2008-07-21 02:34:13 +00:00
. n_named_roots = t - > h - > n_named_roots } ;
if ( t - > h - > n_named_roots > = 0 ) {
lh . u . many . names = t - > h - > names ;
lh . u . many . roots = t - > h - > roots ;
} else {
lh . u . one . root = t - > h - > roots [ 0 ] ;
}
if ( ( r = toku_log_fheader ( toku_txn_logger ( txn ) , ( LSN * ) 0 , 0 , toku_txn_get_txnid ( txn ) , toku_cachefile_filenum ( t - > cf ) , lh ) ) ) { goto died7 ; }
}
2013-04-16 23:57:18 -04:00
if ( ( r = setup_initial_brt_root_node ( t , root , toku_txn_logger ( txn ) ) ) ! = 0 ) { goto died7 ; }
2008-07-21 02:34:13 +00:00
//printf("%s:%d putting %p (%d)\n", __FILE__, __LINE__, t->h, 0);
2013-04-16 23:57:18 -04:00
assert ( t - > h - > free_blocks . b = = - 1 ) ;
2013-04-16 23:57:18 -04:00
toku_cachefile_set_userdata ( t - > cf , t - > h , toku_brtheader_close ) ;
2008-07-21 02:34:13 +00:00
return r ;
}
2008-05-22 21:28:00 +00:00
int toku_brt_open ( BRT t , const char * fname , const char * fname_in_env , const char * dbname , int is_create , int only_create , CACHETABLE cachetable , TOKUTXN txn , DB * db ) {
2007-11-14 17:58:38 +00:00
2007-07-13 19:37:47 +00:00
/* If dbname is NULL then we setup to hold a single tree. Otherwise we setup an array. */
int r ;
char * malloced_name = 0 ;
2008-05-22 21:28:00 +00:00
int db_index ;
2007-11-29 15:34:49 +00:00
//printf("%s:%d %d alloced\n", __FILE__, __LINE__, get_n_items_malloced()); toku_print_malloced_items();
2008-01-24 13:51:34 +00:00
WHEN_BRTTRACE ( fprintf ( stderr , " BRTTRACE: %s:%d toku_brt_open(%s, \" %s \" , %d, %p, %d, %p) \n " ,
2007-07-20 14:20:58 +00:00
__FILE__ , __LINE__ , fname , dbname , is_create , newbrt , nodesize , cachetable ) ) ;
2007-11-14 17:58:38 +00:00
if ( 0 ) { died0 : assert ( r ) ; return r ; }
2007-11-19 16:30:50 +00:00
assert ( is_create | | ! only_create ) ;
2008-04-17 03:11:55 +00:00
t - > fname = toku_strdup ( fname_in_env ) ;
if ( t - > fname = = 0 ) {
r = errno ;
if ( 0 ) { died00 : if ( t - > fname ) toku_free ( t - > fname ) ; t - > fname = 0 ; }
goto died0 ;
}
2007-07-13 19:37:47 +00:00
if ( dbname ) {
2007-08-01 02:37:21 +00:00
malloced_name = toku_strdup ( dbname ) ;
2007-07-13 19:37:47 +00:00
if ( malloced_name = = 0 ) {
r = ENOMEM ;
2007-07-20 18:00:14 +00:00
if ( 0 ) { died0a : if ( malloced_name ) toku_free ( malloced_name ) ; }
2008-04-17 03:11:55 +00:00
goto died00 ;
2007-07-13 19:37:47 +00:00
}
}
t - > database_name = malloced_name ;
2008-01-24 13:51:34 +00:00
t - > db = db ;
2008-07-10 18:46:41 +00:00
t - > txn_that_created = 0 ; // Uses 0 for no transaction.
2007-11-19 23:47:44 +00:00
{
2008-07-21 02:34:13 +00:00
int fd = - 1 ;
r = brt_open_file ( t , fname , fname_in_env , is_create , txn , & fd ) ;
if ( r ! = 0 ) {
t - > database_name = 0 ; goto died0a ;
}
2008-04-17 03:11:55 +00:00
r = toku_cachetable_openfd ( & t - > cf , cachetable , fd , fname_in_env ) ;
2008-02-15 19:44:48 +00:00
if ( r ! = 0 ) goto died0a ;
2007-11-29 18:14:40 +00:00
toku_logger_log_fopen ( txn , fname_in_env , toku_cachefile_filenum ( t - > cf ) ) ;
2007-11-19 23:47:44 +00:00
}
2007-07-13 19:37:47 +00:00
if ( r ! = 0 ) {
2013-04-16 23:57:18 -04:00
if ( 0 ) { died_after_open : toku_cachefile_close ( & t - > cf , toku_txn_logger ( txn ) ) ; }
2007-11-14 17:58:38 +00:00
t - > database_name = 0 ;
2007-07-13 19:37:47 +00:00
goto died0a ;
}
2007-11-14 17:58:38 +00:00
assert ( t - > nodesize > 0 ) ;
2007-11-29 15:34:49 +00:00
//printf("%s:%d %d alloced\n", __FILE__, __LINE__, get_n_items_malloced()); toku_print_malloced_items();
2007-12-23 01:09:09 +00:00
if ( 0 ) {
died_after_read_and_pin :
2013-04-16 23:57:18 -04:00
goto died_after_open ;
2007-12-23 01:09:09 +00:00
}
2007-07-13 19:37:47 +00:00
if ( is_create ) {
2013-04-16 23:57:18 -04:00
r = toku_read_brt_header_and_store_in_cachefile ( t - > cf , & t - > h ) ;
2007-07-13 19:37:47 +00:00
if ( r = = - 1 ) {
2008-07-21 02:34:13 +00:00
r = brt_alloc_init_header ( t , dbname , txn ) ;
if ( r ! = 0 ) goto died_after_read_and_pin ;
2007-12-11 20:03:12 +00:00
}
else if ( r ! = 0 ) {
2007-12-23 01:09:09 +00:00
goto died_after_read_and_pin ;
2007-12-11 20:03:12 +00:00
}
else {
2007-07-13 19:37:47 +00:00
int i ;
assert ( r = = 0 ) ;
2007-11-19 16:30:50 +00:00
assert ( dbname ) ;
2008-05-22 21:28:00 +00:00
if ( t - > h - > n_named_roots < 0 ) { r = EINVAL ; goto died_after_read_and_pin ; } // Cannot create a subdb in a file that is not enabled for subdbs
2007-07-13 19:37:47 +00:00
assert ( t - > h - > n_named_roots > = 0 ) ;
for ( i = 0 ; i < t - > h - > n_named_roots ; i + + ) {
if ( strcmp ( t - > h - > names [ i ] , dbname ) = = 0 ) {
2007-11-19 16:30:50 +00:00
if ( only_create ) {
2007-11-28 14:51:55 +00:00
r = EEXIST ;
2007-12-23 01:09:09 +00:00
goto died_after_read_and_pin ;
2007-11-28 14:51:55 +00:00
}
2008-05-22 21:28:00 +00:00
else {
db_index = i ;
goto found_it ;
}
2007-07-13 19:37:47 +00:00
}
}
2007-12-23 01:09:09 +00:00
if ( ( t - > h - > names = toku_realloc ( t - > h - > names , ( 1 + t - > h - > n_named_roots ) * sizeof ( * t - > h - > names ) ) ) = = 0 ) { assert ( errno = = ENOMEM ) ; r = ENOMEM ; goto died_after_read_and_pin ; }
if ( ( t - > h - > roots = toku_realloc ( t - > h - > roots , ( 1 + t - > h - > n_named_roots ) * sizeof ( * t - > h - > roots ) ) ) = = 0 ) { assert ( errno = = ENOMEM ) ; r = ENOMEM ; goto died_after_read_and_pin ; }
2008-06-18 21:38:01 +00:00
if ( ( t - > h - > root_hashes = toku_realloc ( t - > h - > root_hashes , ( 1 + t - > h - > n_named_roots ) * sizeof ( * t - > h - > root_hashes ) ) ) = = 0 ) { assert ( errno = = ENOMEM ) ; r = ENOMEM ; goto died_after_read_and_pin ; }
2008-05-22 21:28:00 +00:00
if ( ( t - > h - > flags_array = toku_realloc ( t - > h - > flags_array , ( 1 + t - > h - > n_named_roots ) * sizeof ( * t - > h - > flags_array ) ) ) = = 0 ) { assert ( errno = = ENOMEM ) ; r = ENOMEM ; goto died_after_read_and_pin ; }
t - > h - > flags_array [ t - > h - > n_named_roots ] = t - > flags ;
2007-07-13 19:37:47 +00:00
t - > h - > n_named_roots + + ;
2007-12-23 01:09:09 +00:00
if ( ( t - > h - > names [ t - > h - > n_named_roots - 1 ] = toku_strdup ( dbname ) ) = = 0 ) { assert ( errno = = ENOMEM ) ; r = ENOMEM ; goto died_after_read_and_pin ; }
2007-11-14 17:58:38 +00:00
//printf("%s:%d t=%p\n", __FILE__, __LINE__, t);
2013-04-16 23:57:18 -04:00
r = allocate_diskblocknumber ( & t - > h - > roots [ t - > h - > n_named_roots - 1 ] , t , toku_txn_logger ( txn ) ) ;
2008-01-18 21:28:27 +00:00
if ( r ! = 0 ) goto died_after_read_and_pin ;
2007-10-17 13:39:08 +00:00
t - > h - > dirty = 1 ;
2008-06-18 21:38:01 +00:00
compute_and_fill_remembered_hash ( t , t - > h - > n_named_roots - 1 ) ;
2008-02-08 19:54:00 +00:00
if ( ( r = setup_initial_brt_root_node ( t , t - > h - > roots [ t - > h - > n_named_roots - 1 ] , toku_txn_logger ( txn ) ) ) ! = 0 ) goto died_after_read_and_pin ;
2007-07-13 19:37:47 +00:00
}
} else {
2013-04-16 23:57:18 -04:00
if ( ( r = toku_read_brt_header_and_store_in_cachefile ( t - > cf , & t - > h ) ) ! = 0 ) goto died_after_open ;
2007-07-13 19:37:47 +00:00
if ( ! dbname ) {
2007-12-23 01:09:09 +00:00
if ( t - > h - > n_named_roots ! = - 1 ) { r = EINVAL ; goto died_after_read_and_pin ; } // requires a subdb
2008-05-22 21:28:00 +00:00
db_index = 0 ;
2007-07-13 19:37:47 +00:00
} else {
int i ;
2008-05-22 21:28:00 +00:00
if ( t - > h - > n_named_roots = = - 1 ) { r = EINVAL ; goto died_after_read_and_pin ; } // no suddbs in the db
2007-11-14 17:58:38 +00:00
// printf("%s:%d n_roots=%d\n", __FILE__, __LINE__, t->h->n_named_roots);
2007-07-13 19:37:47 +00:00
for ( i = 0 ; i < t - > h - > n_named_roots ; i + + ) {
if ( strcmp ( t - > h - > names [ i ] , dbname ) = = 0 ) {
2008-05-22 21:28:00 +00:00
db_index = i ;
2007-07-13 19:37:47 +00:00
goto found_it ;
}
}
r = ENOENT ; /* the database doesn't exist */
2007-12-23 01:09:09 +00:00
goto died_after_read_and_pin ;
2007-07-13 19:37:47 +00:00
}
2007-11-14 17:58:38 +00:00
found_it :
t - > nodesize = t - > h - > nodesize ; /* inherit the pagesize from the file */
2008-05-22 21:28:00 +00:00
if ( ! t - > did_set_flags ) {
t - > flags = t - > h - > flags_array [ db_index ] ;
} else {
if ( t - > flags ! = t - > h - > flags_array [ db_index ] ) { /* if flags have been set then flags must match */
r = EINVAL ; goto died_after_read_and_pin ;
}
2007-11-14 17:58:38 +00:00
}
2007-07-13 19:37:47 +00:00
}
assert ( t - > h ) ;
WHEN_BRTTRACE ( fprintf ( stderr , " BRTTRACE -> %p \n " , t ) ) ;
return 0 ;
}
2008-07-21 02:34:13 +00:00
int toku_brt_reopen ( BRT brt , const char * fname , const char * fname_in_env , TOKUTXN txn ) {
int r ;
// create a new file
int fd = - 1 ;
r = brt_open_file ( brt , fname , fname_in_env , TRUE , txn , & fd ) ;
if ( r ! = 0 ) return r ;
// set the cachefile
r = toku_cachefile_set_fd ( brt - > cf , fd , fname_in_env ) ;
assert ( r = = 0 ) ;
2013-04-16 23:57:18 -04:00
brt - > h = 0 ; // set_fd should close the header
2008-07-21 02:34:13 +00:00
toku_logger_log_fopen ( txn , fname_in_env , toku_cachefile_filenum ( brt - > cf ) ) ;
// init the tree header
2013-04-16 23:57:18 -04:00
r = toku_read_brt_header_and_store_in_cachefile ( brt - > cf , & brt - > h ) ;
2008-07-21 02:34:13 +00:00
if ( r = = - 1 ) {
r = brt_alloc_init_header ( brt , NULL , txn ) ;
}
return r ;
}
2007-11-29 15:09:14 +00:00
int toku_brt_remove_subdb ( BRT brt , const char * dbname , u_int32_t flags ) {
2007-11-15 19:09:31 +00:00
int i ;
int found = - 1 ;
assert ( flags = = 0 ) ;
2013-04-16 23:57:18 -04:00
assert ( brt - > h ) ;
2007-11-15 19:09:31 +00:00
assert ( brt - > h - > n_named_roots > = 0 ) ;
for ( i = 0 ; i < brt - > h - > n_named_roots ; i + + ) {
if ( strcmp ( brt - > h - > names [ i ] , dbname ) = = 0 ) {
found = i ;
break ;
}
}
if ( found = = - 1 ) {
//Should not be possible.
2013-04-16 23:57:18 -04:00
return ENOENT ;
2007-11-15 19:09:31 +00:00
}
//Free old db name
toku_free ( brt - > h - > names [ found ] ) ;
//TODO: Free Diskblocks including root
for ( i = found + 1 ; i < brt - > h - > n_named_roots ; i + + ) {
2008-06-18 21:38:01 +00:00
brt - > h - > names [ i - 1 ] = brt - > h - > names [ i ] ;
brt - > h - > roots [ i - 1 ] = brt - > h - > roots [ i ] ;
brt - > h - > root_hashes [ i - 1 ] = brt - > h - > root_hashes [ i ] ;
2007-11-15 19:09:31 +00:00
}
brt - > h - > n_named_roots - - ;
brt - > h - > dirty = 1 ;
2008-06-18 21:38:01 +00:00
// Q: What if n_named_roots becomes 0? A: Don't do anything. an empty list of named roots is OK.
2013-04-16 23:57:18 -04:00
XREALLOC_N ( brt - > h - > n_named_roots , brt - > h - > names ) ;
XREALLOC_N ( brt - > h - > n_named_roots , brt - > h - > roots ) ;
XREALLOC_N ( brt - > h - > n_named_roots , brt - > h - > root_hashes ) ;
return 0 ;
2007-11-15 19:09:31 +00:00
}
2007-11-20 21:20:05 +00:00
// This one has no env
2007-11-29 15:09:14 +00:00
int toku_open_brt ( const char * fname , const char * dbname , int is_create , BRT * newbrt , int nodesize , CACHETABLE cachetable , TOKUTXN txn ,
2008-04-25 14:22:05 +00:00
int ( * compare_fun ) ( DB * , const DBT * , const DBT * ) , DB * db ) {
2007-11-14 17:58:38 +00:00
BRT brt ;
int r ;
2007-12-11 19:34:21 +00:00
const int only_create = 0 ;
2007-11-14 17:58:38 +00:00
2007-11-29 14:44:03 +00:00
r = toku_brt_create ( & brt ) ;
2007-11-14 17:58:38 +00:00
if ( r ! = 0 )
return r ;
2007-11-29 15:09:14 +00:00
toku_brt_set_nodesize ( brt , nodesize ) ;
toku_brt_set_bt_compare ( brt , compare_fun ) ;
2007-11-14 17:58:38 +00:00
2008-05-22 21:28:00 +00:00
r = toku_brt_open ( brt , fname , fname , dbname , is_create , only_create , cachetable , txn , db ) ;
2007-11-14 17:58:38 +00:00
if ( r ! = 0 ) {
return r ;
}
* newbrt = brt ;
return r ;
}
2008-04-17 03:11:55 +00:00
int toku_close_brt ( BRT brt , TOKULOGGER logger ) {
2007-07-13 19:37:47 +00:00
int r ;
2008-01-25 15:43:37 +00:00
while ( ! list_empty ( & brt - > cursors ) ) {
BRT_CURSOR c = list_struct ( list_pop ( & brt - > cursors ) , struct brt_cursor , cursors_link ) ;
2007-11-29 14:44:03 +00:00
r = toku_brt_cursor_close ( c ) ;
2007-07-13 19:37:47 +00:00
if ( r ! = 0 ) return r ;
}
2008-04-27 12:34:50 +00:00
// Must do this work before closing the cf
r = toku_txn_note_close_brt ( brt ) ;
assert ( r = = 0 ) ;
toku_omt_destroy ( & brt - > txns ) ;
2007-11-14 17:58:38 +00:00
if ( brt - > cf ) {
2008-04-17 03:11:55 +00:00
if ( logger ) {
assert ( brt - > fname ) ;
BYTESTRING bs = { . len = strlen ( brt - > fname ) , . data = brt - > fname } ;
LSN lsn ;
r = toku_log_brtclose ( logger , & lsn , 1 , bs , toku_cachefile_filenum ( brt - > cf ) ) ; // flush the log on close, otherwise it might not make it out.
if ( r ! = 0 ) return r ;
}
2007-12-07 19:02:03 +00:00
assert ( 0 = = toku_cachefile_count_pinned ( brt - > cf , 1 ) ) ; // For the brt, the pinned count should be zero.
2007-11-14 17:58:38 +00:00
//printf("%s:%d closing cachetable\n", __FILE__, __LINE__);
2013-04-16 23:57:18 -04:00
// printf("%s:%d brt=%p ,brt->h=%p\n", __FILE__, __LINE__, brt, brt->h);
2008-04-17 03:11:55 +00:00
if ( ( r = toku_cachefile_close ( & brt - > cf , logger ) ) ! = 0 ) return r ;
2007-11-14 17:58:38 +00:00
}
2007-07-20 18:00:14 +00:00
if ( brt - > database_name ) toku_free ( brt - > database_name ) ;
2008-04-17 03:11:55 +00:00
if ( brt - > fname ) toku_free ( brt - > fname ) ;
2007-07-24 03:28:48 +00:00
if ( brt - > skey ) { toku_free ( brt - > skey ) ; }
if ( brt - > sval ) { toku_free ( brt - > sval ) ; }
2007-07-20 18:00:14 +00:00
toku_free ( brt ) ;
2007-07-13 19:37:47 +00:00
return 0 ;
}
2008-07-21 02:34:13 +00:00
int toku_brt_flush ( BRT brt ) {
return toku_cachefile_flush ( brt - > cf ) ;
}
2007-11-29 14:44:03 +00:00
int toku_brt_debug_mode = 0 ; //strcmp(key,"hello387")==0;
2007-07-13 19:37:47 +00:00
2008-06-18 21:38:01 +00:00
CACHEKEY * toku_calculate_root_offset_pointer ( BRT brt , u_int32_t * roothash ) {
2007-07-13 19:37:47 +00:00
if ( brt - > database_name = = 0 ) {
2008-05-22 21:28:00 +00:00
assert ( brt - > h - > n_named_roots = = - 1 ) ;
2008-06-18 21:38:01 +00:00
* roothash = get_roothash ( brt , 0 ) ;
2008-05-22 21:28:00 +00:00
return & brt - > h - > roots [ 0 ] ;
2007-07-13 19:37:47 +00:00
} else {
int i ;
for ( i = 0 ; i < brt - > h - > n_named_roots ; i + + ) {
if ( strcmp ( brt - > database_name , brt - > h - > names [ i ] ) = = 0 ) {
2008-06-18 21:38:01 +00:00
* roothash = get_roothash ( brt , i ) ;
2007-07-13 19:37:47 +00:00
return & brt - > h - > roots [ i ] ;
}
}
}
abort ( ) ;
}
2008-02-08 19:54:00 +00:00
static int brt_init_new_root ( BRT brt , BRTNODE nodea , BRTNODE nodeb , DBT splitk , CACHEKEY * rootp , TOKULOGGER logger , BRTNODE * newrootp ) {
2007-09-18 16:09:55 +00:00
TAGMALLOC ( BRTNODE , newroot ) ;
2007-08-23 18:07:18 +00:00
int r ;
2008-01-17 15:41:42 +00:00
int new_height = nodea - > height + 1 ;
int new_nodesize = brt - > h - > nodesize ;
2013-04-16 23:57:18 -04:00
BLOCKNUM newroot_diskoff ;
r = allocate_diskblocknumber ( & newroot_diskoff , brt , logger ) ;
2008-01-18 21:28:27 +00:00
assert ( r = = 0 ) ;
2007-08-23 18:07:18 +00:00
assert ( newroot ) ;
2008-06-15 17:09:14 +00:00
newroot - > ever_been_written = 0 ;
2008-01-18 21:28:27 +00:00
if ( brt - > database_name = = 0 ) {
2008-03-21 21:02:30 +00:00
toku_log_changeunnamedroot ( logger , ( LSN * ) 0 , 0 , toku_cachefile_filenum ( brt - > cf ) , * rootp , newroot_diskoff ) ;
2008-01-18 21:28:27 +00:00
} else {
BYTESTRING bs ;
bs . len = 1 + strlen ( brt - > database_name ) ;
bs . data = brt - > database_name ;
2008-03-21 21:02:30 +00:00
toku_log_changenamedroot ( logger , ( LSN * ) 0 , 0 , toku_cachefile_filenum ( brt - > cf ) , bs , * rootp , newroot_diskoff ) ;
2008-01-18 21:28:27 +00:00
}
2007-08-23 18:07:18 +00:00
* rootp = newroot_diskoff ;
brt - > h - > dirty = 1 ;
2008-01-17 15:41:42 +00:00
initialize_brtnode ( brt , newroot , newroot_diskoff , new_height ) ;
2007-12-31 17:30:19 +00:00
//printf("new_root %lld %d %lld %lld\n", newroot_diskoff, newroot->height, nodea->thisnodename, nodeb->thisnodename);
2007-08-23 18:07:18 +00:00
newroot - > u . n . n_children = 2 ;
2008-03-06 21:46:57 +00:00
MALLOC_N ( 3 , newroot - > u . n . childinfos ) ;
MALLOC_N ( 2 , newroot - > u . n . childkeys ) ;
2007-08-23 18:07:18 +00:00
//printf("%s:%d Splitkey=%p %s\n", __FILE__, __LINE__, splitkey, splitkey);
newroot - > u . n . childkeys [ 0 ] = splitk . data ;
newroot - > u . n . totalchildkeylens = splitk . size ;
2013-04-16 23:57:18 -04:00
BNC_BLOCKNUM ( newroot , 0 ) = nodea - > thisnodename ;
BNC_BLOCKNUM ( newroot , 1 ) = nodeb - > thisnodename ;
2008-06-18 00:30:36 +00:00
BNC_HAVE_FULLHASH ( newroot , 0 ) = FALSE ;
BNC_HAVE_FULLHASH ( newroot , 1 ) = FALSE ;
2008-01-31 22:05:43 +00:00
r = toku_fifo_create ( & BNC_BUFFER ( newroot , 0 ) ) ; if ( r ! = 0 ) return r ;
r = toku_fifo_create ( & BNC_BUFFER ( newroot , 1 ) ) ; if ( r ! = 0 ) return r ;
2008-03-06 21:46:57 +00:00
BNC_NBYTESINBUF ( newroot , 0 ) = 0 ;
BNC_NBYTESINBUF ( newroot , 1 ) = 0 ;
BNC_SUBTREE_FINGERPRINT ( newroot , 0 ) = 0 ;
BNC_SUBTREE_FINGERPRINT ( newroot , 1 ) = 0 ;
2008-04-30 13:23:04 +00:00
BNC_SUBTREE_LEAFENTRY_ESTIMATE ( newroot , 0 ) = 0 ;
BNC_SUBTREE_LEAFENTRY_ESTIMATE ( newroot , 1 ) = 0 ;
2007-11-14 17:58:38 +00:00
//verify_local_fingerprint_nonleaf(nodea);
//verify_local_fingerprint_nonleaf(nodeb);
2008-03-21 21:02:30 +00:00
r = toku_log_newbrtnode ( logger , ( LSN * ) 0 , 0 , toku_cachefile_filenum ( brt - > cf ) , newroot_diskoff , new_height , new_nodesize , ( brt - > flags & TOKU_DB_DUPSORT ) ! = 0 , newroot - > rand4fingerprint ) ;
2008-01-17 19:36:44 +00:00
if ( r ! = 0 ) return r ;
2008-03-21 21:02:30 +00:00
r = toku_log_addchild ( logger , ( LSN * ) 0 , 0 , toku_cachefile_filenum ( brt - > cf ) , newroot_diskoff , 0 , nodea - > thisnodename , 0 ) ;
2008-01-17 15:41:42 +00:00
if ( r ! = 0 ) return r ;
2008-03-21 21:02:30 +00:00
r = toku_log_addchild ( logger , ( LSN * ) 0 , 0 , toku_cachefile_filenum ( brt - > cf ) , newroot_diskoff , 1 , nodeb - > thisnodename , 0 ) ;
2008-01-17 15:41:42 +00:00
if ( r ! = 0 ) return r ;
2008-02-08 19:54:00 +00:00
fixup_child_fingerprint ( newroot , 0 , nodea , brt , logger ) ;
fixup_child_fingerprint ( newroot , 1 , nodeb , brt , logger ) ;
2008-01-17 15:41:42 +00:00
{
2008-01-23 18:29:06 +00:00
BYTESTRING bs = { . len = kv_pair_keylen ( newroot - > u . n . childkeys [ 0 ] ) ,
. data = kv_pair_key ( newroot - > u . n . childkeys [ 0 ] ) } ;
2008-04-22 20:39:50 +00:00
r = toku_log_setpivot ( logger , & newroot - > log_lsn , 0 , toku_cachefile_filenum ( brt - > cf ) , newroot_diskoff , 0 , bs ) ;
2008-01-17 15:41:42 +00:00
if ( r ! = 0 ) return r ;
}
2008-03-05 18:34:32 +00:00
r = toku_unpin_brtnode ( brt , nodea ) ;
2007-09-21 17:55:49 +00:00
if ( r ! = 0 ) return r ;
2008-03-05 18:34:32 +00:00
r = toku_unpin_brtnode ( brt , nodeb ) ;
2007-09-21 17:55:49 +00:00
if ( r ! = 0 ) return r ;
2008-04-07 01:30:25 +00:00
//printf("%s:%d put %lld\n", __FILE__, __LINE__, newroot_diskoff);
2008-06-17 17:05:19 +00:00
u_int32_t fullhash = toku_cachetable_hash ( brt - > cf , newroot_diskoff ) ;
newroot - > fullhash = fullhash ;
toku_cachetable_put ( brt - > cf , newroot_diskoff , fullhash , newroot , brtnode_memory_size ( newroot ) ,
2013-04-16 23:57:18 -04:00
toku_brtnode_flush_callback , toku_brtnode_fetch_callback , brt - > h ) ;
2008-01-23 18:06:23 +00:00
* newrootp = newroot ;
2007-08-23 18:07:18 +00:00
return 0 ;
}
2008-04-09 02:45:27 +00:00
int toku_cachefile_root_put_cmd ( CACHEFILE cf , BRT_CMD cmd , TOKULOGGER logger ) {
int r ;
2013-04-16 23:57:18 -04:00
struct brt_header * h = toku_cachefile_get_userdata ( cf ) ;
assert ( h ) ;
2008-04-09 02:45:27 +00:00
r = toku_fifo_enq_cmdstruct ( h - > fifo , cmd ) ;
if ( r ! = 0 ) return r ;
{
BYTESTRING keybs = { . len = cmd - > u . id . key - > size , . data = cmd - > u . id . key - > data } ;
BYTESTRING valbs = { . len = cmd - > u . id . val - > size , . data = cmd - > u . id . val - > data } ;
r = toku_log_enqrootentry ( logger , ( LSN * ) 0 , 0 , toku_cachefile_filenum ( cf ) , cmd - > xid , cmd - > type , keybs , valbs ) ;
if ( r ! = 0 ) return r ;
}
return 0 ;
}
static int push_something ( BRT brt , BRTNODE * nodep , CACHEKEY * rootp , BRT_CMD cmd , TOKULOGGER logger ) {
int did_split = 0 ;
BRTNODE nodea = 0 , nodeb = 0 ;
DBT splitk ;
int result = brtnode_put_cmd ( brt , * nodep , cmd ,
& did_split , & nodea , & nodeb , & splitk ,
logger ) ;
int r ;
if ( did_split ) {
// node is unpinned, so now we have to proceed to update the root with a new node.
//printf("%s:%d did_split=%d nodeb=%p nodeb->thisnodename=%lld nodeb->nodesize=%d\n", __FILE__, __LINE__, did_split, nodeb, nodeb->thisnodename, nodeb->nodesize);
//printf("Did split, splitkey=%s\n", splitkey);
2013-04-16 23:57:18 -04:00
if ( nodeb - > height > 0 ) assert ( BNC_BLOCKNUM ( nodeb , nodeb - > u . n . n_children - 1 ) . b ! = 0 ) ;
2008-04-09 02:45:27 +00:00
assert ( nodeb - > nodesize > 0 ) ;
r = brt_init_new_root ( brt , nodea , nodeb , splitk , rootp , logger , nodep ) ;
assert ( r = = 0 ) ;
} else {
if ( ( * nodep ) - > height > 0 )
assert ( ( * nodep ) - > u . n . n_children < = TREE_FANOUT ) ;
}
//assert(0==toku_cachetable_assert_all_unpinned(brt->cachetable));
return result ;
}
2008-04-07 01:30:25 +00:00
int toku_brt_root_put_cmd ( BRT brt , BRT_CMD cmd , TOKULOGGER logger ) {
2007-07-13 19:37:47 +00:00
void * node_v ;
BRTNODE node ;
CACHEKEY * rootp ;
int r ;
2007-11-19 23:47:44 +00:00
//assert(0==toku_cachetable_assert_all_unpinned(brt->cachetable));
2013-04-16 23:57:18 -04:00
assert ( brt - > h ) ;
2008-06-18 00:30:36 +00:00
2013-04-16 23:57:18 -04:00
brt - > h - > root_put_counter = global_root_put_counter + + ;
2008-06-18 21:38:01 +00:00
u_int32_t fullhash ;
rootp = toku_calculate_root_offset_pointer ( brt , & fullhash ) ;
//assert(fullhash==toku_cachetable_hash(brt->cf, *rootp));
2008-06-17 17:05:19 +00:00
if ( ( r = toku_cachetable_get_and_pin ( brt - > cf , * rootp , fullhash , & node_v , NULL ,
2013-04-16 23:57:18 -04:00
toku_brtnode_flush_callback , toku_brtnode_fetch_callback , brt - > h ) ) ) {
2013-04-16 23:57:18 -04:00
return r ;
2007-07-13 19:37:47 +00:00
}
2007-10-03 19:34:31 +00:00
//printf("%s:%d pin %p\n", __FILE__, __LINE__, node_v);
2007-07-13 19:37:47 +00:00
node = node_v ;
2008-06-18 00:30:36 +00:00
2008-06-17 17:05:19 +00:00
assert ( node - > fullhash = = fullhash ) ;
2008-04-09 02:45:27 +00:00
// push the fifo stuff
{
DBT okey , odata ;
BRT_CMD_S ocmd ;
while ( 0 = = toku_fifo_peek_cmdstruct ( brt - > h - > fifo , & ocmd , & okey , & odata ) ) {
if ( ( r = push_something ( brt , & node , rootp , & ocmd , logger ) ) ) return r ;
r = toku_fifo_deq ( brt - > h - > fifo ) ;
assert ( r = = 0 ) ;
}
2007-07-13 19:37:47 +00:00
}
2008-06-18 00:30:36 +00:00
2008-04-09 02:45:27 +00:00
if ( ( r = push_something ( brt , & node , rootp , cmd , logger ) ) ) return r ;
2008-03-05 18:34:32 +00:00
r = toku_unpin_brtnode ( brt , node ) ;
2008-04-09 02:45:27 +00:00
assert ( r = = 0 ) ;
return 0 ;
2007-09-06 21:36:45 +00:00
}
2007-11-29 15:09:14 +00:00
int toku_brt_insert ( BRT brt , DBT * key , DBT * val , TOKUTXN txn ) {
2007-09-06 21:36:45 +00:00
int r ;
2008-07-10 18:46:41 +00:00
if ( txn & & ( brt - > txn_that_created ! = toku_txn_get_txnid ( txn ) ) ) {
toku_cachefile_refup ( brt - > cf ) ;
2008-07-11 22:00:06 +00:00
BYTESTRING keybs = { key - > size , toku_memdup_in_rollback ( txn , key - > data , key - > size ) } ;
BYTESTRING databs = { val - > size , toku_memdup_in_rollback ( txn , val - > data , val - > size ) } ;
2008-04-07 01:30:25 +00:00
r = toku_logger_save_rollback_cmdinsert ( txn , toku_txn_get_txnid ( txn ) , toku_cachefile_filenum ( brt - > cf ) , keybs , databs ) ;
if ( r ! = 0 ) return r ;
2008-04-26 03:16:30 +00:00
r = toku_txn_note_brt ( txn , brt ) ;
if ( r ! = 0 ) return r ;
2008-04-07 01:30:25 +00:00
}
2008-02-05 18:25:23 +00:00
BRT_CMD_S brtcmd = { BRT_INSERT , toku_txn_get_txnid ( txn ) , . u . id = { key , val } } ;
2008-04-07 01:30:25 +00:00
r = toku_brt_root_put_cmd ( brt , & brtcmd , toku_txn_logger ( txn ) ) ;
if ( r ! = 0 ) return r ;
2007-09-06 21:36:45 +00:00
return r ;
2007-07-13 19:37:47 +00:00
}
2007-11-29 15:09:14 +00:00
int toku_brt_lookup ( BRT brt , DBT * k , DBT * v ) {
2008-01-07 22:28:36 +00:00
int r , rr ;
BRT_CURSOR cursor ;
2008-03-15 19:06:39 +00:00
rr = toku_brt_cursor ( brt , & cursor , 1 ) ;
2008-01-07 22:28:36 +00:00
if ( rr ! = 0 ) return rr ;
int op = brt - > flags & TOKU_DB_DUPSORT ? DB_GET_BOTH : DB_SET ;
r = toku_brt_cursor_get ( cursor , k , v , op , 0 ) ;
rr = toku_brt_cursor_close ( cursor ) ; assert ( rr = = 0 ) ;
return r ;
2007-07-13 19:37:47 +00:00
}
2008-01-28 20:49:10 +00:00
int toku_brt_delete ( BRT brt , DBT * key , TOKUTXN txn ) {
2007-09-06 21:36:45 +00:00
int r ;
2008-07-10 18:46:41 +00:00
if ( txn & & ( brt - > txn_that_created ! = toku_txn_get_txnid ( txn ) ) ) {
2008-07-11 22:00:06 +00:00
BYTESTRING keybs = { key - > size , toku_memdup_in_rollback ( txn , key - > data , key - > size ) } ;
2008-04-09 02:45:27 +00:00
toku_cachefile_refup ( brt - > cf ) ;
2008-04-07 01:30:25 +00:00
r = toku_logger_save_rollback_cmddelete ( txn , toku_txn_get_txnid ( txn ) , toku_cachefile_filenum ( brt - > cf ) , keybs ) ;
if ( r ! = 0 ) return r ;
2008-04-26 03:16:30 +00:00
r = toku_txn_note_brt ( txn , brt ) ;
if ( r ! = 0 ) return r ;
2008-04-07 01:30:25 +00:00
}
2007-09-06 21:36:45 +00:00
DBT val ;
2008-04-07 01:30:25 +00:00
BRT_CMD_S brtcmd = { BRT_DELETE_ANY , toku_txn_get_txnid ( txn ) , . u . id = { key , toku_init_dbt ( & val ) } } ;
r = toku_brt_root_put_cmd ( brt , & brtcmd , toku_txn_logger ( txn ) ) ;
2007-09-06 21:36:45 +00:00
return r ;
}
2008-01-28 20:49:10 +00:00
int toku_brt_delete_both ( BRT brt , DBT * key , DBT * val , TOKUTXN txn ) {
2008-05-15 21:27:35 +00:00
//{ unsigned i; printf("del %p keylen=%d key={", brt->db, key->size); for(i=0; i<key->size; i++) printf("%d,", ((char*)key->data)[i]); printf("} datalen=%d data={", val->size); for(i=0; i<val->size; i++) printf("%d,", ((char*)val->data)[i]); printf("}\n"); }
2008-01-02 20:33:51 +00:00
int r ;
2008-07-10 18:46:41 +00:00
if ( txn & & ( brt - > txn_that_created ! = toku_txn_get_txnid ( txn ) ) ) {
2008-07-17 19:54:11 +00:00
BYTESTRING keybs = { key - > size , toku_memdup_in_rollback ( txn , key - > data , key - > size ) } ;
BYTESTRING databs = { val - > size , toku_memdup_in_rollback ( txn , val - > data , val - > size ) } ;
2008-04-09 02:45:27 +00:00
toku_cachefile_refup ( brt - > cf ) ;
2008-04-07 01:30:25 +00:00
r = toku_logger_save_rollback_cmddeleteboth ( txn , toku_txn_get_txnid ( txn ) , toku_cachefile_filenum ( brt - > cf ) , keybs , databs ) ;
if ( r ! = 0 ) return r ;
2008-04-26 03:16:30 +00:00
r = toku_txn_note_brt ( txn , brt ) ;
if ( r ! = 0 ) return r ;
2008-04-07 01:30:25 +00:00
}
2008-02-05 18:25:23 +00:00
BRT_CMD_S brtcmd = { BRT_DELETE_BOTH , toku_txn_get_txnid ( txn ) , . u . id = { key , val } } ;
2008-04-07 01:30:25 +00:00
r = toku_brt_root_put_cmd ( brt , & brtcmd , toku_txn_logger ( txn ) ) ;
2008-01-02 20:33:51 +00:00
return r ;
}
2013-04-16 23:57:18 -04:00
int toku_dump_brtnode ( BRT brt , BLOCKNUM blocknum , int depth , bytevec lorange , ITEMLEN lolen , bytevec hirange , ITEMLEN hilen ) {
2007-07-13 19:37:47 +00:00
int result = 0 ;
BRTNODE node ;
void * node_v ;
2013-04-16 23:57:18 -04:00
u_int32_t fullhash = toku_cachetable_hash ( brt - > cf , blocknum ) ;
int r = toku_cachetable_get_and_pin ( brt - > cf , blocknum , fullhash ,
2008-06-17 17:05:19 +00:00
& node_v , NULL ,
2013-04-16 23:57:18 -04:00
toku_brtnode_flush_callback , toku_brtnode_fetch_callback , brt - > h ) ;
2007-07-13 19:37:47 +00:00
assert ( r = = 0 ) ;
2007-10-03 19:34:31 +00:00
printf ( " %s:%d pin %p \n " , __FILE__ , __LINE__ , node_v ) ;
2007-07-13 19:37:47 +00:00
node = node_v ;
2008-06-17 17:05:19 +00:00
assert ( node - > fullhash = = fullhash ) ;
2013-04-16 23:57:18 -04:00
result = toku_verify_brtnode ( brt , blocknum , lorange , lolen , hirange , hilen , 0 ) ;
2007-07-13 19:37:47 +00:00
printf ( " %*sNode=%p \n " , depth , " " , node ) ;
if ( node - > height > 0 ) {
2013-04-16 23:57:18 -04:00
printf ( " %*sNode % " PRId64 " nodesize=%d height=%d n_children=%d n_bytes_in_buffers=%d keyrange=%s %s \n " ,
depth , " " , blocknum . b , node - > nodesize , node - > height , node - > u . n . n_children , node - > u . n . n_bytes_in_buffers , ( char * ) lorange , ( char * ) hirange ) ;
2007-07-13 19:37:47 +00:00
//printf("%s %s\n", lorange ? lorange : "NULL", hirange ? hirange : "NULL");
{
int i ;
2008-02-05 18:25:23 +00:00
for ( i = 0 ; i < node - > u . n . n_children ; i + + ) {
2008-01-31 22:05:43 +00:00
printf ( " %*schild %d buffered (%d entries): \n " , depth + 1 , " " , i , toku_fifo_n_entries ( BNC_BUFFER ( node , i ) ) ) ;
2008-02-05 18:25:23 +00:00
FIFO_ITERATE ( BNC_BUFFER ( node , i ) , key , keylen , data , datalen , type , xid ,
2007-07-13 19:37:47 +00:00
( {
2008-02-05 18:25:23 +00:00
data = data ; datalen = datalen ; keylen = keylen ;
2008-02-06 01:55:16 +00:00
printf ( " %*s xid=% " PRId64 " %d (type=%d) \n " , depth + 2 , " " , xid , ntohl ( * ( int * ) key ) , type ) ;
2008-02-05 18:25:23 +00:00
//assert(strlen((char*)key)+1==keylen);
//assert(strlen((char*)data)+1==datalen);
2007-07-13 19:37:47 +00:00
} ) ) ;
}
for ( i = 0 ; i < node - > u . n . n_children ; i + + ) {
printf ( " %*schild %d \n " , depth , " " , i ) ;
if ( i > 0 ) {
2008-02-05 18:25:23 +00:00
printf ( " %*spivot %d len=%d %d \n " , depth + 1 , " " , i - 1 , node - > u . n . childkeys [ i - 1 ] - > keylen , ntohl ( * ( int * ) & node - > u . n . childkeys [ i - 1 ] - > key ) ) ;
2007-07-13 19:37:47 +00:00
}
2013-04-16 23:57:18 -04:00
toku_dump_brtnode ( brt , BNC_BLOCKNUM ( node , i ) , depth + 4 ,
2007-12-06 13:52:52 +00:00
( i = = 0 ) ? lorange : node - > u . n . childkeys [ i - 1 ] ,
( i = = 0 ) ? lolen : toku_brt_pivot_key_len ( brt , node - > u . n . childkeys [ i - 1 ] ) ,
( i = = node - > u . n . n_children - 1 ) ? hirange : node - > u . n . childkeys [ i ] ,
2008-05-04 16:56:15 +00:00
( i = = node - > u . n . n_children - 1 ) ? hilen : toku_brt_pivot_key_len ( brt , node - > u . n . childkeys [ i ] )
2007-12-06 13:52:52 +00:00
) ;
2007-07-13 19:37:47 +00:00
}
}
} else {
2013-04-16 23:57:18 -04:00
printf ( " %*sNode % " PRId64 " nodesize=%d height=%d n_bytes_in_buffer=%d keyrange=%d %d \n " ,
depth , " " , blocknum . b , node - > nodesize , node - > height , node - > u . l . n_bytes_in_buffer , lorange ? ntohl ( * ( int * ) lorange ) : 0 , hirange ? ntohl ( * ( int * ) hirange ) : 0 ) ;
2008-04-02 23:40:36 +00:00
//GPMA_ITERATE(node->u.l.buffer, idx, len, data,
2008-04-08 02:09:19 +00:00
// printf(" (%d)%u ", len, *(int*)le_any_key(data)));
2007-07-13 19:37:47 +00:00
printf ( " \n " ) ;
}
2013-04-16 23:57:18 -04:00
r = toku_cachetable_unpin ( brt - > cf , blocknum , fullhash , 0 , 0 ) ;
2007-07-13 19:37:47 +00:00
assert ( r = = 0 ) ;
return result ;
}
2007-11-29 15:09:14 +00:00
int toku_dump_brt ( BRT brt ) {
2007-07-13 19:37:47 +00:00
CACHEKEY * rootp ;
2013-04-16 23:57:18 -04:00
assert ( brt - > h ) ;
2008-06-18 21:38:01 +00:00
u_int32_t fullhash ;
rootp = toku_calculate_root_offset_pointer ( brt , & fullhash ) ;
2007-07-13 19:37:47 +00:00
printf ( " split_count=%d \n " , split_count ) ;
2013-04-16 23:57:18 -04:00
return toku_dump_brtnode ( brt , * rootp , 0 , 0 , 0 , 0 , 0 ) ;
2007-07-13 19:37:47 +00:00
}
2008-05-12 18:39:21 +00:00
#if 0
2008-01-07 19:53:50 +00:00
static int show_brtnode_blocknumbers ( BRT brt , DISKOFF off ) {
2007-07-13 19:37:47 +00:00
BRTNODE node ;
void * node_v ;
int i , r ;
assert ( off % brt - > h - > nodesize = = 0 ) ;
2007-11-19 23:47:44 +00:00
if ( ( r = toku_cachetable_get_and_pin ( brt - > cf , off , & node_v , NULL ,
2013-04-16 23:57:18 -04:00
toku_brtnode_flush_callback , toku_brtnode_fetch_callback , brt - > h ) ) ) {
2007-11-19 23:47:44 +00:00
if ( 0 ) { died0 : toku_cachetable_unpin ( brt - > cf , off , 0 , 0 ) ; }
2007-07-13 19:37:47 +00:00
return r ;
}
2007-10-03 19:34:31 +00:00
printf ( " %s:%d pin %p \n " , __FILE__ , __LINE__ , node_v ) ;
2007-07-13 19:37:47 +00:00
node = node_v ;
printf ( " %lld " , off / brt - > h - > nodesize ) ;
if ( node - > height > 0 ) {
for ( i = 0 ; i < node - > u . n . n_children ; i + + ) {
2013-04-16 23:57:18 -04:00
if ( ( r = show_brtnode_blocknumbers ( brt , BNC_BLOCKNUM ( node , i ) ) ) ) goto died0 ;
2007-07-13 19:37:47 +00:00
}
}
2007-11-19 23:47:44 +00:00
r = toku_cachetable_unpin ( brt - > cf , off , 0 , 0 ) ;
2007-07-13 19:37:47 +00:00
return r ;
}
int show_brt_blocknumbers ( BRT brt ) {
int r ;
CACHEKEY * rootp ;
2007-11-14 17:58:38 +00:00
if ( ( r = toku_read_and_pin_brt_header ( brt - > cf , & brt - > h ) ) ) {
if ( 0 ) { died0 : toku_unpin_brt_header ( brt ) ; }
2007-07-13 19:37:47 +00:00
return r ;
}
2007-11-14 17:58:38 +00:00
rootp = toku_calculate_root_offset_pointer ( brt ) ;
2007-07-13 19:37:47 +00:00
printf ( " BRT %p has blocks: " , brt ) ;
2007-10-03 19:34:31 +00:00
if ( ( r = show_brtnode_blocknumbers ( brt , * rootp , 0 ) ) ) goto died0 ;
2007-07-13 19:37:47 +00:00
printf ( " \n " ) ;
2007-11-14 17:58:38 +00:00
if ( ( r = toku_unpin_brt_header ( brt ) ) ! = 0 ) return r ;
2007-07-13 19:37:47 +00:00
return 0 ;
}
2007-11-28 19:00:21 +00:00
# endif
2007-07-13 19:37:47 +00:00
2008-01-25 15:43:37 +00:00
typedef struct brt_split {
int did_split ;
BRTNODE nodea ;
BRTNODE nodeb ;
DBT splitk ;
} BRT_SPLIT ;
2007-08-23 18:07:18 +00:00
2008-01-25 15:43:37 +00:00
static inline void brt_split_init ( BRT_SPLIT * split ) {
split - > did_split = 0 ;
split - > nodea = split - > nodeb = 0 ;
toku_init_dbt ( & split - > splitk ) ;
2007-08-23 18:07:18 +00:00
}
2008-06-02 20:52:12 +00:00
static int brt_search_node ( BRT brt , BRTNODE node , brt_search_t * search , DBT * newkey , DBT * newval , BRT_SPLIT * split , TOKULOGGER logger , OMTCURSOR ) ;
2007-08-23 18:07:18 +00:00
2008-01-25 15:43:37 +00:00
/* search in a node's child */
2008-06-02 20:52:12 +00:00
static int brt_search_child ( BRT brt , BRTNODE node , int childnum , brt_search_t * search , DBT * newkey , DBT * newval , BRT_SPLIT * split , TOKULOGGER logger , OMTCURSOR omtcursor ) {
2008-01-25 15:43:37 +00:00
int r , rr ;
2007-08-24 12:10:49 +00:00
2008-01-25 15:43:37 +00:00
/* if the child's buffer is not empty then try to empty it */
2008-01-31 22:05:43 +00:00
if ( BNC_NBYTESINBUF ( node , childnum ) > 0 ) {
2008-05-20 23:47:39 +00:00
rr = push_some_brt_cmds_down ( brt , node , childnum , & split - > did_split , & split - > nodea , & split - > nodeb , & split - > splitk , logger ) ;
2008-01-25 15:43:37 +00:00
assert ( rr = = 0 ) ;
/* push down may cause a child split, so childnum may not be appropriate, and the node itself may split, so retry */
return EAGAIN ;
}
2007-08-23 18:07:18 +00:00
2008-01-25 15:43:37 +00:00
void * node_v ;
2013-04-16 23:57:18 -04:00
BLOCKNUM childblocknum = BNC_BLOCKNUM ( node , childnum ) ;
2008-06-18 00:30:36 +00:00
u_int32_t fullhash = compute_child_fullhash ( brt - > cf , node , childnum ) ;
2013-04-16 23:57:18 -04:00
rr = toku_cachetable_get_and_pin ( brt - > cf , childblocknum , fullhash , & node_v , NULL , toku_brtnode_flush_callback , toku_brtnode_fetch_callback , brt - > h ) ;
2008-01-25 15:43:37 +00:00
assert ( rr = = 0 ) ;
for ( ; ; ) {
BRTNODE childnode = node_v ;
BRT_SPLIT childsplit ; brt_split_init ( & childsplit ) ;
2008-06-02 20:52:12 +00:00
r = brt_search_node ( brt , childnode , search , newkey , newval , & childsplit , logger , omtcursor ) ;
2008-01-25 15:43:37 +00:00
if ( childsplit . did_split ) {
rr = handle_split_of_child ( brt , node , childnum , childsplit . nodea , childsplit . nodeb , & childsplit . splitk ,
2008-04-07 01:30:25 +00:00
& split - > did_split , & split - > nodea , & split - > nodeb , & split - > splitk , logger ) ;
2008-01-25 15:43:37 +00:00
assert ( rr = = 0 ) ;
break ;
} else {
if ( r = = EAGAIN )
continue ;
2008-06-17 17:05:19 +00:00
rr = toku_cachetable_unpin ( brt - > cf , childnode - > thisnodename , childnode - > fullhash , childnode - > dirty , brtnode_memory_size ( childnode ) ) ;
2008-01-25 15:43:37 +00:00
assert ( rr = = 0 ) ;
break ;
2007-08-23 18:07:18 +00:00
}
}
2008-01-25 15:43:37 +00:00
return r ;
2007-08-23 18:07:18 +00:00
}
2008-06-02 20:52:12 +00:00
static int brt_search_nonleaf_node ( BRT brt , BRTNODE node , brt_search_t * search , DBT * newkey , DBT * newval , BRT_SPLIT * split , TOKULOGGER logger , OMTCURSOR omtcursor ) {
2008-05-08 07:05:26 +00:00
int r ;
2008-01-25 15:43:37 +00:00
int c ;
2007-08-23 18:07:18 +00:00
2008-01-25 15:43:37 +00:00
/* binary search is overkill for a small array */
int child [ node - > u . n . n_children ] ;
2007-08-23 18:07:18 +00:00
2008-01-25 15:43:37 +00:00
/* scan left to right or right to left depending on the search direction */
for ( c = 0 ; c < node - > u . n . n_children ; c + + )
child [ c ] = search - > direction & BRT_SEARCH_LEFT ? c : node - > u . n . n_children - 1 - c ;
2007-08-23 18:07:18 +00:00
2008-01-25 15:43:37 +00:00
for ( c = 0 ; c < node - > u . n . n_children - 1 ; c + + ) {
int p = search - > direction & BRT_SEARCH_LEFT ? child [ c ] : child [ c ] - 1 ;
struct kv_pair * pivot = node - > u . n . childkeys [ p ] ;
DBT pivotkey , pivotval ;
if ( search - > compare ( search ,
toku_fill_dbt ( & pivotkey , kv_pair_key ( pivot ) , kv_pair_keylen ( pivot ) ) ,
brt - > flags & TOKU_DB_DUPSORT ? toku_fill_dbt ( & pivotval , kv_pair_val ( pivot ) , kv_pair_vallen ( pivot ) ) : 0 ) ) {
2008-06-02 20:52:12 +00:00
r = brt_search_child ( brt , node , child [ c ] , search , newkey , newval , split , logger , omtcursor ) ;
2008-05-08 13:05:01 +00:00
if ( r = = 0 | | r = = EAGAIN ) {
2008-05-08 07:05:26 +00:00
return r ;
}
2007-08-23 18:07:18 +00:00
}
}
2008-01-25 15:43:37 +00:00
/* check the first (left) or last (right) node if nothing has been found */
2008-06-02 20:52:12 +00:00
return brt_search_child ( brt , node , child [ c ] , search , newkey , newval , split , logger , omtcursor ) ;
2007-08-23 18:07:18 +00:00
}
2008-04-07 01:30:25 +00:00
int pair_leafval_bessel_le_committed ( u_int32_t klen , void * kval ,
u_int32_t dlen , void * dval ,
brt_search_t * search ) {
2008-04-02 23:40:36 +00:00
DBT x , y ;
int cmp = search - > compare ( search ,
2008-04-07 01:30:25 +00:00
search - > k ? toku_fill_dbt ( & x , kval , klen ) : 0 ,
search - > v ? toku_fill_dbt ( & y , dval , dlen ) : 0 ) ;
2008-05-16 18:48:23 +00:00
// The search->compare function returns only 0 or 1
2008-04-02 23:40:36 +00:00
switch ( search - > direction ) {
2008-05-16 13:16:29 +00:00
case BRT_SEARCH_LEFT : return cmp = = 0 ? - 1 : + 1 ;
case BRT_SEARCH_RIGHT : return cmp = = 0 ? + 1 : - 1 ; // Because the comparison runs backwards for right searches.
2008-04-02 23:40:36 +00:00
}
assert ( 0 ) ;
return 0 ;
}
2008-04-07 01:30:25 +00:00
int pair_leafval_bessel_le_both ( TXNID xid __attribute__ ( ( __unused__ ) ) ,
u_int32_t klen , void * kval ,
2008-05-16 18:48:23 +00:00
u_int32_t clen __attribute__ ( ( __unused__ ) ) , void * cval __attribute__ ( ( __unused__ ) ) ,
u_int32_t plen , void * pval ,
2008-04-07 01:30:25 +00:00
brt_search_t * search ) {
2008-05-16 18:48:23 +00:00
return pair_leafval_bessel_le_committed ( klen , kval , plen , pval , search ) ;
2008-04-07 01:30:25 +00:00
}
int pair_leafval_bessel_le_provdel ( TXNID xid __attribute__ ( ( __unused__ ) ) ,
2008-04-08 02:09:19 +00:00
u_int32_t klen , void * kval ,
u_int32_t clen , void * cval ,
2008-04-07 01:30:25 +00:00
brt_search_t * be ) {
return pair_leafval_bessel_le_committed ( klen , kval , clen , cval , be ) ;
}
int pair_leafval_bessel_le_provpair ( TXNID xid __attribute__ ( ( __unused__ ) ) ,
u_int32_t klen , void * kval ,
u_int32_t plen , void * pval ,
brt_search_t * be ) {
return pair_leafval_bessel_le_committed ( klen , kval , plen , pval , be ) ;
}
2008-04-25 13:45:55 +00:00
static int bessel_from_search_t ( OMTVALUE lev , void * extra ) {
LEAFENTRY leafval = lev ;
2008-04-07 01:30:25 +00:00
brt_search_t * search = extra ;
LESWITCHCALL ( leafval , pair_leafval_bessel , search ) ;
}
2008-06-12 13:53:39 +00:00
static int brt_search_leaf_node ( BRT brt , BRTNODE node , brt_search_t * search , DBT * newkey , DBT * newval , TOKULOGGER logger , OMTCURSOR omtcursor ) {
2008-04-02 23:40:36 +00:00
// Now we have to convert from brt_search_t to the bessel function with a direction. What a pain...
int direction ;
switch ( search - > direction ) {
case BRT_SEARCH_LEFT : direction = + 1 ; goto ok ;
case BRT_SEARCH_RIGHT : direction = - 1 ; goto ok ;
}
return EINVAL ; // This return and the goto are a hack to get both compile-time and run-time checking on enum
ok : ;
2008-04-25 13:45:55 +00:00
OMTVALUE datav ;
2008-06-02 20:52:12 +00:00
u_int32_t idx = 0 ;
2008-04-22 20:39:50 +00:00
int r = toku_omt_find ( node - > u . l . buffer ,
bessel_from_search_t ,
search ,
direction ,
2008-06-02 20:52:12 +00:00
& datav , & idx , omtcursor ) ;
2008-05-08 13:05:01 +00:00
if ( r ! = 0 ) return r ;
2008-04-02 23:40:36 +00:00
2008-04-25 13:45:55 +00:00
LEAFENTRY le = datav ;
2008-04-07 01:30:25 +00:00
if ( le_is_provdel ( le ) ) {
2008-06-12 13:53:39 +00:00
TXNID xid = le_any_xid ( le ) ;
TOKUTXN txn = 0 ;
toku_txn_find_by_xid ( brt , xid , & txn ) ;
2008-04-07 01:30:25 +00:00
// Provisionally deleted stuff is gone.
2008-04-08 02:09:19 +00:00
// So we need to scan in the direction to see if we can find something
while ( 1 ) {
2008-06-12 13:53:39 +00:00
// see if the transaction is alive
TXNID newxid = le_any_xid ( le ) ;
if ( newxid ! = xid ) {
xid = newxid ;
txn = 0 ;
toku_txn_find_by_xid ( brt , xid , & txn ) ;
}
2008-04-08 02:09:19 +00:00
switch ( search - > direction ) {
case BRT_SEARCH_LEFT :
2008-06-12 13:53:39 +00:00
if ( txn ) {
// printf("xid %llu -> %p\n", (unsigned long long) xid, txn);
idx + + ;
} else {
// apply a commit message for this leafentry to the node
// printf("apply commit_both %llu\n", (unsigned long long) xid);
DBT key , val ;
BRT_CMD_S brtcmd = { BRT_COMMIT_BOTH , xid , . u . id = { toku_fill_dbt ( & key , le_latest_key ( le ) , le_latest_keylen ( le ) ) ,
toku_fill_dbt ( & val , le_latest_val ( le ) , le_latest_vallen ( le ) ) } } ;
r = brt_leaf_apply_cmd_once ( brt , node , & brtcmd , logger , idx , le ) ;
assert ( r = = 0 ) ;
}
2008-04-22 20:39:50 +00:00
if ( idx > = toku_omt_size ( node - > u . l . buffer ) ) return DB_NOTFOUND ;
2008-04-08 02:09:19 +00:00
break ;
case BRT_SEARCH_RIGHT :
if ( idx = = 0 ) return DB_NOTFOUND ;
idx - - ;
break ;
}
2008-04-22 20:39:50 +00:00
if ( idx > = toku_omt_size ( node - > u . l . buffer ) ) continue ;
2008-06-04 22:58:07 +00:00
r = toku_omt_fetch ( node - > u . l . buffer , idx , & datav , omtcursor ) ;
2008-04-08 02:09:19 +00:00
assert ( r = = 0 ) ; // we just validated the index
2008-04-25 13:45:55 +00:00
le = datav ;
2008-04-08 02:09:19 +00:00
if ( ! le_is_provdel ( le ) ) goto got_a_good_value ;
}
2008-04-07 01:30:25 +00:00
}
2008-05-24 17:22:14 +00:00
got_a_good_value :
if ( newkey | | newval ) {
bytevec key = newkey ? le_latest_key ( le ) : NULL ;
u_int32_t key_len = newkey ? le_latest_keylen ( le ) : 0 ;
bytevec val = newval ? le_latest_val ( le ) : NULL ;
u_int32_t val_len = newval ? le_latest_vallen ( le ) : 0 ;
r = toku_dbt_set_two_values ( newkey , & key , key_len , & brt - > skey , FALSE ,
newval , & val , val_len , & brt - > sval , FALSE ) ;
if ( r ! = 0 ) return r ;
2008-04-02 23:40:36 +00:00
}
return 0 ;
2008-01-25 15:43:37 +00:00
}
2007-08-23 18:07:18 +00:00
2008-06-02 20:52:12 +00:00
static int brt_search_node ( BRT brt , BRTNODE node , brt_search_t * search , DBT * newkey , DBT * newval , BRT_SPLIT * split , TOKULOGGER logger , OMTCURSOR omtcursor ) {
2008-01-25 15:43:37 +00:00
if ( node - > height > 0 )
2008-06-02 20:52:12 +00:00
return brt_search_nonleaf_node ( brt , node , search , newkey , newval , split , logger , omtcursor ) ;
2008-01-25 15:43:37 +00:00
else
2008-06-12 13:53:39 +00:00
return brt_search_leaf_node ( brt , node , search , newkey , newval , logger , omtcursor ) ;
2007-08-23 18:07:18 +00:00
}
2008-07-21 18:00:38 +00:00
int toku_brt_search ( BRT brt , brt_search_t * search , DBT * newkey , DBT * newval , TOKULOGGER logger , OMTCURSOR omtcursor , u_int64_t * root_put_counter )
2008-06-02 20:52:12 +00:00
// Effect: Perform a search. Associate cursor with a leaf if possible.
{
2008-01-25 15:43:37 +00:00
int r , rr ;
2007-08-23 18:07:18 +00:00
2013-04-16 23:57:18 -04:00
assert ( brt - > h ) ;
2007-08-23 18:07:18 +00:00
2013-04-16 23:57:18 -04:00
* root_put_counter = brt - > h - > root_put_counter ;
2008-06-02 20:52:12 +00:00
2008-06-18 21:38:01 +00:00
u_int32_t fullhash ;
CACHEKEY * rootp = toku_calculate_root_offset_pointer ( brt , & fullhash ) ;
2007-08-23 18:07:18 +00:00
2008-01-28 18:39:57 +00:00
void * node_v ;
2008-06-18 21:38:01 +00:00
//assert(fullhash == toku_cachetable_hash(brt->cf, *rootp));
rr = toku_cachetable_get_and_pin ( brt - > cf , * rootp , fullhash ,
2013-04-16 23:57:18 -04:00
& node_v , NULL , toku_brtnode_flush_callback , toku_brtnode_fetch_callback , brt - > h ) ;
2008-01-28 18:39:57 +00:00
assert ( rr = = 0 ) ;
BRTNODE node = node_v ;
2007-08-23 18:07:18 +00:00
2008-04-09 02:45:27 +00:00
// push the fifo sutff
{
DBT okey , odata ;
BRT_CMD_S ocmd ;
while ( 0 = = toku_fifo_peek_cmdstruct ( brt - > h - > fifo , & ocmd , & okey , & odata ) ) {
if ( ( r = push_something ( brt , & node , rootp , & ocmd , logger ) ) ) return r ;
r = toku_fifo_deq ( brt - > h - > fifo ) ;
assert ( r = = 0 ) ;
}
}
2008-01-28 18:39:57 +00:00
for ( ; ; ) {
2008-01-25 15:43:37 +00:00
BRT_SPLIT split ; brt_split_init ( & split ) ;
2008-06-02 20:52:12 +00:00
r = brt_search_node ( brt , node , search , newkey , newval , & split , logger , omtcursor ) ;
2007-08-23 18:07:18 +00:00
2008-01-25 15:43:37 +00:00
if ( split . did_split ) {
rr = brt_init_new_root ( brt , split . nodea , split . nodeb , split . splitk , rootp , 0 , & node ) ;
assert ( rr = = 0 ) ;
}
2007-08-23 18:07:18 +00:00
2008-01-28 18:39:57 +00:00
if ( r ! = EAGAIN )
2008-01-25 15:43:37 +00:00
break ;
}
2007-08-23 18:07:18 +00:00
2008-03-05 18:34:32 +00:00
rr = toku_unpin_brtnode ( brt , node ) ;
2008-01-28 18:39:57 +00:00
assert ( rr = = 0 ) ;
2008-01-25 15:43:37 +00:00
return r ;
}
2007-08-23 18:07:18 +00:00
2008-01-25 15:43:37 +00:00
static inline void dbt_cleanup ( DBT * dbt ) {
2008-06-12 20:32:22 +00:00
if ( dbt - > data & & ( ( dbt - > flags & DB_DBT_REALLOC )
| | ( dbt - > flags & DB_DBT_MALLOC ) ) ) {
2008-01-25 15:43:37 +00:00
toku_free_n ( dbt - > data , dbt - > size ) ; dbt - > data = 0 ;
2007-08-23 18:07:18 +00:00
}
}
2008-02-11 20:00:19 +00:00
static inline int brt_cursor_not_set ( BRT_CURSOR cursor ) {
2008-01-25 15:43:37 +00:00
return cursor - > key . data = = 0 | | cursor - > val . data = = 0 ;
}
2007-08-23 18:07:18 +00:00
2008-02-11 20:00:19 +00:00
BOOL toku_brt_cursor_uninitialized ( BRT_CURSOR c ) {
return brt_cursor_not_set ( c ) ;
}
2008-07-21 05:37:55 +00:00
static inline void load_dbts_from_omt ( BRT_CURSOR c , DBT * key , DBT * val ) {
OMTVALUE le ;
int r = toku_omt_cursor_current ( c - > omtcursor , & le ) ;
assert ( r = = 0 ) ;
if ( key ) {
key - > data = le_latest_key ( le ) ;
key - > size = le_latest_keylen ( le ) ;
}
if ( val ) {
val - > data = le_latest_val ( le ) ;
val - > size = le_latest_vallen ( le ) ;
}
}
static void brt_cursor_invalidate_callback ( OMTCURSOR UU ( omt_c ) , void * extra ) {
BRT_CURSOR cursor = extra ;
if ( cursor - > current_in_omt ) {
assert ( cursor - > key . flags = = DB_DBT_REALLOC ) ;
assert ( cursor - > val . flags = = DB_DBT_REALLOC ) ;
DBT key , val ;
int r ;
load_dbts_from_omt ( cursor , toku_init_dbt ( & key ) , toku_init_dbt ( & val ) ) ;
//Make certain not to try to free the omt's memory.
toku_init_dbt ( & cursor - > key ) - > flags = DB_DBT_REALLOC ;
toku_init_dbt ( & cursor - > val ) - > flags = DB_DBT_REALLOC ;
r = toku_dbt_set_two_values ( & cursor - > key , ( bytevec * ) & key . data , key . size , NULL , FALSE ,
& cursor - > val , ( bytevec * ) & val . data , val . size , NULL , FALSE ) ;
//TODO: Find some way to deal with ENOMEM here.
assert ( r = = 0 ) ;
cursor - > current_in_omt = FALSE ;
}
if ( cursor - > prev_in_omt ) {
toku_init_dbt ( & cursor - > prevkey ) - > flags = DB_DBT_REALLOC ;
toku_init_dbt ( & cursor - > prevval ) - > flags = DB_DBT_REALLOC ;
cursor - > prev_in_omt = FALSE ;
}
}
2008-03-15 19:06:39 +00:00
int toku_brt_cursor ( BRT brt , BRT_CURSOR * cursorptr , int is_temporary_cursor ) {
2008-01-25 15:43:37 +00:00
BRT_CURSOR cursor = toku_malloc ( sizeof * cursor ) ;
if ( cursor = = 0 )
return ENOMEM ;
cursor - > brt = brt ;
2008-06-12 20:32:22 +00:00
toku_init_dbt ( & cursor - > key ) ; cursor - > key . flags = DB_DBT_REALLOC ;
toku_init_dbt ( & cursor - > val ) ; cursor - > val . flags = DB_DBT_REALLOC ;
toku_init_dbt ( & cursor - > prevkey ) ; cursor - > prevkey . flags = DB_DBT_REALLOC ;
toku_init_dbt ( & cursor - > prevval ) ; cursor - > prevval . flags = DB_DBT_REALLOC ;
2008-07-21 05:37:55 +00:00
cursor - > current_in_omt = FALSE ;
cursor - > prev_in_omt = FALSE ;
2008-01-25 15:43:37 +00:00
list_push ( & brt - > cursors , & cursor - > cursors_link ) ;
2008-03-15 19:06:39 +00:00
cursor - > is_temporary_cursor = is_temporary_cursor ;
cursor - > skey = cursor - > sval = 0 ;
2008-06-02 20:52:12 +00:00
int r = toku_omt_cursor_create ( & cursor - > omtcursor ) ;
assert ( r = = 0 ) ;
2008-07-21 05:37:55 +00:00
toku_omt_cursor_set_invalidate_callback ( cursor - > omtcursor ,
brt_cursor_invalidate_callback , cursor ) ;
2008-06-02 20:52:12 +00:00
cursor - > root_put_counter = 0 ;
2008-01-25 15:43:37 +00:00
* cursorptr = cursor ;
return 0 ;
}
2007-08-23 18:07:18 +00:00
2008-01-25 15:43:37 +00:00
int toku_brt_cursor_close ( BRT_CURSOR cursor ) {
2008-07-21 05:37:55 +00:00
if ( ! cursor - > current_in_omt ) {
dbt_cleanup ( & cursor - > key ) ;
dbt_cleanup ( & cursor - > val ) ;
}
if ( ! cursor - > prev_in_omt ) {
dbt_cleanup ( & cursor - > prevkey ) ;
dbt_cleanup ( & cursor - > prevval ) ;
}
2008-03-15 19:06:39 +00:00
if ( cursor - > skey ) toku_free ( cursor - > skey ) ;
if ( cursor - > sval ) toku_free ( cursor - > sval ) ;
2008-01-25 15:43:37 +00:00
list_remove ( & cursor - > cursors_link ) ;
2008-07-23 02:43:42 +00:00
toku_omt_cursor_set_invalidate_callback ( cursor - > omtcursor , NULL , NULL ) ;
2008-06-02 20:52:12 +00:00
toku_omt_cursor_destroy ( & cursor - > omtcursor ) ;
2008-01-25 15:43:37 +00:00
toku_free_n ( cursor , sizeof * cursor ) ;
return 0 ;
}
2007-08-23 18:07:18 +00:00
2008-06-12 20:32:22 +00:00
DBT * brt_cursor_peek_prev_key ( BRT_CURSOR cursor )
// Effect: Return a pointer to a DBT for the previous key.
// Requires: The caller may not modify that DBT or the memory at which it points.
{
return & cursor - > prevkey ;
}
DBT * brt_cursor_peek_prev_val ( BRT_CURSOR cursor )
// Effect: Return a pointer to a DBT for the previous val
// Requires: The caller may not modify that DBT or the memory at which it points.
{
return & cursor - > prevval ;
}
2008-07-28 17:55:14 +00:00
void brt_cursor_peek_current ( BRT_CURSOR cursor , const DBT * * pkey , const DBT * * pval )
// Effect: Retrieves a pointer to the DBTs for the current key and value.
// Requires: The caller may not modify the DBTs or the memory at which they points.
{
if ( cursor - > current_in_omt ) load_dbts_from_omt ( cursor , & cursor - > key , & cursor - > val ) ;
* pkey = & cursor - > key ;
* pval = & cursor - > val ;
}
2008-06-12 20:32:22 +00:00
DBT * brt_cursor_peek_current_key ( BRT_CURSOR cursor )
// Effect: Return a pointer to a DBT for the current key.
// Requires: The caller may not modify that DBT or the memory at which it points.
{
2008-07-21 05:37:55 +00:00
if ( cursor - > current_in_omt ) load_dbts_from_omt ( cursor , & cursor - > key , NULL ) ;
2008-06-12 20:32:22 +00:00
return & cursor - > key ;
}
DBT * brt_cursor_peek_current_val ( BRT_CURSOR cursor )
// Effect: Return a pointer to a DBT for the current val
// Requires: The caller may not modify that DBT or the memory at which it points.
{
2008-07-21 05:37:55 +00:00
if ( cursor - > current_in_omt ) load_dbts_from_omt ( cursor , NULL , & cursor - > val ) ;
2008-06-12 20:32:22 +00:00
return & cursor - > val ;
}
2008-01-25 15:43:37 +00:00
static inline int compare_k_x ( BRT brt , DBT * k , DBT * x ) {
return brt - > compare_fun ( brt - > db , k , x ) ;
2007-08-23 18:07:18 +00:00
}
2007-07-13 19:37:47 +00:00
2008-01-25 15:43:37 +00:00
static inline int compare_v_y ( BRT brt , DBT * v , DBT * y ) {
return brt - > dup_compare ( brt - > db , v , y ) ;
}
2007-07-13 19:37:47 +00:00
2008-01-25 15:43:37 +00:00
static inline int compare_kv_xy ( BRT brt , DBT * k , DBT * v , DBT * x , DBT * y ) {
int cmp = brt - > compare_fun ( brt - > db , k , x ) ;
if ( cmp = = 0 & & v & & y )
cmp = brt - > dup_compare ( brt - > db , v , y ) ;
return cmp ;
}
2007-08-23 18:07:18 +00:00
2008-01-25 15:43:37 +00:00
static inline int brt_cursor_copyout ( BRT_CURSOR cursor , DBT * key , DBT * val ) {
2008-07-28 16:19:59 +00:00
//Passing in NULL for both key and val is used with light weight cursors.
//Retrieval of key and val will use the peek functions.
if ( ! key & & ! val ) return 0 ;
2008-01-25 15:43:37 +00:00
int r = 0 ;
2008-05-07 20:03:13 +00:00
void * * key_staticp = cursor - > is_temporary_cursor ? & cursor - > brt - > skey : & cursor - > skey ;
void * * val_staticp = cursor - > is_temporary_cursor ? & cursor - > brt - > sval : & cursor - > sval ;
2008-07-21 05:37:55 +00:00
if ( cursor - > current_in_omt ) load_dbts_from_omt ( cursor , & cursor - > key , & cursor - > val ) ;
2008-05-24 17:22:14 +00:00
r = toku_dbt_set_two_values ( key , ( bytevec * ) & cursor - > key . data , cursor - > key . size , key_staticp , FALSE ,
val , ( bytevec * ) & cursor - > val . data , cursor - > val . size , val_staticp , FALSE ) ;
2008-01-25 15:43:37 +00:00
return r ;
2007-07-13 19:37:47 +00:00
}
2008-05-24 17:22:14 +00:00
int toku_brt_dbt_set ( DBT * key , DBT * key_source ) {
int r = toku_dbt_set_value ( key , ( bytevec * ) & key_source - > data , key_source - > size , NULL , FALSE ) ;
2008-05-08 17:38:10 +00:00
return r ;
}
2008-05-24 17:22:14 +00:00
int toku_brt_cursor_dbts_set ( BRT_CURSOR cursor ,
DBT * key , DBT * key_source , BOOL key_disposable ,
DBT * val , DBT * val_source , BOOL val_disposable ) {
void * * key_staticp = cursor - > is_temporary_cursor ? & cursor - > brt - > skey : & cursor - > skey ;
void * * val_staticp = cursor - > is_temporary_cursor ? & cursor - > brt - > sval : & cursor - > sval ;
int r ;
r = toku_dbt_set_two_values ( key , ( bytevec * ) & key_source - > data , key_source - > size , key_staticp , key_disposable ,
val , ( bytevec * ) & val_source - > data , val_source - > size , val_staticp , val_disposable ) ;
2008-05-07 20:03:13 +00:00
return r ;
}
2008-05-24 17:22:14 +00:00
int toku_brt_cursor_dbts_set_with_dat ( BRT_CURSOR cursor , BRT pdb ,
DBT * key , DBT * key_source , BOOL key_disposable ,
DBT * val , DBT * val_source , BOOL val_disposable ,
DBT * dat , DBT * dat_source , BOOL dat_disposable ) {
void * * key_staticp = cursor - > is_temporary_cursor ? & cursor - > brt - > skey : & cursor - > skey ;
void * * val_staticp = cursor - > is_temporary_cursor ? & cursor - > brt - > sval : & cursor - > sval ;
void * * dat_staticp = & pdb - > sval ;
int r ;
r = toku_dbt_set_three_values ( key , ( bytevec * ) & key_source - > data , key_source - > size , key_staticp , key_disposable ,
val , ( bytevec * ) & val_source - > data , val_source - > size , val_staticp , val_disposable ,
dat , ( bytevec * ) & dat_source - > data , dat_source - > size , dat_staticp , dat_disposable ) ;
2008-05-08 17:38:10 +00:00
return r ;
}
2008-01-25 15:43:37 +00:00
static int brt_cursor_compare_set ( brt_search_t * search , DBT * x , DBT * y ) {
BRT brt = search - > context ;
return compare_kv_xy ( brt , search - > k , search - > v , x , y ) < = 0 ; /* return min xy: kv <= xy */
}
2007-08-23 18:07:18 +00:00
2008-02-08 19:54:00 +00:00
static int brt_cursor_current ( BRT_CURSOR cursor , int op , DBT * outkey , DBT * outval , TOKULOGGER logger ) {
2008-01-25 15:43:37 +00:00
if ( brt_cursor_not_set ( cursor ) )
return EINVAL ;
2008-02-03 18:26:01 +00:00
if ( op = = DB_CURRENT ) {
2008-05-06 20:30:24 +00:00
int r = ENOSYS ;
2008-06-12 20:32:22 +00:00
DBT newkey ; toku_init_dbt ( & newkey ) ; newkey . flags = DB_DBT_REALLOC ;
DBT newval ; toku_init_dbt ( & newval ) ; newval . flags = DB_DBT_REALLOC ;
2008-01-25 15:43:37 +00:00
2008-07-21 05:37:55 +00:00
brt_cursor_invalidate_callback ( cursor - > omtcursor , cursor ) ;
2008-01-25 15:43:37 +00:00
brt_search_t search ; brt_search_init ( & search , brt_cursor_compare_set , BRT_SEARCH_LEFT , & cursor - > key , & cursor - > val , cursor - > brt ) ;
2008-06-02 20:52:12 +00:00
r = toku_brt_search ( cursor - > brt , & search , & newkey , & newval , logger , cursor - > omtcursor , & cursor - > root_put_counter ) ;
2008-01-25 15:43:37 +00:00
if ( r ! = 0 | | compare_kv_xy ( cursor - > brt , & cursor - > key , & cursor - > val , & newkey , & newval ) ! = 0 )
2008-05-06 20:30:24 +00:00
r = DB_KEYEMPTY ;
dbt_cleanup ( & newkey ) ;
dbt_cleanup ( & newval ) ;
if ( r ! = 0 ) return r ;
2007-07-13 19:37:47 +00:00
}
2008-01-25 15:43:37 +00:00
return brt_cursor_copyout ( cursor , outkey , outval ) ;
2007-07-13 19:37:47 +00:00
}
2008-06-12 20:32:22 +00:00
static void swap_dbts ( DBT * a , DBT * b ) {
DBT tmp = * a ;
* a = * b ;
* b = tmp ;
}
static void swap_cursor_dbts ( BRT_CURSOR cursor ) {
swap_dbts ( & cursor - > prevkey , & cursor - > key ) ;
swap_dbts ( & cursor - > prevval , & cursor - > val ) ;
}
void brt_cursor_restore_state_from_prev ( BRT_CURSOR cursor ) {
toku_omt_cursor_invalidate ( cursor - > omtcursor ) ;
swap_cursor_dbts ( cursor ) ;
}
2008-01-25 15:43:37 +00:00
/* search for the first kv pair that matches the search object */
2008-02-08 19:54:00 +00:00
static int brt_cursor_search ( BRT_CURSOR cursor , brt_search_t * search , DBT * outkey , DBT * outval , TOKULOGGER logger ) {
2008-06-12 20:32:22 +00:00
assert ( cursor - > prevkey . flags = = DB_DBT_REALLOC ) ;
assert ( cursor - > prevval . flags = = DB_DBT_REALLOC ) ;
2007-08-23 18:07:18 +00:00
2008-07-21 05:37:55 +00:00
brt_cursor_invalidate_callback ( cursor - > omtcursor , cursor ) ;
2008-06-12 20:32:22 +00:00
int r = toku_brt_search ( cursor - > brt , search , & cursor - > prevkey , & cursor - > prevval , logger , cursor - > omtcursor , & cursor - > root_put_counter ) ;
2008-01-25 15:43:37 +00:00
if ( r = = 0 ) {
2008-06-12 20:32:22 +00:00
swap_cursor_dbts ( cursor ) ;
2008-01-25 15:43:37 +00:00
r = brt_cursor_copyout ( cursor , outkey , outval ) ;
2007-08-23 18:07:18 +00:00
}
2008-01-25 15:43:37 +00:00
return r ;
2007-08-23 18:07:18 +00:00
}
2008-01-25 15:43:37 +00:00
/* search for the kv pair that matches the search object and is equal to kv */
2008-02-08 19:54:00 +00:00
static int brt_cursor_search_eq_kv_xy ( BRT_CURSOR cursor , brt_search_t * search , DBT * outkey , DBT * outval , TOKULOGGER logger ) {
2008-06-12 20:32:22 +00:00
assert ( cursor - > prevkey . flags = = DB_DBT_REALLOC ) ;
assert ( cursor - > prevval . flags = = DB_DBT_REALLOC ) ;
2008-01-08 21:03:17 +00:00
2008-07-21 05:37:55 +00:00
brt_cursor_invalidate_callback ( cursor - > omtcursor , cursor ) ;
2008-06-12 20:32:22 +00:00
int r = toku_brt_search ( cursor - > brt , search , & cursor - > prevkey , & cursor - > prevval , logger , cursor - > omtcursor , & cursor - > root_put_counter ) ;
2008-01-25 15:43:37 +00:00
if ( r = = 0 ) {
2008-06-12 20:32:22 +00:00
if ( compare_kv_xy ( cursor - > brt , search - > k , search - > v , & cursor - > prevkey , & cursor - > prevval ) = = 0 ) {
swap_cursor_dbts ( cursor ) ;
2008-01-25 15:43:37 +00:00
r = brt_cursor_copyout ( cursor , outkey , outval ) ;
2008-06-12 20:32:22 +00:00
} else {
2008-01-25 15:43:37 +00:00
r = DB_NOTFOUND ;
2008-06-12 20:32:22 +00:00
}
2008-01-25 15:43:37 +00:00
}
return r ;
2007-07-13 19:37:47 +00:00
}
2008-01-25 15:43:37 +00:00
/* search for the kv pair that matches the search object and is equal to k */
2008-02-08 19:54:00 +00:00
static int brt_cursor_search_eq_k_x ( BRT_CURSOR cursor , brt_search_t * search , DBT * outkey , DBT * outval , TOKULOGGER logger ) {
2008-06-12 20:32:22 +00:00
assert ( cursor - > prevkey . flags = = DB_DBT_REALLOC ) ;
assert ( cursor - > prevval . flags = = DB_DBT_REALLOC ) ;
2007-08-23 18:07:18 +00:00
2008-07-21 05:37:55 +00:00
brt_cursor_invalidate_callback ( cursor - > omtcursor , cursor ) ;
2008-06-12 20:32:22 +00:00
int r = toku_brt_search ( cursor - > brt , search , & cursor - > prevkey , & cursor - > prevval , logger , cursor - > omtcursor , & cursor - > root_put_counter ) ;
2008-01-25 15:43:37 +00:00
if ( r = = 0 ) {
2008-06-12 20:32:22 +00:00
if ( compare_k_x ( cursor - > brt , search - > k , & cursor - > prevkey ) = = 0 ) {
swap_cursor_dbts ( cursor ) ;
2008-01-25 15:43:37 +00:00
r = brt_cursor_copyout ( cursor , outkey , outval ) ;
} else
r = DB_NOTFOUND ;
}
return r ;
}
2008-01-08 21:03:17 +00:00
2008-01-25 15:43:37 +00:00
static int brt_cursor_compare_one ( brt_search_t * search , DBT * x , DBT * y ) {
search = search ; x = x ; y = y ;
return 1 ;
}
2007-08-23 18:07:18 +00:00
2008-02-08 19:54:00 +00:00
static int brt_cursor_first ( BRT_CURSOR cursor , DBT * outkey , DBT * outval , TOKULOGGER logger ) {
2008-01-25 15:43:37 +00:00
brt_search_t search ; brt_search_init ( & search , brt_cursor_compare_one , BRT_SEARCH_LEFT , 0 , 0 , cursor - > brt ) ;
2008-02-08 19:54:00 +00:00
return brt_cursor_search ( cursor , & search , outkey , outval , logger ) ;
2007-07-13 19:37:47 +00:00
}
2008-02-08 19:54:00 +00:00
static int brt_cursor_last ( BRT_CURSOR cursor , DBT * outkey , DBT * outval , TOKULOGGER logger ) {
2008-01-25 15:43:37 +00:00
brt_search_t search ; brt_search_init ( & search , brt_cursor_compare_one , BRT_SEARCH_RIGHT , 0 , 0 , cursor - > brt ) ;
2008-02-08 19:54:00 +00:00
return brt_cursor_search ( cursor , & search , outkey , outval , logger ) ;
2008-01-25 15:43:37 +00:00
}
2007-08-23 18:07:18 +00:00
2008-01-25 15:43:37 +00:00
static int brt_cursor_compare_next ( brt_search_t * search , DBT * x , DBT * y ) {
BRT brt = search - > context ;
return compare_kv_xy ( brt , search - > k , search - > v , x , y ) < 0 ; /* return min xy: kv < xy */
}
2007-08-23 18:07:18 +00:00
2008-07-21 05:37:55 +00:00
static void save_omtcursor_current_in_prev ( BRT_CURSOR cursor ) {
if ( ! cursor - > prev_in_omt ) {
//Free the data.
if ( cursor - > prevkey . data ) toku_free ( cursor - > prevkey . data ) ;
if ( cursor - > prevval . data ) toku_free ( cursor - > prevval . data ) ;
cursor - > prev_in_omt = TRUE ;
}
load_dbts_from_omt ( cursor , & cursor - > prevkey , & cursor - > prevval ) ;
}
2008-06-02 20:52:12 +00:00
static int brt_cursor_next_shortcut ( BRT_CURSOR cursor , DBT * outkey , DBT * outval )
// Effect: If possible, increment the cursor and return the key-value pair
// (i.e., the next one from what the cursor pointed to before.)
// That is, do DB_NEXT on DUP databases, and do DB_NEXT_NODUP on NODUP databases.
{
if ( toku_omt_cursor_is_valid ( cursor - > omtcursor ) ) {
{
2013-04-16 23:57:18 -04:00
u_int64_t h_counter = cursor - > brt - > h - > root_put_counter ;
2008-06-02 20:52:12 +00:00
if ( h_counter ! = cursor - > root_put_counter ) return - 1 ;
}
OMTVALUE le ;
2008-07-21 05:37:55 +00:00
//Save current value in prev.
save_omtcursor_current_in_prev ( cursor ) ;
2008-06-12 20:32:22 +00:00
2008-07-21 05:59:25 +00:00
u_int32_t starting_index ;
2008-07-21 05:37:55 +00:00
u_int32_t index ;
u_int32_t size = toku_omt_size ( toku_omt_cursor_get_omt ( cursor - > omtcursor ) ) ;
2008-07-21 05:59:25 +00:00
int r = toku_omt_cursor_current_index ( cursor - > omtcursor , & starting_index ) ;
2008-07-21 05:37:55 +00:00
assert ( r = = 0 ) ;
2008-07-21 05:59:25 +00:00
index = starting_index ;
2008-07-21 05:37:55 +00:00
while ( index + 1 < size ) {
r = toku_omt_cursor_next ( cursor - > omtcursor , & le ) ;
assert ( r = = 0 ) ;
index + + ;
if ( le_is_provdel ( le ) ) continue ;
//Free old current if necessary.
if ( ! cursor - > current_in_omt ) {
if ( cursor - > key . data ) toku_free ( cursor - > key . data ) ;
if ( cursor - > val . data ) toku_free ( cursor - > val . data ) ;
cursor - > current_in_omt = TRUE ;
}
2008-06-12 20:32:22 +00:00
2008-07-21 05:37:55 +00:00
return brt_cursor_copyout ( cursor , outkey , outval ) ;
}
2008-07-21 05:59:25 +00:00
toku_omt_cursor_set_index ( cursor - > omtcursor , starting_index ) ;
toku_omt_cursor_invalidate ( cursor - > omtcursor ) ;
2008-06-02 20:52:12 +00:00
}
return - 1 ;
}
2008-07-16 22:23:29 +00:00
int toku_brt_cursor_peek_prev ( BRT_CURSOR cursor , DBT * outkey , DBT * outval ) {
if ( toku_omt_cursor_is_valid ( cursor - > omtcursor ) ) {
{
2013-04-16 23:57:18 -04:00
assert ( cursor - > brt - > h ) ;
2013-04-16 23:57:18 -04:00
u_int64_t h_counter = cursor - > brt - > h - > root_put_counter ;
2008-07-16 22:23:29 +00:00
if ( h_counter ! = cursor - > root_put_counter ) return - 1 ;
}
OMTVALUE le ;
u_int32_t index = 0 ;
int r = toku_omt_cursor_current_index ( cursor - > omtcursor , & index ) ;
assert ( r = = 0 ) ;
OMT omt = toku_omt_cursor_get_omt ( cursor - > omtcursor ) ;
get_prev : ;
if ( index > 0 ) {
r = toku_omt_fetch ( omt , - - index , & le , NULL ) ;
if ( r = = 0 ) {
if ( le_is_provdel ( le ) ) goto get_prev ;
toku_fill_dbt ( outkey , le_latest_key ( le ) , le_latest_keylen ( le ) ) ;
toku_fill_dbt ( outval , le_latest_val ( le ) , le_latest_vallen ( le ) ) ;
return 0 ;
}
}
}
return - 1 ;
}
int toku_brt_cursor_peek_next ( BRT_CURSOR cursor , DBT * outkey , DBT * outval ) {
if ( toku_omt_cursor_is_valid ( cursor - > omtcursor ) ) {
{
2013-04-16 23:57:18 -04:00
assert ( cursor - > brt - > h ) ;
2013-04-16 23:57:18 -04:00
u_int64_t h_counter = cursor - > brt - > h - > root_put_counter ;
2008-07-16 22:23:29 +00:00
if ( h_counter ! = cursor - > root_put_counter ) return - 1 ;
}
OMTVALUE le ;
u_int32_t index = UINT32_MAX ;
int r = toku_omt_cursor_current_index ( cursor - > omtcursor , & index ) ;
assert ( r = = 0 ) ;
OMT omt = toku_omt_cursor_get_omt ( cursor - > omtcursor ) ;
get_next : ;
if ( + + index < toku_omt_size ( omt ) ) {
r = toku_omt_fetch ( omt , index , & le , NULL ) ;
if ( r = = 0 ) {
if ( le_is_provdel ( le ) ) goto get_next ;
toku_fill_dbt ( outkey , le_latest_key ( le ) , le_latest_keylen ( le ) ) ;
toku_fill_dbt ( outval , le_latest_val ( le ) , le_latest_vallen ( le ) ) ;
return 0 ;
}
}
}
return - 1 ;
}
2008-02-08 19:54:00 +00:00
static int brt_cursor_next ( BRT_CURSOR cursor , DBT * outkey , DBT * outval , TOKULOGGER logger ) {
2008-06-02 20:52:12 +00:00
if ( 0 ! = ( cursor - > brt - > flags & TOKU_DB_DUP ) & &
brt_cursor_next_shortcut ( cursor , outkey , outval ) = = 0 )
return 0 ;
2008-01-25 15:43:37 +00:00
brt_search_t search ; brt_search_init ( & search , brt_cursor_compare_next , BRT_SEARCH_LEFT , & cursor - > key , & cursor - > val , cursor - > brt ) ;
2008-02-08 19:54:00 +00:00
return brt_cursor_search ( cursor , & search , outkey , outval , logger ) ;
2008-01-25 15:43:37 +00:00
}
2007-08-23 18:07:18 +00:00
2008-07-16 22:23:29 +00:00
int toku_brt_cursor_after ( BRT_CURSOR cursor , DBT * key , DBT * val , DBT * outkey , DBT * outval , TOKUTXN txn ) {
TOKULOGGER logger = toku_txn_logger ( txn ) ;
brt_search_t search ; brt_search_init ( & search , brt_cursor_compare_next , BRT_SEARCH_LEFT , key , val , cursor - > brt ) ;
return brt_cursor_search ( cursor , & search , outkey , outval , logger ) ;
}
2008-01-25 15:43:37 +00:00
static int brt_cursor_compare_next_nodup ( brt_search_t * search , DBT * x , DBT * y ) {
BRT brt = search - > context ; y = y ;
return compare_k_x ( brt , search - > k , x ) < 0 ; /* return min x: k < x */
2007-08-23 18:07:18 +00:00
}
2008-02-08 19:54:00 +00:00
static int brt_cursor_next_nodup ( BRT_CURSOR cursor , DBT * outkey , DBT * outval , TOKULOGGER logger ) {
2008-06-02 20:52:12 +00:00
if ( 0 = = ( cursor - > brt - > flags & TOKU_DB_DUP ) & &
brt_cursor_next_shortcut ( cursor , outkey , outval ) = = 0 )
return 0 ;
2008-01-25 15:43:37 +00:00
brt_search_t search ; brt_search_init ( & search , brt_cursor_compare_next_nodup , BRT_SEARCH_LEFT , & cursor - > key , & cursor - > val , cursor - > brt ) ;
2008-02-08 19:54:00 +00:00
return brt_cursor_search ( cursor , & search , outkey , outval , logger ) ;
2007-07-20 12:41:23 +00:00
}
2008-01-25 15:43:37 +00:00
static int brt_cursor_compare_next_dup ( brt_search_t * search , DBT * x , DBT * y ) {
BRT brt = search - > context ;
int keycmp = compare_k_x ( brt , search - > k , x ) ;
if ( keycmp < 0 )
return 1 ;
else
2008-02-10 20:27:41 +00:00
return keycmp = = 0 & & y & & compare_v_y ( brt , search - > v , y ) < 0 ; /* return min xy: k <= x && v < y */
2008-01-25 15:43:37 +00:00
}
2007-09-07 20:25:54 +00:00
2008-02-08 19:54:00 +00:00
static int brt_cursor_next_dup ( BRT_CURSOR cursor , DBT * outkey , DBT * outval , TOKULOGGER logger ) {
2008-01-25 15:43:37 +00:00
brt_search_t search ; brt_search_init ( & search , brt_cursor_compare_next_dup , BRT_SEARCH_LEFT , & cursor - > key , & cursor - > val , cursor - > brt ) ;
2008-02-08 19:54:00 +00:00
return brt_cursor_search_eq_k_x ( cursor , & search , outkey , outval , logger ) ;
2008-01-25 15:43:37 +00:00
}
2007-09-07 20:25:54 +00:00
2008-01-25 15:43:37 +00:00
static int brt_cursor_compare_get_both_range ( brt_search_t * search , DBT * x , DBT * y ) {
BRT brt = search - > context ;
int keycmp = compare_k_x ( brt , search - > k , x ) ;
if ( keycmp < 0 )
return 1 ;
2008-02-09 21:20:22 +00:00
else
return keycmp = = 0 & & ( y = = 0 | | compare_v_y ( brt , search - > v , y ) < = 0 ) ; /* return min xy: k <= x && v <= y */
2008-01-25 15:43:37 +00:00
}
2007-09-07 20:25:54 +00:00
2008-02-08 19:54:00 +00:00
static int brt_cursor_get_both_range ( BRT_CURSOR cursor , DBT * key , DBT * val , DBT * outkey , DBT * outval , TOKULOGGER logger ) {
2008-01-25 15:43:37 +00:00
brt_search_t search ; brt_search_init ( & search , brt_cursor_compare_get_both_range , BRT_SEARCH_LEFT , key , val , cursor - > brt ) ;
2008-02-08 19:54:00 +00:00
return brt_cursor_search_eq_k_x ( cursor , & search , outkey , outval , logger ) ;
2007-09-07 20:25:54 +00:00
}
2008-01-25 15:43:37 +00:00
static int brt_cursor_compare_prev ( brt_search_t * search , DBT * x , DBT * y ) {
BRT brt = search - > context ;
return compare_kv_xy ( brt , search - > k , search - > v , x , y ) > 0 ; /* return max xy: kv > xy */
2007-09-07 20:25:54 +00:00
}
2008-06-02 21:35:02 +00:00
static int brt_cursor_prev_shortcut ( BRT_CURSOR cursor , DBT * outkey , DBT * outval )
// Effect: If possible, decrement the cursor and return the key-value pair
// (i.e., the previous one from what the cursor pointed to before.)
// That is, do DB_PREV on DUP databases, and do DB_PREV_NODUP on NODUP databases.
{
if ( toku_omt_cursor_is_valid ( cursor - > omtcursor ) ) {
{
2013-04-16 23:57:18 -04:00
u_int64_t h_counter = cursor - > brt - > h - > root_put_counter ;
2008-06-02 21:35:02 +00:00
if ( h_counter ! = cursor - > root_put_counter ) return - 1 ;
}
OMTVALUE le ;
2008-07-21 05:37:55 +00:00
//Save current value in prev.
save_omtcursor_current_in_prev ( cursor ) ;
2008-06-12 20:32:22 +00:00
2008-07-21 05:59:25 +00:00
u_int32_t starting_index = 0 ;
u_int32_t index ;
int r = toku_omt_cursor_current_index ( cursor - > omtcursor , & starting_index ) ;
2008-07-21 05:37:55 +00:00
assert ( r = = 0 ) ;
2008-07-21 05:59:25 +00:00
index = starting_index ;
2008-07-21 05:37:55 +00:00
while ( index > 0 ) {
r = toku_omt_cursor_prev ( cursor - > omtcursor , & le ) ;
assert ( r = = 0 ) ;
index - - ;
if ( le_is_provdel ( le ) ) continue ;
//Free old current if necessary.
if ( ! cursor - > current_in_omt ) {
if ( cursor - > key . data ) toku_free ( cursor - > key . data ) ;
if ( cursor - > val . data ) toku_free ( cursor - > val . data ) ;
cursor - > current_in_omt = TRUE ;
}
2008-06-12 20:32:22 +00:00
2008-07-21 05:37:55 +00:00
return brt_cursor_copyout ( cursor , outkey , outval ) ;
}
2008-07-21 05:59:25 +00:00
toku_omt_cursor_set_index ( cursor - > omtcursor , starting_index ) ;
toku_omt_cursor_invalidate ( cursor - > omtcursor ) ;
2008-06-02 21:35:02 +00:00
}
return - 1 ;
}
2008-07-16 22:23:29 +00:00
int toku_brt_cursor_before ( BRT_CURSOR cursor , DBT * key , DBT * val , DBT * outkey , DBT * outval , TOKUTXN txn ) {
TOKULOGGER logger = toku_txn_logger ( txn ) ;
brt_search_t search ; brt_search_init ( & search , brt_cursor_compare_prev , BRT_SEARCH_RIGHT , key , val , cursor - > brt ) ;
return brt_cursor_search ( cursor , & search , outkey , outval , logger ) ;
}
2008-02-08 19:54:00 +00:00
static int brt_cursor_prev ( BRT_CURSOR cursor , DBT * outkey , DBT * outval , TOKULOGGER logger ) {
2008-06-02 21:35:02 +00:00
if ( 0 ! = ( cursor - > brt - > flags & TOKU_DB_DUP ) & &
brt_cursor_prev_shortcut ( cursor , outkey , outval ) = = 0 )
return 0 ;
2008-01-25 15:43:37 +00:00
brt_search_t search ; brt_search_init ( & search , brt_cursor_compare_prev , BRT_SEARCH_RIGHT , & cursor - > key , & cursor - > val , cursor - > brt ) ;
2008-02-08 19:54:00 +00:00
return brt_cursor_search ( cursor , & search , outkey , outval , logger ) ;
2007-09-11 16:30:58 +00:00
}
2008-01-25 15:43:37 +00:00
static int brt_cursor_compare_prev_nodup ( brt_search_t * search , DBT * x , DBT * y ) {
BRT brt = search - > context ; y = y ;
return compare_k_x ( brt , search - > k , x ) > 0 ; /* return max x: k > x */
2007-12-17 01:03:35 +00:00
}
2007-09-11 16:30:58 +00:00
2008-02-08 19:54:00 +00:00
static int brt_cursor_prev_nodup ( BRT_CURSOR cursor , DBT * outkey , DBT * outval , TOKULOGGER logger ) {
2008-06-02 21:35:02 +00:00
if ( 0 = = ( cursor - > brt - > flags & TOKU_DB_DUP ) & &
brt_cursor_prev_shortcut ( cursor , outkey , outval ) = = 0 )
return 0 ;
2008-01-25 15:43:37 +00:00
brt_search_t search ; brt_search_init ( & search , brt_cursor_compare_prev_nodup , BRT_SEARCH_RIGHT , & cursor - > key , & cursor - > val , cursor - > brt ) ;
2008-02-08 19:54:00 +00:00
return brt_cursor_search ( cursor , & search , outkey , outval , logger ) ;
2008-01-25 15:43:37 +00:00
}
2007-09-11 16:30:58 +00:00
2008-01-25 15:43:37 +00:00
# ifdef DB_PREV_DUP
static int brt_cursor_compare_prev_dup ( brt_search_t * search , DBT * x , DBT * y ) {
BRT brt = search - > context ;
int keycmp = compare_k_x ( brt , search - > k , x ) ;
if ( keycmp > 0 )
return 1 ;
else
2008-02-10 20:27:41 +00:00
return keycmp = = 0 & & y & & compare_v_y ( brt , search - > v , y ) > 0 ; /* return max xy: k >= x && v > y */
2007-09-11 16:30:58 +00:00
}
2008-02-09 11:48:24 +00:00
static int brt_cursor_prev_dup ( BRT_CURSOR cursor , DBT * outkey , DBT * outval , TOKULOGGER logger ) {
2008-01-25 15:43:37 +00:00
brt_search_t search ; brt_search_init ( & search , brt_cursor_compare_prev_dup , BRT_SEARCH_RIGHT , & cursor - > key , & cursor - > val , cursor - > brt ) ;
2008-02-09 11:48:24 +00:00
return brt_cursor_search_eq_k_x ( cursor , & search , outkey , outval , logger ) ;
2007-07-13 19:37:47 +00:00
}
2008-01-25 15:43:37 +00:00
# endif
2007-08-24 12:10:49 +00:00
2008-01-25 15:43:37 +00:00
static int brt_cursor_compare_set_range ( brt_search_t * search , DBT * x , DBT * y ) {
BRT brt = search - > context ;
return compare_kv_xy ( brt , search - > k , search - > v , x , y ) < = 0 ; /* return kv <= xy */
2007-08-24 12:10:49 +00:00
}
2008-02-08 19:54:00 +00:00
static int brt_cursor_set ( BRT_CURSOR cursor , DBT * key , DBT * val , DBT * outkey , DBT * outval , TOKULOGGER logger ) {
2008-01-25 15:43:37 +00:00
brt_search_t search ; brt_search_init ( & search , brt_cursor_compare_set_range , BRT_SEARCH_LEFT , key , val , cursor - > brt ) ;
2008-02-08 19:54:00 +00:00
return brt_cursor_search_eq_kv_xy ( cursor , & search , outkey , outval , logger ) ;
2008-01-25 15:43:37 +00:00
}
2008-02-08 19:54:00 +00:00
static int brt_cursor_set_range ( BRT_CURSOR cursor , DBT * key , DBT * outkey , DBT * outval , TOKULOGGER logger ) {
2008-01-25 15:43:37 +00:00
brt_search_t search ; brt_search_init ( & search , brt_cursor_compare_set_range , BRT_SEARCH_LEFT , key , 0 , cursor - > brt ) ;
2008-02-08 19:54:00 +00:00
return brt_cursor_search ( cursor , & search , outkey , outval , logger ) ;
2008-01-25 15:43:37 +00:00
}
int toku_brt_cursor_get ( BRT_CURSOR cursor , DBT * key , DBT * val , int get_flags , TOKUTXN txn ) {
2007-07-13 19:37:47 +00:00
int r ;
2008-01-25 15:43:37 +00:00
2008-02-03 18:26:01 +00:00
int op = get_flags & DB_OPFLAGS_MASK ;
2008-02-08 19:54:00 +00:00
TOKULOGGER logger = toku_txn_logger ( txn ) ;
2008-02-03 18:26:01 +00:00
if ( get_flags & ~ DB_OPFLAGS_MASK )
2008-01-25 15:43:37 +00:00
return EINVAL ;
2008-02-03 18:26:01 +00:00
switch ( op ) {
2008-01-25 15:43:37 +00:00
case DB_CURRENT :
2008-02-03 18:26:01 +00:00
case DB_CURRENT_BINDING :
2008-02-08 19:54:00 +00:00
r = brt_cursor_current ( cursor , op , key , val , logger ) ;
2007-08-24 12:10:49 +00:00
break ;
2007-07-13 19:37:47 +00:00
case DB_FIRST :
2008-02-08 19:54:00 +00:00
r = brt_cursor_first ( cursor , key , val , logger ) ;
2008-01-25 15:43:37 +00:00
break ;
case DB_LAST :
2008-02-08 19:54:00 +00:00
r = brt_cursor_last ( cursor , key , val , logger ) ;
2007-08-24 12:10:49 +00:00
break ;
2007-07-20 12:41:23 +00:00
case DB_NEXT :
2008-01-25 15:43:37 +00:00
if ( brt_cursor_not_set ( cursor ) )
2008-02-08 19:54:00 +00:00
r = brt_cursor_first ( cursor , key , val , logger ) ;
2008-01-25 15:43:37 +00:00
else
2008-02-08 19:54:00 +00:00
r = brt_cursor_next ( cursor , key , val , logger ) ;
2008-01-25 15:43:37 +00:00
break ;
case DB_NEXT_DUP :
if ( brt_cursor_not_set ( cursor ) )
r = EINVAL ;
else
2008-02-08 19:54:00 +00:00
r = brt_cursor_next_dup ( cursor , key , val , logger ) ;
2008-01-25 15:43:37 +00:00
break ;
case DB_NEXT_NODUP :
if ( brt_cursor_not_set ( cursor ) )
2008-02-08 19:54:00 +00:00
r = brt_cursor_first ( cursor , key , val , logger ) ;
2008-01-25 15:43:37 +00:00
else
2008-02-08 19:54:00 +00:00
r = brt_cursor_next_nodup ( cursor , key , val , logger ) ;
2007-09-07 20:25:54 +00:00
break ;
case DB_PREV :
2008-01-25 15:43:37 +00:00
if ( brt_cursor_not_set ( cursor ) )
2008-02-08 19:54:00 +00:00
r = brt_cursor_last ( cursor , key , val , logger ) ;
2008-01-25 15:43:37 +00:00
else
2008-02-08 19:54:00 +00:00
r = brt_cursor_prev ( cursor , key , val , logger ) ;
2007-09-07 20:25:54 +00:00
break ;
2008-01-25 15:43:37 +00:00
# ifdef DB_PREV_DUP
case DB_PREV_DUP :
if ( brt_cursor_not_set ( cursor ) )
r = EINVAL ;
else
2008-02-08 19:54:00 +00:00
r = brt_cursor_prev_dup ( cursor , key , val , logger ) ;
2007-12-03 13:36:52 +00:00
break ;
2008-01-25 15:43:37 +00:00
# endif
case DB_PREV_NODUP :
if ( brt_cursor_not_set ( cursor ) )
2008-02-08 19:54:00 +00:00
r = brt_cursor_last ( cursor , key , val , logger ) ;
2008-01-25 15:43:37 +00:00
else
2008-02-08 19:54:00 +00:00
r = brt_cursor_prev_nodup ( cursor , key , val , logger ) ;
2007-09-12 20:30:36 +00:00
break ;
2008-01-25 15:43:37 +00:00
case DB_SET :
2008-02-08 19:54:00 +00:00
r = brt_cursor_set ( cursor , key , 0 , 0 , val , logger ) ;
2007-09-11 16:30:58 +00:00
break ;
case DB_SET_RANGE :
2008-02-08 19:54:00 +00:00
r = brt_cursor_set_range ( cursor , key , key , val , logger ) ;
2008-01-25 15:43:37 +00:00
break ;
case DB_GET_BOTH :
2008-02-08 19:54:00 +00:00
r = brt_cursor_set ( cursor , key , val , 0 , 0 , logger ) ;
2007-09-11 16:30:58 +00:00
break ;
2007-12-17 13:48:20 +00:00
case DB_GET_BOTH_RANGE :
2008-02-08 19:54:00 +00:00
r = brt_cursor_get_both_range ( cursor , key , val , 0 , val , logger ) ;
2007-12-17 13:48:20 +00:00
break ;
2007-07-13 19:37:47 +00:00
default :
2008-01-25 15:43:37 +00:00
r = EINVAL ;
break ;
2007-07-13 19:37:47 +00:00
}
2007-09-11 18:32:10 +00:00
return r ;
}
2007-12-10 18:54:12 +00:00
2008-07-16 22:23:29 +00:00
static int brt_cursor_compare_heavi ( brt_search_t * search , DBT * x , DBT * y ) {
HEAVI_WRAPPER wrapper = search - > context ;
int r = wrapper - > h ( x , y , wrapper - > extra_h ) ;
// wrapper->r_h must have the same signus as the final chosen element.
// it is initialized to -1 or 1. 0's are closer to the min (max) that we
// want so once we hit 0 we keep it.
if ( r = = 0 ) wrapper - > r_h = 0 ;
return ( search - > direction & BRT_SEARCH_LEFT ) ? r > = 0 : r < = 0 ;
}
//We pass in toku_dbt_fake to the search functions, since it will not pass the
//key(or val) to the heaviside function if key(or val) is NULL.
//It is not used for anything else,
//the actual 'extra' information for the heaviside function is inside the
//wrapper.
static const DBT __toku_dbt_fake ;
static const DBT * const toku_dbt_fake = & __toku_dbt_fake ;
int toku_brt_cursor_get_heavi ( BRT_CURSOR cursor , DBT * outkey , DBT * outval , TOKUTXN txn , int direction , HEAVI_WRAPPER wrapper ) {
TOKULOGGER logger = toku_txn_logger ( txn ) ;
brt_search_t search ; brt_search_init ( & search , brt_cursor_compare_heavi ,
direction < 0 ? BRT_SEARCH_RIGHT : BRT_SEARCH_LEFT ,
( DBT * ) toku_dbt_fake ,
cursor - > brt - > flags & TOKU_DB_DUPSORT ? ( DBT * ) toku_dbt_fake : NULL ,
wrapper ) ;
return brt_cursor_search ( cursor , & search , outkey , outval , logger ) ;
}
2008-06-18 21:38:01 +00:00
static void toku_brt_keyrange_internal ( BRT brt , CACHEKEY nodename , u_int32_t fullhash , DBT * key , u_int64_t * less , u_int64_t * equal , u_int64_t * greater ) {
2008-05-08 13:05:01 +00:00
BRTNODE node ;
{
void * node_v ;
2008-06-18 21:38:01 +00:00
//assert(fullhash == toku_cachetable_hash(brt->cf, nodename));
2008-06-17 17:05:19 +00:00
int rr = toku_cachetable_get_and_pin ( brt - > cf , nodename , fullhash ,
2013-04-16 23:57:18 -04:00
& node_v , NULL , toku_brtnode_flush_callback , toku_brtnode_fetch_callback , brt - > h ) ;
2008-05-08 13:05:01 +00:00
assert ( rr = = 0 ) ;
node = node_v ;
2008-06-17 17:05:19 +00:00
assert ( node - > fullhash = = fullhash ) ;
2008-05-08 13:05:01 +00:00
}
if ( node - > height > 0 ) {
2008-05-13 20:14:31 +00:00
int n_keys = node - > u . n . n_children - 1 ;
int compares [ n_keys ] ;
2008-05-08 13:05:01 +00:00
int i ;
2008-05-13 20:14:31 +00:00
for ( i = 0 ; i < n_keys ; i + + ) {
2008-05-08 13:05:01 +00:00
struct kv_pair * pivot = node - > u . n . childkeys [ i ] ;
DBT dbt ;
2008-05-13 20:14:31 +00:00
compares [ i ] = brt - > compare_fun ( brt - > db , toku_fill_dbt ( & dbt , kv_pair_key ( pivot ) , kv_pair_keylen ( pivot ) ) , key ) ;
}
for ( i = 0 ; i < node - > u . n . n_children ; i + + ) {
int prevcomp = ( i = = 0 ) ? - 1 : compares [ i - 1 ] ;
int nextcomp = ( i + 1 > = n_keys ) ? 1 : compares [ i ] ;
int subest = BNC_SUBTREE_LEAFENTRY_ESTIMATE ( node , i ) ;
if ( nextcomp < 0 ) {
// We're definitely looking too far to the left
* less + = subest ;
} else if ( prevcomp > 0 ) {
// We're definitely looking too far to the right
* greater + = subest ;
} else if ( prevcomp = = 0 & & nextcomp = = 0 ) {
// We're looking at a subtree that contains all zeros
* equal + = subest ;
2008-05-08 13:05:01 +00:00
} else {
2008-05-13 20:14:31 +00:00
// nextcomp>=0 and prevcomp<=0, so something in the subtree could match
// but they are not both zero, so it's not the whole subtree, so we need to recurse
2013-04-16 23:57:18 -04:00
toku_brt_keyrange_internal ( brt , BNC_BLOCKNUM ( node , i ) , compute_child_fullhash ( brt - > cf , node , i ) , key , less , equal , greater ) ;
2008-05-08 13:05:01 +00:00
}
}
} else {
BRT_CMD_S cmd = { BRT_INSERT , 0 , . u . id = { key , 0 } } ;
struct cmd_leafval_bessel_extra be = { brt , & cmd , 0 } ;
u_int32_t idx ;
2008-05-30 20:41:12 +00:00
int r = toku_omt_find_zero ( node - > u . l . buffer , toku_cmd_leafval_bessel , & be , 0 , & idx , NULL ) ;
2008-05-08 13:05:01 +00:00
* less + = idx ;
2008-05-13 20:14:31 +00:00
if ( r = = 0 & & ( brt - > flags & TOKU_DB_DUP ) ) {
// There is something, and so we now want to find the rightmost extent.
u_int32_t idx2 ;
2008-05-30 20:41:12 +00:00
r = toku_omt_find ( node - > u . l . buffer , toku_cmd_leafval_bessel , & be , + 1 , 0 , & idx2 , NULL ) ;
2008-05-13 20:14:31 +00:00
if ( r = = 0 ) {
* greater + = toku_omt_size ( node - > u . l . buffer ) - idx2 ;
* equal + = idx2 - idx ;
} else {
* equal + = toku_omt_size ( node - > u . l . buffer ) - idx ;
}
//printf("%s:%d (%llu, %llu, %llu)\n", __FILE__, __LINE__, (unsigned long long)*less, (unsigned long long)*equal, (unsigned long long)*greater);
} else {
* greater + = toku_omt_size ( node - > u . l . buffer ) - idx ;
if ( r = = 0 ) {
( * greater ) - - ;
( * equal ) + + ;
}
2008-05-08 13:05:01 +00:00
}
}
2008-05-13 12:14:38 +00:00
{
int rr = toku_unpin_brtnode ( brt , node ) ;
assert ( rr = = 0 ) ;
}
2008-05-08 13:05:01 +00:00
}
int toku_brt_keyrange ( BRT brt , DBT * key , u_int64_t * less , u_int64_t * equal , u_int64_t * greater ) {
2013-04-16 23:57:18 -04:00
assert ( brt - > h ) ;
2008-06-18 21:38:01 +00:00
u_int32_t fullhash ;
CACHEKEY * rootp = toku_calculate_root_offset_pointer ( brt , & fullhash ) ;
2008-05-13 20:14:31 +00:00
2008-05-08 13:05:01 +00:00
* less = * equal = * greater = 0 ;
2008-06-18 21:38:01 +00:00
toku_brt_keyrange_internal ( brt , * rootp , fullhash , key , less , equal , greater ) ;
2008-05-08 13:05:01 +00:00
return 0 ;
}
2008-01-28 20:49:10 +00:00
int toku_brt_cursor_delete ( BRT_CURSOR cursor , int flags , TOKUTXN txn ) {
2008-01-25 15:43:37 +00:00
if ( ( flags & ~ DB_DELETE_ANY ) ! = 0 )
return EINVAL ;
if ( brt_cursor_not_set ( cursor ) )
return EINVAL ;
int r = 0 ;
if ( ! ( flags & DB_DELETE_ANY ) )
2008-02-08 19:54:00 +00:00
r = brt_cursor_current ( cursor , DB_CURRENT , 0 , 0 , toku_txn_logger ( txn ) ) ;
2013-04-16 23:57:18 -04:00
if ( r = = 0 ) {
if ( cursor - > current_in_omt ) load_dbts_from_omt ( cursor , & cursor - > key , & cursor - > val ) ;
2008-01-28 20:49:10 +00:00
r = toku_brt_delete_both ( cursor - > brt , & cursor - > key , & cursor - > val , txn ) ;
2013-04-16 23:57:18 -04:00
}
2007-12-10 18:54:12 +00:00
return r ;
}
2008-02-25 22:46:48 +00:00
int toku_brt_height_of_root ( BRT brt , int * height ) {
// for an open brt, return the current height.
int r ;
2013-04-16 23:57:18 -04:00
assert ( brt - > h ) ;
2008-06-18 21:38:01 +00:00
u_int32_t fullhash ;
CACHEKEY * rootp = toku_calculate_root_offset_pointer ( brt , & fullhash ) ;
2008-02-25 22:46:48 +00:00
void * node_v ;
2008-06-18 21:38:01 +00:00
//assert(fullhash == toku_cachetable_hash(brt->cf, *rootp));
if ( ( r = toku_cachetable_get_and_pin ( brt - > cf , * rootp , fullhash , & node_v , NULL ,
2013-04-16 23:57:18 -04:00
toku_brtnode_flush_callback , toku_brtnode_fetch_callback , brt - > h ) ) ) {
2013-04-16 23:57:18 -04:00
return r ;
2008-02-25 22:46:48 +00:00
}
BRTNODE node = node_v ;
* height = node - > height ;
2008-03-05 18:34:32 +00:00
r = toku_unpin_brtnode ( brt , node ) ; assert ( r = = 0 ) ;
2008-02-25 22:46:48 +00:00
return 0 ;
}
2008-03-19 19:23:45 +00:00
2008-07-21 02:34:13 +00:00
int toku_brt_get_cursor_count ( BRT brt ) {
int n = 0 ;
struct list * list ;
for ( list = brt - > cursors . next ; list ! = & brt - > cursors ; list = list - > next )
n + = 1 ;
return n ;
}
2008-04-22 20:39:50 +00:00
struct omt_compressor_state {
struct mempool * new_kvspace ;
OMT omt ;
} ;
2008-04-07 01:30:25 +00:00
2008-04-25 13:45:55 +00:00
static int move_it ( OMTVALUE lev , u_int32_t idx , void * v ) {
LEAFENTRY le = lev ;
2008-04-22 20:39:50 +00:00
struct omt_compressor_state * oc = v ;
u_int32_t size = leafentry_memsize ( le ) ;
LEAFENTRY newdata = toku_mempool_malloc ( oc - > new_kvspace , size , 1 ) ;
assert ( newdata ) ; // we do this on a fresh mempool, so nothing bad shouldhapepn
memcpy ( newdata , le , size ) ;
toku_omt_set_at ( oc - > omt , newdata , idx ) ;
return 0 ;
}
2008-05-29 03:12:59 +00:00
// Compress things, and grow the mempool if needed.
static int omt_compress_kvspace ( OMT omt , struct mempool * memp , size_t added_size ) {
u_int32_t total_size_needed = memp - > free_offset - memp - > frag_size + added_size ;
if ( total_size_needed + total_size_needed / 4 > = memp - > size ) {
memp - > size = total_size_needed + total_size_needed / 4 ;
}
2008-04-07 01:30:25 +00:00
void * newmem = toku_malloc ( memp - > size ) ;
if ( newmem = = 0 )
2008-05-29 16:28:37 +00:00
return ENOMEM ;
2008-04-07 01:30:25 +00:00
struct mempool new_kvspace ;
toku_mempool_init ( & new_kvspace , newmem , memp - > size ) ;
2008-04-22 20:39:50 +00:00
struct omt_compressor_state oc = { & new_kvspace , omt } ;
toku_omt_iterate ( omt , move_it , & oc ) ;
2008-04-07 01:30:25 +00:00
toku_free ( memp - > base ) ;
* memp = new_kvspace ;
return 0 ;
}
2008-04-22 20:39:50 +00:00
void * mempool_malloc_from_omt ( OMT omt , struct mempool * mp , size_t size ) {
void * v = toku_mempool_malloc ( mp , size , 1 ) ;
2008-04-07 01:30:25 +00:00
if ( v = = 0 ) {
2008-06-02 20:52:12 +00:00
if ( 0 = = omt_compress_kvspace ( omt , mp , size ) ) {
v = toku_mempool_malloc ( mp , size , 1 ) ;
assert ( v ) ;
}
2008-04-07 01:30:25 +00:00
}
return v ;
}