2007-07-13 19:37:47 +00:00
/* -*- mode: C; c-basic-offset: 4 -*- */
2008-01-24 15:10:32 +00:00
# ident "Copyright (c) 2007, 2008 Tokutek Inc. All rights reserved."
2007-11-29 14:18:54 +00:00
2007-07-13 19:37:47 +00:00
/* Buffered repository tree.
* Observation : The in - memory representation of a node doesn ' t have to be the same as the on - disk representation .
* Goal for the in - memory representation : fast
* Goal for on - disk : small
*
* So to get this running fast , I ' ll make a version that doesn ' t do range queries :
* use a hash table for in - memory
* simply write the strings on disk .
* Later I ' ll do a PMA or a skiplist for the in - memory version .
* Also , later I ' ll convert the format to network order fromn host order .
* Later , for on disk , I ' ll compress it ( perhaps with gzip , perhaps with the bzip2 algorithm . )
*
* The collection of nodes forms a data structure like a B - tree . The complexities of keeping it balanced apply .
*
* We always write nodes to a new location on disk .
* The nodes themselves contain the information about the tree structure .
* Q : During recovery , how do we find the root node without looking at every block on disk ?
* A : The root node is either the designated root near the front of the freelist .
* The freelist is updated infrequently . Before updating the stable copy of the freelist , we make sure that
* the root is up - to - date . We can make the freelist - and - root update be an arbitrarily small fraction of disk bandwidth .
*
*/
# include <assert.h>
2007-11-26 18:47:44 +00:00
# include <errno.h>
# include <inttypes.h>
# include <stdio.h>
# include <stdlib.h>
2007-07-13 19:37:47 +00:00
# include <string.h>
# include <unistd.h>
2007-11-26 18:47:44 +00:00
# include "brt-internal.h"
# include "key.h"
# include "log_header.h"
2007-07-13 19:37:47 +00:00
extern long long n_items_malloced ;
2008-01-18 21:28:27 +00:00
static int malloc_diskblock ( DISKOFF * res , BRT brt , int size , TOKUTXN ) ;
2007-11-14 17:58:38 +00:00
//static void verify_local_fingerprint_nonleaf (BRTNODE node);
2007-07-13 19:37:47 +00:00
/* Frees a node, including all the stuff in the hash table. */
2007-11-29 15:09:14 +00:00
void toku_brtnode_free ( BRTNODE * nodep ) {
2007-08-01 02:37:21 +00:00
BRTNODE node = * nodep ;
2007-07-13 19:37:47 +00:00
int i ;
//printf("%s:%d %p->mdict[0]=%p\n", __FILE__, __LINE__, node, node->mdicts[0]);
if ( node - > height > 0 ) {
for ( i = 0 ; i < node - > u . n . n_children - 1 ; i + + ) {
2007-07-20 18:00:14 +00:00
toku_free ( ( void * ) node - > u . n . childkeys [ i ] ) ;
2007-07-13 19:37:47 +00:00
}
for ( i = 0 ; i < node - > u . n . n_children ; i + + ) {
2008-01-11 14:03:33 +00:00
if ( node - > u . n . buffers [ i ] ) {
toku_fifo_free ( & node - > u . n . buffers [ i ] ) ;
2007-07-13 19:37:47 +00:00
}
2007-08-23 18:07:18 +00:00
assert ( node - > u . n . n_cursors [ i ] = = 0 ) ;
2007-07-13 19:37:47 +00:00
}
} else {
if ( node - > u . l . buffer ) // The buffer may have been freed already, in some cases.
2007-11-20 00:32:25 +00:00
toku_pma_free ( & node - > u . l . buffer ) ;
2007-07-13 19:37:47 +00:00
}
2007-07-20 18:00:14 +00:00
toku_free ( node ) ;
2007-08-01 02:37:21 +00:00
* nodep = 0 ;
2007-07-13 19:37:47 +00:00
}
2007-11-29 15:09:14 +00:00
static long brtnode_size ( BRTNODE node ) {
2007-09-21 17:55:49 +00:00
long size ;
2007-10-10 19:33:31 +00:00
assert ( node - > tag = = TYP_BRTNODE ) ;
2007-09-21 17:55:49 +00:00
if ( node - > height > 0 )
2008-01-11 14:03:33 +00:00
size = node - > u . n . n_bytes_in_buffers ;
2007-09-21 17:55:49 +00:00
else
size = node - > u . l . n_bytes_in_buffer ;
return size ;
}
2008-01-23 19:44:13 +00:00
static void toku_update_brtnode_lsn ( BRTNODE node , TOKUTXN txn ) {
if ( txn ) {
node - > log_lsn = toku_txn_get_last_lsn ( txn ) ;
}
}
2008-01-23 18:06:23 +00:00
static void fixup_child_fingerprint ( BRTNODE node , int childnum_of_node , BRTNODE child , BRT brt , TOKUTXN txn ) {
u_int32_t old_fingerprint = BRTNODE_CHILD_SUBTREE_FINGERPRINTS ( node , childnum_of_node ) ;
2007-11-14 17:58:38 +00:00
u_int32_t sum = child - > local_fingerprint ;
if ( child - > height > 0 ) {
int i ;
for ( i = 0 ; i < child - > u . n . n_children ; i + + ) {
2007-12-06 19:16:18 +00:00
sum + = BRTNODE_CHILD_SUBTREE_FINGERPRINTS ( child , i ) ;
2007-11-14 17:58:38 +00:00
}
}
// Don't try to get fancy about not modifying the fingerprint if it didn't change.
// We only call this function if we have reason to believe that the child's fingerprint did change.
2007-12-06 19:16:18 +00:00
BRTNODE_CHILD_SUBTREE_FINGERPRINTS ( node , childnum_of_node ) = sum ;
2007-11-14 17:58:38 +00:00
node - > dirty = 1 ;
2008-01-23 18:06:23 +00:00
toku_log_changechildfingerprint ( txn , toku_txn_get_txnid ( txn ) , toku_cachefile_filenum ( brt - > cf ) , node - > thisnodename , childnum_of_node , old_fingerprint , sum ) ;
2008-01-23 19:44:13 +00:00
toku_update_brtnode_lsn ( node , txn ) ;
2007-11-14 17:58:38 +00:00
}
2007-12-06 13:52:52 +00:00
static int brt_compare_pivot ( BRT brt , DBT * key , DBT * data , bytevec ck ) {
2007-11-19 00:46:09 +00:00
int cmp ;
DBT mydbt ;
2007-11-27 18:16:45 +00:00
struct kv_pair * kv = ( struct kv_pair * ) ck ;
2007-11-19 20:22:56 +00:00
if ( brt - > flags & TOKU_DB_DUPSORT ) {
2007-11-29 15:17:46 +00:00
cmp = brt - > compare_fun ( brt - > db , key , toku_fill_dbt ( & mydbt , kv_pair_key ( kv ) , kv_pair_keylen ( kv ) ) ) ;
2007-11-19 00:46:09 +00:00
if ( cmp = = 0 & & data ! = 0 )
2007-11-29 15:17:46 +00:00
cmp = brt - > dup_compare ( brt - > db , data , toku_fill_dbt ( & mydbt , kv_pair_val ( kv ) , kv_pair_vallen ( kv ) ) ) ;
2007-11-27 18:16:45 +00:00
} else {
2007-11-29 15:17:46 +00:00
cmp = brt - > compare_fun ( brt - > db , key , toku_fill_dbt ( & mydbt , kv_pair_key ( kv ) , kv_pair_keylen ( kv ) ) ) ;
2007-11-27 18:16:45 +00:00
}
2007-11-19 00:46:09 +00:00
return cmp ;
}
2007-10-03 19:34:31 +00:00
2007-11-29 15:09:14 +00:00
void toku_brtnode_flush_callback ( CACHEFILE cachefile , DISKOFF nodename , void * brtnode_v , long size __attribute ( ( unused ) ) , BOOL write_me , BOOL keep_me , LSN modified_lsn __attribute__ ( ( __unused__ ) ) , BOOL rename_p __attribute__ ( ( __unused__ ) ) ) {
2007-07-13 19:37:47 +00:00
BRTNODE brtnode = brtnode_v ;
2007-11-14 17:58:38 +00:00
// if ((write_me || keep_me) && (brtnode->height==0)) {
2007-11-20 00:32:25 +00:00
// toku_pma_verify_fingerprint(brtnode->u.l.buffer, brtnode->rand4fingerprint, brtnode->subtree_fingerprint);
2007-11-14 17:58:38 +00:00
// }
2007-07-13 19:37:47 +00:00
if ( 0 ) {
2007-11-29 15:09:14 +00:00
printf ( " %s:%d toku_brtnode_flush_callback %p keep_me=%d height=%d " , __FILE__ , __LINE__ , brtnode , keep_me , brtnode - > height ) ;
2007-07-13 19:37:47 +00:00
if ( brtnode - > height = = 0 ) printf ( " pma=%p " , brtnode - > u . l . buffer ) ;
printf ( " \n " ) ;
}
2007-11-24 03:50:28 +00:00
//if (modified_lsn.lsn > brtnode->lsn.lsn) brtnode->lsn=modified_lsn;
2007-07-13 19:37:47 +00:00
assert ( brtnode - > thisnodename = = nodename ) ;
//printf("%s:%d %p->mdict[0]=%p\n", __FILE__, __LINE__, brtnode, brtnode->mdicts[0]);
if ( write_me ) {
2007-11-21 13:07:49 +00:00
toku_serialize_brtnode_to ( toku_cachefile_fd ( cachefile ) , brtnode - > thisnodename , brtnode - > nodesize , brtnode ) ;
2007-07-13 19:37:47 +00:00
}
//printf("%s:%d %p->mdict[0]=%p\n", __FILE__, __LINE__, brtnode, brtnode->mdicts[0]);
if ( ! keep_me ) {
2007-11-29 15:09:14 +00:00
toku_brtnode_free ( & brtnode ) ;
2007-07-13 19:37:47 +00:00
}
//printf("%s:%d n_items_malloced=%lld\n", __FILE__, __LINE__, n_items_malloced);
}
2007-11-29 15:09:14 +00:00
int toku_brtnode_fetch_callback ( CACHEFILE cachefile , DISKOFF nodename , void * * brtnode_pv , long * sizep , void * extraargs , LSN * written_lsn ) {
2007-11-16 22:06:27 +00:00
BRT t = ( BRT ) extraargs ;
2007-07-13 19:37:47 +00:00
BRTNODE * result = ( BRTNODE * ) brtnode_pv ;
2007-11-19 23:54:17 +00:00
int r = toku_deserialize_brtnode_from ( toku_cachefile_fd ( cachefile ) , nodename , result , t - > flags , t - > nodesize ,
2007-11-26 21:51:36 +00:00
t - > compare_fun , t - > dup_compare , t - > db , toku_cachefile_filenum ( t - > cf ) ) ;
2008-01-23 18:06:23 +00:00
if ( r = = 0 ) {
2007-09-21 17:55:49 +00:00
* sizep = brtnode_size ( * result ) ;
2008-01-23 18:06:23 +00:00
* written_lsn = ( * result ) - > disk_lsn ;
}
2007-10-03 19:34:31 +00:00
//(*result)->parent_brtnode = 0; /* Don't know it right now. */
//printf("%s:%d installed %p (offset=%lld)\n", __FILE__, __LINE__, *result, nodename);
2007-09-21 17:55:49 +00:00
return r ;
2007-07-13 19:37:47 +00:00
}
2007-11-29 15:09:14 +00:00
void toku_brtheader_flush_callback ( CACHEFILE cachefile , DISKOFF nodename , void * header_v , long size __attribute ( ( unused ) ) , BOOL write_me , BOOL keep_me , LSN lsn __attribute__ ( ( __unused__ ) ) , BOOL rename_p __attribute__ ( ( __unused__ ) ) ) {
2007-07-13 19:37:47 +00:00
struct brt_header * h = header_v ;
assert ( nodename = = 0 ) ;
assert ( ! h - > dirty ) ; // shouldn't be dirty once it is unpinned.
if ( write_me ) {
2007-11-19 23:54:17 +00:00
toku_serialize_brt_header_to ( toku_cachefile_fd ( cachefile ) , h ) ;
2007-07-13 19:37:47 +00:00
}
if ( ! keep_me ) {
if ( h - > n_named_roots > 0 ) {
int i ;
for ( i = 0 ; i < h - > n_named_roots ; i + + ) {
2007-07-20 18:00:14 +00:00
toku_free ( h - > names [ i ] ) ;
2007-07-13 19:37:47 +00:00
}
2007-07-20 18:00:14 +00:00
toku_free ( h - > names ) ;
toku_free ( h - > roots ) ;
2007-07-13 19:37:47 +00:00
}
2007-07-20 18:00:14 +00:00
toku_free ( h ) ;
2007-07-13 19:37:47 +00:00
}
}
2007-11-29 15:09:14 +00:00
int toku_brtheader_fetch_callback ( CACHEFILE cachefile , DISKOFF nodename , void * * headerp_v , long * sizep __attribute__ ( ( unused ) ) , void * extraargs __attribute__ ( ( __unused__ ) ) , LSN * written_lsn ) {
2007-07-13 19:37:47 +00:00
struct brt_header * * h = ( struct brt_header * * ) headerp_v ;
assert ( nodename = = 0 ) ;
2007-11-19 23:54:17 +00:00
int r = toku_deserialize_brtheader_from ( toku_cachefile_fd ( cachefile ) , nodename , h ) ;
2007-11-14 17:58:38 +00:00
written_lsn - > lsn = 0 ; // !!! WRONG. This should be stored or kept redundantly or something.
2007-09-21 17:55:49 +00:00
return r ;
2007-07-13 19:37:47 +00:00
}
2007-11-14 17:58:38 +00:00
int toku_read_and_pin_brt_header ( CACHEFILE cf , struct brt_header * * header ) {
2007-07-13 19:37:47 +00:00
void * header_p ;
2007-07-20 14:20:58 +00:00
//fprintf(stderr, "%s:%d read_and_pin_brt_header(...)\n", __FILE__, __LINE__);
2007-11-19 23:47:44 +00:00
int r = toku_cachetable_get_and_pin ( cf , 0 , & header_p , NULL ,
2007-11-29 15:09:14 +00:00
toku_brtheader_flush_callback , toku_brtheader_fetch_callback , 0 ) ;
2007-07-13 19:37:47 +00:00
if ( r ! = 0 ) return r ;
* header = header_p ;
return 0 ;
}
2007-11-14 17:58:38 +00:00
int toku_unpin_brt_header ( BRT brt ) {
2007-11-19 23:47:44 +00:00
int r = toku_cachetable_unpin ( brt - > cf , 0 , brt - > h - > dirty , 0 ) ;
2007-07-13 19:37:47 +00:00
brt - > h - > dirty = 0 ;
brt - > h = 0 ;
return r ;
}
2008-01-23 19:44:13 +00:00
static int unpin_brtnode ( BRT brt , BRTNODE node ) {
// if (node->dirty && txn) {
// // For now just update the log_lsn. Later we'll have to deal with the checksums.
// node->log_lsn = toku_txn_get_last_lsn(txn);
// //if (node->log_lsn.lsn>33320) printf("%s:%d node%lld lsn=%lld\n", __FILE__, __LINE__, node->thisnodename, node->log_lsn.lsn);
// }
2008-01-23 18:06:23 +00:00
return toku_cachetable_unpin ( brt - > cf , node - > thisnodename , node - > dirty , brtnode_size ( node ) ) ;
}
2007-07-13 19:37:47 +00:00
typedef struct kvpair {
bytevec key ;
unsigned int keylen ;
bytevec val ;
unsigned int vallen ;
} * KVPAIR ;
2007-11-20 00:02:51 +00:00
#if 0
2007-07-13 19:37:47 +00:00
int kvpair_compare ( const void * av , const void * bv ) {
const KVPAIR a = ( const KVPAIR ) av ;
const KVPAIR b = ( const KVPAIR ) bv ;
2007-11-20 00:02:51 +00:00
int r = toku_keycompare ( a - > key , a - > keylen , b - > key , b - > keylen ) ;
2007-07-13 19:37:47 +00:00
//printf("keycompare(%s,\n %s)-->%d\n", a->key, b->key, r);
return r ;
}
2007-11-20 00:02:51 +00:00
# endif
2007-07-13 19:37:47 +00:00
/* Forgot to handle the case where there is something in the freelist. */
2008-01-18 21:28:27 +00:00
static int malloc_diskblock_header_is_in_memory ( DISKOFF * res , BRT brt , int size , TOKUTXN txn ) {
2007-11-14 17:58:38 +00:00
DISKOFF result = brt - > h - > unused_memory ;
2007-07-13 19:37:47 +00:00
brt - > h - > unused_memory + = size ;
2007-12-06 18:56:46 +00:00
brt - > h - > dirty = 1 ;
2008-01-18 21:28:27 +00:00
int r = toku_log_changeunusedmemory ( txn , toku_txn_get_txnid ( txn ) , toku_cachefile_filenum ( brt - > cf ) , result , brt - > h - > unused_memory ) ;
* res = result ;
return r ;
2007-07-13 19:37:47 +00:00
}
2008-01-18 21:28:27 +00:00
int malloc_diskblock ( DISKOFF * res , BRT brt , int size , TOKUTXN txn ) {
2007-07-13 19:37:47 +00:00
#if 0
int r = read_and_pin_brt_header ( brt - > fd , & brt - > h ) ;
assert ( r = = 0 ) ;
{
2007-11-14 17:58:38 +00:00
DISKOFF result = malloc_diskblock_header_is_in_memory ( brt , size ) ;
2007-07-13 19:37:47 +00:00
r = write_brt_header ( brt - > fd , & brt - > h ) ;
assert ( r = = 0 ) ;
return result ;
}
# else
2008-01-18 21:28:27 +00:00
return malloc_diskblock_header_is_in_memory ( res , brt , size , txn ) ;
2007-07-13 19:37:47 +00:00
# endif
}
2007-11-14 17:58:38 +00:00
static void initialize_brtnode ( BRT t , BRTNODE n , DISKOFF nodename , int height ) {
2007-07-13 19:37:47 +00:00
int i ;
n - > tag = TYP_BRTNODE ;
n - > nodesize = t - > h - > nodesize ;
2007-11-27 15:22:56 +00:00
n - > flags = t - > h - > flags ;
2007-07-13 19:37:47 +00:00
n - > thisnodename = nodename ;
2007-11-24 03:50:28 +00:00
n - > disk_lsn . lsn = 0 ; // a new one can always be 0.
n - > log_lsn = n - > disk_lsn ;
2008-01-07 19:02:27 +00:00
n - > layout_version = 1 ;
2007-07-13 19:37:47 +00:00
n - > height = height ;
2007-11-14 17:58:38 +00:00
n - > rand4fingerprint = random ( ) ;
n - > local_fingerprint = 0 ;
2007-11-15 14:44:05 +00:00
n - > dirty = 1 ;
2007-07-13 19:37:47 +00:00
assert ( height > = 0 ) ;
if ( height > 0 ) {
n - > u . n . n_children = 0 ;
for ( i = 0 ; i < TREE_FANOUT ; i + + ) {
2007-11-14 17:58:38 +00:00
// n->u.n.childkeys[i] = 0;
// n->u.n.childkeylens[i] = 0;
2007-07-13 19:37:47 +00:00
}
n - > u . n . totalchildkeylens = 0 ;
for ( i = 0 ; i < TREE_FANOUT + 1 ; i + + ) {
2007-12-06 19:16:18 +00:00
BRTNODE_CHILD_SUBTREE_FINGERPRINTS ( n , i ) = 0 ;
2007-11-14 17:58:38 +00:00
// n->u.n.children[i] = 0;
2008-01-11 14:03:33 +00:00
// n->u.n.buffers[i] = 0;
n - > u . n . n_bytes_in_buffer [ i ] = 0 ;
2007-11-14 17:58:38 +00:00
n - > u . n . n_cursors [ i ] = 0 ; // This one is simpler to initialize properly
2007-07-13 19:37:47 +00:00
}
2008-01-11 14:03:33 +00:00
n - > u . n . n_bytes_in_buffers = 0 ;
2007-07-13 19:37:47 +00:00
} else {
2007-11-26 21:51:36 +00:00
int r = toku_pma_create ( & n - > u . l . buffer , t - > compare_fun , t - > db , toku_cachefile_filenum ( t - > cf ) , n - > nodesize ) ;
2007-11-19 00:46:09 +00:00
assert ( r = = 0 ) ;
2008-01-02 20:33:51 +00:00
toku_pma_set_dup_mode ( n - > u . l . buffer , t - > flags & ( TOKU_DB_DUP + TOKU_DB_DUPSORT ) ) ;
2007-12-17 01:03:35 +00:00
toku_pma_set_dup_compare ( n - > u . l . buffer , t - > dup_compare ) ;
2007-07-13 19:37:47 +00:00
static int rcount = 0 ;
//printf("%s:%d n PMA= %p (rcount=%d)\n", __FILE__, __LINE__, n->u.l.buffer, rcount);
rcount + + ;
n - > u . l . n_bytes_in_buffer = 0 ;
}
}
2008-01-15 21:50:45 +00:00
static void create_new_brtnode ( BRT t , BRTNODE * result , int height , TOKUTXN txn ) {
2007-07-13 19:37:47 +00:00
TAGMALLOC ( BRTNODE , n ) ;
int r ;
2008-01-18 21:28:27 +00:00
DISKOFF name ;
r = malloc_diskblock ( & name , t , t - > h - > nodesize , txn ) ;
assert ( r = = 0 ) ;
2007-07-13 19:37:47 +00:00
assert ( n ) ;
assert ( t - > h - > nodesize > 0 ) ;
//printf("%s:%d malloced %lld (and malloc again=%lld)\n", __FILE__, __LINE__, name, malloc_diskblock(t, t->nodesize));
initialize_brtnode ( t , n , name , height ) ;
* result = n ;
assert ( n - > nodesize > 0 ) ;
2007-11-16 20:34:13 +00:00
// n->brt = t;
2007-10-03 19:34:31 +00:00
//printf("%s:%d putting %p (%lld) parent=%p\n", __FILE__, __LINE__, n, n->thisnodename, parent_brtnode);
2007-11-19 23:47:44 +00:00
r = toku_cachetable_put ( t - > cf , n - > thisnodename , n , brtnode_size ( n ) ,
2007-11-29 15:09:14 +00:00
toku_brtnode_flush_callback , toku_brtnode_fetch_callback , t ) ;
2008-01-22 16:27:54 +00:00
assert ( r = = 0 ) ;
2008-01-15 21:50:45 +00:00
r = toku_log_newbrtnode ( txn , toku_txn_get_txnid ( txn ) , toku_cachefile_filenum ( t - > cf ) , n - > thisnodename , height , n - > nodesize , ( t - > flags & TOKU_DB_DUPSORT ) ! = 0 , n - > rand4fingerprint ) ;
2007-07-13 19:37:47 +00:00
assert ( r = = 0 ) ;
2008-01-23 19:44:13 +00:00
toku_update_brtnode_lsn ( n , txn ) ;
2007-07-13 19:37:47 +00:00
}
2007-11-29 15:09:14 +00:00
static void delete_node ( BRT t , BRTNODE node ) {
2007-07-13 19:37:47 +00:00
int i ;
assert ( node - > height > = 0 ) ;
if ( node - > height = = 0 ) {
if ( node - > u . l . buffer ) {
2007-11-20 00:32:25 +00:00
toku_pma_free ( & node - > u . l . buffer ) ;
2007-07-13 19:37:47 +00:00
}
node - > u . l . n_bytes_in_buffer = 0 ;
} else {
for ( i = 0 ; i < node - > u . n . n_children ; i + + ) {
2008-01-11 14:03:33 +00:00
if ( node - > u . n . buffers [ i ] ) {
toku_fifo_free ( & node - > u . n . buffers [ i ] ) ;
2007-07-13 19:37:47 +00:00
}
2008-01-11 14:03:33 +00:00
node - > u . n . n_bytes_in_buffer [ 0 ] = 0 ;
2007-08-23 18:07:18 +00:00
assert ( node - > u . n . n_cursors [ i ] = = 0 ) ;
2007-07-13 19:37:47 +00:00
}
2008-01-11 14:03:33 +00:00
node - > u . n . n_bytes_in_buffers = 0 ;
2007-07-13 19:37:47 +00:00
node - > u . n . totalchildkeylens = 0 ;
node - > u . n . n_children = 0 ;
node - > height = 0 ;
node - > u . l . buffer = 0 ; /* It's a leaf now (height==0) so set the buffer to NULL. */
}
2007-11-19 23:47:44 +00:00
toku_cachetable_remove ( t - > cf , node - > thisnodename , 0 ) ; /* Don't write it back to disk. */
2007-07-13 19:37:47 +00:00
}
2008-01-11 14:03:33 +00:00
static int insert_to_buffer_in_nonleaf ( BRTNODE node , int childnum , DBT * k , DBT * v , int type ) {
2007-09-06 21:36:45 +00:00
unsigned int n_bytes_added = BRT_CMD_OVERHEAD + KEY_VALUE_OVERHEAD + k - > size + v - > size ;
2008-01-11 14:03:33 +00:00
int r = toku_fifo_enq ( node - > u . n . buffers [ childnum ] , k - > data , k - > size , v - > data , v - > size , type ) ;
2007-07-13 19:37:47 +00:00
if ( r ! = 0 ) return r ;
2007-11-14 17:58:38 +00:00
node - > local_fingerprint + = node - > rand4fingerprint * toku_calccrc32_cmd ( type , k - > data , k - > size , v - > data , v - > size ) ;
2008-01-11 14:03:33 +00:00
node - > u . n . n_bytes_in_buffer [ childnum ] + = n_bytes_added ;
node - > u . n . n_bytes_in_buffers + = n_bytes_added ;
2007-11-15 14:44:05 +00:00
node - > dirty = 1 ;
2007-07-13 19:37:47 +00:00
return 0 ;
}
2007-12-04 10:02:59 +00:00
static int brtleaf_split ( TOKUTXN txn , FILENUM filenum , BRT t , BRTNODE node , BRTNODE * nodea , BRTNODE * nodeb , DBT * splitk ) {
2008-01-22 16:27:54 +00:00
BRTNODE B ;
2007-07-13 19:37:47 +00:00
assert ( node - > height = = 0 ) ;
assert ( t - > h - > nodesize > = node - > nodesize ) ; /* otherwise we might be in trouble because the nodesize shrank. */
2008-01-15 21:50:45 +00:00
create_new_brtnode ( t , & B , 0 , txn ) ;
2007-12-31 17:30:19 +00:00
//printf("leaf_split %lld - %lld %lld\n", node->thisnodename, A->thisnodename, B->thisnodename);
2007-07-13 19:37:47 +00:00
//printf("%s:%d A PMA= %p\n", __FILE__, __LINE__, A->u.l.buffer);
//printf("%s:%d B PMA= %p\n", __FILE__, __LINE__, A->u.l.buffer);
assert ( B - > nodesize > 0 ) ;
assert ( node - > nodesize > 0 ) ;
//printf("%s:%d A is at %lld\n", __FILE__, __LINE__, A->thisnodename);
//printf("%s:%d B is at %lld nodesize=%d\n", __FILE__, __LINE__, B->thisnodename, B->nodesize);
assert ( node - > height > 0 | | node - > u . l . buffer ! = 0 ) ;
2007-08-09 18:54:58 +00:00
int r ;
2008-01-16 14:13:21 +00:00
r = toku_pma_split ( txn , filenum ,
2008-01-23 19:44:13 +00:00
node - > thisnodename , node - > u . l . buffer , & node - > u . l . n_bytes_in_buffer , node - > rand4fingerprint , & node - > local_fingerprint , & node - > log_lsn ,
2008-01-22 16:27:54 +00:00
splitk ,
2008-01-23 19:44:13 +00:00
B - > thisnodename , B - > u . l . buffer , & B - > u . l . n_bytes_in_buffer , B - > rand4fingerprint , & B - > local_fingerprint , & B - > log_lsn ) ;
2007-08-09 18:54:58 +00:00
assert ( r = = 0 ) ;
2007-07-13 19:37:47 +00:00
assert ( node - > height > 0 | | node - > u . l . buffer ! = 0 ) ;
/* Remove it from the cache table, and free its storage. */
//printf("%s:%d old pma = %p\n", __FILE__, __LINE__, node->u.l.buffer);
2008-01-22 16:27:54 +00:00
* nodea = node ;
2007-07-13 19:37:47 +00:00
* nodeb = B ;
2008-01-22 16:27:54 +00:00
assert ( toku_serialize_brtnode_size ( node ) < node - > nodesize ) ;
assert ( toku_serialize_brtnode_size ( B ) < B - > nodesize ) ;
2007-07-13 19:37:47 +00:00
return 0 ;
}
2008-01-11 14:03:33 +00:00
static void brt_update_fingerprint_when_moving_hashtable ( BRTNODE oldnode , BRTNODE newnode , FIFO table_being_moved ) {
2007-11-14 17:58:38 +00:00
u_int32_t sum = 0 ;
2008-01-11 14:03:33 +00:00
FIFO_ITERATE ( table_being_moved , key , keylen , data , datalen , type ,
sum + = toku_calccrc32_cmd ( type , key , keylen , data , datalen ) ) ;
2007-11-14 17:58:38 +00:00
oldnode - > local_fingerprint - = oldnode - > rand4fingerprint * sum ;
newnode - > local_fingerprint + = newnode - > rand4fingerprint * sum ;
}
2007-07-24 01:32:03 +00:00
/* Side effect: sets splitk->data pointer to a malloc'd value */
2008-01-15 21:50:45 +00:00
static void brt_nonleaf_split ( BRT t , BRTNODE node , BRTNODE * nodea , BRTNODE * nodeb , DBT * splitk , TOKUTXN txn ) {
2007-07-13 19:37:47 +00:00
int n_children_in_a = node - > u . n . n_children / 2 ;
BRTNODE A , B ;
assert ( node - > height > 0 ) ;
assert ( node - > u . n . n_children > = 2 ) ; // Otherwise, how do we split? We need at least two children to split. */
assert ( t - > h - > nodesize > = node - > nodesize ) ; /* otherwise we might be in trouble because the nodesize shrank. */
2008-01-15 21:50:45 +00:00
create_new_brtnode ( t , & A , node - > height , txn ) ;
create_new_brtnode ( t , & B , node - > height , txn ) ;
2007-07-13 19:37:47 +00:00
A - > u . n . n_children = n_children_in_a ;
B - > u . n . n_children = node - > u . n . n_children - n_children_in_a ;
2007-10-03 19:34:31 +00:00
//printf("%s:%d %p (%lld) becomes %p and %p\n", __FILE__, __LINE__, node, node->thisnodename, A, B);
2007-07-13 19:37:47 +00:00
//printf("%s:%d A is at %lld\n", __FILE__, __LINE__, A->thisnodename);
{
/* The first n_children_in_a go into node a.
* That means that the first n_children_in_a - 1 keys go into node a .
* The splitter key is key number n_children_in_a */
int i ;
for ( i = 0 ; i < n_children_in_a ; i + + ) {
2008-01-11 14:03:33 +00:00
FIFO htab = node - > u . n . buffers [ i ] ;
2007-12-06 20:58:45 +00:00
BRTNODE_CHILD_DISKOFF ( A , i ) = BRTNODE_CHILD_DISKOFF ( node , i ) ;
2008-01-11 14:03:33 +00:00
A - > u . n . buffers [ i ] = htab ;
A - > u . n . n_bytes_in_buffers + = ( A - > u . n . n_bytes_in_buffer [ i ] = node - > u . n . n_bytes_in_buffer [ i ] ) ;
2007-12-06 19:16:18 +00:00
BRTNODE_CHILD_SUBTREE_FINGERPRINTS ( A , i ) = BRTNODE_CHILD_SUBTREE_FINGERPRINTS ( node , i ) ;
2007-07-13 19:37:47 +00:00
2008-01-11 14:03:33 +00:00
node - > u . n . buffers [ i ] = 0 ;
node - > u . n . n_bytes_in_buffers - = node - > u . n . n_bytes_in_buffer [ i ] ;
node - > u . n . n_bytes_in_buffer [ i ] = 0 ;
2007-11-14 17:58:38 +00:00
brt_update_fingerprint_when_moving_hashtable ( node , A , htab ) ;
2007-07-13 19:37:47 +00:00
}
for ( i = n_children_in_a ; i < node - > u . n . n_children ; i + + ) {
int targchild = i - n_children_in_a ;
2008-01-11 14:03:33 +00:00
FIFO htab = node - > u . n . buffers [ i ] ;
2007-12-06 20:58:45 +00:00
BRTNODE_CHILD_DISKOFF ( B , targchild ) = BRTNODE_CHILD_DISKOFF ( node , i ) ;
2008-01-11 14:03:33 +00:00
B - > u . n . buffers [ targchild ] = htab ;
B - > u . n . n_bytes_in_buffers + = ( B - > u . n . n_bytes_in_buffer [ targchild ] = node - > u . n . n_bytes_in_buffer [ i ] ) ;
2007-12-06 19:16:18 +00:00
BRTNODE_CHILD_SUBTREE_FINGERPRINTS ( B , targchild ) = BRTNODE_CHILD_SUBTREE_FINGERPRINTS ( node , i ) ;
2007-07-13 19:37:47 +00:00
2008-01-11 14:03:33 +00:00
node - > u . n . buffers [ i ] = 0 ;
node - > u . n . n_bytes_in_buffers - = node - > u . n . n_bytes_in_buffer [ i ] ;
node - > u . n . n_bytes_in_buffer [ i ] = 0 ;
2007-11-14 17:58:38 +00:00
brt_update_fingerprint_when_moving_hashtable ( node , B , htab ) ;
2007-07-13 19:37:47 +00:00
}
for ( i = 0 ; i < n_children_in_a - 1 ; i + + ) {
A - > u . n . childkeys [ i ] = node - > u . n . childkeys [ i ] ;
2007-12-06 14:24:17 +00:00
A - > u . n . totalchildkeylens + = toku_brt_pivot_key_len ( t , node - > u . n . childkeys [ i ] ) ;
2007-12-06 14:20:47 +00:00
node - > u . n . totalchildkeylens - = toku_brt_pivot_key_len ( t , node - > u . n . childkeys [ i ] ) ;
2007-07-13 19:37:47 +00:00
node - > u . n . childkeys [ i ] = 0 ;
}
2007-07-24 01:32:03 +00:00
splitk - > data = ( void * ) ( node - > u . n . childkeys [ n_children_in_a - 1 ] ) ;
2007-12-06 14:20:47 +00:00
splitk - > size = toku_brt_pivot_key_len ( t , node - > u . n . childkeys [ n_children_in_a - 1 ] ) ;
node - > u . n . totalchildkeylens - = toku_brt_pivot_key_len ( t , node - > u . n . childkeys [ n_children_in_a - 1 ] ) ;
2007-07-13 19:37:47 +00:00
node - > u . n . childkeys [ n_children_in_a - 1 ] = 0 ;
for ( i = n_children_in_a ; i < node - > u . n . n_children - 1 ; i + + ) {
B - > u . n . childkeys [ i - n_children_in_a ] = node - > u . n . childkeys [ i ] ;
2007-12-06 14:20:47 +00:00
B - > u . n . totalchildkeylens + = toku_brt_pivot_key_len ( t , node - > u . n . childkeys [ i ] ) ;
node - > u . n . totalchildkeylens - = toku_brt_pivot_key_len ( t , node - > u . n . childkeys [ i ] ) ;
2007-07-13 19:37:47 +00:00
node - > u . n . childkeys [ i ] = 0 ;
}
assert ( node - > u . n . totalchildkeylens = = 0 ) ;
2007-10-03 19:34:31 +00:00
2007-11-14 17:58:38 +00:00
//verify_local_fingerprint_nonleaf(A);
//verify_local_fingerprint_nonleaf(B);
2007-07-13 19:37:47 +00:00
}
{
int i ;
for ( i = 0 ; i < TREE_FANOUT + 1 ; i + + ) {
2008-01-11 14:03:33 +00:00
assert ( node - > u . n . buffers [ i ] = = 0 ) ;
assert ( node - > u . n . n_bytes_in_buffer [ i ] = = 0 ) ;
2007-07-13 19:37:47 +00:00
}
2008-01-11 14:03:33 +00:00
assert ( node - > u . n . n_bytes_in_buffers = = 0 ) ;
2007-07-13 19:37:47 +00:00
}
2008-01-11 14:03:33 +00:00
/* The buffer is all divied up between them, since just moved the buffers over. */
2007-07-13 19:37:47 +00:00
* nodea = A ;
* nodeb = B ;
/* Remove it from the cache table, and free its storage. */
//printf("%s:%d removing %lld\n", __FILE__, __LINE__, node->thisnodename);
delete_node ( t , node ) ;
2007-11-19 23:54:17 +00:00
assert ( toku_serialize_brtnode_size ( A ) < A - > nodesize ) ;
assert ( toku_serialize_brtnode_size ( B ) < B - > nodesize ) ;
2007-07-13 19:37:47 +00:00
}
2007-11-28 19:00:21 +00:00
static void find_heaviest_child ( BRTNODE node , int * childnum ) {
2007-07-13 19:37:47 +00:00
int max_child = 0 ;
2008-01-11 14:03:33 +00:00
int max_weight = node - > u . n . n_bytes_in_buffer [ 0 ] ;
2007-07-13 19:37:47 +00:00
int i ;
2007-07-24 02:36:00 +00:00
if ( 0 ) printf ( " %s:%d weights: %d " , __FILE__ , __LINE__ , max_weight ) ;
2007-07-13 19:37:47 +00:00
assert ( node - > u . n . n_children > 0 ) ;
for ( i = 1 ; i < node - > u . n . n_children ; i + + ) {
2008-01-11 14:03:33 +00:00
int this_weight = node - > u . n . n_bytes_in_buffer [ i ] ;
2007-07-24 02:36:00 +00:00
if ( 0 ) printf ( " %d " , this_weight ) ;
2007-07-13 19:37:47 +00:00
if ( max_weight < this_weight ) {
max_child = i ;
max_weight = this_weight ;
}
}
* childnum = max_child ;
2007-07-24 02:36:00 +00:00
if ( 0 ) printf ( " \n " ) ;
2007-07-13 19:37:47 +00:00
}
2007-09-06 21:36:45 +00:00
static int brtnode_put_cmd ( BRT t , BRTNODE node , BRT_CMD * cmd ,
2007-09-28 17:11:22 +00:00
int * did_split , BRTNODE * nodea , BRTNODE * nodeb ,
DBT * split ,
int debug ,
TOKUTXN txn ) ;
2007-07-13 19:37:47 +00:00
2008-01-11 14:03:33 +00:00
/* key is not in the buffer. Either put the key-value pair in the child, or put it in the node. */
2007-09-06 21:36:45 +00:00
static int push_brt_cmd_down_only_if_it_wont_push_more_else_put_here ( BRT t , BRTNODE node , BRTNODE child ,
2007-09-28 17:11:22 +00:00
BRT_CMD * cmd ,
int childnum_of_node ,
TOKUTXN txn ) {
2007-07-13 19:37:47 +00:00
assert ( node - > height > 0 ) ; /* Not a leaf. */
2007-09-06 21:36:45 +00:00
DBT * k = cmd - > u . id . key ;
DBT * v = cmd - > u . id . val ;
2007-12-04 22:18:21 +00:00
int to_child = toku_serialize_brtnode_size ( child ) + k - > size + v - > size + KEY_VALUE_OVERHEAD + BRT_CMD_OVERHEAD < = child - > nodesize ;
2007-11-29 14:44:03 +00:00
if ( toku_brt_debug_mode ) {
2007-07-24 01:32:03 +00:00
printf ( " %s:%d pushing %s to %s %d " , __FILE__ , __LINE__ , ( char * ) k - > data , to_child ? " child " : " hash " , childnum_of_node ) ;
2007-07-13 19:37:47 +00:00
if ( childnum_of_node + 1 < node - > u . n . n_children ) {
2007-07-24 01:32:03 +00:00
DBT k2 ;
2007-07-13 19:37:47 +00:00
printf ( " nextsplitkey=%s \n " , ( char * ) node - > u . n . childkeys [ childnum_of_node ] ) ;
2007-12-06 13:52:52 +00:00
assert ( t - > compare_fun ( t - > db , k , toku_fill_dbt ( & k2 , node - > u . n . childkeys [ childnum_of_node ] , toku_brt_pivot_key_len ( t , node - > u . n . childkeys [ childnum_of_node ] ) ) ) < = 0 ) ;
2007-07-13 19:37:47 +00:00
} else {
printf ( " \n " ) ;
}
}
2007-11-14 17:58:38 +00:00
int r ;
2007-07-13 19:37:47 +00:00
if ( to_child ) {
2007-07-24 01:32:03 +00:00
int again_split = - 1 ; BRTNODE againa , againb ;
DBT againk ;
2007-11-29 15:17:46 +00:00
toku_init_dbt ( & againk ) ;
2007-07-13 19:37:47 +00:00
//printf("%s:%d hello!\n", __FILE__, __LINE__);
2007-11-14 17:58:38 +00:00
r = brtnode_put_cmd ( t , child , cmd ,
2007-09-28 17:11:22 +00:00
& again_split , & againa , & againb , & againk ,
0 ,
txn ) ;
2007-07-13 19:37:47 +00:00
if ( r ! = 0 ) return r ;
assert ( again_split = = 0 ) ; /* I only did the insert if I knew it wouldn't push down, and hence wouldn't split. */
} else {
2008-01-11 14:03:33 +00:00
r = insert_to_buffer_in_nonleaf ( node , childnum_of_node , k , v , cmd - > type ) ;
2007-07-13 19:37:47 +00:00
}
2008-01-23 18:06:23 +00:00
fixup_child_fingerprint ( node , childnum_of_node , child , t , txn ) ;
2007-11-14 17:58:38 +00:00
return r ;
2007-07-13 19:37:47 +00:00
}
2007-09-06 21:36:45 +00:00
static int push_a_brt_cmd_down ( BRT t , BRTNODE node , BRTNODE child , int childnum ,
2007-09-28 17:11:22 +00:00
BRT_CMD * cmd ,
int * child_did_split , BRTNODE * childa , BRTNODE * childb ,
DBT * childsplitk ,
TOKUTXN txn ) {
2007-07-13 19:37:47 +00:00
//if (debug) printf("%s:%d %*sinserting down\n", __FILE__, __LINE__, debug, "");
//printf("%s:%d hello!\n", __FILE__, __LINE__);
assert ( node - > height > 0 ) ;
{
2007-09-06 21:36:45 +00:00
int r = brtnode_put_cmd ( t , child , cmd ,
2007-09-28 17:11:22 +00:00
child_did_split , childa , childb , childsplitk ,
0 ,
txn ) ;
2007-07-13 19:37:47 +00:00
if ( r ! = 0 ) return r ;
}
2007-07-24 02:36:00 +00:00
2007-09-06 21:36:45 +00:00
DBT * k = cmd - > u . id . key ;
DBT * v = cmd - > u . id . val ;
2007-07-13 19:37:47 +00:00
//if (debug) printf("%s:%d %*sinserted down child_did_split=%d\n", __FILE__, __LINE__, debug, "", child_did_split);
2007-11-14 17:58:38 +00:00
node - > local_fingerprint - = node - > rand4fingerprint * toku_calccrc32_cmdstruct ( cmd ) ;
2007-07-13 19:37:47 +00:00
{
2008-01-11 14:03:33 +00:00
int r = toku_fifo_deq ( node - > u . n . buffers [ childnum ] ) ;
2007-07-24 02:36:00 +00:00
//printf("%s:%d deleted status=%d\n", __FILE__, __LINE__, r);
2007-07-13 19:37:47 +00:00
if ( r ! = 0 ) return r ;
}
{
2007-09-06 21:36:45 +00:00
int n_bytes_removed = ( k - > size + v - > size + KEY_VALUE_OVERHEAD + BRT_CMD_OVERHEAD ) ;
2008-01-11 14:03:33 +00:00
node - > u . n . n_bytes_in_buffers - = n_bytes_removed ;
node - > u . n . n_bytes_in_buffer [ childnum ] - = n_bytes_removed ;
2007-11-15 14:44:05 +00:00
node - > dirty = 1 ;
2007-07-13 19:37:47 +00:00
}
2007-11-14 17:58:38 +00:00
if ( * child_did_split ) {
2008-01-23 18:06:23 +00:00
fixup_child_fingerprint ( node , childnum , * childa , t , txn ) ;
fixup_child_fingerprint ( node , childnum + 1 , * childb , t , txn ) ;
2007-11-14 17:58:38 +00:00
} else {
2008-01-23 18:06:23 +00:00
fixup_child_fingerprint ( node , childnum , child , t , txn ) ;
2007-11-14 17:58:38 +00:00
}
2007-07-13 19:37:47 +00:00
return 0 ;
}
2007-11-30 17:40:04 +00:00
static int brtnode_maybe_push_down ( BRT t , BRTNODE node , int * did_split , BRTNODE * nodea , BRTNODE * nodeb , DBT * splitk , int debug , TOKUTXN txn ) ;
2007-11-28 19:00:21 +00:00
static int split_count = 0 ;
2007-07-13 19:37:47 +00:00
/* NODE is a node with a child.
* childnum was split into two nodes childa , and childb .
* We must slide things around , & move things from the old table to the new tables .
2008-01-17 15:41:42 +00:00
* We also move things to the new children as much as we can without doing any pushdowns or splitting of the child .
2008-01-11 14:03:33 +00:00
* We must delete the old buffer ( but the old child is already deleted . )
2007-07-13 19:37:47 +00:00
* We also unpin the new children .
*/
static int handle_split_of_child ( BRT t , BRTNODE node , int childnum ,
2007-07-24 01:32:03 +00:00
BRTNODE childa , BRTNODE childb ,
DBT * childsplitk , /* the data in the childsplitk is alloc'd and is consumed by this call. */
int * did_split , BRTNODE * nodea , BRTNODE * nodeb ,
DBT * splitk ,
2007-09-28 17:11:22 +00:00
TOKUTXN txn ) {
2007-07-13 19:37:47 +00:00
assert ( node - > height > 0 ) ;
2007-08-23 18:07:18 +00:00
assert ( 0 < = childnum & & childnum < node - > u . n . n_children ) ;
2008-01-11 14:03:33 +00:00
FIFO old_h = node - > u . n . buffers [ childnum ] ;
int old_count = node - > u . n . n_bytes_in_buffer [ childnum ] ;
2007-07-13 19:37:47 +00:00
int cnum ;
int r ;
assert ( node - > u . n . n_children < = TREE_FANOUT ) ;
2007-11-29 14:44:03 +00:00
if ( toku_brt_debug_mode ) {
2007-07-13 19:37:47 +00:00
int i ;
2007-07-24 01:32:03 +00:00
printf ( " %s:%d Child %d did split on %s \n " , __FILE__ , __LINE__ , childnum , ( char * ) childsplitk - > data ) ;
2007-07-13 19:37:47 +00:00
printf ( " %s:%d oldsplitkeys: " , __FILE__ , __LINE__ ) ;
for ( i = 0 ; i < node - > u . n . n_children - 1 ; i + + ) printf ( " %s " , ( char * ) node - > u . n . childkeys [ i ] ) ;
printf ( " \n " ) ;
}
2007-11-15 14:44:05 +00:00
node - > dirty = 1 ;
2007-09-18 16:09:55 +00:00
2007-11-14 17:58:38 +00:00
//verify_local_fingerprint_nonleaf(node);
2007-07-13 19:37:47 +00:00
// Slide the children over.
for ( cnum = node - > u . n . n_children ; cnum > childnum + 1 ; cnum - - ) {
2007-12-06 20:58:45 +00:00
BRTNODE_CHILD_DISKOFF ( node , cnum ) = BRTNODE_CHILD_DISKOFF ( node , cnum - 1 ) ;
2008-01-11 14:03:33 +00:00
node - > u . n . buffers [ cnum ] = node - > u . n . buffers [ cnum - 1 ] ;
2007-12-06 19:16:18 +00:00
BRTNODE_CHILD_SUBTREE_FINGERPRINTS ( node , cnum ) = BRTNODE_CHILD_SUBTREE_FINGERPRINTS ( node , cnum - 1 ) ;
2008-01-11 14:03:33 +00:00
node - > u . n . n_bytes_in_buffer [ cnum ] = node - > u . n . n_bytes_in_buffer [ cnum - 1 ] ;
2007-08-23 18:07:18 +00:00
node - > u . n . n_cursors [ cnum ] = node - > u . n . n_cursors [ cnum - 1 ] ;
2007-07-13 19:37:47 +00:00
}
2007-12-06 20:58:45 +00:00
BRTNODE_CHILD_DISKOFF ( node , childnum ) = childa - > thisnodename ;
BRTNODE_CHILD_DISKOFF ( node , childnum + 1 ) = childb - > thisnodename ;
2007-12-05 01:07:48 +00:00
node - > u . n . n_cursors [ childnum + 1 ] = 0 ;
2008-01-23 18:06:23 +00:00
fixup_child_fingerprint ( node , childnum , childa , t , txn ) ;
fixup_child_fingerprint ( node , childnum + 1 , childb , t , txn ) ;
2008-01-18 16:01:25 +00:00
r = toku_fifo_create ( & node - > u . n . buffers [ childnum ] ) ; assert ( r = = 0 ) ; // ??? SHould handle this error case
r = toku_fifo_create ( & node - > u . n . buffers [ childnum + 1 ] ) ; assert ( r = = 0 ) ;
2008-01-11 14:03:33 +00:00
node - > u . n . n_bytes_in_buffer [ childnum ] = 0 ;
node - > u . n . n_bytes_in_buffer [ childnum + 1 ] = 0 ;
2007-11-14 17:58:38 +00:00
// Remove all the cmds from the local fingerprint. Some may get added in again when we try to push to the child.
2008-01-11 14:03:33 +00:00
FIFO_ITERATE ( old_h , skey , skeylen , sval , svallen , type ,
node - > local_fingerprint - = node - > rand4fingerprint * toku_calccrc32_cmd ( type , skey , skeylen , sval , svallen ) ) ;
2007-11-14 17:58:38 +00:00
2007-07-13 19:37:47 +00:00
// Slide the keys over
for ( cnum = node - > u . n . n_children - 1 ; cnum > childnum ; cnum - - ) {
node - > u . n . childkeys [ cnum ] = node - > u . n . childkeys [ cnum - 1 ] ;
}
2007-11-27 18:16:45 +00:00
node - > u . n . childkeys [ childnum ] = ( void * ) childsplitk - > data ;
2007-07-24 01:32:03 +00:00
node - > u . n . totalchildkeylens + = childsplitk - > size ;
2007-07-13 19:37:47 +00:00
node - > u . n . n_children + + ;
2007-11-29 14:44:03 +00:00
if ( toku_brt_debug_mode ) {
2007-07-13 19:37:47 +00:00
int i ;
printf ( " %s:%d splitkeys: " , __FILE__ , __LINE__ ) ;
for ( i = 0 ; i < node - > u . n . n_children - 1 ; i + + ) printf ( " %s " , ( char * ) node - > u . n . childkeys [ i ] ) ;
printf ( " \n " ) ;
}
2008-01-11 14:03:33 +00:00
node - > u . n . n_bytes_in_buffers - = old_count ; /* By default, they are all removed. We might add them back in. */
2007-07-13 19:37:47 +00:00
/* Keep pushing to the children, but not if the children would require a pushdown */
2008-01-11 14:03:33 +00:00
FIFO_ITERATE ( old_h , skey , skeylen , sval , svallen , type , ( {
2007-07-24 01:32:03 +00:00
DBT skd , svd ;
2007-11-29 15:17:46 +00:00
toku_fill_dbt ( & skd , skey , skeylen ) ;
toku_fill_dbt ( & svd , sval , svallen ) ;
2007-09-06 21:36:45 +00:00
BRT_CMD brtcmd ;
2007-11-26 21:51:36 +00:00
brtcmd . type = type ; brtcmd . u . id . key = & skd ; brtcmd . u . id . val = & svd ;
2007-11-14 17:58:38 +00:00
//verify_local_fingerprint_nonleaf(childa); verify_local_fingerprint_nonleaf(childb);
2007-11-19 00:46:09 +00:00
int tochildnum = childnum ;
BRTNODE tochild = childa ;
2008-01-02 20:33:51 +00:00
if ( type = = BRT_INSERT | | type = = BRT_DELETE_BOTH ) {
2007-12-31 17:30:19 +00:00
int cmp = brt_compare_pivot ( t , & skd , & svd , childsplitk - > data ) ;
2008-01-08 21:03:17 +00:00
if ( cmp > 0 ) {
2007-11-19 00:46:09 +00:00
tochildnum = childnum + 1 ; tochild = childb ;
}
}
r = push_brt_cmd_down_only_if_it_wont_push_more_else_put_here ( t , node , tochild , & brtcmd , tochildnum , txn ) ;
2007-11-14 17:58:38 +00:00
//verify_local_fingerprint_nonleaf(childa); verify_local_fingerprint_nonleaf(childb);
2007-12-31 17:30:19 +00:00
if ( type = = BRT_DELETE ) {
int r2 = push_brt_cmd_down_only_if_it_wont_push_more_else_put_here ( t , node , childb , & brtcmd , childnum + 1 , txn ) ;
//verify_local_fingerprint_nonleaf(childa); verify_local_fingerprint_nonleaf(childb);
if ( r2 ! = 0 ) return r2 ;
}
2007-07-13 19:37:47 +00:00
if ( r ! = 0 ) return r ;
} ) ) ;
2007-09-06 21:36:45 +00:00
2008-01-11 14:03:33 +00:00
toku_fifo_free ( & old_h ) ;
2007-07-13 19:37:47 +00:00
2007-11-14 17:58:38 +00:00
//verify_local_fingerprint_nonleaf(childa);
//verify_local_fingerprint_nonleaf(childb);
//verify_local_fingerprint_nonleaf(node);
2007-07-13 19:37:47 +00:00
2007-11-19 23:54:17 +00:00
toku_verify_counts ( node ) ;
toku_verify_counts ( childa ) ;
toku_verify_counts ( childb ) ;
2007-07-13 19:37:47 +00:00
2008-01-23 19:44:13 +00:00
r = unpin_brtnode ( t , childa ) ;
2007-11-14 17:58:38 +00:00
assert ( r = = 0 ) ;
2008-01-23 19:44:13 +00:00
r = unpin_brtnode ( t , childb ) ;
2007-11-14 17:58:38 +00:00
assert ( r = = 0 ) ;
2007-07-13 19:37:47 +00:00
if ( node - > u . n . n_children > TREE_FANOUT ) {
//printf("%s:%d about to split having pushed %d out of %d keys\n", __FILE__, __LINE__, i, n_pairs);
2008-01-15 21:50:45 +00:00
brt_nonleaf_split ( t , node , nodea , nodeb , splitk , txn ) ;
2007-07-13 19:37:47 +00:00
//printf("%s:%d did split\n", __FILE__, __LINE__);
split_count + + ;
* did_split = 1 ;
assert ( ( * nodea ) - > height > 0 ) ;
assert ( ( * nodeb ) - > height > 0 ) ;
assert ( ( * nodea ) - > u . n . n_children > 0 ) ;
assert ( ( * nodeb ) - > u . n . n_children > 0 ) ;
2007-12-06 20:58:45 +00:00
assert ( BRTNODE_CHILD_DISKOFF ( * nodea , ( * nodea ) - > u . n . n_children - 1 ) ! = 0 ) ;
assert ( BRTNODE_CHILD_DISKOFF ( * nodeb , ( * nodeb ) - > u . n . n_children - 1 ) ! = 0 ) ;
2007-11-19 23:54:17 +00:00
assert ( toku_serialize_brtnode_size ( * nodea ) < = ( * nodea ) - > nodesize ) ;
assert ( toku_serialize_brtnode_size ( * nodeb ) < = ( * nodeb ) - > nodesize ) ;
2007-11-14 17:58:38 +00:00
//verify_local_fingerprint_nonleaf(*nodea);
//verify_local_fingerprint_nonleaf(*nodeb);
2007-07-13 19:37:47 +00:00
} else {
* did_split = 0 ;
2007-11-30 17:40:04 +00:00
if ( toku_serialize_brtnode_size ( node ) > node - > nodesize ) {
2007-12-01 13:12:56 +00:00
/* lighten the node by pushing down its buffers. this may cause
the current node to split and go away */
2007-11-30 17:40:04 +00:00
r = brtnode_maybe_push_down ( t , node , did_split , nodea , nodeb , splitk , 0 , txn ) ;
assert ( r = = 0 ) ;
}
2007-12-01 13:12:56 +00:00
if ( * did_split = = 0 ) assert ( toku_serialize_brtnode_size ( node ) < = node - > nodesize ) ;
2007-07-13 19:37:47 +00:00
}
return 0 ;
}
2007-09-06 21:36:45 +00:00
static int push_some_brt_cmds_down ( BRT t , BRTNODE node , int childnum ,
2007-09-28 17:11:22 +00:00
int * did_split , BRTNODE * nodea , BRTNODE * nodeb ,
DBT * splitk ,
int debug ,
TOKUTXN txn ) {
2007-07-13 19:37:47 +00:00
void * childnode_v ;
BRTNODE child ;
int r ;
assert ( node - > height > 0 ) ;
2007-12-06 20:58:45 +00:00
DISKOFF targetchild = BRTNODE_CHILD_DISKOFF ( node , childnum ) ;
2007-07-13 19:37:47 +00:00
assert ( targetchild > = 0 & & targetchild < t - > h - > unused_memory ) ; // This assertion could fail in a concurrent setting since another process might have bumped unused memory.
2007-11-19 23:47:44 +00:00
r = toku_cachetable_get_and_pin ( t - > cf , targetchild , & childnode_v , NULL ,
2007-11-29 15:09:14 +00:00
toku_brtnode_flush_callback , toku_brtnode_fetch_callback , t ) ;
2007-07-13 19:37:47 +00:00
if ( r ! = 0 ) return r ;
2007-10-03 19:34:31 +00:00
//printf("%s:%d pin %p\n", __FILE__, __LINE__, childnode_v);
2007-07-13 19:37:47 +00:00
child = childnode_v ;
2007-11-14 17:58:38 +00:00
//verify_local_fingerprint_nonleaf(child);
2007-11-19 23:54:17 +00:00
toku_verify_counts ( child ) ;
2008-01-11 14:03:33 +00:00
//printf("%s:%d height=%d n_bytes_in_buffer = {%d, %d, %d, ...}\n", __FILE__, __LINE__, child->height, child->n_bytes_in_buffer[0], child->n_bytes_in_buffer[1], child->n_bytes_in_buffer[2]);
2007-12-06 20:58:45 +00:00
if ( child - > height > 0 & & child - > u . n . n_children > 0 ) assert ( BRTNODE_CHILD_DISKOFF ( child , child - > u . n . n_children - 1 ) ! = 0 ) ;
2007-09-06 21:36:45 +00:00
if ( debug ) printf ( " %s:%d %*spush_some_brt_cmds_down to %lld \n " , __FILE__ , __LINE__ , debug , " " , child - > thisnodename ) ;
2007-07-13 19:37:47 +00:00
/* I am exposing the internals of the hash table here, mostly because I am not thinking of a really
* good way to do it otherwise . I want to loop over the elements of the hash table , deleting some as I
2008-01-11 14:03:33 +00:00
* go . The FIFO_ITERATE macro will break if I delete something from the hash table . */
2007-07-13 19:37:47 +00:00
2007-07-30 21:44:27 +00:00
if ( 0 ) {
static int count = 0 ;
count + + ;
printf ( " %s:%d pushing %d count=%d \n " , __FILE__ , __LINE__ , childnum , count ) ;
}
2007-07-13 19:37:47 +00:00
{
bytevec key , val ;
ITEMLEN keylen , vallen ;
2008-01-11 14:03:33 +00:00
//printf("%s:%d Try random_pick, weight=%d \n", __FILE__, __LINE__, node->u.n.n_bytes_in_buffer[childnum]);
assert ( toku_fifo_n_entries ( node - > u . n . buffers [ childnum ] ) > 0 ) ;
2007-09-06 21:36:45 +00:00
int type ;
2008-01-11 14:03:33 +00:00
while ( 0 = = toku_fifo_peek ( node - > u . n . buffers [ childnum ] , & key , & keylen , & val , & vallen , & type ) ) {
2007-07-24 01:32:03 +00:00
int child_did_split = 0 ; BRTNODE childa , childb ;
DBT hk , hv ;
DBT childsplitk ;
2007-09-06 21:36:45 +00:00
BRT_CMD brtcmd ;
2007-11-29 15:17:46 +00:00
toku_fill_dbt ( & hk , key , keylen ) ;
toku_fill_dbt ( & hv , val , vallen ) ;
2007-09-06 21:36:45 +00:00
brtcmd . type = type ;
brtcmd . u . id . key = & hk ;
brtcmd . u . id . val = & hv ;
2007-07-24 02:36:00 +00:00
//printf("%s:%d random_picked\n", __FILE__, __LINE__);
2007-11-29 15:17:46 +00:00
toku_init_dbt ( & childsplitk ) ;
2007-07-13 19:37:47 +00:00
if ( debug ) printf ( " %s:%d %*spush down %s \n " , __FILE__ , __LINE__ , debug , " " , ( char * ) key ) ;
2007-09-06 21:36:45 +00:00
r = push_a_brt_cmd_down ( t , node , child , childnum ,
2007-09-28 17:11:22 +00:00
& brtcmd ,
& child_did_split , & childa , & childb ,
& childsplitk ,
txn ) ;
2007-07-24 02:36:00 +00:00
if ( 0 ) {
unsigned int sum = 0 ;
2008-01-11 14:03:33 +00:00
FIFO_ITERATE ( node - > u . n . buffers [ childnum ] , subhk __attribute__ ( ( __unused__ ) ) , hkl , hd __attribute__ ( ( __unused__ ) ) , hdl , subtype __attribute__ ( ( __unused__ ) ) ,
sum + = hkl + hdl + KEY_VALUE_OVERHEAD + BRT_CMD_OVERHEAD ) ;
2007-07-24 02:36:00 +00:00
printf ( " %s:%d sum=%d \n " , __FILE__ , __LINE__ , sum ) ;
2008-01-11 14:03:33 +00:00
assert ( sum = = node - > u . n . n_bytes_in_buffer [ childnum ] ) ;
2007-07-24 02:36:00 +00:00
}
2008-01-11 14:03:33 +00:00
if ( node - > u . n . n_bytes_in_buffer [ childnum ] > 0 ) assert ( toku_fifo_n_entries ( node - > u . n . buffers [ childnum ] ) > 0 ) ;
//printf("%s:%d %d=push_a_brt_cmd_down=(); child_did_split=%d (weight=%d)\n", __FILE__, __LINE__, r, child_did_split, node->u.n.n_bytes_in_buffer[childnum]);
2007-07-13 19:37:47 +00:00
if ( r ! = 0 ) return r ;
if ( child_did_split ) {
// If the child splits, we don't push down any further.
2007-07-24 01:32:03 +00:00
if ( debug ) printf ( " %s:%d %*shandle split splitkey=%s \n " , __FILE__ , __LINE__ , debug , " " , ( char * ) childsplitk . data ) ;
2007-07-13 19:37:47 +00:00
r = handle_split_of_child ( t , node , childnum ,
2007-07-24 01:32:03 +00:00
childa , childb , & childsplitk ,
2007-07-24 11:13:42 +00:00
did_split , nodea , nodeb , splitk ,
2007-11-26 21:51:36 +00:00
txn ) ;
2007-11-14 17:58:38 +00:00
//if (*did_split) {
// verify_local_fingerprint_nonleaf(*nodea);
// verify_local_fingerprint_nonleaf(*nodeb);
//}
2007-07-13 19:37:47 +00:00
return r ; /* Don't do any more pushing if the child splits. */
}
}
2007-07-24 02:36:00 +00:00
if ( 0 ) printf ( " %s:%d done random picking \n " , __FILE__ , __LINE__ ) ;
2007-07-13 19:37:47 +00:00
}
2007-09-06 21:36:45 +00:00
if ( debug ) printf ( " %s:%d %*sdone push_some_brt_cmds_down, unpinning %lld \n " , __FILE__ , __LINE__ , debug , " " , targetchild ) ;
2007-11-19 23:54:17 +00:00
assert ( toku_serialize_brtnode_size ( node ) < = node - > nodesize ) ;
2007-11-14 17:58:38 +00:00
//verify_local_fingerprint_nonleaf(node);
2008-01-23 19:44:13 +00:00
r = unpin_brtnode ( t , child ) ;
2007-07-13 19:37:47 +00:00
if ( r ! = 0 ) return r ;
* did_split = 0 ;
return 0 ;
}
2007-11-28 19:00:21 +00:00
static int debugp1 ( int debug ) {
2007-07-13 19:37:47 +00:00
return debug ? debug + 1 : 0 ;
}
2007-11-26 21:51:36 +00:00
static int brtnode_maybe_push_down ( BRT t , BRTNODE node , int * did_split , BRTNODE * nodea , BRTNODE * nodeb , DBT * splitk , int debug , TOKUTXN txn )
2007-07-13 19:37:47 +00:00
/* If the buffer is too full, then push down. Possibly the child will split. That may make us split. */
{
assert ( node - > height > 0 ) ;
2008-01-11 14:03:33 +00:00
if ( debug ) printf ( " %s:%d %*sIn maybe_push_down in_buffer=%d childkeylens=%d size=%d \n " , __FILE__ , __LINE__ , debug , " " , node - > u . n . n_bytes_in_buffers , node - > u . n . totalchildkeylens , toku_serialize_brtnode_size ( node ) ) ;
2007-11-19 23:54:17 +00:00
if ( toku_serialize_brtnode_size ( node ) > node - > nodesize ) {
2007-07-13 19:37:47 +00:00
if ( debug ) printf ( " %s:%d %*stoo full, height=%d \n " , __FILE__ , __LINE__ , debug , " " , node - > height ) ;
{
/* Push to a child. */
/* Find the heaviest child, and push stuff to it. Keep pushing to the child until we run out.
* But if the child pushes something to its child and our buffer has gotten small enough , then we stop pushing . */
int childnum ;
2007-07-24 02:36:00 +00:00
if ( 0 ) printf ( " %s:%d %*sfind_heaviest_data \n " , __FILE__ , __LINE__ , debug , " " ) ;
2007-07-13 19:37:47 +00:00
find_heaviest_child ( node , & childnum ) ;
2007-12-06 20:58:45 +00:00
if ( 0 ) printf ( " %s:%d %*spush some down from %lld into %lld (child %d) \n " , __FILE__ , __LINE__ , debug , " " , node - > thisnodename , BRTNODE_CHILD_DISKOFF ( node , childnum ) , childnum ) ;
assert ( BRTNODE_CHILD_DISKOFF ( node , childnum ) ! = 0 ) ;
2007-11-26 21:51:36 +00:00
int r = push_some_brt_cmds_down ( t , node , childnum , did_split , nodea , nodeb , splitk , debugp1 ( debug ) , txn ) ;
2007-07-13 19:37:47 +00:00
if ( r ! = 0 ) return r ;
assert ( * did_split = = 0 | | * did_split = = 1 ) ;
2007-09-06 21:36:45 +00:00
if ( debug ) printf ( " %s:%d %*sdid push_some_brt_cmds_down did_split=%d \n " , __FILE__ , __LINE__ , debug , " " , * did_split ) ;
2007-07-13 19:37:47 +00:00
if ( * did_split ) {
2007-11-19 23:54:17 +00:00
assert ( toku_serialize_brtnode_size ( * nodea ) < = ( * nodea ) - > nodesize ) ;
assert ( toku_serialize_brtnode_size ( * nodeb ) < = ( * nodeb ) - > nodesize ) ;
2007-07-13 19:37:47 +00:00
assert ( ( * nodea ) - > u . n . n_children > 0 ) ;
assert ( ( * nodeb ) - > u . n . n_children > 0 ) ;
2007-12-06 20:58:45 +00:00
assert ( BRTNODE_CHILD_DISKOFF ( * nodea , ( * nodea ) - > u . n . n_children - 1 ) ! = 0 ) ;
assert ( BRTNODE_CHILD_DISKOFF ( * nodeb , ( * nodeb ) - > u . n . n_children - 1 ) ! = 0 ) ;
2007-11-14 17:58:38 +00:00
//verify_local_fingerprint_nonleaf(*nodea);
//verify_local_fingerprint_nonleaf(*nodeb);
2007-07-13 19:37:47 +00:00
} else {
2007-11-19 23:54:17 +00:00
assert ( toku_serialize_brtnode_size ( node ) < = node - > nodesize ) ;
2007-07-13 19:37:47 +00:00
}
}
} else {
* did_split = 0 ;
2007-11-19 23:54:17 +00:00
assert ( toku_serialize_brtnode_size ( node ) < = node - > nodesize ) ;
2007-07-13 19:37:47 +00:00
}
2007-11-14 17:58:38 +00:00
//if (*did_split) {
// verify_local_fingerprint_nonleaf(*nodea);
// verify_local_fingerprint_nonleaf(*nodeb);
//} else {
// verify_local_fingerprint_nonleaf(node);
//}
2007-07-13 19:37:47 +00:00
return 0 ;
}
2007-09-07 13:51:47 +00:00
static int brt_leaf_put_cmd ( BRT t , BRTNODE node , BRT_CMD * cmd ,
2007-09-28 17:11:22 +00:00
int * did_split , BRTNODE * nodea , BRTNODE * nodeb , DBT * splitk ,
int debug ,
TOKUTXN txn ) {
2007-11-20 00:32:25 +00:00
// toku_pma_verify_fingerprint(node->u.l.buffer, node->rand4fingerprint, node->subtree_fingerprint);
2008-01-24 13:51:34 +00:00
assert ( node - > height = = 0 ) ;
2007-12-04 10:02:59 +00:00
FILENUM filenum = toku_cachefile_filenum ( t - > cf ) ;
2007-09-06 21:36:45 +00:00
if ( cmd - > type = = BRT_INSERT ) {
DBT * k = cmd - > u . id . key ;
DBT * v = cmd - > u . id . val ;
int replaced_v_size ;
2008-01-23 19:44:13 +00:00
enum pma_errors pma_status = toku_pma_insert_or_replace ( node - > u . l . buffer , k , v , & replaced_v_size , txn , filenum , node - > thisnodename , node - > rand4fingerprint , & node - > local_fingerprint , & node - > log_lsn ) ;
2007-09-06 21:36:45 +00:00
assert ( pma_status = = BRT_OK ) ;
//printf("replaced_v_size=%d\n", replaced_v_size);
if ( replaced_v_size > = 0 ) {
node - > u . l . n_bytes_in_buffer + = v - > size - replaced_v_size ;
} else {
2007-12-04 22:18:21 +00:00
node - > u . l . n_bytes_in_buffer + = k - > size + v - > size + KEY_VALUE_OVERHEAD + PMA_ITEM_OVERHEAD ;
2007-09-06 21:36:45 +00:00
}
2007-11-15 14:44:05 +00:00
node - > dirty = 1 ;
2007-11-14 17:58:38 +00:00
2007-11-20 00:32:25 +00:00
// toku_pma_verify_fingerprint(node->u.l.buffer, node->rand4fingerprint, node->subtree_fingerprint);
2007-11-14 17:58:38 +00:00
2007-09-06 21:36:45 +00:00
// If it doesn't fit, then split the leaf.
2007-11-19 23:54:17 +00:00
if ( toku_serialize_brtnode_size ( node ) > node - > nodesize ) {
2007-12-04 10:02:59 +00:00
int r = brtleaf_split ( txn , filenum , t , node , nodea , nodeb , splitk ) ;
2007-09-06 21:36:45 +00:00
if ( r ! = 0 ) return r ;
//printf("%s:%d splitkey=%s\n", __FILE__, __LINE__, (char*)*splitkey);
split_count + + ;
* did_split = 1 ;
2007-11-19 23:54:17 +00:00
toku_verify_counts ( * nodea ) ; toku_verify_counts ( * nodeb ) ;
2007-09-06 21:36:45 +00:00
if ( debug ) printf ( " %s:%d %*snodeb->thisnodename=%lld nodeb->size=%d \n " , __FILE__ , __LINE__ , debug , " " , ( * nodeb ) - > thisnodename , ( * nodeb ) - > nodesize ) ;
2007-11-19 23:54:17 +00:00
assert ( toku_serialize_brtnode_size ( * nodea ) < = ( * nodea ) - > nodesize ) ;
assert ( toku_serialize_brtnode_size ( * nodeb ) < = ( * nodeb ) - > nodesize ) ;
2007-11-20 00:32:25 +00:00
// toku_pma_verify_fingerprint((*nodea)->u.l.buffer, (*nodea)->rand4fingerprint, (*nodea)->subtree_fingerprint);
// toku_pma_verify_fingerprint((*nodeb)->u.l.buffer, (*nodeb)->rand4fingerprint, (*nodeb)->subtree_fingerprint);
2007-09-06 21:36:45 +00:00
} else {
* did_split = 0 ;
}
return 0 ;
2007-11-15 14:44:05 +00:00
} else if ( cmd - > type = = BRT_DELETE ) {
u_int32_t delta ;
2008-01-02 20:33:51 +00:00
int r = toku_pma_delete ( node - > u . l . buffer , cmd - > u . id . key , 0 , node - > rand4fingerprint , & node - > local_fingerprint , & delta ) ;
2007-11-15 14:44:05 +00:00
if ( r = = BRT_OK ) {
node - > u . l . n_bytes_in_buffer - = delta ;
node - > dirty = 1 ;
2007-09-06 21:36:45 +00:00
}
* did_split = 0 ;
2007-11-14 17:58:38 +00:00
return BRT_OK ;
2007-09-06 21:36:45 +00:00
2008-01-02 20:33:51 +00:00
} else if ( cmd - > type = = BRT_DELETE_BOTH ) {
u_int32_t delta ;
int r = toku_pma_delete ( node - > u . l . buffer , cmd - > u . id . key , cmd - > u . id . val , node - > rand4fingerprint , & node - > local_fingerprint , & delta ) ;
if ( r = = BRT_OK ) {
node - > u . l . n_bytes_in_buffer - = delta ;
node - > dirty = 1 ;
}
* did_split = 0 ;
return BRT_OK ;
} else {
2007-11-15 14:44:05 +00:00
return EINVAL ;
2008-01-02 20:33:51 +00:00
}
2007-07-13 19:37:47 +00:00
}
2007-11-19 00:46:09 +00:00
/* find the leftmost child that may contain the key */
2007-11-26 21:51:36 +00:00
static unsigned int brtnode_left_child ( BRTNODE node , DBT * k , DBT * d , BRT t ) {
2007-07-24 01:32:03 +00:00
int i ;
assert ( node - > height > 0 ) ;
for ( i = 0 ; i < node - > u . n . n_children - 1 ; i + + ) {
2007-12-06 13:52:52 +00:00
int cmp = brt_compare_pivot ( t , k , d , node - > u . n . childkeys [ i ] ) ;
2007-11-19 00:46:09 +00:00
if ( cmp > 0 ) continue ;
if ( cmp < 0 ) return i ;
return i ;
2007-07-24 01:32:03 +00:00
}
return node - > u . n . n_children - 1 ;
}
2008-01-08 21:03:17 +00:00
static unsigned int brtnode_right_child ( BRTNODE node , DBT * k , DBT * data , BRT t ) {
return brtnode_left_child ( node , k , data , t ) ;
2007-11-19 00:46:09 +00:00
}
2008-01-08 21:03:17 +00:00
/* put a cmd into a nodes child */
static int brt_nonleaf_put_cmd_child_node ( BRT t , BRTNODE node , BRT_CMD * cmd ,
int * did_split , BRTNODE * nodea , BRTNODE * nodeb , DBT * splitk ,
int debug , TOKUTXN txn , int childnum , int maybe ) {
2007-10-02 16:17:44 +00:00
int r ;
void * child_v ;
BRTNODE child ;
int child_did_split ;
BRTNODE childa , childb ;
DBT childsplitk ;
* did_split = 0 ;
if ( maybe )
2007-12-06 20:58:45 +00:00
r = toku_cachetable_maybe_get_and_pin ( t - > cf , BRTNODE_CHILD_DISKOFF ( node , childnum ) , & child_v ) ;
2007-10-02 16:17:44 +00:00
else
2007-12-06 20:58:45 +00:00
r = toku_cachetable_get_and_pin ( t - > cf , BRTNODE_CHILD_DISKOFF ( node , childnum ) , & child_v , NULL ,
2008-01-23 19:44:13 +00:00
toku_brtnode_flush_callback , toku_brtnode_fetch_callback , t ) ;
2007-10-02 16:17:44 +00:00
if ( r ! = 0 )
return r ;
child = child_v ;
child_did_split = 0 ;
r = brtnode_put_cmd ( t , child , cmd ,
& child_did_split , & childa , & childb , & childsplitk , debug , txn ) ;
2007-10-02 19:19:44 +00:00
if ( r ! = 0 ) {
/* putting to the child failed for some reason, so unpin the child and return the error code */
2008-01-23 19:44:13 +00:00
int rr = unpin_brtnode ( t , child ) ;
2007-10-02 19:19:44 +00:00
assert ( rr = = 0 ) ;
return r ;
}
2007-10-02 16:17:44 +00:00
if ( child_did_split ) {
if ( 0 ) printf ( " brt_nonleaf_insert child_split %p \n " , child ) ;
2008-01-02 20:33:51 +00:00
assert ( cmd - > type < = BRT_DELETE_BOTH ) ;
2007-10-02 16:17:44 +00:00
r = handle_split_of_child ( t , node , childnum ,
childa , childb , & childsplitk ,
did_split , nodea , nodeb , splitk ,
2007-11-26 21:51:36 +00:00
txn ) ;
2007-10-02 16:17:44 +00:00
assert ( r = = 0 ) ;
} else {
2007-11-14 17:58:38 +00:00
//verify_local_fingerprint_nonleaf(child);
2008-01-23 18:06:23 +00:00
fixup_child_fingerprint ( node , childnum , child , t , txn ) ;
2008-01-23 19:44:13 +00:00
int rr = unpin_brtnode ( t , child ) ;
2007-10-02 19:19:44 +00:00
assert ( rr = = 0 ) ;
2007-10-02 16:17:44 +00:00
}
return r ;
}
2007-07-13 19:37:47 +00:00
2007-11-28 19:00:21 +00:00
int toku_brt_do_push_cmd = 1 ;
2007-10-03 14:51:23 +00:00
2008-01-08 21:03:17 +00:00
/* put a cmd into a node at childnum */
static int brt_nonleaf_put_cmd_child ( BRT t , BRTNODE node , BRT_CMD * cmd ,
int * did_split , BRTNODE * nodea , BRTNODE * nodeb , DBT * splitk ,
int debug , TOKUTXN txn , unsigned int childnum , int can_push , int * do_push_down ) {
2007-11-14 17:58:38 +00:00
//verify_local_fingerprint_nonleaf(node);
2007-08-23 18:07:18 +00:00
/* non-buffering mode when cursors are open on this child */
if ( node - > u . n . n_cursors [ childnum ] > 0 ) {
2008-01-11 14:03:33 +00:00
assert ( node - > u . n . n_bytes_in_buffer [ childnum ] = = 0 ) ;
2008-01-08 21:03:17 +00:00
int r = brt_nonleaf_put_cmd_child_node ( t , node , cmd , did_split , nodea , nodeb , splitk , debug , txn , childnum , 0 ) ;
2007-11-14 17:58:38 +00:00
//if (*did_split) {
// verify_local_fingerprint_nonleaf(*nodea);
// verify_local_fingerprint_nonleaf(*nodeb);
//} else {
// verify_local_fingerprint_nonleaf(node);
//}
2007-08-23 18:07:18 +00:00
return r ;
}
2008-01-08 21:03:17 +00:00
/* try to push the cmd to the subtree if the buffer is empty and pushes are enabled */
2008-01-11 14:03:33 +00:00
if ( node - > u . n . n_bytes_in_buffer [ childnum ] = = 0 & & can_push & & toku_brt_do_push_cmd ) {
2008-01-08 21:03:17 +00:00
int r = brt_nonleaf_put_cmd_child_node ( t , node , cmd , did_split , nodea , nodeb , splitk , debug , txn , childnum , 1 ) ;
if ( r = = 0 )
2007-11-19 00:46:09 +00:00
return r ;
}
//verify_local_fingerprint_nonleaf(node);
2008-01-08 21:03:17 +00:00
/* append the cmd to the child buffer */
2007-11-19 00:46:09 +00:00
{
2008-01-08 21:03:17 +00:00
int type = cmd - > type ;
DBT * k = cmd - > u . id . key ;
DBT * v = cmd - > u . id . val ;
2007-11-19 00:46:09 +00:00
int diff = k - > size + v - > size + KEY_VALUE_OVERHEAD + BRT_CMD_OVERHEAD ;
2008-01-11 14:03:33 +00:00
int r = toku_fifo_enq ( node - > u . n . buffers [ childnum ] , k - > data , k - > size , v - > data , v - > size , type ) ;
2007-11-19 00:46:09 +00:00
assert ( r = = 0 ) ;
node - > local_fingerprint + = node - > rand4fingerprint * toku_calccrc32_cmd ( type , k - > data , k - > size , v - > data , v - > size ) ;
2008-01-11 14:03:33 +00:00
node - > u . n . n_bytes_in_buffers + = diff ;
node - > u . n . n_bytes_in_buffer [ childnum ] + = diff ;
2007-11-19 00:46:09 +00:00
node - > dirty = 1 ;
}
2008-01-08 21:03:17 +00:00
* do_push_down = 1 ;
2007-11-19 00:46:09 +00:00
return 0 ;
}
2008-01-08 21:03:17 +00:00
static int brt_nonleaf_insert_cmd ( BRT t , BRTNODE node , BRT_CMD * cmd ,
int * did_split , BRTNODE * nodea , BRTNODE * nodeb , DBT * splitk ,
int debug , TOKUTXN txn ) {
2007-11-19 00:46:09 +00:00
//verify_local_fingerprint_nonleaf(node);
2008-01-08 21:03:17 +00:00
unsigned int childnum ;
int r ;
2007-11-19 00:46:09 +00:00
2008-01-08 21:03:17 +00:00
/* find the right subtree */
childnum = brtnode_right_child ( node , cmd - > u . id . key , cmd - > u . id . val , t ) ;
/* put the cmd in the subtree */
int do_push_down = 0 ;
r = brt_nonleaf_put_cmd_child ( t , node , cmd , did_split , nodea , nodeb , splitk , debug , txn , childnum , 1 , & do_push_down ) ;
if ( r ! = 0 ) return r ;
/* maybe push down */
if ( do_push_down ) {
if ( debug ) printf ( " %s:%d %*sDoing maybe_push_down \n " , __FILE__ , __LINE__ , debug , " " ) ;
//verify_local_fingerprint_nonleaf(node);
r = brtnode_maybe_push_down ( t , node , did_split , nodea , nodeb , splitk , debugp1 ( debug ) , txn ) ;
if ( r ! = 0 ) return r ;
if ( debug ) printf ( " %s:%d %*sDid maybe_push_down \n " , __FILE__ , __LINE__ , debug , " " ) ;
if ( * did_split ) {
assert ( toku_serialize_brtnode_size ( * nodea ) < = ( * nodea ) - > nodesize ) ;
assert ( toku_serialize_brtnode_size ( * nodeb ) < = ( * nodeb ) - > nodesize ) ;
assert ( ( * nodea ) - > u . n . n_children > 0 ) ;
assert ( ( * nodeb ) - > u . n . n_children > 0 ) ;
assert ( BRTNODE_CHILD_DISKOFF ( * nodea , ( * nodea ) - > u . n . n_children - 1 ) ! = 0 ) ;
assert ( BRTNODE_CHILD_DISKOFF ( * nodeb , ( * nodeb ) - > u . n . n_children - 1 ) ! = 0 ) ;
toku_verify_counts ( * nodea ) ;
toku_verify_counts ( * nodeb ) ;
} else {
assert ( toku_serialize_brtnode_size ( node ) < = node - > nodesize ) ;
toku_verify_counts ( node ) ;
}
//if (*did_split) {
// verify_local_fingerprint_nonleaf(*nodea);
// verify_local_fingerprint_nonleaf(*nodeb);
//} else {
// verify_local_fingerprint_nonleaf(node);
//}
2007-07-13 19:37:47 +00:00
}
return 0 ;
}
2007-11-19 00:46:09 +00:00
/* delete in all subtrees starting from the left most one which contains the key */
static int brt_nonleaf_delete_cmd ( BRT t , BRTNODE node , BRT_CMD * cmd ,
int * did_split , BRTNODE * nodea , BRTNODE * nodeb , DBT * splitk ,
int debug ,
TOKUTXN txn ) {
int r ;
/* find all children that need a delete cmd */
int delchild [ TREE_FANOUT ] , delidx = 0 ;
inline void delchild_append ( int i ) {
if ( delidx = = 0 | | delchild [ delidx - 1 ] ! = i )
delchild [ delidx + + ] = i ;
}
int i ;
for ( i = 0 ; i < node - > u . n . n_children - 1 ; i + + ) {
2007-12-06 13:52:52 +00:00
int cmp = brt_compare_pivot ( t , cmd - > u . id . key , 0 , node - > u . n . childkeys [ i ] ) ;
2007-11-19 00:46:09 +00:00
if ( cmp > 0 ) {
continue ;
} else if ( cmp < 0 ) {
delchild_append ( i ) ;
break ;
2007-11-19 20:22:56 +00:00
} else if ( t - > flags & TOKU_DB_DUPSORT ) {
2007-11-19 00:46:09 +00:00
delchild_append ( i ) ;
delchild_append ( i + 1 ) ;
} else {
delchild_append ( i ) ;
break ;
}
}
if ( delidx = = 0 )
delchild_append ( node - > u . n . n_children - 1 ) ;
/* issue the delete cmd to all of the children found previously */
2008-01-08 21:03:17 +00:00
int do_push_down = 0 ;
2007-11-19 00:46:09 +00:00
for ( i = 0 ; i < delidx ; i + + ) {
2008-01-08 21:03:17 +00:00
r = brt_nonleaf_put_cmd_child ( t , node , cmd , did_split , nodea , nodeb , splitk , debug , txn , delchild [ i ] , delidx = = 1 , & do_push_down ) ;
2007-11-19 00:46:09 +00:00
assert ( r = = 0 ) ;
}
2008-01-08 21:03:17 +00:00
if ( do_push_down ) {
/* maybe push down */
if ( debug ) printf ( " %s:%d %*sDoing maybe_push_down \n " , __FILE__ , __LINE__ , debug , " " ) ;
//verify_local_fingerprint_nonleaf(node);
r = brtnode_maybe_push_down ( t , node , did_split , nodea , nodeb , splitk , debugp1 ( debug ) , txn ) ;
if ( r ! = 0 ) return r ;
if ( debug ) printf ( " %s:%d %*sDid maybe_push_down \n " , __FILE__ , __LINE__ , debug , " " ) ;
if ( * did_split ) {
assert ( toku_serialize_brtnode_size ( * nodea ) < = ( * nodea ) - > nodesize ) ;
assert ( toku_serialize_brtnode_size ( * nodeb ) < = ( * nodeb ) - > nodesize ) ;
assert ( ( * nodea ) - > u . n . n_children > 0 ) ;
assert ( ( * nodeb ) - > u . n . n_children > 0 ) ;
assert ( BRTNODE_CHILD_DISKOFF ( * nodea , ( * nodea ) - > u . n . n_children - 1 ) ! = 0 ) ;
assert ( BRTNODE_CHILD_DISKOFF ( * nodeb , ( * nodeb ) - > u . n . n_children - 1 ) ! = 0 ) ;
toku_verify_counts ( * nodea ) ;
toku_verify_counts ( * nodeb ) ;
} else {
assert ( toku_serialize_brtnode_size ( node ) < = node - > nodesize ) ;
toku_verify_counts ( node ) ;
}
//if (*did_split) {
// verify_local_fingerprint_nonleaf(*nodea);
// verify_local_fingerprint_nonleaf(*nodeb);
//} else {
// verify_local_fingerprint_nonleaf(node);
//}
2007-11-19 00:46:09 +00:00
}
return 0 ;
}
static int brt_nonleaf_put_cmd ( BRT t , BRTNODE node , BRT_CMD * cmd ,
int * did_split , BRTNODE * nodea , BRTNODE * nodeb ,
DBT * splitk ,
int debug ,
TOKUTXN txn ) {
2008-01-08 21:03:17 +00:00
if ( cmd - > type = = BRT_INSERT | | cmd - > type = = BRT_DELETE_BOTH ) {
2007-11-19 00:46:09 +00:00
return brt_nonleaf_insert_cmd ( t , node , cmd , did_split , nodea , nodeb , splitk , debug , txn ) ;
2008-01-08 21:03:17 +00:00
} else if ( cmd - > type = = BRT_DELETE ) {
2007-11-19 00:46:09 +00:00
return brt_nonleaf_delete_cmd ( t , node , cmd , did_split , nodea , nodeb , splitk , debug , txn ) ;
2008-01-08 21:03:17 +00:00
} else
2007-11-19 00:46:09 +00:00
return EINVAL ;
}
2007-07-13 19:37:47 +00:00
2007-11-14 17:58:38 +00:00
//static void verify_local_fingerprint_nonleaf (BRTNODE node) {
// u_int32_t fp=0;
// int i;
// if (node->height==0) return;
// for (i=0; i<node->u.n.n_children; i++)
2008-01-11 14:03:33 +00:00
// FIFO_ITERATE(node->u.n.htables[i], key, keylen, data, datalen, type,
2007-11-14 17:58:38 +00:00
// ({
// fp += node->rand4fingerprint * toku_calccrc32_cmd(type, key, keylen, data, datalen);
// }));
// assert(fp==node->local_fingerprint);
//}
2007-09-06 21:36:45 +00:00
static int brtnode_put_cmd ( BRT t , BRTNODE node , BRT_CMD * cmd ,
2007-09-28 17:11:22 +00:00
int * did_split , BRTNODE * nodea , BRTNODE * nodeb , DBT * splitk ,
int debug ,
TOKUTXN txn ) {
2007-11-14 17:58:38 +00:00
//static int counter=0; // FOO
//static int oldcounter=0;
//int tmpcounter;
//u_int32_t oldfingerprint=node->local_fingerprint;
int r ;
//counter++; tmpcounter=counter;
2007-07-13 19:37:47 +00:00
if ( node - > height = = 0 ) {
2007-11-20 00:32:25 +00:00
// toku_pma_verify_fingerprint(node->u.l.buffer, node->rand4fingerprint, node->subtree_fingerprint);
2007-11-14 17:58:38 +00:00
r = brt_leaf_put_cmd ( t , node , cmd ,
2007-09-28 17:11:22 +00:00
did_split , nodea , nodeb , splitk ,
debug , txn ) ;
2007-07-13 19:37:47 +00:00
} else {
2007-11-14 17:58:38 +00:00
r = brt_nonleaf_put_cmd ( t , node , cmd ,
2007-09-28 17:11:22 +00:00
did_split , nodea , nodeb , splitk ,
debug , txn ) ;
2007-07-13 19:37:47 +00:00
}
2007-11-14 17:58:38 +00:00
//oldcounter=tmpcounter;
// Watch out. If did_split then the original node is no longer allocated.
if ( * did_split ) {
2007-11-19 23:54:17 +00:00
assert ( toku_serialize_brtnode_size ( * nodea ) < = ( * nodea ) - > nodesize ) ;
assert ( toku_serialize_brtnode_size ( * nodeb ) < = ( * nodeb ) - > nodesize ) ;
2007-11-14 17:58:38 +00:00
// if ((*nodea)->height==0) {
2007-11-20 00:32:25 +00:00
// toku_pma_verify_fingerprint((*nodea)->u.l.buffer, (*nodea)->rand4fingerprint, (*nodea)->subtree_fingerprint);
// toku_pma_verify_fingerprint((*nodeb)->u.l.buffer, (*nodeb)->rand4fingerprint, (*nodeb)->subtree_fingerprint);
2007-11-14 17:58:38 +00:00
// }
} else {
2007-11-19 23:54:17 +00:00
assert ( toku_serialize_brtnode_size ( node ) < = node - > nodesize ) ;
2007-11-14 17:58:38 +00:00
// if (node->height==0) {
2007-11-20 00:32:25 +00:00
// toku_pma_verify_fingerprint(node->u.l.buffer, node->rand4fingerprint, node->local_fingerprint);
2007-11-14 17:58:38 +00:00
// } else {
// verify_local_fingerprint_nonleaf(node);
// }
}
//if (node->local_fingerprint==3522421844U) {
// if (*did_split) {
// verify_local_fingerprint_nonleaf(*nodea);
// verify_local_fingerprint_nonleaf(*nodeb);
// }
return r ;
2007-07-13 19:37:47 +00:00
}
2007-11-29 14:44:03 +00:00
int toku_brt_create_cachetable ( CACHETABLE * ct , long cachesize , LSN initial_lsn , TOKULOGGER logger ) {
2007-11-14 17:58:38 +00:00
if ( cachesize = = 0 )
cachesize = 128 * 1024 * 1024 ;
2007-11-19 23:47:44 +00:00
return toku_create_cachetable ( ct , cachesize , initial_lsn , logger ) ;
2007-07-13 19:37:47 +00:00
}
2007-11-21 19:06:32 +00:00
static int setup_brt_root_node ( BRT t , DISKOFF offset , TOKUTXN txn ) {
2007-07-13 19:37:47 +00:00
int r ;
2007-09-18 16:09:55 +00:00
TAGMALLOC ( BRTNODE , node ) ;
2007-07-13 19:37:47 +00:00
assert ( node ) ;
//printf("%s:%d\n", __FILE__, __LINE__);
initialize_brtnode ( t , node ,
offset , /* the location is one nodesize offset from 0. */
0 ) ;
2007-11-16 20:34:13 +00:00
// node->brt = t;
2007-07-13 19:37:47 +00:00
if ( 0 ) {
printf ( " %s:%d for tree %p node %p mdict_create--> %p \n " , __FILE__ , __LINE__ , t , node , node - > u . l . buffer ) ;
printf ( " %s:%d put root at %lld \n " , __FILE__ , __LINE__ , offset ) ;
}
2007-10-03 19:34:31 +00:00
//printf("%s:%d putting %p (%lld)\n", __FILE__, __LINE__, node, node->thisnodename);
2007-11-19 23:47:44 +00:00
r = toku_cachetable_put ( t - > cf , offset , node , brtnode_size ( node ) ,
2007-11-29 15:09:14 +00:00
toku_brtnode_flush_callback , toku_brtnode_fetch_callback , t ) ;
2007-07-13 19:37:47 +00:00
if ( r ! = 0 ) {
2007-07-20 18:00:14 +00:00
toku_free ( node ) ;
2007-07-13 19:37:47 +00:00
return r ;
}
//printf("%s:%d created %lld\n", __FILE__, __LINE__, node->thisnodename);
2007-11-19 23:54:17 +00:00
toku_verify_counts ( node ) ;
2007-11-14 17:58:38 +00:00
// verify_local_fingerprint_nonleaf(node);
2007-11-24 03:50:28 +00:00
toku_log_newbrtnode ( txn , toku_txn_get_txnid ( txn ) , toku_cachefile_filenum ( t - > cf ) , offset , 0 , t - > h - > nodesize , ( t - > flags & TOKU_DB_DUPSORT ) ! = 0 , node - > rand4fingerprint ) ;
2008-01-23 19:44:13 +00:00
toku_update_brtnode_lsn ( node , txn ) ;
r = unpin_brtnode ( t , node ) ;
2007-07-13 19:37:47 +00:00
if ( r ! = 0 ) {
2007-07-20 18:00:14 +00:00
toku_free ( node ) ;
2007-07-13 19:37:47 +00:00
return r ;
}
return 0 ;
}
2007-07-20 14:20:58 +00:00
//#define BRT_TRACE
2007-07-13 19:37:47 +00:00
# ifdef BRT_TRACE
# define WHEN_BRTTRACE(x) x
# else
# define WHEN_BRTTRACE(x) ((void)0)
# endif
2007-11-29 14:44:03 +00:00
int toku_brt_create ( BRT * brt_ptr ) {
2007-11-14 17:58:38 +00:00
BRT brt = toku_malloc ( sizeof * brt ) ;
if ( brt = = 0 )
return ENOMEM ;
memset ( brt , 0 , sizeof * brt ) ;
2008-01-25 15:43:37 +00:00
list_init ( & brt - > cursors ) ;
2007-11-14 17:58:38 +00:00
brt - > flags = 0 ;
brt - > nodesize = BRT_DEFAULT_NODE_SIZE ;
2007-11-20 00:02:51 +00:00
brt - > compare_fun = toku_default_compare_fun ;
brt - > dup_compare = toku_default_compare_fun ;
2007-11-14 17:58:38 +00:00
* brt_ptr = brt ;
return 0 ;
}
2007-11-29 19:32:53 +00:00
int toku_brt_set_flags ( BRT brt , unsigned int flags ) {
2007-11-14 17:58:38 +00:00
brt - > flags = flags ;
return 0 ;
}
2007-11-29 19:32:53 +00:00
int toku_brt_get_flags ( BRT brt , unsigned int * flags ) {
2007-11-25 18:17:01 +00:00
* flags = brt - > flags ;
return 0 ;
}
2007-11-29 19:32:53 +00:00
int toku_brt_set_nodesize ( BRT brt , unsigned int nodesize ) {
2007-11-14 17:58:38 +00:00
brt - > nodesize = nodesize ;
return 0 ;
}
2007-11-29 19:32:53 +00:00
int toku_brt_get_nodesize ( BRT brt , unsigned int * nodesize ) {
* nodesize = brt - > nodesize ;
return 0 ;
}
2007-11-29 15:09:14 +00:00
int toku_brt_set_bt_compare ( BRT brt , int ( * bt_compare ) ( DB * , const DBT * , const DBT * ) ) {
2007-11-14 17:58:38 +00:00
brt - > compare_fun = bt_compare ;
return 0 ;
}
2007-11-29 15:09:14 +00:00
int toku_brt_set_dup_compare ( BRT brt , int ( * dup_compare ) ( DB * , const DBT * , const DBT * ) ) {
2007-11-14 17:58:38 +00:00
brt - > dup_compare = dup_compare ;
return 0 ;
}
2008-01-11 14:38:49 +00:00
int toku_brt_get_fd ( BRT brt , int * fdp ) {
* fdp = toku_cachefile_fd ( brt - > cf ) ;
return 0 ;
}
2008-01-24 13:51:34 +00:00
int toku_brt_open ( BRT t , const char * fname , const char * fname_in_env , const char * dbname , int is_create , int only_create , int load_flags , CACHETABLE cachetable , TOKUTXN txn , DB * db ) {
2007-11-14 17:58:38 +00:00
2007-07-13 19:37:47 +00:00
/* If dbname is NULL then we setup to hold a single tree. Otherwise we setup an array. */
int r ;
char * malloced_name = 0 ;
2007-11-29 15:34:49 +00:00
//printf("%s:%d %d alloced\n", __FILE__, __LINE__, get_n_items_malloced()); toku_print_malloced_items();
2008-01-24 13:51:34 +00:00
WHEN_BRTTRACE ( fprintf ( stderr , " BRTTRACE: %s:%d toku_brt_open(%s, \" %s \" , %d, %p, %d, %p) \n " ,
2007-07-20 14:20:58 +00:00
__FILE__ , __LINE__ , fname , dbname , is_create , newbrt , nodesize , cachetable ) ) ;
2007-11-14 17:58:38 +00:00
if ( 0 ) { died0 : assert ( r ) ; return r ; }
2007-11-19 16:30:50 +00:00
assert ( is_create | | ! only_create ) ;
2007-12-11 19:34:21 +00:00
assert ( ! load_flags | | ! only_create ) ;
2007-07-13 19:37:47 +00:00
if ( dbname ) {
2007-08-01 02:37:21 +00:00
malloced_name = toku_strdup ( dbname ) ;
2007-07-13 19:37:47 +00:00
if ( malloced_name = = 0 ) {
r = ENOMEM ;
2007-07-20 18:00:14 +00:00
if ( 0 ) { died0a : if ( malloced_name ) toku_free ( malloced_name ) ; }
2007-07-13 19:37:47 +00:00
goto died0 ;
}
}
t - > database_name = malloced_name ;
2008-01-24 13:51:34 +00:00
t - > db = db ;
2007-11-19 23:47:44 +00:00
{
int fd = open ( fname , O_RDWR , 0777 ) ;
r = errno ;
if ( fd = = - 1 & & errno = = ENOENT ) {
if ( ! is_create ) {
t - > database_name = 0 ;
goto died0a ;
}
fd = open ( fname , O_RDWR | O_CREAT , 0777 ) ;
r = errno ;
if ( fd = = - 1 ) {
t - > database_name = 0 ;
goto died0a ;
}
2007-11-29 18:14:40 +00:00
toku_logger_log_fcreate ( txn , fname_in_env , 0777 ) ;
2007-11-19 23:47:44 +00:00
}
r = toku_cachetable_openfd ( & t - > cf , cachetable , fd ) ;
2007-11-29 18:14:40 +00:00
toku_logger_log_fopen ( txn , fname_in_env , toku_cachefile_filenum ( t - > cf ) ) ;
2007-11-19 23:47:44 +00:00
}
2007-07-13 19:37:47 +00:00
if ( r ! = 0 ) {
2007-11-19 23:47:44 +00:00
if ( 0 ) { died1 : toku_cachefile_close ( & t - > cf ) ; }
2007-11-14 17:58:38 +00:00
t - > database_name = 0 ;
2007-07-13 19:37:47 +00:00
goto died0a ;
}
2007-11-14 17:58:38 +00:00
assert ( t - > nodesize > 0 ) ;
2007-11-29 15:34:49 +00:00
//printf("%s:%d %d alloced\n", __FILE__, __LINE__, get_n_items_malloced()); toku_print_malloced_items();
2007-12-23 01:09:09 +00:00
if ( 0 ) {
died_after_read_and_pin :
toku_cachetable_unpin ( t - > cf , 0 , 0 , 0 ) ; // unpin the header
goto died1 ;
}
2007-07-13 19:37:47 +00:00
if ( is_create ) {
2007-11-14 17:58:38 +00:00
r = toku_read_and_pin_brt_header ( t - > cf , & t - > h ) ;
2007-07-13 19:37:47 +00:00
if ( r = = - 1 ) {
/* construct a new header. */
if ( ( MALLOC ( t - > h ) ) = = 0 ) {
assert ( errno = = ENOMEM ) ;
r = ENOMEM ;
2007-07-20 18:00:14 +00:00
if ( 0 ) { died2 : toku_free ( t - > h ) ; }
2007-12-23 01:09:09 +00:00
t - > h = 0 ;
goto died_after_read_and_pin ;
2007-07-13 19:37:47 +00:00
}
2007-08-01 16:01:52 +00:00
t - > h - > dirty = 1 ;
2007-11-14 17:58:38 +00:00
t - > h - > flags = t - > flags ;
t - > h - > nodesize = t - > nodesize ;
2007-07-13 19:37:47 +00:00
t - > h - > freelist = - 1 ;
2007-11-14 17:58:38 +00:00
t - > h - > unused_memory = 2 * t - > nodesize ;
2007-07-13 19:37:47 +00:00
if ( dbname ) {
t - > h - > unnamed_root = - 1 ;
t - > h - > n_named_roots = 1 ;
2007-08-01 02:37:21 +00:00
if ( ( MALLOC_N ( 1 , t - > h - > names ) ) = = 0 ) { assert ( errno = = ENOMEM ) ; r = ENOMEM ; if ( 0 ) { died3 : toku_free ( t - > h - > names ) ; } goto died2 ; }
if ( ( MALLOC_N ( 1 , t - > h - > roots ) ) = = 0 ) { assert ( errno = = ENOMEM ) ; r = ENOMEM ; if ( 0 ) { died4 : toku_free ( t - > h - > roots ) ; } goto died3 ; }
if ( ( t - > h - > names [ 0 ] = toku_strdup ( dbname ) ) = = 0 ) { assert ( errno = = ENOMEM ) ; r = ENOMEM ; if ( 0 ) { died5 : toku_free ( t - > h - > names [ 0 ] ) ; } goto died4 ; }
2007-11-14 17:58:38 +00:00
t - > h - > roots [ 0 ] = t - > nodesize ;
2007-07-13 19:37:47 +00:00
} else {
2007-11-14 17:58:38 +00:00
t - > h - > unnamed_root = t - > nodesize ;
2007-07-13 19:37:47 +00:00
t - > h - > n_named_roots = - 1 ;
t - > h - > names = 0 ;
t - > h - > roots = 0 ;
}
2007-11-29 18:14:40 +00:00
if ( ( r = toku_logger_log_header ( txn , toku_cachefile_filenum ( t - > cf ) , t - > h ) ) ) { goto died6 ; }
2007-11-21 19:06:32 +00:00
if ( ( r = setup_brt_root_node ( t , t - > nodesize , txn ) ) ! = 0 ) { died6 : if ( dbname ) goto died5 ; else goto died2 ; }
2007-11-29 15:09:14 +00:00
if ( ( r = toku_cachetable_put ( t - > cf , 0 , t - > h , 0 , toku_brtheader_flush_callback , toku_brtheader_fetch_callback , 0 ) ) ) { goto died6 ; }
2007-12-11 20:03:12 +00:00
}
else if ( r ! = 0 ) {
2007-12-23 01:09:09 +00:00
goto died_after_read_and_pin ;
2007-12-11 20:03:12 +00:00
}
else {
2007-07-13 19:37:47 +00:00
int i ;
assert ( r = = 0 ) ;
2007-11-19 16:30:50 +00:00
assert ( dbname ) ;
2007-12-23 01:09:09 +00:00
if ( t - > h - > unnamed_root ! = - 1 ) { r = EINVAL ; goto died_after_read_and_pin ; } // Cannot create a subdb in a file that is not enabled for subdbs
2007-07-13 19:37:47 +00:00
assert ( t - > h - > n_named_roots > = 0 ) ;
for ( i = 0 ; i < t - > h - > n_named_roots ; i + + ) {
if ( strcmp ( t - > h - > names [ i ] , dbname ) = = 0 ) {
2007-11-19 16:30:50 +00:00
if ( only_create ) {
2007-11-28 14:51:55 +00:00
r = EEXIST ;
2007-12-23 01:09:09 +00:00
goto died_after_read_and_pin ;
2007-11-28 14:51:55 +00:00
}
else goto found_it ;
2007-07-13 19:37:47 +00:00
}
}
2007-12-23 01:09:09 +00:00
if ( ( t - > h - > names = toku_realloc ( t - > h - > names , ( 1 + t - > h - > n_named_roots ) * sizeof ( * t - > h - > names ) ) ) = = 0 ) { assert ( errno = = ENOMEM ) ; r = ENOMEM ; goto died_after_read_and_pin ; }
if ( ( t - > h - > roots = toku_realloc ( t - > h - > roots , ( 1 + t - > h - > n_named_roots ) * sizeof ( * t - > h - > roots ) ) ) = = 0 ) { assert ( errno = = ENOMEM ) ; r = ENOMEM ; goto died_after_read_and_pin ; }
2007-07-13 19:37:47 +00:00
t - > h - > n_named_roots + + ;
2007-12-23 01:09:09 +00:00
if ( ( t - > h - > names [ t - > h - > n_named_roots - 1 ] = toku_strdup ( dbname ) ) = = 0 ) { assert ( errno = = ENOMEM ) ; r = ENOMEM ; goto died_after_read_and_pin ; }
2007-11-14 17:58:38 +00:00
//printf("%s:%d t=%p\n", __FILE__, __LINE__, t);
2008-01-18 21:28:27 +00:00
r = malloc_diskblock_header_is_in_memory ( & t - > h - > roots [ t - > h - > n_named_roots - 1 ] , t , t - > h - > nodesize , txn ) ;
if ( r ! = 0 ) goto died_after_read_and_pin ;
2007-10-17 13:39:08 +00:00
t - > h - > dirty = 1 ;
2007-12-23 01:09:09 +00:00
if ( ( r = setup_brt_root_node ( t , t - > h - > roots [ t - > h - > n_named_roots - 1 ] , txn ) ) ! = 0 ) goto died_after_read_and_pin ;
2007-07-13 19:37:47 +00:00
}
} else {
2007-11-14 17:58:38 +00:00
if ( ( r = toku_read_and_pin_brt_header ( t - > cf , & t - > h ) ) ! = 0 ) goto died1 ;
2007-07-13 19:37:47 +00:00
if ( ! dbname ) {
2007-12-23 01:09:09 +00:00
if ( t - > h - > n_named_roots ! = - 1 ) { r = EINVAL ; goto died_after_read_and_pin ; } // requires a subdb
2007-07-13 19:37:47 +00:00
} else {
int i ;
2007-12-23 01:09:09 +00:00
if ( t - > h - > n_named_roots = = - 1 ) { r = EINVAL ; goto died_after_read_and_pin ; } // no suddbs in the db
2007-11-14 17:58:38 +00:00
// printf("%s:%d n_roots=%d\n", __FILE__, __LINE__, t->h->n_named_roots);
2007-07-13 19:37:47 +00:00
for ( i = 0 ; i < t - > h - > n_named_roots ; i + + ) {
if ( strcmp ( t - > h - > names [ i ] , dbname ) = = 0 ) {
goto found_it ;
}
}
r = ENOENT ; /* the database doesn't exist */
2007-12-23 01:09:09 +00:00
goto died_after_read_and_pin ;
2007-07-13 19:37:47 +00:00
}
2007-11-14 17:58:38 +00:00
found_it :
t - > nodesize = t - > h - > nodesize ; /* inherit the pagesize from the file */
if ( t - > flags ! = t - > h - > flags ) { /* flags must match */
2007-12-11 19:34:21 +00:00
if ( load_flags ) t - > flags = t - > h - > flags ;
2007-12-23 01:09:09 +00:00
else { r = EINVAL ; goto died_after_read_and_pin ; }
2007-11-14 17:58:38 +00:00
}
2007-07-13 19:37:47 +00:00
}
assert ( t - > h ) ;
2007-12-23 01:09:09 +00:00
if ( ( r = toku_unpin_brt_header ( t ) ) ! = 0 ) goto died1 ; // it's unpinned
2007-07-13 19:37:47 +00:00
assert ( t - > h = = 0 ) ;
WHEN_BRTTRACE ( fprintf ( stderr , " BRTTRACE -> %p \n " , t ) ) ;
return 0 ;
}
2007-11-29 15:09:14 +00:00
int toku_brt_remove_subdb ( BRT brt , const char * dbname , u_int32_t flags ) {
2007-11-15 19:09:31 +00:00
int r ;
int r2 = 0 ;
int i ;
int found = - 1 ;
assert ( flags = = 0 ) ;
r = toku_read_and_pin_brt_header ( brt - > cf , & brt - > h ) ;
//TODO: What if r != 0? Is this possible?
2007-11-29 15:09:14 +00:00
// We just called toku_brt_open, so it should exist...
2007-11-15 19:09:31 +00:00
assert ( r = = 0 ) ;
assert ( brt - > h - > unnamed_root = = - 1 ) ;
assert ( brt - > h - > n_named_roots > = 0 ) ;
for ( i = 0 ; i < brt - > h - > n_named_roots ; i + + ) {
if ( strcmp ( brt - > h - > names [ i ] , dbname ) = = 0 ) {
found = i ;
break ;
}
}
if ( found = = - 1 ) {
//Should not be possible.
r = ENOENT ;
goto error ;
}
//Free old db name
toku_free ( brt - > h - > names [ found ] ) ;
//TODO: Free Diskblocks including root
for ( i = found + 1 ; i < brt - > h - > n_named_roots ; i + + ) {
brt - > h - > names [ i - 1 ] = brt - > h - > names [ i ] ;
brt - > h - > roots [ i - 1 ] = brt - > h - > roots [ i ] ;
}
brt - > h - > n_named_roots - - ;
brt - > h - > dirty = 1 ;
//TODO: What if n_named_roots becomes 0? Should we handle it specially? Should we delete the file?
if ( ( brt - > h - > names = toku_realloc ( brt - > h - > names , ( brt - > h - > n_named_roots ) * sizeof ( * brt - > h - > names ) ) ) = = 0 ) { assert ( errno = = ENOMEM ) ; r = ENOMEM ; goto error ; }
if ( ( brt - > h - > roots = toku_realloc ( brt - > h - > roots , ( brt - > h - > n_named_roots ) * sizeof ( * brt - > h - > roots ) ) ) = = 0 ) { assert ( errno = = ENOMEM ) ; r = ENOMEM ; goto error ; }
error :
r2 = toku_unpin_brt_header ( brt ) ;
assert ( r2 = = 0 ) ; //TODO: Can r2 be non 0?
assert ( brt - > h = = 0 ) ;
return r ? r : r2 ;
}
2007-11-20 21:20:05 +00:00
// This one has no env
2007-11-29 15:09:14 +00:00
int toku_open_brt ( const char * fname , const char * dbname , int is_create , BRT * newbrt , int nodesize , CACHETABLE cachetable , TOKUTXN txn ,
2007-11-26 21:51:36 +00:00
int ( * compare_fun ) ( DB * , const DBT * , const DBT * ) , DB * db ) {
2007-11-14 17:58:38 +00:00
BRT brt ;
int r ;
2007-12-11 19:34:21 +00:00
const int only_create = 0 ;
const int load_flags = 0 ;
2007-11-14 17:58:38 +00:00
2007-11-29 14:44:03 +00:00
r = toku_brt_create ( & brt ) ;
2007-11-14 17:58:38 +00:00
if ( r ! = 0 )
return r ;
2007-11-29 15:09:14 +00:00
toku_brt_set_nodesize ( brt , nodesize ) ;
toku_brt_set_bt_compare ( brt , compare_fun ) ;
2007-11-14 17:58:38 +00:00
2008-01-24 13:51:34 +00:00
r = toku_brt_open ( brt , fname , fname , dbname , is_create , only_create , load_flags , cachetable , txn , db ) ;
2007-11-14 17:58:38 +00:00
if ( r ! = 0 ) {
return r ;
}
* newbrt = brt ;
return r ;
}
2007-11-29 15:09:14 +00:00
int toku_close_brt ( BRT brt ) {
2007-07-13 19:37:47 +00:00
int r ;
2008-01-25 15:43:37 +00:00
while ( ! list_empty ( & brt - > cursors ) ) {
BRT_CURSOR c = list_struct ( list_pop ( & brt - > cursors ) , struct brt_cursor , cursors_link ) ;
2007-11-29 14:44:03 +00:00
r = toku_brt_cursor_close ( c ) ;
2007-07-13 19:37:47 +00:00
if ( r ! = 0 ) return r ;
}
2007-11-14 17:58:38 +00:00
if ( brt - > cf ) {
2007-12-07 19:02:03 +00:00
assert ( 0 = = toku_cachefile_count_pinned ( brt - > cf , 1 ) ) ; // For the brt, the pinned count should be zero.
2007-11-14 17:58:38 +00:00
//printf("%s:%d closing cachetable\n", __FILE__, __LINE__);
2007-11-19 23:47:44 +00:00
if ( ( r = toku_cachefile_close ( & brt - > cf ) ) ! = 0 ) return r ;
2007-11-14 17:58:38 +00:00
}
2007-07-20 18:00:14 +00:00
if ( brt - > database_name ) toku_free ( brt - > database_name ) ;
2007-07-24 03:28:48 +00:00
if ( brt - > skey ) { toku_free ( brt - > skey ) ; }
if ( brt - > sval ) { toku_free ( brt - > sval ) ; }
2007-07-20 18:00:14 +00:00
toku_free ( brt ) ;
2007-07-13 19:37:47 +00:00
return 0 ;
}
2007-11-29 14:44:03 +00:00
int toku_brt_debug_mode = 0 ; //strcmp(key,"hello387")==0;
2007-07-13 19:37:47 +00:00
2007-11-14 17:58:38 +00:00
CACHEKEY * toku_calculate_root_offset_pointer ( BRT brt ) {
2007-07-13 19:37:47 +00:00
if ( brt - > database_name = = 0 ) {
return & brt - > h - > unnamed_root ;
} else {
int i ;
for ( i = 0 ; i < brt - > h - > n_named_roots ; i + + ) {
if ( strcmp ( brt - > database_name , brt - > h - > names [ i ] ) = = 0 ) {
return & brt - > h - > roots [ i ] ;
}
}
}
abort ( ) ;
}
2008-01-23 18:06:23 +00:00
static int brt_init_new_root ( BRT brt , BRTNODE nodea , BRTNODE nodeb , DBT splitk , CACHEKEY * rootp , TOKUTXN txn , BRTNODE * newrootp ) {
2007-09-18 16:09:55 +00:00
TAGMALLOC ( BRTNODE , newroot ) ;
2007-08-23 18:07:18 +00:00
int r ;
2008-01-17 15:41:42 +00:00
int new_height = nodea - > height + 1 ;
int new_nodesize = brt - > h - > nodesize ;
2008-01-18 21:28:27 +00:00
DISKOFF newroot_diskoff ;
r = malloc_diskblock ( & newroot_diskoff , brt , new_nodesize , txn ) ;
assert ( r = = 0 ) ;
2007-08-23 18:07:18 +00:00
assert ( newroot ) ;
2008-01-18 21:28:27 +00:00
if ( brt - > database_name = = 0 ) {
toku_log_changeunnamedroot ( txn , toku_txn_get_txnid ( txn ) , toku_cachefile_filenum ( brt - > cf ) , * rootp , newroot_diskoff ) ;
} else {
BYTESTRING bs ;
bs . len = 1 + strlen ( brt - > database_name ) ;
bs . data = brt - > database_name ;
toku_log_changenamedroot ( txn , toku_txn_get_txnid ( txn ) , toku_cachefile_filenum ( brt - > cf ) , bs , * rootp , newroot_diskoff ) ;
}
2007-08-23 18:07:18 +00:00
* rootp = newroot_diskoff ;
brt - > h - > dirty = 1 ;
2008-01-17 15:41:42 +00:00
initialize_brtnode ( brt , newroot , newroot_diskoff , new_height ) ;
2007-12-31 17:30:19 +00:00
//printf("new_root %lld %d %lld %lld\n", newroot_diskoff, newroot->height, nodea->thisnodename, nodeb->thisnodename);
2007-08-23 18:07:18 +00:00
newroot - > u . n . n_children = 2 ;
//printf("%s:%d Splitkey=%p %s\n", __FILE__, __LINE__, splitkey, splitkey);
newroot - > u . n . childkeys [ 0 ] = splitk . data ;
newroot - > u . n . totalchildkeylens = splitk . size ;
newroot - > u . n . children [ 0 ] = nodea - > thisnodename ;
newroot - > u . n . children [ 1 ] = nodeb - > thisnodename ;
2008-01-11 14:03:33 +00:00
r = toku_fifo_create ( & newroot - > u . n . buffers [ 0 ] ) ; if ( r ! = 0 ) return r ;
r = toku_fifo_create ( & newroot - > u . n . buffers [ 1 ] ) ; if ( r ! = 0 ) return r ;
2007-11-19 23:54:17 +00:00
toku_verify_counts ( newroot ) ;
2007-11-14 17:58:38 +00:00
//verify_local_fingerprint_nonleaf(nodea);
//verify_local_fingerprint_nonleaf(nodeb);
2008-01-17 15:41:42 +00:00
r = toku_log_newbrtnode ( txn , toku_txn_get_txnid ( txn ) , toku_cachefile_filenum ( brt - > cf ) , newroot_diskoff , new_height , new_nodesize , ( brt - > flags & TOKU_DB_DUPSORT ) ! = 0 , newroot - > rand4fingerprint ) ;
2008-01-17 19:36:44 +00:00
if ( r ! = 0 ) return r ;
2008-01-17 19:03:37 +00:00
r = toku_log_addchild ( txn , toku_txn_get_txnid ( txn ) , toku_cachefile_filenum ( brt - > cf ) , newroot_diskoff , 0 ) ;
2008-01-17 15:41:42 +00:00
if ( r ! = 0 ) return r ;
2008-01-17 19:36:44 +00:00
r = toku_log_addchild ( txn , toku_txn_get_txnid ( txn ) , toku_cachefile_filenum ( brt - > cf ) , newroot_diskoff , 1 ) ;
if ( r ! = 0 ) return r ;
2008-01-17 19:03:37 +00:00
r = toku_log_setchild ( txn , toku_txn_get_txnid ( txn ) , toku_cachefile_filenum ( brt - > cf ) , newroot_diskoff , 0 , nodea - > thisnodename ) ;
2008-01-17 15:41:42 +00:00
if ( r ! = 0 ) return r ;
2008-01-17 19:03:37 +00:00
r = toku_log_setchild ( txn , toku_txn_get_txnid ( txn ) , toku_cachefile_filenum ( brt - > cf ) , newroot_diskoff , 1 , nodeb - > thisnodename ) ;
2008-01-17 15:41:42 +00:00
if ( r ! = 0 ) return r ;
2008-01-23 18:06:23 +00:00
fixup_child_fingerprint ( newroot , 0 , nodea , brt , txn ) ;
fixup_child_fingerprint ( newroot , 1 , nodeb , brt , txn ) ;
2008-01-17 15:41:42 +00:00
{
2008-01-23 18:29:06 +00:00
BYTESTRING bs = { . len = kv_pair_keylen ( newroot - > u . n . childkeys [ 0 ] ) ,
. data = kv_pair_key ( newroot - > u . n . childkeys [ 0 ] ) } ;
2008-01-17 19:03:37 +00:00
r = toku_log_setpivot ( txn , toku_txn_get_txnid ( txn ) , toku_cachefile_filenum ( brt - > cf ) , newroot_diskoff , 0 , bs ) ;
2008-01-17 15:41:42 +00:00
if ( r ! = 0 ) return r ;
2008-01-23 19:44:13 +00:00
toku_update_brtnode_lsn ( newroot , txn ) ;
2008-01-17 15:41:42 +00:00
}
2008-01-23 19:44:13 +00:00
r = unpin_brtnode ( brt , nodea ) ;
2007-09-21 17:55:49 +00:00
if ( r ! = 0 ) return r ;
2008-01-23 19:44:13 +00:00
r = unpin_brtnode ( brt , nodeb ) ;
2007-09-21 17:55:49 +00:00
if ( r ! = 0 ) return r ;
2007-08-23 18:07:18 +00:00
//printf("%s:%d put %lld\n", __FILE__, __LINE__, brt->root);
2007-11-19 23:47:44 +00:00
toku_cachetable_put ( brt - > cf , newroot_diskoff , newroot , brtnode_size ( newroot ) ,
2008-01-08 21:03:17 +00:00
toku_brtnode_flush_callback , toku_brtnode_fetch_callback , brt ) ;
2008-01-23 18:06:23 +00:00
* newrootp = newroot ;
2007-08-23 18:07:18 +00:00
return 0 ;
}
2007-11-14 17:58:38 +00:00
static int brt_root_put_cmd ( BRT brt , BRT_CMD * cmd , TOKUTXN txn ) {
2007-07-13 19:37:47 +00:00
void * node_v ;
BRTNODE node ;
CACHEKEY * rootp ;
2007-09-06 21:36:45 +00:00
int result ;
2007-07-13 19:37:47 +00:00
int r ;
2007-07-24 01:32:03 +00:00
int did_split ; BRTNODE nodea = 0 , nodeb = 0 ;
DBT splitk ;
2007-11-29 14:44:03 +00:00
int debug = toku_brt_debug_mode ; //strcmp(key,"hello387")==0;
2007-11-19 23:47:44 +00:00
//assert(0==toku_cachetable_assert_all_unpinned(brt->cachetable));
2007-11-14 17:58:38 +00:00
if ( ( r = toku_read_and_pin_brt_header ( brt - > cf , & brt - > h ) ) ) {
if ( 0 ) { died0 : toku_unpin_brt_header ( brt ) ; }
2007-07-13 19:37:47 +00:00
return r ;
}
2007-11-14 17:58:38 +00:00
rootp = toku_calculate_root_offset_pointer ( brt ) ;
2007-07-13 19:37:47 +00:00
if ( debug ) printf ( " %s:%d Getting %lld \n " , __FILE__ , __LINE__ , * rootp ) ;
2007-11-19 23:47:44 +00:00
if ( ( r = toku_cachetable_get_and_pin ( brt - > cf , * rootp , & node_v , NULL ,
2007-11-29 15:09:14 +00:00
toku_brtnode_flush_callback , toku_brtnode_fetch_callback , brt ) ) ) {
2007-07-13 19:37:47 +00:00
goto died0 ;
}
2007-10-03 19:34:31 +00:00
//printf("%s:%d pin %p\n", __FILE__, __LINE__, node_v);
2007-07-13 19:37:47 +00:00
node = node_v ;
if ( debug ) printf ( " %s:%d node inserting \n " , __FILE__ , __LINE__ ) ;
2007-09-06 21:36:45 +00:00
did_split = 0 ;
result = brtnode_put_cmd ( brt , node , cmd ,
2007-09-28 17:11:22 +00:00
& did_split , & nodea , & nodeb , & splitk ,
debug ,
txn ) ;
2007-07-13 19:37:47 +00:00
if ( debug ) printf ( " %s:%d did_insert \n " , __FILE__ , __LINE__ ) ;
if ( did_split ) {
2008-01-23 18:06:23 +00:00
// node is unpinned, so now we have to proceed to update the root with a new node.
2007-07-13 19:37:47 +00:00
//printf("%s:%d did_split=%d nodeb=%p nodeb->thisnodename=%lld nodeb->nodesize=%d\n", __FILE__, __LINE__, did_split, nodeb, nodeb->thisnodename, nodeb->nodesize);
//printf("Did split, splitkey=%s\n", splitkey);
if ( nodeb - > height > 0 ) assert ( nodeb - > u . n . children [ nodeb - > u . n . n_children - 1 ] ! = 0 ) ;
assert ( nodeb - > nodesize > 0 ) ;
2008-01-23 18:06:23 +00:00
r = brt_init_new_root ( brt , nodea , nodeb , splitk , rootp , txn , & node ) ;
2007-09-06 21:36:45 +00:00
assert ( r = = 0 ) ;
2007-07-13 19:37:47 +00:00
} else {
if ( node - > height > 0 )
assert ( node - > u . n . n_children < = TREE_FANOUT ) ;
}
2008-01-23 19:44:13 +00:00
r = unpin_brtnode ( brt , node ) ;
2008-01-23 18:06:23 +00:00
assert ( r = = 0 ) ;
2007-11-14 17:58:38 +00:00
r = toku_unpin_brt_header ( brt ) ;
2007-09-06 21:36:45 +00:00
assert ( r = = 0 ) ;
2007-11-19 23:47:44 +00:00
//assert(0==toku_cachetable_assert_all_unpinned(brt->cachetable));
2007-09-06 21:36:45 +00:00
return result ;
}
2007-11-29 15:09:14 +00:00
int toku_brt_insert ( BRT brt , DBT * key , DBT * val , TOKUTXN txn ) {
2007-09-06 21:36:45 +00:00
int r ;
BRT_CMD brtcmd ;
brtcmd . type = BRT_INSERT ;
brtcmd . u . id . key = key ;
brtcmd . u . id . val = val ;
2007-09-28 17:11:22 +00:00
r = brt_root_put_cmd ( brt , & brtcmd , txn ) ;
2007-09-06 21:36:45 +00:00
return r ;
2007-07-13 19:37:47 +00:00
}
2007-11-29 15:09:14 +00:00
int toku_brt_lookup ( BRT brt , DBT * k , DBT * v ) {
2008-01-07 22:28:36 +00:00
int r , rr ;
BRT_CURSOR cursor ;
rr = toku_brt_cursor ( brt , & cursor ) ;
if ( rr ! = 0 ) return rr ;
int op = brt - > flags & TOKU_DB_DUPSORT ? DB_GET_BOTH : DB_SET ;
r = toku_brt_cursor_get ( cursor , k , v , op , 0 ) ;
rr = toku_brt_cursor_close ( cursor ) ; assert ( rr = = 0 ) ;
return r ;
2007-07-13 19:37:47 +00:00
}
2007-11-29 15:09:14 +00:00
int toku_brt_delete ( BRT brt , DBT * key ) {
2007-09-06 21:36:45 +00:00
int r ;
BRT_CMD brtcmd ;
DBT val ;
2007-11-29 15:17:46 +00:00
toku_init_dbt ( & val ) ;
2007-09-06 21:36:45 +00:00
val . size = 0 ;
brtcmd . type = BRT_DELETE ;
brtcmd . u . id . key = key ;
brtcmd . u . id . val = & val ;
2007-09-28 17:11:22 +00:00
r = brt_root_put_cmd ( brt , & brtcmd , 0 ) ;
2007-09-06 21:36:45 +00:00
return r ;
}
2008-01-02 20:33:51 +00:00
int toku_brt_delete_both ( BRT brt , DBT * key , DBT * val ) {
int r ;
BRT_CMD brtcmd ;
brtcmd . type = BRT_DELETE_BOTH ;
brtcmd . u . id . key = key ;
brtcmd . u . id . val = val ;
r = brt_root_put_cmd ( brt , & brtcmd , 0 ) ;
return r ;
}
2007-11-20 00:35:31 +00:00
int toku_verify_brtnode ( BRT brt , DISKOFF off , bytevec lorange , ITEMLEN lolen , bytevec hirange , ITEMLEN hilen , int recurse , BRTNODE parent_brtnode ) ;
2007-07-13 19:37:47 +00:00
2007-11-29 14:44:03 +00:00
int toku_dump_brtnode ( BRT brt , DISKOFF off , int depth , bytevec lorange , ITEMLEN lolen , bytevec hirange , ITEMLEN hilen , BRTNODE parent_brtnode ) {
2007-07-13 19:37:47 +00:00
int result = 0 ;
BRTNODE node ;
void * node_v ;
2007-11-19 23:47:44 +00:00
int r = toku_cachetable_get_and_pin ( brt - > cf , off , & node_v , NULL ,
2008-01-23 18:06:23 +00:00
toku_brtnode_flush_callback , toku_brtnode_fetch_callback , brt ) ;
2007-07-13 19:37:47 +00:00
assert ( r = = 0 ) ;
2007-10-03 19:34:31 +00:00
printf ( " %s:%d pin %p \n " , __FILE__ , __LINE__ , node_v ) ;
2007-07-13 19:37:47 +00:00
node = node_v ;
2007-11-20 00:35:31 +00:00
result = toku_verify_brtnode ( brt , off , lorange , lolen , hirange , hilen , 0 , parent_brtnode ) ;
2007-07-13 19:37:47 +00:00
printf ( " %*sNode=%p \n " , depth , " " , node ) ;
if ( node - > height > 0 ) {
2008-01-11 14:03:33 +00:00
printf ( " %*sNode %lld nodesize=%d height=%d n_children=%d n_bytes_in_buffers=%d keyrange=%s %s \n " ,
depth , " " , off , node - > nodesize , node - > height , node - > u . n . n_children , node - > u . n . n_bytes_in_buffers , ( char * ) lorange , ( char * ) hirange ) ;
2007-07-13 19:37:47 +00:00
//printf("%s %s\n", lorange ? lorange : "NULL", hirange ? hirange : "NULL");
{
int i ;
for ( i = 0 ; i < node - > u . n . n_children - 1 ; i + + ) {
2008-01-11 14:03:33 +00:00
printf ( " %*schild %d buffered (%d entries): \n " , depth + 1 , " " , i , toku_fifo_n_entries ( node - > u . n . buffers [ i ] ) ) ;
FIFO_ITERATE ( node - > u . n . buffers [ i ] , key , keylen , data , datalen , type ,
2007-07-13 19:37:47 +00:00
( {
2007-09-06 21:36:45 +00:00
printf ( " %*s %s %s %d \n " , depth + 2 , " " , ( char * ) key , ( char * ) data , type ) ;
2007-07-13 19:37:47 +00:00
assert ( strlen ( ( char * ) key ) + 1 = = keylen ) ;
assert ( strlen ( ( char * ) data ) + 1 = = datalen ) ;
} ) ) ;
}
for ( i = 0 ; i < node - > u . n . n_children ; i + + ) {
printf ( " %*schild %d \n " , depth , " " , i ) ;
if ( i > 0 ) {
printf ( " %*spivot %d=%s \n " , depth + 1 , " " , i - 1 , ( char * ) node - > u . n . childkeys [ i - 1 ] ) ;
}
2007-12-06 20:58:45 +00:00
toku_dump_brtnode ( brt , BRTNODE_CHILD_DISKOFF ( node , i ) , depth + 4 ,
2007-12-06 13:52:52 +00:00
( i = = 0 ) ? lorange : node - > u . n . childkeys [ i - 1 ] ,
( i = = 0 ) ? lolen : toku_brt_pivot_key_len ( brt , node - > u . n . childkeys [ i - 1 ] ) ,
( i = = node - > u . n . n_children - 1 ) ? hirange : node - > u . n . childkeys [ i ] ,
( i = = node - > u . n . n_children - 1 ) ? hilen : toku_brt_pivot_key_len ( brt , node - > u . n . childkeys [ i ] ) ,
node
) ;
2007-07-13 19:37:47 +00:00
}
}
} else {
printf ( " %*sNode %lld nodesize=%d height=%d n_bytes_in_buffer=%d keyrange=%s %s \n " ,
depth , " " , off , node - > nodesize , node - > height , node - > u . l . n_bytes_in_buffer , ( char * ) lorange , ( char * ) hirange ) ;
PMA_ITERATE ( node - > u . l . buffer , key , keylen , val , vallen ,
( keylen = keylen , vallen = vallen , printf ( " %s:%s " , ( char * ) key , ( char * ) val ) ) ) ;
printf ( " \n " ) ;
}
2007-11-19 23:47:44 +00:00
r = toku_cachetable_unpin ( brt - > cf , off , 0 , 0 ) ;
2007-07-13 19:37:47 +00:00
assert ( r = = 0 ) ;
return result ;
}
2007-11-29 15:09:14 +00:00
int toku_dump_brt ( BRT brt ) {
2007-07-13 19:37:47 +00:00
int r ;
CACHEKEY * rootp ;
2007-11-14 17:58:38 +00:00
if ( ( r = toku_read_and_pin_brt_header ( brt - > cf , & brt - > h ) ) ) {
if ( 0 ) { died0 : toku_unpin_brt_header ( brt ) ; }
2007-07-13 19:37:47 +00:00
return r ;
}
2007-11-14 17:58:38 +00:00
rootp = toku_calculate_root_offset_pointer ( brt ) ;
2007-07-13 19:37:47 +00:00
printf ( " split_count=%d \n " , split_count ) ;
2007-11-29 14:44:03 +00:00
if ( ( r = toku_dump_brtnode ( brt , * rootp , 0 , 0 , 0 , 0 , 0 , null_brtnode ) ) ) goto died0 ;
2007-11-14 17:58:38 +00:00
if ( ( r = toku_unpin_brt_header ( brt ) ) ! = 0 ) return r ;
2007-07-13 19:37:47 +00:00
return 0 ;
}
2008-01-07 19:53:50 +00:00
static int show_brtnode_blocknumbers ( BRT brt , DISKOFF off ) {
2007-07-13 19:37:47 +00:00
BRTNODE node ;
void * node_v ;
int i , r ;
assert ( off % brt - > h - > nodesize = = 0 ) ;
2007-11-19 23:47:44 +00:00
if ( ( r = toku_cachetable_get_and_pin ( brt - > cf , off , & node_v , NULL ,
2007-11-29 15:09:14 +00:00
toku_brtnode_flush_callback , toku_brtnode_fetch_callback , brt ) ) ) {
2007-11-19 23:47:44 +00:00
if ( 0 ) { died0 : toku_cachetable_unpin ( brt - > cf , off , 0 , 0 ) ; }
2007-07-13 19:37:47 +00:00
return r ;
}
2007-10-03 19:34:31 +00:00
printf ( " %s:%d pin %p \n " , __FILE__ , __LINE__ , node_v ) ;
2007-07-13 19:37:47 +00:00
node = node_v ;
printf ( " %lld " , off / brt - > h - > nodesize ) ;
if ( node - > height > 0 ) {
for ( i = 0 ; i < node - > u . n . n_children ; i + + ) {
2008-01-07 19:53:50 +00:00
if ( ( r = show_brtnode_blocknumbers ( brt , BRTNODE_CHILD_DISKOFF ( node , i ) ) ) ) goto died0 ;
2007-07-13 19:37:47 +00:00
}
}
2007-11-19 23:47:44 +00:00
r = toku_cachetable_unpin ( brt - > cf , off , 0 , 0 ) ;
2007-07-13 19:37:47 +00:00
return r ;
}
2007-11-28 19:00:21 +00:00
#if 0
2007-07-13 19:37:47 +00:00
int show_brt_blocknumbers ( BRT brt ) {
int r ;
CACHEKEY * rootp ;
2007-11-14 17:58:38 +00:00
if ( ( r = toku_read_and_pin_brt_header ( brt - > cf , & brt - > h ) ) ) {
if ( 0 ) { died0 : toku_unpin_brt_header ( brt ) ; }
2007-07-13 19:37:47 +00:00
return r ;
}
2007-11-14 17:58:38 +00:00
rootp = toku_calculate_root_offset_pointer ( brt ) ;
2007-07-13 19:37:47 +00:00
printf ( " BRT %p has blocks: " , brt ) ;
2007-10-03 19:34:31 +00:00
if ( ( r = show_brtnode_blocknumbers ( brt , * rootp , 0 ) ) ) goto died0 ;
2007-07-13 19:37:47 +00:00
printf ( " \n " ) ;
2007-11-14 17:58:38 +00:00
if ( ( r = toku_unpin_brt_header ( brt ) ) ! = 0 ) return r ;
2007-07-13 19:37:47 +00:00
return 0 ;
}
2007-11-28 19:00:21 +00:00
# endif
2007-07-13 19:37:47 +00:00
2007-08-24 12:10:49 +00:00
2008-01-25 15:43:37 +00:00
int toku_brt_dbt_set_key ( BRT brt , DBT * ybt , bytevec val , ITEMLEN vallen ) {
int r = toku_dbt_set_value ( ybt , val , vallen , & brt - > skey ) ;
return r ;
}
2007-08-24 12:10:49 +00:00
2008-01-25 15:43:37 +00:00
int toku_brt_dbt_set_value ( BRT brt , DBT * ybt , bytevec val , ITEMLEN vallen ) {
int r = toku_dbt_set_value ( ybt , val , vallen , & brt - > sval ) ;
return r ;
}
2007-08-23 18:07:18 +00:00
2008-01-25 15:43:37 +00:00
typedef struct brt_split {
int did_split ;
BRTNODE nodea ;
BRTNODE nodeb ;
DBT splitk ;
} BRT_SPLIT ;
2007-08-23 18:07:18 +00:00
2008-01-25 15:43:37 +00:00
static inline void brt_split_init ( BRT_SPLIT * split ) {
split - > did_split = 0 ;
split - > nodea = split - > nodeb = 0 ;
toku_init_dbt ( & split - > splitk ) ;
2007-08-23 18:07:18 +00:00
}
2008-01-25 15:43:37 +00:00
static int brt_search_node ( BRT brt , BRTNODE node , brt_search_t * search , DBT * newkey , DBT * newval , BRT_SPLIT * split ) ;
2007-08-23 18:07:18 +00:00
2008-01-25 15:43:37 +00:00
/* search in a node's child */
static int brt_search_child ( BRT brt , BRTNODE node , int childnum , brt_search_t * search , DBT * newkey , DBT * newval , BRT_SPLIT * split ) {
int r , rr ;
2007-08-24 12:10:49 +00:00
2008-01-25 15:43:37 +00:00
/* if the child's buffer is not empty then try to empty it */
if ( node - > u . n . n_bytes_in_buffer [ childnum ] > 0 ) {
rr = push_some_brt_cmds_down ( brt , node , childnum , & split - > did_split , & split - > nodea , & split - > nodeb , & split - > splitk , 0 , 0 ) ;
assert ( rr = = 0 ) ;
/* push down may cause a child split, so childnum may not be appropriate, and the node itself may split, so retry */
return EAGAIN ;
}
2007-08-23 18:07:18 +00:00
2008-01-25 15:43:37 +00:00
void * node_v ;
rr = toku_cachetable_get_and_pin ( brt - > cf , node - > u . n . children [ childnum ] , & node_v , NULL , toku_brtnode_flush_callback , toku_brtnode_fetch_callback , brt ) ;
assert ( rr = = 0 ) ;
for ( ; ; ) {
BRTNODE childnode = node_v ;
BRT_SPLIT childsplit ; brt_split_init ( & childsplit ) ;
r = brt_search_node ( brt , childnode , search , newkey , newval , & childsplit ) ;
if ( childsplit . did_split ) {
rr = handle_split_of_child ( brt , node , childnum , childsplit . nodea , childsplit . nodeb , & childsplit . splitk ,
& split - > did_split , & split - > nodea , & split - > nodeb , & split - > splitk , 0 ) ;
assert ( rr = = 0 ) ;
break ;
} else {
if ( r = = EAGAIN )
continue ;
rr = toku_cachetable_unpin ( brt - > cf , childnode - > thisnodename , childnode - > dirty , brtnode_size ( childnode ) ) ;
assert ( rr = = 0 ) ;
break ;
2007-08-23 18:07:18 +00:00
}
}
2008-01-25 15:43:37 +00:00
return r ;
2007-08-23 18:07:18 +00:00
}
2008-01-25 15:43:37 +00:00
static int brt_search_nonleaf_node ( BRT brt , BRTNODE node , brt_search_t * search , DBT * newkey , DBT * newval , BRT_SPLIT * split ) {
int r = DB_NOTFOUND ;
int c ;
2007-08-23 18:07:18 +00:00
2008-01-25 15:43:37 +00:00
/* binary search is overkill for a small array */
int child [ node - > u . n . n_children ] ;
2007-08-23 18:07:18 +00:00
2008-01-25 15:43:37 +00:00
/* scan left to right or right to left depending on the search direction */
for ( c = 0 ; c < node - > u . n . n_children ; c + + )
child [ c ] = search - > direction & BRT_SEARCH_LEFT ? c : node - > u . n . n_children - 1 - c ;
2007-08-23 18:07:18 +00:00
2008-01-25 15:43:37 +00:00
for ( c = 0 ; c < node - > u . n . n_children - 1 ; c + + ) {
int p = search - > direction & BRT_SEARCH_LEFT ? child [ c ] : child [ c ] - 1 ;
struct kv_pair * pivot = node - > u . n . childkeys [ p ] ;
DBT pivotkey , pivotval ;
if ( search - > compare ( search ,
toku_fill_dbt ( & pivotkey , kv_pair_key ( pivot ) , kv_pair_keylen ( pivot ) ) ,
brt - > flags & TOKU_DB_DUPSORT ? toku_fill_dbt ( & pivotval , kv_pair_val ( pivot ) , kv_pair_vallen ( pivot ) ) : 0 ) ) {
r = brt_search_child ( brt , node , child [ c ] , search , newkey , newval , split ) ;
if ( r = = 0 | | r = = EAGAIN )
break ;
2007-08-23 18:07:18 +00:00
}
}
2008-01-25 15:43:37 +00:00
/* check the first (left) or last (right) node if nothing has been found */
if ( r = = DB_NOTFOUND & & c = = node - > u . n . n_children - 1 )
r = brt_search_child ( brt , node , child [ c ] , search , newkey , newval , split ) ;
return r ;
2007-08-23 18:07:18 +00:00
}
2008-01-25 15:43:37 +00:00
static int brt_search_leaf_node ( BRT brt , BRTNODE node , brt_search_t * search , DBT * newkey , DBT * newval , BRT_SPLIT * split ) {
brt = brt ; split = split ;
PMA pma = node - > u . l . buffer ;
int r = toku_pma_search ( pma , search , newkey , newval ) ;
return r ;
}
2007-08-23 18:07:18 +00:00
2008-01-25 15:43:37 +00:00
static int brt_search_node ( BRT brt , BRTNODE node , brt_search_t * search , DBT * newkey , DBT * newval , BRT_SPLIT * split ) {
if ( node - > height > 0 )
return brt_search_nonleaf_node ( brt , node , search , newkey , newval , split ) ;
else
return brt_search_leaf_node ( brt , node , search , newkey , newval , split ) ;
2007-08-23 18:07:18 +00:00
}
2008-01-25 15:43:37 +00:00
int toku_brt_search ( BRT brt , brt_search_t * search , DBT * newkey , DBT * newval ) {
int r , rr ;
2007-08-23 18:07:18 +00:00
2008-01-25 15:43:37 +00:00
rr = toku_read_and_pin_brt_header ( brt - > cf , & brt - > h ) ;
assert ( rr = = 0 ) ;
2007-08-23 18:07:18 +00:00
2008-01-25 15:43:37 +00:00
CACHEKEY * rootp ;
rootp = toku_calculate_root_offset_pointer ( brt ) ;
2007-08-23 18:07:18 +00:00
2008-01-25 15:43:37 +00:00
for ( ; ; ) {
void * node_v ;
rr = toku_cachetable_get_and_pin ( brt - > cf , * rootp , & node_v , NULL , toku_brtnode_flush_callback , toku_brtnode_fetch_callback , brt ) ;
assert ( rr = = 0 ) ;
2007-08-23 18:07:18 +00:00
2008-01-25 15:43:37 +00:00
BRTNODE node = node_v ;
BRT_SPLIT split ; brt_split_init ( & split ) ;
r = brt_search_node ( brt , node , search , newkey , newval , & split ) ;
2007-08-23 18:07:18 +00:00
2008-01-25 15:43:37 +00:00
if ( split . did_split ) {
rr = brt_init_new_root ( brt , split . nodea , split . nodeb , split . splitk , rootp , 0 , & node ) ;
assert ( rr = = 0 ) ;
}
2007-08-23 18:07:18 +00:00
2008-01-25 15:43:37 +00:00
rr = unpin_brtnode ( brt , node ) ;
assert ( rr = = 0 ) ;
2007-08-23 18:07:18 +00:00
2008-01-25 15:43:37 +00:00
if ( r ! = EAGAIN )
break ;
}
2007-08-23 18:07:18 +00:00
2008-01-25 15:43:37 +00:00
rr = toku_unpin_brt_header ( brt ) ;
assert ( rr = = 0 ) ;
2007-08-23 18:07:18 +00:00
2008-01-25 15:43:37 +00:00
return r ;
}
2007-08-23 18:07:18 +00:00
2008-01-25 15:43:37 +00:00
static inline void dbt_cleanup ( DBT * dbt ) {
if ( dbt - > data & & ( dbt - > flags & DB_DBT_MALLOC ) ) {
toku_free_n ( dbt - > data , dbt - > size ) ; dbt - > data = 0 ;
2007-08-23 18:07:18 +00:00
}
}
2008-01-25 15:43:37 +00:00
static inline void brt_cursor_cleanup ( BRT_CURSOR cursor ) {
dbt_cleanup ( & cursor - > key ) ;
dbt_cleanup ( & cursor - > val ) ;
}
2007-08-23 18:07:18 +00:00
2008-01-25 15:43:37 +00:00
static inline int brt_cursor_not_set ( BRT_CURSOR cursor ) {
return cursor - > key . data = = 0 | | cursor - > val . data = = 0 ;
}
2007-08-23 18:07:18 +00:00
2008-01-25 15:43:37 +00:00
static inline void brt_cursor_set_key_val ( BRT_CURSOR cursor , DBT * newkey , DBT * newval ) {
brt_cursor_cleanup ( cursor ) ;
cursor - > key = * newkey ; memset ( newkey , 0 , sizeof * newkey ) ;
cursor - > val = * newval ; memset ( newval , 0 , sizeof * newval ) ;
2007-08-23 18:07:18 +00:00
}
2008-01-25 15:43:37 +00:00
int toku_brt_cursor ( BRT brt , BRT_CURSOR * cursorptr ) {
BRT_CURSOR cursor = toku_malloc ( sizeof * cursor ) ;
if ( cursor = = 0 )
return ENOMEM ;
cursor - > brt = brt ;
toku_init_dbt ( & cursor - > key ) ;
toku_init_dbt ( & cursor - > val ) ;
list_push ( & brt - > cursors , & cursor - > cursors_link ) ;
* cursorptr = cursor ;
return 0 ;
}
2007-08-23 18:07:18 +00:00
2008-01-25 15:43:37 +00:00
int toku_brt_cursor_close ( BRT_CURSOR cursor ) {
brt_cursor_cleanup ( cursor ) ;
list_remove ( & cursor - > cursors_link ) ;
toku_free_n ( cursor , sizeof * cursor ) ;
return 0 ;
}
2007-08-23 18:07:18 +00:00
2008-01-25 15:43:37 +00:00
static inline int compare_k_x ( BRT brt , DBT * k , DBT * x ) {
return brt - > compare_fun ( brt - > db , k , x ) ;
2007-08-23 18:07:18 +00:00
}
2007-07-13 19:37:47 +00:00
2008-01-25 15:43:37 +00:00
static inline int compare_v_y ( BRT brt , DBT * v , DBT * y ) {
return brt - > dup_compare ( brt - > db , v , y ) ;
}
2007-07-13 19:37:47 +00:00
2008-01-25 15:43:37 +00:00
static inline int compare_kv_xy ( BRT brt , DBT * k , DBT * v , DBT * x , DBT * y ) {
int cmp = brt - > compare_fun ( brt - > db , k , x ) ;
if ( cmp = = 0 & & v & & y )
cmp = brt - > dup_compare ( brt - > db , v , y ) ;
return cmp ;
}
2007-08-23 18:07:18 +00:00
2008-01-25 15:43:37 +00:00
static inline int brt_cursor_copyout ( BRT_CURSOR cursor , DBT * key , DBT * val ) {
int r = 0 ;
if ( key )
r = toku_dbt_set_value ( key , cursor - > key . data , cursor - > key . size , & cursor - > brt - > skey ) ;
if ( r = = 0 & & val )
r = toku_dbt_set_value ( val , cursor - > val . data , cursor - > val . size , & cursor - > brt - > sval ) ;
return r ;
2007-07-13 19:37:47 +00:00
}
2008-01-25 15:43:37 +00:00
static int brt_cursor_compare_set ( brt_search_t * search , DBT * x , DBT * y ) {
BRT brt = search - > context ;
return compare_kv_xy ( brt , search - > k , search - > v , x , y ) < = 0 ; /* return min xy: kv <= xy */
}
2007-08-23 18:07:18 +00:00
2008-01-25 15:43:37 +00:00
static int brt_cursor_current ( BRT_CURSOR cursor , int get_flags , DBT * outkey , DBT * outval ) {
if ( brt_cursor_not_set ( cursor ) )
return EINVAL ;
if ( ( get_flags & 256 ) = = 0 ) {
DBT newkey ; toku_init_dbt ( & newkey ) ;
DBT newval ; toku_init_dbt ( & newval ) ;
brt_search_t search ; brt_search_init ( & search , brt_cursor_compare_set , BRT_SEARCH_LEFT , & cursor - > key , & cursor - > val , cursor - > brt ) ;
int r = toku_brt_search ( cursor - > brt , & search , & newkey , & newval ) ;
if ( r ! = 0 | | compare_kv_xy ( cursor - > brt , & cursor - > key , & cursor - > val , & newkey , & newval ) ! = 0 )
return DB_KEYEMPTY ;
2007-07-13 19:37:47 +00:00
}
2008-01-25 15:43:37 +00:00
return brt_cursor_copyout ( cursor , outkey , outval ) ;
2007-07-13 19:37:47 +00:00
}
2008-01-25 15:43:37 +00:00
/* search for the first kv pair that matches the search object */
static int brt_cursor_search ( BRT_CURSOR cursor , brt_search_t * search , DBT * outkey , DBT * outval ) {
DBT newkey ; toku_init_dbt ( & newkey ) ; newkey . flags = DB_DBT_MALLOC ;
DBT newval ; toku_init_dbt ( & newval ) ; newval . flags = DB_DBT_MALLOC ;
2007-08-23 18:07:18 +00:00
2008-01-25 15:43:37 +00:00
int r = toku_brt_search ( cursor - > brt , search , & newkey , & newval ) ;
if ( r = = 0 ) {
brt_cursor_set_key_val ( cursor , & newkey , & newval ) ;
r = brt_cursor_copyout ( cursor , outkey , outval ) ;
2007-08-23 18:07:18 +00:00
}
2008-01-25 15:43:37 +00:00
dbt_cleanup ( & newkey ) ;
dbt_cleanup ( & newval ) ;
return r ;
2007-08-23 18:07:18 +00:00
}
2008-01-25 15:43:37 +00:00
/* search for the kv pair that matches the search object and is equal to kv */
static int brt_cursor_search_eq_kv_xy ( BRT_CURSOR cursor , brt_search_t * search , DBT * outkey , DBT * outval ) {
DBT newkey ; toku_init_dbt ( & newkey ) ; newkey . flags = DB_DBT_MALLOC ;
DBT newval ; toku_init_dbt ( & newval ) ; newval . flags = DB_DBT_MALLOC ;
2008-01-08 21:03:17 +00:00
2008-01-25 15:43:37 +00:00
int r = toku_brt_search ( cursor - > brt , search , & newkey , & newval ) ;
if ( r = = 0 ) {
if ( compare_kv_xy ( cursor - > brt , search - > k , search - > v , & newkey , & newval ) = = 0 ) {
brt_cursor_set_key_val ( cursor , & newkey , & newval ) ;
r = brt_cursor_copyout ( cursor , outkey , outval ) ;
} else
r = DB_NOTFOUND ;
}
dbt_cleanup ( & newkey ) ;
dbt_cleanup ( & newval ) ;
return r ;
2007-07-13 19:37:47 +00:00
}
2008-01-25 15:43:37 +00:00
/* search for the kv pair that matches the search object and is equal to k */
static int brt_cursor_search_eq_k_x ( BRT_CURSOR cursor , brt_search_t * search , DBT * outkey , DBT * outval ) {
DBT newkey ; toku_init_dbt ( & newkey ) ; newkey . flags = DB_DBT_MALLOC ;
DBT newval ; toku_init_dbt ( & newval ) ; newval . flags = DB_DBT_MALLOC ;
2007-08-23 18:07:18 +00:00
2008-01-25 15:43:37 +00:00
int r = toku_brt_search ( cursor - > brt , search , & newkey , & newval ) ;
if ( r = = 0 ) {
if ( compare_k_x ( cursor - > brt , search - > k , & newkey ) = = 0 ) {
brt_cursor_set_key_val ( cursor , & newkey , & newval ) ;
r = brt_cursor_copyout ( cursor , outkey , outval ) ;
} else
r = DB_NOTFOUND ;
}
dbt_cleanup ( & newkey ) ;
dbt_cleanup ( & newval ) ;
return r ;
}
2008-01-08 21:03:17 +00:00
2008-01-25 15:43:37 +00:00
static int brt_cursor_compare_one ( brt_search_t * search , DBT * x , DBT * y ) {
search = search ; x = x ; y = y ;
return 1 ;
}
2007-08-23 18:07:18 +00:00
2008-01-25 15:43:37 +00:00
static int brt_cursor_first ( BRT_CURSOR cursor , DBT * outkey , DBT * outval ) {
brt_search_t search ; brt_search_init ( & search , brt_cursor_compare_one , BRT_SEARCH_LEFT , 0 , 0 , cursor - > brt ) ;
return brt_cursor_search ( cursor , & search , outkey , outval ) ;
2007-07-13 19:37:47 +00:00
}
2008-01-25 15:43:37 +00:00
static int brt_cursor_last ( BRT_CURSOR cursor , DBT * outkey , DBT * outval ) {
brt_search_t search ; brt_search_init ( & search , brt_cursor_compare_one , BRT_SEARCH_RIGHT , 0 , 0 , cursor - > brt ) ;
return brt_cursor_search ( cursor , & search , outkey , outval ) ;
}
2007-08-23 18:07:18 +00:00
2008-01-25 15:43:37 +00:00
static int brt_cursor_compare_next ( brt_search_t * search , DBT * x , DBT * y ) {
BRT brt = search - > context ;
return compare_kv_xy ( brt , search - > k , search - > v , x , y ) < 0 ; /* return min xy: kv < xy */
}
2007-08-23 18:07:18 +00:00
2008-01-25 15:43:37 +00:00
static int brt_cursor_next ( BRT_CURSOR cursor , DBT * outkey , DBT * outval ) {
brt_search_t search ; brt_search_init ( & search , brt_cursor_compare_next , BRT_SEARCH_LEFT , & cursor - > key , & cursor - > val , cursor - > brt ) ;
return brt_cursor_search ( cursor , & search , outkey , outval ) ;
}
2007-08-23 18:07:18 +00:00
2008-01-25 15:43:37 +00:00
static int brt_cursor_compare_next_nodup ( brt_search_t * search , DBT * x , DBT * y ) {
BRT brt = search - > context ; y = y ;
return compare_k_x ( brt , search - > k , x ) < 0 ; /* return min x: k < x */
2007-08-23 18:07:18 +00:00
}
2008-01-25 15:43:37 +00:00
static int brt_cursor_next_nodup ( BRT_CURSOR cursor , DBT * outkey , DBT * outval ) {
brt_search_t search ; brt_search_init ( & search , brt_cursor_compare_next_nodup , BRT_SEARCH_LEFT , & cursor - > key , & cursor - > val , cursor - > brt ) ;
return brt_cursor_search ( cursor , & search , outkey , outval ) ;
2007-07-20 12:41:23 +00:00
}
2008-01-25 15:43:37 +00:00
static int brt_cursor_compare_next_dup ( brt_search_t * search , DBT * x , DBT * y ) {
BRT brt = search - > context ;
int keycmp = compare_k_x ( brt , search - > k , x ) ;
if ( keycmp < 0 )
return 1 ;
else
return keycmp = = 0 & & compare_v_y ( brt , search - > v , y ) < 0 ; /* return min xy: k <= x && v < y */
}
2007-09-07 20:25:54 +00:00
2008-01-25 15:43:37 +00:00
static int brt_cursor_next_dup ( BRT_CURSOR cursor , DBT * outkey , DBT * outval ) {
brt_search_t search ; brt_search_init ( & search , brt_cursor_compare_next_dup , BRT_SEARCH_LEFT , & cursor - > key , & cursor - > val , cursor - > brt ) ;
return brt_cursor_search_eq_k_x ( cursor , & search , outkey , outval ) ;
}
2007-09-07 20:25:54 +00:00
2008-01-25 15:43:37 +00:00
static int brt_cursor_compare_get_both_range ( brt_search_t * search , DBT * x , DBT * y ) {
BRT brt = search - > context ;
int keycmp = compare_k_x ( brt , search - > k , x ) ;
if ( keycmp < 0 )
return 1 ;
else
return keycmp = = 0 & & compare_v_y ( brt , search - > v , y ) < = 0 ; /* return min xy: k <= x && v <= y */
}
2007-09-07 20:25:54 +00:00
2008-01-25 15:43:37 +00:00
static int brt_cursor_get_both_range ( BRT_CURSOR cursor , DBT * key , DBT * val , DBT * outkey , DBT * outval ) {
brt_search_t search ; brt_search_init ( & search , brt_cursor_compare_get_both_range , BRT_SEARCH_LEFT , key , val , cursor - > brt ) ;
return brt_cursor_search_eq_k_x ( cursor , & search , outkey , outval ) ;
2007-09-07 20:25:54 +00:00
}
2008-01-25 15:43:37 +00:00
static int brt_cursor_compare_prev ( brt_search_t * search , DBT * x , DBT * y ) {
BRT brt = search - > context ;
return compare_kv_xy ( brt , search - > k , search - > v , x , y ) > 0 ; /* return max xy: kv > xy */
2007-09-07 20:25:54 +00:00
}
2008-01-25 15:43:37 +00:00
static int brt_cursor_prev ( BRT_CURSOR cursor , DBT * outkey , DBT * outval ) {
brt_search_t search ; brt_search_init ( & search , brt_cursor_compare_prev , BRT_SEARCH_RIGHT , & cursor - > key , & cursor - > val , cursor - > brt ) ;
return brt_cursor_search ( cursor , & search , outkey , outval ) ;
2007-09-11 16:30:58 +00:00
}
2008-01-25 15:43:37 +00:00
static int brt_cursor_compare_prev_nodup ( brt_search_t * search , DBT * x , DBT * y ) {
BRT brt = search - > context ; y = y ;
return compare_k_x ( brt , search - > k , x ) > 0 ; /* return max x: k > x */
2007-12-17 01:03:35 +00:00
}
2007-09-11 16:30:58 +00:00
2008-01-25 15:43:37 +00:00
static int brt_cursor_prev_nodup ( BRT_CURSOR cursor , DBT * outkey , DBT * outval ) {
brt_search_t search ; brt_search_init ( & search , brt_cursor_compare_prev_nodup , BRT_SEARCH_RIGHT , & cursor - > key , & cursor - > val , cursor - > brt ) ;
return brt_cursor_search ( cursor , & search , outkey , outval ) ;
}
2007-09-11 16:30:58 +00:00
2008-01-25 15:43:37 +00:00
# ifdef DB_PREV_DUP
static int brt_cursor_compare_prev_dup ( brt_search_t * search , DBT * x , DBT * y ) {
BRT brt = search - > context ;
int keycmp = compare_k_x ( brt , search - > k , x ) ;
if ( keycmp > 0 )
return 1 ;
else
return keycmp = = 0 & & compare_v_y ( brt , search - > v , y ) > 0 ; /* return max xy: k >= x && v > y */
2007-09-11 16:30:58 +00:00
}
2008-01-25 15:43:37 +00:00
static int brt_cursor_prev_dup ( BRT_CURSOR cursor , DBT * outkey , DBT * outval ) {
brt_search_t search ; brt_search_init ( & search , brt_cursor_compare_prev_dup , BRT_SEARCH_RIGHT , & cursor - > key , & cursor - > val , cursor - > brt ) ;
return brt_cursor_search_eq_k_x ( cursor , & search , outkey , outval ) ;
2007-07-13 19:37:47 +00:00
}
2008-01-25 15:43:37 +00:00
# endif
2007-08-24 12:10:49 +00:00
2008-01-25 15:43:37 +00:00
static int brt_cursor_compare_set_range ( brt_search_t * search , DBT * x , DBT * y ) {
BRT brt = search - > context ;
return compare_kv_xy ( brt , search - > k , search - > v , x , y ) < = 0 ; /* return kv <= xy */
2007-08-24 12:10:49 +00:00
}
2008-01-25 15:43:37 +00:00
static int brt_cursor_set ( BRT_CURSOR cursor , DBT * key , DBT * val , DBT * outkey , DBT * outval ) {
brt_search_t search ; brt_search_init ( & search , brt_cursor_compare_set_range , BRT_SEARCH_LEFT , key , val , cursor - > brt ) ;
return brt_cursor_search_eq_kv_xy ( cursor , & search , outkey , outval ) ;
}
static int brt_cursor_set_range ( BRT_CURSOR cursor , DBT * key , DBT * outkey , DBT * outval ) {
brt_search_t search ; brt_search_init ( & search , brt_cursor_compare_set_range , BRT_SEARCH_LEFT , key , 0 , cursor - > brt ) ;
return brt_cursor_search ( cursor , & search , outkey , outval ) ;
}
int toku_brt_cursor_get ( BRT_CURSOR cursor , DBT * key , DBT * val , int get_flags , TOKUTXN txn ) {
assert ( txn = = 0 ) ;
2007-07-13 19:37:47 +00:00
int r ;
2008-01-25 15:43:37 +00:00
if ( ( get_flags & ~ ( DB_OPFLAGS_MASK + 256 ) ) )
return EINVAL ;
switch ( get_flags ) {
case DB_CURRENT :
case DB_CURRENT + 256 :
r = brt_cursor_current ( cursor , get_flags , key , val ) ;
2007-08-24 12:10:49 +00:00
break ;
2007-07-13 19:37:47 +00:00
case DB_FIRST :
2008-01-25 15:43:37 +00:00
r = brt_cursor_first ( cursor , key , val ) ;
break ;
case DB_LAST :
r = brt_cursor_last ( cursor , key , val ) ;
2007-08-24 12:10:49 +00:00
break ;
2007-07-20 12:41:23 +00:00
case DB_NEXT :
2008-01-25 15:43:37 +00:00
if ( brt_cursor_not_set ( cursor ) )
r = brt_cursor_first ( cursor , key , val ) ;
else
r = brt_cursor_next ( cursor , key , val ) ;
break ;
case DB_NEXT_DUP :
if ( brt_cursor_not_set ( cursor ) )
r = EINVAL ;
else
r = brt_cursor_next_dup ( cursor , key , val ) ;
break ;
case DB_NEXT_NODUP :
if ( brt_cursor_not_set ( cursor ) )
r = brt_cursor_first ( cursor , key , val ) ;
else
r = brt_cursor_next_nodup ( cursor , key , val ) ;
2007-09-07 20:25:54 +00:00
break ;
case DB_PREV :
2008-01-25 15:43:37 +00:00
if ( brt_cursor_not_set ( cursor ) )
r = brt_cursor_last ( cursor , key , val ) ;
else
r = brt_cursor_prev ( cursor , key , val ) ;
2007-09-07 20:25:54 +00:00
break ;
2008-01-25 15:43:37 +00:00
# ifdef DB_PREV_DUP
case DB_PREV_DUP :
if ( brt_cursor_not_set ( cursor ) )
r = EINVAL ;
else
r = brt_cursor_prev_dup ( cursor , key , val ) ;
2007-12-03 13:36:52 +00:00
break ;
2008-01-25 15:43:37 +00:00
# endif
case DB_PREV_NODUP :
if ( brt_cursor_not_set ( cursor ) )
r = brt_cursor_last ( cursor , key , val ) ;
else
r = brt_cursor_prev_nodup ( cursor , key , val ) ;
2007-09-12 20:30:36 +00:00
break ;
2008-01-25 15:43:37 +00:00
case DB_SET :
r = brt_cursor_set ( cursor , key , 0 , 0 , val ) ;
2007-09-11 16:30:58 +00:00
break ;
case DB_SET_RANGE :
2008-01-25 15:43:37 +00:00
r = brt_cursor_set_range ( cursor , key , key , val ) ;
break ;
case DB_GET_BOTH :
r = brt_cursor_set ( cursor , key , val , 0 , 0 ) ;
2007-09-11 16:30:58 +00:00
break ;
2007-12-17 13:48:20 +00:00
case DB_GET_BOTH_RANGE :
2008-01-25 15:43:37 +00:00
r = brt_cursor_get_both_range ( cursor , key , val , 0 , val ) ;
2007-12-17 13:48:20 +00:00
break ;
2007-07-13 19:37:47 +00:00
default :
2008-01-25 15:43:37 +00:00
r = EINVAL ;
break ;
2007-07-13 19:37:47 +00:00
}
2007-09-11 18:32:10 +00:00
return r ;
}
2007-12-10 18:54:12 +00:00
2008-01-25 15:43:37 +00:00
int toku_brt_cursor_delete ( BRT_CURSOR cursor , int flags ) {
if ( ( flags & ~ DB_DELETE_ANY ) ! = 0 )
return EINVAL ;
if ( brt_cursor_not_set ( cursor ) )
return EINVAL ;
int r = 0 ;
if ( ! ( flags & DB_DELETE_ANY ) )
r = brt_cursor_current ( cursor , DB_CURRENT , 0 , 0 ) ;
if ( r = = 0 )
r = toku_brt_delete_both ( cursor - > brt , & cursor - > key , & cursor - > val ) ;
2007-12-10 18:54:12 +00:00
return r ;
}