2007-11-29 14:18:54 +00:00
/* -*- mode: C; c-basic-offset: 4 -*- */
2008-01-24 15:10:32 +00:00
# ident "Copyright (c) 2007, 2008 Tokutek Inc. All rights reserved."
2007-11-29 14:18:54 +00:00
2007-07-13 19:37:47 +00:00
# define _XOPEN_SOURCE 500
//#include "pma.h"
2008-02-08 03:17:38 +00:00
# include "toku_assert.h"
2007-07-13 19:37:47 +00:00
# include "brt-internal.h"
2007-07-24 01:32:03 +00:00
# include "key.h"
2007-08-21 23:32:17 +00:00
# include "rbuf.h"
# include "wbuf.h"
2007-07-13 19:37:47 +00:00
# include <unistd.h>
# include <stdio.h>
# include <arpa/inet.h>
2007-11-14 17:58:38 +00:00
2007-11-19 23:54:17 +00:00
static const int brtnode_header_overhead = ( 8 + // magic "tokunode" or "tokuleaf"
8 + // checkpoint number
4 + // block size
4 + // data size
2007-11-27 15:22:56 +00:00
4 + // flags
2007-11-19 23:54:17 +00:00
4 + // height
4 + // random for fingerprint
4 + // localfingerprint
4 ) ; // crc32 at the end
2007-11-14 17:58:38 +00:00
2007-11-19 23:54:17 +00:00
static unsigned int toku_serialize_brtnode_size_slow ( BRTNODE node ) {
2007-11-14 17:58:38 +00:00
unsigned int size = brtnode_header_overhead ;
2007-07-13 19:37:47 +00:00
if ( node - > height > 0 ) {
unsigned int hsize = 0 ;
unsigned int csize = 0 ;
int i ;
size + = 4 ; /* n_children */
2007-11-14 17:58:38 +00:00
size + = 4 ; /* subtree fingerprint. */
2007-07-13 19:37:47 +00:00
for ( i = 0 ; i < node - > u . n . n_children - 1 ; i + + ) {
size + = 4 ;
2007-11-27 18:16:45 +00:00
if ( node - > flags & TOKU_DB_DUPSORT ) size + = 4 ;
2007-12-06 14:20:47 +00:00
csize + = toku_brtnode_pivot_key_len ( node , node - > u . n . childkeys [ i ] ) ;
2007-07-13 19:37:47 +00:00
}
for ( i = 0 ; i < node - > u . n . n_children ; i + + ) {
2007-11-14 17:58:38 +00:00
size + = 8 ; // diskoff
size + = 4 ; // subsum
2007-07-13 19:37:47 +00:00
}
2008-01-11 14:03:33 +00:00
int n_buffers = node - > u . n . n_children ;
2007-07-13 19:37:47 +00:00
size + = 4 ; /* n_entries */
2008-01-11 14:03:33 +00:00
assert ( 0 < = n_buffers & & n_buffers < TREE_FANOUT + 1 ) ;
for ( i = 0 ; i < n_buffers ; i + + ) {
2008-01-31 22:05:43 +00:00
FIFO_ITERATE ( BNC_BUFFER ( node , i ) ,
key __attribute__ ( ( __unused__ ) ) , keylen ,
data __attribute__ ( ( __unused__ ) ) , datalen ,
2008-02-05 18:25:23 +00:00
type __attribute__ ( ( __unused__ ) ) , xid __attribute__ ( ( __unused__ ) ) ,
2008-01-31 22:05:43 +00:00
( hsize + = BRT_CMD_OVERHEAD + KEY_VALUE_OVERHEAD + keylen + datalen ) ) ;
2007-07-13 19:37:47 +00:00
}
2008-01-11 14:03:33 +00:00
assert ( hsize = = node - > u . n . n_bytes_in_buffers ) ;
2007-07-13 19:37:47 +00:00
assert ( csize = = node - > u . n . totalchildkeylens ) ;
return size + hsize + csize ;
} else {
unsigned int hsize = 0 ;
PMA_ITERATE ( node - > u . l . buffer ,
key __attribute__ ( ( __unused__ ) ) , keylen ,
data __attribute__ ( ( __unused__ ) ) , datalen ,
2007-12-04 22:18:21 +00:00
( hsize + = PMA_ITEM_OVERHEAD + KEY_VALUE_OVERHEAD + keylen + datalen ) ) ;
2007-07-13 19:37:47 +00:00
assert ( hsize = = node - > u . l . n_bytes_in_buffer ) ;
2007-12-04 22:18:21 +00:00
hsize + = 4 ; /* the PMA size */
2007-07-13 19:37:47 +00:00
hsize + = 4 ; /* add n entries in buffer table. */
return size + hsize ;
}
}
2007-11-19 23:54:17 +00:00
unsigned int toku_serialize_brtnode_size ( BRTNODE node ) {
2007-11-14 17:58:38 +00:00
unsigned int result = brtnode_header_overhead ;
2007-07-13 19:37:47 +00:00
assert ( sizeof ( off_t ) = = 8 ) ;
if ( node - > height > 0 ) {
result + = 4 ; /* n_children */
2007-11-14 17:58:38 +00:00
result + = 4 ; /* subtree fingerpirnt */
2008-01-08 21:43:11 +00:00
result + = 4 * ( node - > u . n . n_children - 1 ) ; /* key lengths*/
2007-11-27 18:16:45 +00:00
if ( node - > flags & TOKU_DB_DUPSORT ) result + = 4 * ( node - > u . n . n_children - 1 ) ; /* data lengths */
2007-07-13 19:37:47 +00:00
result + = node - > u . n . totalchildkeylens ; /* the lengths of the pivot keys, without their key lengths. */
2007-11-14 17:58:38 +00:00
result + = ( 8 + 4 + 4 ) * ( node - > u . n . n_children ) ; /* For each child, a child offset, a count for the number of hash table entries, and the subtree fingerprint. */
2008-01-11 14:03:33 +00:00
result + = node - > u . n . n_bytes_in_buffers ;
2007-07-13 19:37:47 +00:00
} else {
2007-12-04 22:18:21 +00:00
result + = ( 4 /* n_entries in buffer table. */
+ 4 ) ; /* the pma size */
2007-07-13 19:37:47 +00:00
result + = node - > u . l . n_bytes_in_buffer ;
2007-11-29 15:34:49 +00:00
if ( toku_memory_check ) {
2007-11-19 23:54:17 +00:00
unsigned int slowresult = toku_serialize_brtnode_size_slow ( node ) ;
2007-07-13 19:37:47 +00:00
if ( result ! = slowresult ) printf ( " %s:%d result=%d slowresult=%d \n " , __FILE__ , __LINE__ , result , slowresult ) ;
assert ( result = = slowresult ) ;
}
}
return result ;
}
2007-11-21 13:07:49 +00:00
void toku_serialize_brtnode_to ( int fd , DISKOFF off , DISKOFF size , BRTNODE node ) {
2007-11-14 17:58:38 +00:00
//printf("%s:%d serializing\n", __FILE__, __LINE__);
2007-08-21 23:32:17 +00:00
struct wbuf w ;
2007-07-13 19:37:47 +00:00
int i ;
2007-11-19 23:54:17 +00:00
unsigned int calculated_size = toku_serialize_brtnode_size ( node ) ;
2008-01-16 13:50:23 +00:00
assert ( calculated_size < = size ) ;
2007-10-16 21:02:53 +00:00
//char buf[size];
char * MALLOC_N ( size , buf ) ;
2007-12-05 20:00:19 +00:00
toku_verify_counts ( node ) ;
2007-07-13 19:37:47 +00:00
assert ( size > 0 ) ;
2007-09-28 17:11:22 +00:00
wbuf_init ( & w , buf , size ) ;
2007-07-13 19:37:47 +00:00
//printf("%s:%d serializing %lld w height=%d p0=%p\n", __FILE__, __LINE__, off, node->height, node->mdicts[0]);
2007-11-14 17:58:38 +00:00
wbuf_literal_bytes ( & w , " toku " , 4 ) ;
if ( node - > height = = 0 ) wbuf_literal_bytes ( & w , " leaf " , 4 ) ;
else wbuf_literal_bytes ( & w , " node " , 4 ) ;
wbuf_int ( & w , node - > layout_version ) ;
2007-11-24 03:50:28 +00:00
wbuf_ulonglong ( & w , node - > log_lsn . lsn ) ;
2007-11-14 17:58:38 +00:00
//printf("%s:%d %lld.calculated_size=%d\n", __FILE__, __LINE__, off, calculated_size);
2007-07-13 19:37:47 +00:00
wbuf_int ( & w , calculated_size ) ;
2007-11-27 15:22:56 +00:00
wbuf_int ( & w , node - > flags ) ;
2007-07-13 19:37:47 +00:00
wbuf_int ( & w , node - > height ) ;
2007-11-14 17:58:38 +00:00
//printf("%s:%d %lld rand=%08x sum=%08x height=%d\n", __FILE__, __LINE__, node->thisnodename, node->rand4fingerprint, node->subtree_fingerprint, node->height);
wbuf_int ( & w , node - > rand4fingerprint ) ;
wbuf_int ( & w , node - > local_fingerprint ) ;
//printf("%s:%d local_fingerprint=%8x\n", __FILE__, __LINE__, node->local_fingerprint);
2007-07-13 19:37:47 +00:00
//printf("%s:%d w.ndone=%d n_children=%d\n", __FILE__, __LINE__, w.ndone, node->n_children);
2007-11-14 17:58:38 +00:00
if ( node - > height > 0 ) {
2008-01-29 21:43:08 +00:00
assert ( node - > u . n . n_children > 0 ) ;
2007-11-14 17:58:38 +00:00
// Local fingerprint is not actually stored while in main memory. Must calculate it.
// Subtract the child fingerprints from the subtree fingerprint to get the local fingerprint.
{
u_int32_t subtree_fingerprint = node - > local_fingerprint ;
for ( i = 0 ; i < node - > u . n . n_children ; i + + ) {
2008-01-31 22:05:43 +00:00
subtree_fingerprint + = BNC_SUBTREE_FINGERPRINT ( node , i ) ;
2007-11-14 17:58:38 +00:00
}
wbuf_int ( & w , subtree_fingerprint ) ;
}
2007-07-13 19:37:47 +00:00
wbuf_int ( & w , node - > u . n . n_children ) ;
2007-11-14 17:58:38 +00:00
for ( i = 0 ; i < node - > u . n . n_children ; i + + ) {
2008-01-31 22:05:43 +00:00
wbuf_int ( & w , BNC_SUBTREE_FINGERPRINT ( node , i ) ) ;
2007-11-14 17:58:38 +00:00
}
2007-07-13 19:37:47 +00:00
//printf("%s:%d w.ndone=%d\n", __FILE__, __LINE__, w.ndone);
for ( i = 0 ; i < node - > u . n . n_children - 1 ; i + + ) {
2007-11-27 15:22:56 +00:00
if ( node - > flags & TOKU_DB_DUPSORT ) {
2007-11-27 18:16:45 +00:00
wbuf_bytes ( & w , kv_pair_key ( node - > u . n . childkeys [ i ] ) , kv_pair_keylen ( node - > u . n . childkeys [ i ] ) ) ;
wbuf_bytes ( & w , kv_pair_val ( node - > u . n . childkeys [ i ] ) , kv_pair_vallen ( node - > u . n . childkeys [ i ] ) ) ;
} else {
2007-12-06 14:20:47 +00:00
wbuf_bytes ( & w , kv_pair_key ( node - > u . n . childkeys [ i ] ) , toku_brtnode_pivot_key_len ( node , node - > u . n . childkeys [ i ] ) ) ;
2007-11-27 18:16:45 +00:00
}
2007-07-13 19:37:47 +00:00
//printf("%s:%d w.ndone=%d (childkeylen[%d]=%d\n", __FILE__, __LINE__, w.ndone, i, node->childkeylens[i]);
}
for ( i = 0 ; i < node - > u . n . n_children ; i + + ) {
2008-01-31 22:05:43 +00:00
wbuf_DISKOFF ( & w , BNC_DISKOFF ( node , i ) ) ;
2007-07-13 19:37:47 +00:00
//printf("%s:%d w.ndone=%d\n", __FILE__, __LINE__, w.ndone);
}
{
2008-01-11 14:03:33 +00:00
int n_buffers = node - > u . n . n_children ;
2007-11-14 17:58:38 +00:00
u_int32_t check_local_fingerprint = 0 ;
2008-01-11 14:03:33 +00:00
for ( i = 0 ; i < n_buffers ; i + + ) {
2007-07-13 19:37:47 +00:00
//printf("%s:%d p%d=%p n_entries=%d\n", __FILE__, __LINE__, i, node->mdicts[i], mdict_n_entries(node->mdicts[i]));
2008-01-31 22:05:43 +00:00
wbuf_int ( & w , toku_fifo_n_entries ( BNC_BUFFER ( node , i ) ) ) ;
2008-02-05 18:25:23 +00:00
FIFO_ITERATE ( BNC_BUFFER ( node , i ) , key , keylen , data , datalen , type , xid ,
2007-11-14 17:58:38 +00:00
( {
wbuf_char ( & w , type ) ;
2008-02-05 18:25:23 +00:00
wbuf_TXNID ( & w , xid ) ;
2007-11-14 17:58:38 +00:00
wbuf_bytes ( & w , key , keylen ) ;
wbuf_bytes ( & w , data , datalen ) ;
2008-02-05 18:25:23 +00:00
check_local_fingerprint + = node - > rand4fingerprint * toku_calccrc32_cmd ( type , xid , key , keylen , data , datalen ) ;
2007-11-14 17:58:38 +00:00
} ) ) ;
2007-07-13 19:37:47 +00:00
}
2007-11-14 17:58:38 +00:00
//printf("%s:%d check_local_fingerprint=%8x\n", __FILE__, __LINE__, check_local_fingerprint);
assert ( check_local_fingerprint = = node - > local_fingerprint ) ;
2007-07-13 19:37:47 +00:00
}
} else {
2007-11-20 00:32:25 +00:00
//printf(" n_entries=%d\n", toku_pma_n_entries(node->u.l.buffer));
wbuf_int ( & w , toku_pma_n_entries ( node - > u . l . buffer ) ) ;
2007-12-04 22:18:21 +00:00
wbuf_int ( & w , toku_pma_index_limit ( node - > u . l . buffer ) ) ;
PMA_ITERATE_IDX ( node - > u . l . buffer , idx ,
key , keylen , data , datalen ,
( {
wbuf_int ( & w , idx ) ;
wbuf_bytes ( & w , key , keylen ) ;
wbuf_bytes ( & w , data , datalen ) ;
} ) ) ;
2007-07-13 19:37:47 +00:00
}
assert ( w . ndone < = w . size ) ;
2007-11-14 17:58:38 +00:00
# ifdef CRC_ATEND
wbuf_int ( & w , crc32 ( toku_null_crc , w . buf , w . ndone ) ) ;
# endif
# ifdef CRC_INCR
wbuf_int ( & w , w . crc32 ) ;
# endif
//write_now: printf("%s:%d Writing %d bytes\n", __FILE__, __LINE__, w.ndone);
2007-07-13 19:37:47 +00:00
{
ssize_t r = pwrite ( fd , w . buf , w . ndone , off ) ;
2007-07-31 21:23:00 +00:00
if ( r < 0 ) printf ( " r=%ld errno=%d \n " , ( long ) r , errno ) ;
2007-07-13 19:37:47 +00:00
assert ( ( size_t ) r = = w . ndone ) ;
}
//printf("%s:%d w.done=%d r=%d\n", __FILE__, __LINE__, w.ndone, r);
assert ( calculated_size = = w . ndone ) ;
//printf("%s:%d wrote %d bytes for %lld size=%lld\n", __FILE__, __LINE__, w.ndone, off, size);
assert ( w . ndone < = size ) ;
2007-10-16 21:02:53 +00:00
toku_free ( buf ) ;
2007-07-13 19:37:47 +00:00
}
2007-11-19 23:54:17 +00:00
int toku_deserialize_brtnode_from ( int fd , DISKOFF off , BRTNODE * brtnode , int flags , int nodesize ,
2007-11-26 21:51:36 +00:00
int ( * bt_compare ) ( DB * , const DBT * , const DBT * ) ,
int ( * dup_compare ) ( DB * , const DBT * , const DBT * ) ,
DB * db , FILENUM filenum ) {
2007-07-13 19:37:47 +00:00
TAGMALLOC ( BRTNODE , result ) ;
2007-08-21 23:32:17 +00:00
struct rbuf rc ;
2007-07-13 19:37:47 +00:00
int i ;
2007-11-14 17:58:38 +00:00
u_int32_t datasize ;
2007-07-13 19:37:47 +00:00
int r ;
if ( errno ! = 0 ) {
r = errno ;
2007-07-20 18:00:14 +00:00
if ( 0 ) { died0 : toku_free ( result ) ; }
2007-07-13 19:37:47 +00:00
return r ;
}
{
2007-11-14 17:58:38 +00:00
u_int32_t datasize_n ;
r = pread ( fd , & datasize_n , sizeof ( datasize_n ) , off + 8 + 4 + 8 ) ;
2007-07-13 19:37:47 +00:00
//printf("%s:%d r=%d the datasize=%d\n", __FILE__, __LINE__, r, ntohl(datasize_n));
if ( r ! = sizeof ( datasize_n ) ) {
if ( r = = - 1 ) r = errno ;
else r = DB_BADFORMAT ;
goto died0 ;
}
datasize = ntohl ( datasize_n ) ;
if ( datasize < = 0 | | datasize > ( 1 < < 30 ) ) { r = DB_BADFORMAT ; goto died0 ; }
}
2007-07-20 18:00:14 +00:00
rc . buf = toku_malloc ( datasize ) ;
2007-11-14 17:58:38 +00:00
//printf("%s:%d errno=%d\n", __FILE__, __LINE__, errno);
2007-07-13 19:37:47 +00:00
if ( errno ! = 0 ) {
2007-07-20 18:00:14 +00:00
if ( 0 ) { died1 : toku_free ( rc . buf ) ; }
2007-07-13 19:37:47 +00:00
r = errno ;
goto died0 ;
}
rc . size = datasize ;
assert ( rc . size > 0 ) ;
rc . ndone = 0 ;
//printf("Deserializing %lld datasize=%d\n", off, datasize);
{
2007-11-14 17:58:38 +00:00
ssize_t rlen = pread ( fd , rc . buf , datasize , off ) ;
//printf("%s:%d pread->%d datasize=%d\n", __FILE__, __LINE__, r, datasize);
if ( ( size_t ) rlen ! = datasize ) {
//printf("%s:%d size messed up\n", __FILE__, __LINE__);
r = errno ;
goto died1 ;
}
2007-07-13 19:37:47 +00:00
//printf("Got %d %d %d %d\n", rc.buf[0], rc.buf[1], rc.buf[2], rc.buf[3]);
}
2007-11-14 17:58:38 +00:00
{
bytevec tmp ;
rbuf_literal_bytes ( & rc , & tmp , 8 ) ;
if ( memcmp ( tmp , " tokuleaf " , 8 ) ! = 0
& & memcmp ( tmp , " tokunode " , 8 ) ! = 0 ) {
r = DB_BADFORMAT ;
goto died1 ;
}
}
result - > layout_version = rbuf_int ( & rc ) ;
2008-02-05 18:25:23 +00:00
if ( result - > layout_version ! = 2 ) {
2007-11-14 17:58:38 +00:00
r = DB_BADFORMAT ;
goto died1 ;
}
2007-11-24 03:50:28 +00:00
result - > disk_lsn . lsn = rbuf_ulonglong ( & rc ) ;
result - > log_lsn = result - > disk_lsn ;
2007-07-13 19:37:47 +00:00
{
unsigned int stored_size = rbuf_int ( & rc ) ;
if ( stored_size ! = datasize ) { r = DB_BADFORMAT ; goto died1 ; }
}
result - > nodesize = nodesize ; // How to compute the nodesize?
result - > thisnodename = off ;
2007-11-27 15:22:56 +00:00
result - > flags = rbuf_int ( & rc ) ; assert ( result - > flags = = ( unsigned int ) flags ) ;
2007-07-13 19:37:47 +00:00
result - > height = rbuf_int ( & rc ) ;
2007-11-14 17:58:38 +00:00
result - > rand4fingerprint = rbuf_int ( & rc ) ;
result - > local_fingerprint = rbuf_int ( & rc ) ;
2007-09-18 16:09:55 +00:00
result - > dirty = 0 ;
2007-07-13 19:37:47 +00:00
//printf("height==%d\n", result->height);
if ( result - > height > 0 ) {
result - > u . n . totalchildkeylens = 0 ;
2007-11-14 17:58:38 +00:00
u_int32_t subtree_fingerprint = rbuf_int ( & rc ) ;
u_int32_t check_subtree_fingerprint = 0 ;
2007-07-13 19:37:47 +00:00
result - > u . n . n_children = rbuf_int ( & rc ) ;
2008-03-06 22:48:07 +00:00
MALLOC_N ( result - > u . n . n_children + 1 , result - > u . n . childinfos ) ;
MALLOC_N ( result - > u . n . n_children , result - > u . n . childkeys ) ;
2007-07-13 19:37:47 +00:00
//printf("n_children=%d\n", result->n_children);
assert ( result - > u . n . n_children > = 0 & & result - > u . n . n_children < = TREE_FANOUT ) ;
2007-11-14 17:58:38 +00:00
for ( i = 0 ; i < result - > u . n . n_children ; i + + ) {
u_int32_t childfp = rbuf_int ( & rc ) ;
2008-01-31 22:05:43 +00:00
BNC_SUBTREE_FINGERPRINT ( result , i ) = childfp ;
2007-11-14 17:58:38 +00:00
check_subtree_fingerprint + = childfp ;
}
2007-07-13 19:37:47 +00:00
for ( i = 0 ; i < result - > u . n . n_children - 1 ; i + + ) {
2007-11-27 15:22:56 +00:00
if ( result - > flags & TOKU_DB_DUPSORT ) {
bytevec keyptr , dataptr ;
unsigned int keylen , datalen ;
rbuf_bytes ( & rc , & keyptr , & keylen ) ;
rbuf_bytes ( & rc , & dataptr , & datalen ) ;
2007-11-27 18:16:45 +00:00
result - > u . n . childkeys [ i ] = kv_pair_malloc ( keyptr , keylen , dataptr , datalen ) ;
2007-11-27 15:22:56 +00:00
} else {
bytevec childkeyptr ;
2007-12-06 14:30:33 +00:00
unsigned int cklen ;
rbuf_bytes ( & rc , & childkeyptr , & cklen ) ; /* Returns a pointer into the rbuf. */
result - > u . n . childkeys [ i ] = kv_pair_malloc ( ( void * ) childkeyptr , cklen , 0 , 0 ) ;
2007-11-27 15:22:56 +00:00
}
//printf(" key %d length=%d data=%s\n", i, result->childkeylens[i], result->childkeys[i]);
2007-12-06 14:20:47 +00:00
result - > u . n . totalchildkeylens + = toku_brtnode_pivot_key_len ( result , result - > u . n . childkeys [ i ] ) ;
2007-07-13 19:37:47 +00:00
}
for ( i = 0 ; i < result - > u . n . n_children ; i + + ) {
2008-01-31 22:05:43 +00:00
BNC_DISKOFF ( result , i ) = rbuf_diskoff ( & rc ) ;
2008-03-06 21:46:57 +00:00
BNC_NBYTESINBUF ( result , i ) = 0 ;
2007-07-13 19:37:47 +00:00
//printf("Child %d at %lld\n", i, result->children[i]);
}
2008-01-11 14:03:33 +00:00
result - > u . n . n_bytes_in_buffers = 0 ;
2007-07-24 01:32:03 +00:00
for ( i = 0 ; i < result - > u . n . n_children ; i + + ) {
2008-01-31 22:05:43 +00:00
r = toku_fifo_create ( & BNC_BUFFER ( result , i ) ) ;
2007-07-13 19:37:47 +00:00
if ( r ! = 0 ) {
int j ;
2008-01-11 14:03:33 +00:00
if ( 0 ) { died_12 : j = result - > u . n . n_bytes_in_buffers ; }
2008-01-31 22:05:43 +00:00
for ( j = 0 ; j < i ; j + + ) toku_fifo_free ( & BNC_BUFFER ( result , j ) ) ;
2007-07-13 19:37:47 +00:00
goto died1 ;
}
}
{
2007-07-24 01:32:03 +00:00
int cnum ;
2007-11-14 17:58:38 +00:00
u_int32_t check_local_fingerprint = 0 ;
2007-07-24 01:32:03 +00:00
for ( cnum = 0 ; cnum < result - > u . n . n_children ; cnum + + ) {
int n_in_this_hash = rbuf_int ( & rc ) ;
//printf("%d in hash\n", n_in_hash);
for ( i = 0 ; i < n_in_this_hash ; i + + ) {
int diff ;
bytevec key ; ITEMLEN keylen ;
bytevec val ; ITEMLEN vallen ;
2007-11-19 23:54:17 +00:00
toku_verify_counts ( result ) ;
2008-02-05 18:25:23 +00:00
int type = rbuf_char ( & rc ) ;
TXNID xid = rbuf_ulonglong ( & rc ) ;
2007-07-24 01:32:03 +00:00
rbuf_bytes ( & rc , & key , & keylen ) ; /* Returns a pointer into the rbuf. */
rbuf_bytes ( & rc , & val , & vallen ) ;
2008-02-05 18:25:23 +00:00
check_local_fingerprint + = result - > rand4fingerprint * toku_calccrc32_cmd ( type , xid , key , keylen , val , vallen ) ;
2007-09-06 21:36:45 +00:00
//printf("Found %s,%s\n", (char*)key, (char*)val);
2007-07-24 01:32:03 +00:00
{
2008-02-05 18:25:23 +00:00
r = toku_fifo_enq ( BNC_BUFFER ( result , cnum ) , key , keylen , val , vallen , type , xid ) ; /* Copies the data into the hash table. */
2007-07-24 01:32:03 +00:00
if ( r ! = 0 ) { goto died_12 ; }
}
2007-09-06 21:36:45 +00:00
diff = keylen + vallen + KEY_VALUE_OVERHEAD + BRT_CMD_OVERHEAD ;
2008-01-11 14:03:33 +00:00
result - > u . n . n_bytes_in_buffers + = diff ;
2008-01-31 22:05:43 +00:00
BNC_NBYTESINBUF ( result , cnum ) + = diff ;
2007-07-24 01:32:03 +00:00
//printf("Inserted\n");
2007-07-13 19:37:47 +00:00
}
}
2007-11-14 17:58:38 +00:00
if ( check_local_fingerprint ! = result - > local_fingerprint ) {
fprintf ( stderr , " %s:%d local fingerprint is wrong (found %8x calcualted %8x \n " , __FILE__ , __LINE__ , result - > local_fingerprint , check_local_fingerprint ) ;
return DB_BADFORMAT ;
}
if ( check_subtree_fingerprint + check_local_fingerprint ! = subtree_fingerprint ) {
fprintf ( stderr , " %s:%d subtree fingerprint is wrong \n " , __FILE__ , __LINE__ ) ;
return DB_BADFORMAT ;
}
2007-07-13 19:37:47 +00:00
}
} else {
int n_in_buf = rbuf_int ( & rc ) ;
result - > u . l . n_bytes_in_buffer = 0 ;
2007-11-26 21:51:36 +00:00
r = toku_pma_create ( & result - > u . l . buffer , bt_compare , db , filenum , nodesize ) ;
2007-07-13 19:37:47 +00:00
if ( r ! = 0 ) {
2007-11-20 00:32:25 +00:00
if ( 0 ) { died_21 : toku_pma_free ( & result - > u . l . buffer ) ; }
2007-07-13 19:37:47 +00:00
goto died1 ;
}
2007-11-20 00:32:25 +00:00
toku_pma_set_dup_mode ( result - > u . l . buffer , flags ) ;
2007-12-17 01:03:35 +00:00
toku_pma_set_dup_compare ( result - > u . l . buffer , dup_compare ) ;
2007-07-13 19:37:47 +00:00
//printf("%s:%d r PMA= %p\n", __FILE__, __LINE__, result->u.l.buffer);
2007-12-05 19:41:39 +00:00
toku_verify_counts ( result ) ;
2007-08-08 20:58:25 +00:00
# define BRT_USE_PMA_BULK_INSERT 1
# if BRT_USE_PMA_BULK_INSERT
2008-03-04 21:31:01 +00:00
int index_limit __attribute__ ( ( __unused__ ) ) = rbuf_int ( & rc ) ;
2007-08-08 20:58:25 +00:00
if ( n_in_buf > 0 ) {
2008-03-04 21:31:01 +00:00
# define BRT_BULK_INSERT_MALLOC 1
# if BRT_BULK_INSERT_MALLOC
/* some applications run with small stacks so we malloc the
keys and vals structs */
size_t n = 2 * n_in_buf * sizeof ( DBT ) ;
DBT * keys = toku_malloc ( n ) ;
if ( keys = = 0 ) goto died_21 ;
DBT * vals = & keys [ n_in_buf ] ;
# else
DBT keys [ n_in_buf ] , vals [ n_in_buf ] ;
# endif
for ( i = 0 ; i < n_in_buf ; i + + ) {
bytevec key ; ITEMLEN keylen ;
bytevec val ; ITEMLEN vallen ;
// The counts are wrong here
int idx __attribute__ ( ( __unused__ ) ) = rbuf_int ( & rc ) ;
rbuf_bytes ( & rc , & key , & keylen ) ; /* Returns a pointer into the rbuf. */
toku_fill_dbt ( & keys [ i ] , key , keylen ) ;
rbuf_bytes ( & rc , & val , & vallen ) ;
toku_fill_dbt ( & vals [ i ] , val , vallen ) ;
result - > u . l . n_bytes_in_buffer + = keylen + vallen + KEY_VALUE_OVERHEAD + PMA_ITEM_OVERHEAD ;
}
u_int32_t actual_sum = 0 ;
2008-02-08 19:54:00 +00:00
r = toku_pma_bulk_insert ( ( TOKULOGGER ) 0 , ( FILENUM ) { 0 } , ( DISKOFF ) 0 , result - > u . l . buffer , keys , vals , n_in_buf , result - > rand4fingerprint , & actual_sum , 0 ) ;
2008-03-04 21:31:01 +00:00
# if BRT_BULK_INSERT_MALLOC
toku_free_n ( keys , n ) ;
# endif
2007-08-08 20:58:25 +00:00
if ( r ! = 0 ) goto died_21 ;
2008-03-04 21:31:01 +00:00
if ( actual_sum ! = result - > local_fingerprint ) {
//fprintf(stderr, "%s:%d Corrupted checksum stored=%08x rand=%08x actual=%08x height=%d n_keys=%d\n", __FILE__, __LINE__, result->rand4fingerprint, result->local_fingerprint, actual_sum, result->height, n_in_buf);
return DB_BADFORMAT ;
goto died_21 ;
} else {
//fprintf(stderr, "%s:%d Good checksum=%08x height=%d\n", __FILE__, __LINE__, actual_sum, result->height);
}
2007-08-08 20:58:25 +00:00
}
# else
2007-07-13 19:37:47 +00:00
for ( i = 0 ; i < n_in_buf ; i + + ) {
bytevec key ; ITEMLEN keylen ;
bytevec val ; ITEMLEN vallen ;
2007-11-19 23:54:17 +00:00
toku_verify_counts ( result ) ;
2007-07-13 19:37:47 +00:00
rbuf_bytes ( & rc , & key , & keylen ) ; /* Returns a pointer into the rbuf. */
rbuf_bytes ( & rc , & val , & vallen ) ;
{
2007-07-24 01:32:03 +00:00
DBT k , v ;
2007-11-29 15:17:46 +00:00
r = toku_pma_insert ( result - > u . l . buffer , toku_fill_dbt ( & k , key , keylen ) , toku_fill_dbt ( & v , val , vallen ) , 0 ) ;
2007-07-13 19:37:47 +00:00
if ( r ! = 0 ) goto died_21 ;
}
2007-12-04 22:18:21 +00:00
result - > u . l . n_bytes_in_buffer + = keylen + vallen + KEY_VALUE_OVERHEAD + PMA_ITEM_OVERHEAD ;
2007-07-13 19:37:47 +00:00
}
2007-08-08 20:58:25 +00:00
# endif
2007-12-05 19:41:39 +00:00
toku_verify_counts ( result ) ;
2007-07-13 19:37:47 +00:00
}
2007-11-14 17:58:38 +00:00
{
unsigned int n_read_so_far = rc . ndone ;
if ( n_read_so_far + 4 ! = rc . size ) {
r = DB_BADFORMAT ; goto died_21 ;
}
uint32_t crc = toku_crc32 ( toku_null_crc , rc . buf , n_read_so_far ) ;
uint32_t storedcrc = rbuf_int ( & rc ) ;
if ( crc ! = storedcrc ) {
printf ( " Bad CRC \n " ) ;
assert ( 0 ) ; //this is wrong!!!
r = DB_BADFORMAT ;
goto died_21 ;
}
}
2007-07-13 19:37:47 +00:00
//printf("%s:%d Ok got %lld n_children=%d\n", __FILE__, __LINE__, result->thisnodename, result->n_children);
2007-07-20 18:00:14 +00:00
toku_free ( rc . buf ) ;
2007-07-13 19:37:47 +00:00
* brtnode = result ;
2007-11-19 23:54:17 +00:00
toku_verify_counts ( result ) ;
2007-07-13 19:37:47 +00:00
return 0 ;
}
2007-11-19 23:54:17 +00:00
void toku_verify_counts ( BRTNODE node ) {
2007-07-24 02:36:00 +00:00
/*foo*/
2007-07-13 19:37:47 +00:00
if ( node - > height = = 0 ) {
assert ( node - > u . l . buffer ) ;
2007-12-05 19:41:39 +00:00
unsigned int sum = 0 ;
PMA_ITERATE ( node - > u . l . buffer , key __attribute__ ( ( __unused__ ) ) , keylen , data __attribute__ ( ( __unused__ ) ) , datalen ,
sum + = ( PMA_ITEM_OVERHEAD + KEY_VALUE_OVERHEAD + keylen + datalen ) ) ;
assert ( sum = = node - > u . l . n_bytes_in_buffer ) ;
2007-07-13 19:37:47 +00:00
} else {
unsigned int sum = 0 ;
int i ;
for ( i = 0 ; i < node - > u . n . n_children ; i + + )
2008-01-31 22:05:43 +00:00
sum + = BNC_NBYTESINBUF ( node , i ) ;
2008-01-11 14:03:33 +00:00
// We don't rally care of the later buffers have garbage in them. Valgrind would do a better job noticing if we leave it uninitialized.
2007-11-14 17:58:38 +00:00
// But for now the code always initializes the later tables so they are 0.
2008-01-11 14:03:33 +00:00
assert ( sum = = node - > u . n . n_bytes_in_buffers ) ;
2007-07-13 19:37:47 +00:00
}
}
2007-11-21 13:07:49 +00:00
int toku_serialize_brt_header_size ( struct brt_header * h ) {
unsigned int size = 4 + 4 + 4 + 8 + 8 + 4 ; /* this size, flags, the tree's nodesize, freelist, unused_memory, named_roots. */
2007-07-13 19:37:47 +00:00
if ( h - > n_named_roots < 0 ) {
size + = 8 ;
} else {
2007-11-21 13:07:49 +00:00
int i ;
2007-07-13 19:37:47 +00:00
for ( i = 0 ; i < h - > n_named_roots ; i + + ) {
size + = 12 + 1 + strlen ( h - > names [ i ] ) ;
}
}
2007-11-21 13:07:49 +00:00
return size ;
}
int toku_serialize_brt_header_to_wbuf ( struct wbuf * wbuf , struct brt_header * h ) {
unsigned int size = toku_serialize_brt_header_size ( h ) ; // !!! seems silly to recompute the size when the caller knew it. Do we really need the size?
wbuf_int ( wbuf , size ) ;
wbuf_int ( wbuf , h - > flags ) ;
wbuf_int ( wbuf , h - > nodesize ) ;
2007-11-22 18:45:22 +00:00
wbuf_DISKOFF ( wbuf , h - > freelist ) ;
wbuf_DISKOFF ( wbuf , h - > unused_memory ) ;
2007-11-21 13:07:49 +00:00
wbuf_int ( wbuf , h - > n_named_roots ) ;
2007-07-13 19:37:47 +00:00
if ( h - > n_named_roots > 0 ) {
2007-11-21 13:07:49 +00:00
int i ;
2007-07-13 19:37:47 +00:00
for ( i = 0 ; i < h - > n_named_roots ; i + + ) {
char * s = h - > names [ i ] ;
unsigned int l = 1 + strlen ( s ) ;
2007-11-22 18:45:22 +00:00
wbuf_DISKOFF ( wbuf , h - > roots [ i ] ) ;
2007-11-21 13:07:49 +00:00
wbuf_bytes ( wbuf , s , l ) ;
2007-07-13 19:37:47 +00:00
assert ( l > 0 & & s [ l - 1 ] = = 0 ) ;
}
} else {
2007-11-22 18:45:22 +00:00
wbuf_DISKOFF ( wbuf , h - > unnamed_root ) ;
2007-07-13 19:37:47 +00:00
}
2007-11-21 13:07:49 +00:00
assert ( wbuf - > ndone < = wbuf - > size ) ;
return 0 ;
}
int toku_serialize_brt_header_to ( int fd , struct brt_header * h ) {
struct wbuf w ;
unsigned int size = toku_serialize_brt_header_size ( h ) ;
wbuf_init ( & w , toku_malloc ( size ) , size ) ;
int r = toku_serialize_brt_header_to_wbuf ( & w , h ) ;
2007-07-13 19:37:47 +00:00
assert ( w . ndone = = size ) ;
{
2007-11-21 13:07:49 +00:00
ssize_t nwrote = pwrite ( fd , w . buf , w . ndone , 0 ) ;
if ( nwrote < 0 ) perror ( " pwrite " ) ;
assert ( ( size_t ) nwrote = = w . ndone ) ;
2007-07-13 19:37:47 +00:00
}
2007-07-20 18:00:14 +00:00
toku_free ( w . buf ) ;
2007-11-21 13:07:49 +00:00
return r ;
2007-07-13 19:37:47 +00:00
}
2007-11-19 23:54:17 +00:00
int toku_deserialize_brtheader_from ( int fd , DISKOFF off , struct brt_header * * brth ) {
2007-10-17 22:10:47 +00:00
//printf("%s:%d calling MALLOC\n", __FILE__, __LINE__);
2007-07-13 19:37:47 +00:00
struct brt_header * MALLOC ( h ) ;
2007-08-21 23:32:17 +00:00
struct rbuf rc ;
2007-07-13 19:37:47 +00:00
int size ;
int sizeagain ;
2007-12-11 20:03:12 +00:00
int ret = - 1 ;
2007-07-13 19:37:47 +00:00
assert ( off = = 0 ) ;
2007-10-17 22:10:47 +00:00
//printf("%s:%d malloced %p\n", __FILE__, __LINE__, h);
2007-07-13 19:37:47 +00:00
{
uint32_t size_n ;
ssize_t r = pread ( fd , & size_n , sizeof ( size_n ) , off ) ;
2007-12-11 20:03:12 +00:00
if ( r = = 0 ) {
died0 :
toku_free ( h ) ; return ret ;
}
if ( r ! = sizeof ( size_n ) ) { ret = EINVAL ; goto died0 ; }
2007-07-13 19:37:47 +00:00
size = ntohl ( size_n ) ;
}
2007-07-20 18:00:14 +00:00
rc . buf = toku_malloc ( size ) ;
2007-12-11 20:03:12 +00:00
if ( rc . buf = = NULL ) { ret = ENOMEM ; goto died0 ; }
if ( 0 ) {
died1 :
toku_free ( rc . buf ) ;
goto died0 ;
}
2007-07-13 19:37:47 +00:00
rc . size = size ;
2007-12-11 20:03:12 +00:00
if ( rc . size < = 0 ) { ret = EINVAL ; goto died1 ; }
2007-07-13 19:37:47 +00:00
rc . ndone = 0 ;
{
ssize_t r = pread ( fd , rc . buf , size , off ) ;
2007-12-11 20:03:12 +00:00
if ( r ! = size ) { ret = EINVAL ; goto died1 ; }
2007-07-13 19:37:47 +00:00
}
h - > dirty = 0 ;
sizeagain = rbuf_int ( & rc ) ;
2007-12-11 20:03:12 +00:00
if ( sizeagain ! = size ) { ret = EINVAL ; goto died1 ; }
2007-11-14 17:58:38 +00:00
h - > flags = rbuf_int ( & rc ) ;
2007-07-13 19:37:47 +00:00
h - > nodesize = rbuf_int ( & rc ) ;
h - > freelist = rbuf_diskoff ( & rc ) ;
h - > unused_memory = rbuf_diskoff ( & rc ) ;
h - > n_named_roots = rbuf_int ( & rc ) ;
if ( h - > n_named_roots > = 0 ) {
int i ;
MALLOC_N ( h - > n_named_roots , h - > roots ) ;
2007-12-11 20:03:12 +00:00
if ( h - > n_named_roots > 0 & & h - > roots = = NULL ) { ret = ENOMEM ; goto died1 ; }
if ( 0 ) {
died2 :
toku_free ( h - > roots ) ;
goto died1 ;
}
2007-07-13 19:37:47 +00:00
MALLOC_N ( h - > n_named_roots , h - > names ) ;
2007-12-11 20:03:12 +00:00
if ( h - > n_named_roots > 0 & & h - > names = = NULL ) { ret = ENOMEM ; goto died2 ; }
if ( 0 ) {
died3 :
toku_free ( h - > names ) ;
for ( i = 0 ; i < h - > n_named_roots ; i + + ) {
if ( h - > names [ i ] ) toku_free ( h - > names [ i ] ) ;
}
goto died2 ;
}
2007-07-13 19:37:47 +00:00
for ( i = 0 ; i < h - > n_named_roots ; i + + ) {
bytevec nameptr ;
unsigned int len ;
h - > roots [ i ] = rbuf_diskoff ( & rc ) ;
rbuf_bytes ( & rc , & nameptr , & len ) ;
2007-12-11 20:03:12 +00:00
if ( strlen ( nameptr ) + 1 ! = len ) { ret = EINVAL ; goto died3 ; }
2007-11-29 15:41:46 +00:00
h - > names [ i ] = toku_memdup ( nameptr , len ) ;
2007-12-11 20:03:12 +00:00
if ( len > 0 & & h - > names [ i ] = = NULL ) { ret = ENOMEM ; goto died3 ; }
2007-07-13 19:37:47 +00:00
}
2007-12-11 20:03:12 +00:00
2007-07-13 19:37:47 +00:00
h - > unnamed_root = - 1 ;
} else {
h - > roots = 0 ;
h - > names = 0 ;
h - > unnamed_root = rbuf_diskoff ( & rc ) ;
}
2007-12-11 20:03:12 +00:00
if ( rc . ndone ! = rc . size ) { ret = EINVAL ; goto died3 ; }
2007-07-20 18:00:14 +00:00
toku_free ( rc . buf ) ;
2007-07-13 19:37:47 +00:00
* brth = h ;
return 0 ;
}
2007-11-14 17:58:38 +00:00
2007-12-06 14:20:47 +00:00
unsigned int toku_brt_pivot_key_len ( BRT brt , struct kv_pair * pk ) {
if ( brt - > flags & TOKU_DB_DUPSORT ) {
return kv_pair_keylen ( pk ) + kv_pair_vallen ( pk ) ;
} else {
return kv_pair_keylen ( pk ) ;
}
}
unsigned int toku_brtnode_pivot_key_len ( BRTNODE node , struct kv_pair * pk ) {
if ( node - > flags & TOKU_DB_DUPSORT ) {
return kv_pair_keylen ( pk ) + kv_pair_vallen ( pk ) ;
} else {
return kv_pair_keylen ( pk ) ;
}
}