2007-11-29 14:18:54 +00:00
/* -*- mode: C; c-basic-offset: 4 -*- */
2008-01-24 15:10:32 +00:00
# ident "Copyright (c) 2007, 2008 Tokutek Inc. All rights reserved."
2007-11-29 14:18:54 +00:00
2007-07-13 19:37:47 +00:00
# define _XOPEN_SOURCE 500
//#include "pma.h"
2008-02-08 03:17:38 +00:00
# include "toku_assert.h"
2007-07-13 19:37:47 +00:00
# include "brt-internal.h"
2007-07-24 01:32:03 +00:00
# include "key.h"
2007-08-21 23:32:17 +00:00
# include "rbuf.h"
# include "wbuf.h"
2008-04-02 23:40:36 +00:00
# include "kv-pair.h"
# include "mempool.h"
2007-08-21 23:32:17 +00:00
2007-07-13 19:37:47 +00:00
# include <unistd.h>
# include <stdio.h>
# include <arpa/inet.h>
2007-11-14 17:58:38 +00:00
2007-11-19 23:54:17 +00:00
static const int brtnode_header_overhead = ( 8 + // magic "tokunode" or "tokuleaf"
8 + // checkpoint number
4 + // block size
4 + // data size
2007-11-27 15:22:56 +00:00
4 + // flags
2007-11-19 23:54:17 +00:00
4 + // height
4 + // random for fingerprint
4 + // localfingerprint
4 ) ; // crc32 at the end
2007-11-14 17:58:38 +00:00
2007-11-19 23:54:17 +00:00
static unsigned int toku_serialize_brtnode_size_slow ( BRTNODE node ) {
2007-11-14 17:58:38 +00:00
unsigned int size = brtnode_header_overhead ;
2007-07-13 19:37:47 +00:00
if ( node - > height > 0 ) {
unsigned int hsize = 0 ;
unsigned int csize = 0 ;
int i ;
size + = 4 ; /* n_children */
2007-11-14 17:58:38 +00:00
size + = 4 ; /* subtree fingerprint. */
2007-07-13 19:37:47 +00:00
for ( i = 0 ; i < node - > u . n . n_children - 1 ; i + + ) {
size + = 4 ;
2007-11-27 18:16:45 +00:00
if ( node - > flags & TOKU_DB_DUPSORT ) size + = 4 ;
2007-12-06 14:20:47 +00:00
csize + = toku_brtnode_pivot_key_len ( node , node - > u . n . childkeys [ i ] ) ;
2007-07-13 19:37:47 +00:00
}
for ( i = 0 ; i < node - > u . n . n_children ; i + + ) {
2007-11-14 17:58:38 +00:00
size + = 8 ; // diskoff
size + = 4 ; // subsum
2007-07-13 19:37:47 +00:00
}
2008-01-11 14:03:33 +00:00
int n_buffers = node - > u . n . n_children ;
2007-07-13 19:37:47 +00:00
size + = 4 ; /* n_entries */
2008-01-11 14:03:33 +00:00
assert ( 0 < = n_buffers & & n_buffers < TREE_FANOUT + 1 ) ;
for ( i = 0 ; i < n_buffers ; i + + ) {
2008-01-31 22:05:43 +00:00
FIFO_ITERATE ( BNC_BUFFER ( node , i ) ,
key __attribute__ ( ( __unused__ ) ) , keylen ,
data __attribute__ ( ( __unused__ ) ) , datalen ,
2008-02-05 18:25:23 +00:00
type __attribute__ ( ( __unused__ ) ) , xid __attribute__ ( ( __unused__ ) ) ,
2008-01-31 22:05:43 +00:00
( hsize + = BRT_CMD_OVERHEAD + KEY_VALUE_OVERHEAD + keylen + datalen ) ) ;
2007-07-13 19:37:47 +00:00
}
2008-01-11 14:03:33 +00:00
assert ( hsize = = node - > u . n . n_bytes_in_buffers ) ;
2007-07-13 19:37:47 +00:00
assert ( csize = = node - > u . n . totalchildkeylens ) ;
return size + hsize + csize ;
} else {
unsigned int hsize = 0 ;
2008-04-02 23:40:36 +00:00
GPMA_ITERATE ( node - > u . l . buffer ,
idx , vlen , vdata ,
( {
2008-04-07 01:30:25 +00:00
LEAFENTRY le = vdata ;
hsize + = PMA_ITEM_OVERHEAD + leafentry_disksize ( le ) ;
2008-04-02 23:40:36 +00:00
} ) ) ;
2008-04-09 15:46:41 +00:00
assert ( hsize < = node - > u . l . n_bytes_in_buffer ) ;
2007-12-04 22:18:21 +00:00
hsize + = 4 ; /* the PMA size */
2007-07-13 19:37:47 +00:00
hsize + = 4 ; /* add n entries in buffer table. */
return size + hsize ;
}
}
2007-11-19 23:54:17 +00:00
unsigned int toku_serialize_brtnode_size ( BRTNODE node ) {
2007-11-14 17:58:38 +00:00
unsigned int result = brtnode_header_overhead ;
2007-07-13 19:37:47 +00:00
assert ( sizeof ( off_t ) = = 8 ) ;
if ( node - > height > 0 ) {
result + = 4 ; /* n_children */
2007-11-14 17:58:38 +00:00
result + = 4 ; /* subtree fingerpirnt */
2008-01-08 21:43:11 +00:00
result + = 4 * ( node - > u . n . n_children - 1 ) ; /* key lengths*/
2007-11-27 18:16:45 +00:00
if ( node - > flags & TOKU_DB_DUPSORT ) result + = 4 * ( node - > u . n . n_children - 1 ) ; /* data lengths */
2007-07-13 19:37:47 +00:00
result + = node - > u . n . totalchildkeylens ; /* the lengths of the pivot keys, without their key lengths. */
2007-11-14 17:58:38 +00:00
result + = ( 8 + 4 + 4 ) * ( node - > u . n . n_children ) ; /* For each child, a child offset, a count for the number of hash table entries, and the subtree fingerprint. */
2008-01-11 14:03:33 +00:00
result + = node - > u . n . n_bytes_in_buffers ;
2007-07-13 19:37:47 +00:00
} else {
2007-12-04 22:18:21 +00:00
result + = ( 4 /* n_entries in buffer table. */
+ 4 ) ; /* the pma size */
2007-07-13 19:37:47 +00:00
result + = node - > u . l . n_bytes_in_buffer ;
2008-04-09 15:46:41 +00:00
#if 0
2007-11-29 15:34:49 +00:00
if ( toku_memory_check ) {
2007-11-19 23:54:17 +00:00
unsigned int slowresult = toku_serialize_brtnode_size_slow ( node ) ;
2007-07-13 19:37:47 +00:00
if ( result ! = slowresult ) printf ( " %s:%d result=%d slowresult=%d \n " , __FILE__ , __LINE__ , result , slowresult ) ;
assert ( result = = slowresult ) ;
}
2008-04-09 15:46:41 +00:00
# else
unsigned int slowresult = toku_serialize_brtnode_size_slow ( node ) ;
if ( result ! = slowresult )
result = slowresult ;
# endif
2007-07-13 19:37:47 +00:00
}
return result ;
}
2008-04-07 01:30:25 +00:00
void toku_serialize_brtnode_to ( int fd , DISKOFF off , DISKOFF size , BRTNODE node ) {
2007-11-14 17:58:38 +00:00
//printf("%s:%d serializing\n", __FILE__, __LINE__);
2007-08-21 23:32:17 +00:00
struct wbuf w ;
2007-07-13 19:37:47 +00:00
int i ;
2007-11-19 23:54:17 +00:00
unsigned int calculated_size = toku_serialize_brtnode_size ( node ) ;
2008-01-16 13:50:23 +00:00
assert ( calculated_size < = size ) ;
2007-10-16 21:02:53 +00:00
//char buf[size];
char * MALLOC_N ( size , buf ) ;
2008-04-07 01:30:25 +00:00
//toku_verify_counts(node);
2007-07-13 19:37:47 +00:00
assert ( size > 0 ) ;
2007-09-28 17:11:22 +00:00
wbuf_init ( & w , buf , size ) ;
2007-07-13 19:37:47 +00:00
//printf("%s:%d serializing %lld w height=%d p0=%p\n", __FILE__, __LINE__, off, node->height, node->mdicts[0]);
2007-11-14 17:58:38 +00:00
wbuf_literal_bytes ( & w , " toku " , 4 ) ;
if ( node - > height = = 0 ) wbuf_literal_bytes ( & w , " leaf " , 4 ) ;
else wbuf_literal_bytes ( & w , " node " , 4 ) ;
wbuf_int ( & w , node - > layout_version ) ;
2007-11-24 03:50:28 +00:00
wbuf_ulonglong ( & w , node - > log_lsn . lsn ) ;
2007-11-14 17:58:38 +00:00
//printf("%s:%d %lld.calculated_size=%d\n", __FILE__, __LINE__, off, calculated_size);
2008-04-02 23:40:36 +00:00
wbuf_uint ( & w , calculated_size ) ;
wbuf_uint ( & w , node - > flags ) ;
2008-04-04 18:22:01 +00:00
wbuf_int ( & w , node - > height ) ;
2007-11-14 17:58:38 +00:00
//printf("%s:%d %lld rand=%08x sum=%08x height=%d\n", __FILE__, __LINE__, node->thisnodename, node->rand4fingerprint, node->subtree_fingerprint, node->height);
2008-04-04 18:22:01 +00:00
wbuf_uint ( & w , node - > rand4fingerprint ) ;
wbuf_uint ( & w , node - > local_fingerprint ) ;
2008-04-02 23:40:36 +00:00
// printf("%s:%d wrote %08x for node %lld\n", __FILE__, __LINE__, node->local_fingerprint, (long long)node->thisnodename);
2007-11-14 17:58:38 +00:00
//printf("%s:%d local_fingerprint=%8x\n", __FILE__, __LINE__, node->local_fingerprint);
2007-07-13 19:37:47 +00:00
//printf("%s:%d w.ndone=%d n_children=%d\n", __FILE__, __LINE__, w.ndone, node->n_children);
2007-11-14 17:58:38 +00:00
if ( node - > height > 0 ) {
2008-01-29 21:43:08 +00:00
assert ( node - > u . n . n_children > 0 ) ;
2007-11-14 17:58:38 +00:00
// Local fingerprint is not actually stored while in main memory. Must calculate it.
// Subtract the child fingerprints from the subtree fingerprint to get the local fingerprint.
{
u_int32_t subtree_fingerprint = node - > local_fingerprint ;
for ( i = 0 ; i < node - > u . n . n_children ; i + + ) {
2008-01-31 22:05:43 +00:00
subtree_fingerprint + = BNC_SUBTREE_FINGERPRINT ( node , i ) ;
2007-11-14 17:58:38 +00:00
}
2008-04-04 18:22:01 +00:00
wbuf_uint ( & w , subtree_fingerprint ) ;
2007-11-14 17:58:38 +00:00
}
2007-07-13 19:37:47 +00:00
wbuf_int ( & w , node - > u . n . n_children ) ;
2007-11-14 17:58:38 +00:00
for ( i = 0 ; i < node - > u . n . n_children ; i + + ) {
2008-04-04 18:22:01 +00:00
wbuf_uint ( & w , BNC_SUBTREE_FINGERPRINT ( node , i ) ) ;
2007-11-14 17:58:38 +00:00
}
2007-07-13 19:37:47 +00:00
//printf("%s:%d w.ndone=%d\n", __FILE__, __LINE__, w.ndone);
for ( i = 0 ; i < node - > u . n . n_children - 1 ; i + + ) {
2007-11-27 15:22:56 +00:00
if ( node - > flags & TOKU_DB_DUPSORT ) {
2007-11-27 18:16:45 +00:00
wbuf_bytes ( & w , kv_pair_key ( node - > u . n . childkeys [ i ] ) , kv_pair_keylen ( node - > u . n . childkeys [ i ] ) ) ;
wbuf_bytes ( & w , kv_pair_val ( node - > u . n . childkeys [ i ] ) , kv_pair_vallen ( node - > u . n . childkeys [ i ] ) ) ;
} else {
2007-12-06 14:20:47 +00:00
wbuf_bytes ( & w , kv_pair_key ( node - > u . n . childkeys [ i ] ) , toku_brtnode_pivot_key_len ( node , node - > u . n . childkeys [ i ] ) ) ;
2007-11-27 18:16:45 +00:00
}
2007-07-13 19:37:47 +00:00
//printf("%s:%d w.ndone=%d (childkeylen[%d]=%d\n", __FILE__, __LINE__, w.ndone, i, node->childkeylens[i]);
}
for ( i = 0 ; i < node - > u . n . n_children ; i + + ) {
2008-01-31 22:05:43 +00:00
wbuf_DISKOFF ( & w , BNC_DISKOFF ( node , i ) ) ;
2007-07-13 19:37:47 +00:00
//printf("%s:%d w.ndone=%d\n", __FILE__, __LINE__, w.ndone);
}
{
2008-01-11 14:03:33 +00:00
int n_buffers = node - > u . n . n_children ;
2007-11-14 17:58:38 +00:00
u_int32_t check_local_fingerprint = 0 ;
2008-01-11 14:03:33 +00:00
for ( i = 0 ; i < n_buffers ; i + + ) {
2007-07-13 19:37:47 +00:00
//printf("%s:%d p%d=%p n_entries=%d\n", __FILE__, __LINE__, i, node->mdicts[i], mdict_n_entries(node->mdicts[i]));
2008-01-31 22:05:43 +00:00
wbuf_int ( & w , toku_fifo_n_entries ( BNC_BUFFER ( node , i ) ) ) ;
2008-02-05 18:25:23 +00:00
FIFO_ITERATE ( BNC_BUFFER ( node , i ) , key , keylen , data , datalen , type , xid ,
2007-11-14 17:58:38 +00:00
( {
wbuf_char ( & w , type ) ;
2008-02-05 18:25:23 +00:00
wbuf_TXNID ( & w , xid ) ;
2007-11-14 17:58:38 +00:00
wbuf_bytes ( & w , key , keylen ) ;
wbuf_bytes ( & w , data , datalen ) ;
2008-02-05 18:25:23 +00:00
check_local_fingerprint + = node - > rand4fingerprint * toku_calccrc32_cmd ( type , xid , key , keylen , data , datalen ) ;
2007-11-14 17:58:38 +00:00
} ) ) ;
2007-07-13 19:37:47 +00:00
}
2007-11-14 17:58:38 +00:00
//printf("%s:%d check_local_fingerprint=%8x\n", __FILE__, __LINE__, check_local_fingerprint);
2008-03-19 22:42:46 +00:00
if ( check_local_fingerprint ! = node - > local_fingerprint ) printf ( " %s:%d node=%lld fingerprint expected=%08x actual=%08x \n " , __FILE__ , __LINE__ , ( long long ) node - > thisnodename , check_local_fingerprint , node - > local_fingerprint ) ;
2007-11-14 17:58:38 +00:00
assert ( check_local_fingerprint = = node - > local_fingerprint ) ;
2007-07-13 19:37:47 +00:00
}
} else {
2008-04-07 01:30:25 +00:00
//printf("%s:%d writing node %lld n_entries=%d\n", __FILE__, __LINE__, node->thisnodename, toku_gpma_n_entries(node->u.l.buffer));
2008-04-04 18:22:01 +00:00
wbuf_uint ( & w , toku_gpma_n_entries ( node - > u . l . buffer ) ) ;
wbuf_uint ( & w , toku_gpma_index_limit ( node - > u . l . buffer ) ) ;
2008-04-02 23:40:36 +00:00
GPMA_ITERATE ( node - > u . l . buffer , idx , vlen , vdata ,
( {
2008-04-07 01:30:25 +00:00
//printf(" %s:%d idx=%d\n", __FILE__, __LINE__, idx);
2008-04-04 18:22:01 +00:00
wbuf_uint ( & w , idx ) ;
2008-04-07 01:30:25 +00:00
wbuf_LEAFENTRY ( & w , vdata ) ;
2008-04-02 23:40:36 +00:00
} ) ) ;
2007-07-13 19:37:47 +00:00
}
assert ( w . ndone < = w . size ) ;
2007-11-14 17:58:38 +00:00
# ifdef CRC_ATEND
wbuf_int ( & w , crc32 ( toku_null_crc , w . buf , w . ndone ) ) ;
# endif
# ifdef CRC_INCR
2008-04-04 18:22:01 +00:00
wbuf_uint ( & w , w . crc32 ) ;
2007-11-14 17:58:38 +00:00
# endif
2008-04-04 18:22:01 +00:00
memset ( w . buf + w . ndone , 0 , ( size_t ) ( size - w . ndone ) ) ; // fill with zeros
2008-03-18 12:08:56 +00:00
2007-11-14 17:58:38 +00:00
//write_now: printf("%s:%d Writing %d bytes\n", __FILE__, __LINE__, w.ndone);
2007-07-13 19:37:47 +00:00
{
2008-04-04 18:22:01 +00:00
ssize_t r = pwrite ( fd , w . buf , ( size_t ) size , off ) ; // write the whole buffer, including the zeros
2007-07-31 21:23:00 +00:00
if ( r < 0 ) printf ( " r=%ld errno=%d \n " , ( long ) r , errno ) ;
2008-03-18 12:08:56 +00:00
assert ( r = = size ) ;
2007-07-13 19:37:47 +00:00
}
2008-03-18 10:19:41 +00:00
if ( calculated_size ! = w . ndone )
printf ( " %s:%d w.done=%d calculated_size=%d \n " , __FILE__ , __LINE__ , w . ndone , calculated_size ) ;
2007-07-13 19:37:47 +00:00
assert ( calculated_size = = w . ndone ) ;
//printf("%s:%d wrote %d bytes for %lld size=%lld\n", __FILE__, __LINE__, w.ndone, off, size);
assert ( w . ndone < = size ) ;
2007-10-16 21:02:53 +00:00
toku_free ( buf ) ;
2007-07-13 19:37:47 +00:00
}
2008-04-02 23:40:36 +00:00
int toku_deserialize_brtnode_from ( int fd , DISKOFF off , BRTNODE * brtnode , unsigned int flags , int nodesize ) {
2007-07-13 19:37:47 +00:00
TAGMALLOC ( BRTNODE , result ) ;
2007-08-21 23:32:17 +00:00
struct rbuf rc ;
2007-07-13 19:37:47 +00:00
int i ;
2007-11-14 17:58:38 +00:00
u_int32_t datasize ;
2007-07-13 19:37:47 +00:00
int r ;
2008-04-07 15:54:58 +00:00
if ( result = = 0 ) {
2007-07-13 19:37:47 +00:00
r = errno ;
2007-07-20 18:00:14 +00:00
if ( 0 ) { died0 : toku_free ( result ) ; }
2007-07-13 19:37:47 +00:00
return r ;
}
{
2007-11-14 17:58:38 +00:00
u_int32_t datasize_n ;
r = pread ( fd , & datasize_n , sizeof ( datasize_n ) , off + 8 + 4 + 8 ) ;
2007-07-13 19:37:47 +00:00
//printf("%s:%d r=%d the datasize=%d\n", __FILE__, __LINE__, r, ntohl(datasize_n));
if ( r ! = sizeof ( datasize_n ) ) {
if ( r = = - 1 ) r = errno ;
else r = DB_BADFORMAT ;
goto died0 ;
}
datasize = ntohl ( datasize_n ) ;
if ( datasize < = 0 | | datasize > ( 1 < < 30 ) ) { r = DB_BADFORMAT ; goto died0 ; }
}
2007-07-20 18:00:14 +00:00
rc . buf = toku_malloc ( datasize ) ;
2007-11-14 17:58:38 +00:00
//printf("%s:%d errno=%d\n", __FILE__, __LINE__, errno);
2008-04-07 15:54:58 +00:00
if ( rc . buf = = 0 ) {
2007-07-20 18:00:14 +00:00
if ( 0 ) { died1 : toku_free ( rc . buf ) ; }
2007-07-13 19:37:47 +00:00
r = errno ;
goto died0 ;
}
rc . size = datasize ;
assert ( rc . size > 0 ) ;
rc . ndone = 0 ;
//printf("Deserializing %lld datasize=%d\n", off, datasize);
{
2007-11-14 17:58:38 +00:00
ssize_t rlen = pread ( fd , rc . buf , datasize , off ) ;
//printf("%s:%d pread->%d datasize=%d\n", __FILE__, __LINE__, r, datasize);
if ( ( size_t ) rlen ! = datasize ) {
//printf("%s:%d size messed up\n", __FILE__, __LINE__);
r = errno ;
goto died1 ;
}
2007-07-13 19:37:47 +00:00
//printf("Got %d %d %d %d\n", rc.buf[0], rc.buf[1], rc.buf[2], rc.buf[3]);
}
2007-11-14 17:58:38 +00:00
{
bytevec tmp ;
rbuf_literal_bytes ( & rc , & tmp , 8 ) ;
if ( memcmp ( tmp , " tokuleaf " , 8 ) ! = 0
& & memcmp ( tmp , " tokunode " , 8 ) ! = 0 ) {
r = DB_BADFORMAT ;
goto died1 ;
}
}
result - > layout_version = rbuf_int ( & rc ) ;
2008-04-02 23:40:36 +00:00
if ( result - > layout_version ! = 3 ) {
2007-11-14 17:58:38 +00:00
r = DB_BADFORMAT ;
goto died1 ;
}
2007-11-24 03:50:28 +00:00
result - > disk_lsn . lsn = rbuf_ulonglong ( & rc ) ;
result - > log_lsn = result - > disk_lsn ;
2007-07-13 19:37:47 +00:00
{
unsigned int stored_size = rbuf_int ( & rc ) ;
if ( stored_size ! = datasize ) { r = DB_BADFORMAT ; goto died1 ; }
}
result - > nodesize = nodesize ; // How to compute the nodesize?
result - > thisnodename = off ;
2007-11-27 15:22:56 +00:00
result - > flags = rbuf_int ( & rc ) ; assert ( result - > flags = = ( unsigned int ) flags ) ;
2007-07-13 19:37:47 +00:00
result - > height = rbuf_int ( & rc ) ;
2007-11-14 17:58:38 +00:00
result - > rand4fingerprint = rbuf_int ( & rc ) ;
result - > local_fingerprint = rbuf_int ( & rc ) ;
2008-04-02 23:40:36 +00:00
// printf("%s:%d read %08x\n", __FILE__, __LINE__, result->local_fingerprint);
2007-09-18 16:09:55 +00:00
result - > dirty = 0 ;
2007-07-13 19:37:47 +00:00
//printf("height==%d\n", result->height);
if ( result - > height > 0 ) {
result - > u . n . totalchildkeylens = 0 ;
2007-11-14 17:58:38 +00:00
u_int32_t subtree_fingerprint = rbuf_int ( & rc ) ;
u_int32_t check_subtree_fingerprint = 0 ;
2007-07-13 19:37:47 +00:00
result - > u . n . n_children = rbuf_int ( & rc ) ;
2008-03-06 22:48:07 +00:00
MALLOC_N ( result - > u . n . n_children + 1 , result - > u . n . childinfos ) ;
MALLOC_N ( result - > u . n . n_children , result - > u . n . childkeys ) ;
2007-07-13 19:37:47 +00:00
//printf("n_children=%d\n", result->n_children);
assert ( result - > u . n . n_children > = 0 & & result - > u . n . n_children < = TREE_FANOUT ) ;
2007-11-14 17:58:38 +00:00
for ( i = 0 ; i < result - > u . n . n_children ; i + + ) {
u_int32_t childfp = rbuf_int ( & rc ) ;
2008-01-31 22:05:43 +00:00
BNC_SUBTREE_FINGERPRINT ( result , i ) = childfp ;
2007-11-14 17:58:38 +00:00
check_subtree_fingerprint + = childfp ;
}
2007-07-13 19:37:47 +00:00
for ( i = 0 ; i < result - > u . n . n_children - 1 ; i + + ) {
2007-11-27 15:22:56 +00:00
if ( result - > flags & TOKU_DB_DUPSORT ) {
bytevec keyptr , dataptr ;
unsigned int keylen , datalen ;
rbuf_bytes ( & rc , & keyptr , & keylen ) ;
rbuf_bytes ( & rc , & dataptr , & datalen ) ;
2007-11-27 18:16:45 +00:00
result - > u . n . childkeys [ i ] = kv_pair_malloc ( keyptr , keylen , dataptr , datalen ) ;
2007-11-27 15:22:56 +00:00
} else {
bytevec childkeyptr ;
2007-12-06 14:30:33 +00:00
unsigned int cklen ;
rbuf_bytes ( & rc , & childkeyptr , & cklen ) ; /* Returns a pointer into the rbuf. */
result - > u . n . childkeys [ i ] = kv_pair_malloc ( ( void * ) childkeyptr , cklen , 0 , 0 ) ;
2007-11-27 15:22:56 +00:00
}
//printf(" key %d length=%d data=%s\n", i, result->childkeylens[i], result->childkeys[i]);
2007-12-06 14:20:47 +00:00
result - > u . n . totalchildkeylens + = toku_brtnode_pivot_key_len ( result , result - > u . n . childkeys [ i ] ) ;
2007-07-13 19:37:47 +00:00
}
for ( i = 0 ; i < result - > u . n . n_children ; i + + ) {
2008-01-31 22:05:43 +00:00
BNC_DISKOFF ( result , i ) = rbuf_diskoff ( & rc ) ;
2008-03-06 21:46:57 +00:00
BNC_NBYTESINBUF ( result , i ) = 0 ;
2007-07-13 19:37:47 +00:00
//printf("Child %d at %lld\n", i, result->children[i]);
}
2008-01-11 14:03:33 +00:00
result - > u . n . n_bytes_in_buffers = 0 ;
2007-07-24 01:32:03 +00:00
for ( i = 0 ; i < result - > u . n . n_children ; i + + ) {
2008-01-31 22:05:43 +00:00
r = toku_fifo_create ( & BNC_BUFFER ( result , i ) ) ;
2007-07-13 19:37:47 +00:00
if ( r ! = 0 ) {
int j ;
2008-01-11 14:03:33 +00:00
if ( 0 ) { died_12 : j = result - > u . n . n_bytes_in_buffers ; }
2008-01-31 22:05:43 +00:00
for ( j = 0 ; j < i ; j + + ) toku_fifo_free ( & BNC_BUFFER ( result , j ) ) ;
2007-07-13 19:37:47 +00:00
goto died1 ;
}
}
{
2007-07-24 01:32:03 +00:00
int cnum ;
2007-11-14 17:58:38 +00:00
u_int32_t check_local_fingerprint = 0 ;
2007-07-24 01:32:03 +00:00
for ( cnum = 0 ; cnum < result - > u . n . n_children ; cnum + + ) {
int n_in_this_hash = rbuf_int ( & rc ) ;
//printf("%d in hash\n", n_in_hash);
for ( i = 0 ; i < n_in_this_hash ; i + + ) {
int diff ;
bytevec key ; ITEMLEN keylen ;
bytevec val ; ITEMLEN vallen ;
2008-04-07 01:30:25 +00:00
//toku_verify_counts(result);
2008-02-05 18:25:23 +00:00
int type = rbuf_char ( & rc ) ;
TXNID xid = rbuf_ulonglong ( & rc ) ;
2007-07-24 01:32:03 +00:00
rbuf_bytes ( & rc , & key , & keylen ) ; /* Returns a pointer into the rbuf. */
rbuf_bytes ( & rc , & val , & vallen ) ;
2008-02-05 18:25:23 +00:00
check_local_fingerprint + = result - > rand4fingerprint * toku_calccrc32_cmd ( type , xid , key , keylen , val , vallen ) ;
2007-09-06 21:36:45 +00:00
//printf("Found %s,%s\n", (char*)key, (char*)val);
2007-07-24 01:32:03 +00:00
{
2008-02-05 18:25:23 +00:00
r = toku_fifo_enq ( BNC_BUFFER ( result , cnum ) , key , keylen , val , vallen , type , xid ) ; /* Copies the data into the hash table. */
2007-07-24 01:32:03 +00:00
if ( r ! = 0 ) { goto died_12 ; }
}
2007-09-06 21:36:45 +00:00
diff = keylen + vallen + KEY_VALUE_OVERHEAD + BRT_CMD_OVERHEAD ;
2008-01-11 14:03:33 +00:00
result - > u . n . n_bytes_in_buffers + = diff ;
2008-01-31 22:05:43 +00:00
BNC_NBYTESINBUF ( result , cnum ) + = diff ;
2007-07-24 01:32:03 +00:00
//printf("Inserted\n");
2007-07-13 19:37:47 +00:00
}
}
2007-11-14 17:58:38 +00:00
if ( check_local_fingerprint ! = result - > local_fingerprint ) {
fprintf ( stderr , " %s:%d local fingerprint is wrong (found %8x calcualted %8x \n " , __FILE__ , __LINE__ , result - > local_fingerprint , check_local_fingerprint ) ;
return DB_BADFORMAT ;
}
if ( check_subtree_fingerprint + check_local_fingerprint ! = subtree_fingerprint ) {
fprintf ( stderr , " %s:%d subtree fingerprint is wrong \n " , __FILE__ , __LINE__ ) ;
return DB_BADFORMAT ;
}
2007-07-13 19:37:47 +00:00
}
} else {
int n_in_buf = rbuf_int ( & rc ) ;
2008-03-17 02:40:59 +00:00
int index_limit = rbuf_int ( & rc ) ;
2007-07-13 19:37:47 +00:00
result - > u . l . n_bytes_in_buffer = 0 ;
2008-04-02 23:40:36 +00:00
r = toku_gpma_create ( & result - > u . l . buffer , index_limit ) ;
2007-07-13 19:37:47 +00:00
if ( r ! = 0 ) {
2008-04-02 23:40:36 +00:00
if ( 0 ) { died_21 : toku_gpma_free ( & result - > u . l . buffer , 0 , 0 ) ; }
2007-07-13 19:37:47 +00:00
goto died1 ;
}
//printf("%s:%d r PMA= %p\n", __FILE__, __LINE__, result->u.l.buffer);
2008-04-02 23:40:36 +00:00
{
2008-04-03 13:49:16 +00:00
int mpsize = nodesize + nodesize / 4 ;
void * mp = toku_malloc ( mpsize ) ;
if ( mp = = 0 ) return ENOMEM ; // TODO cleanup
toku_mempool_init ( & result - > u . l . buffer_mempool , mp , mpsize ) ;
2008-04-02 23:40:36 +00:00
}
2008-03-17 02:40:59 +00:00
u_int32_t actual_sum = 0 ;
2008-04-07 01:30:25 +00:00
//printf("%s:%d node %lld, reading %d items\n", __FILE__, __LINE__, off, n_in_buf);
2007-07-13 19:37:47 +00:00
for ( i = 0 ; i < n_in_buf ; i + + ) {
2008-04-07 01:30:25 +00:00
LEAFENTRY tmp_le ;
//printf("%s:%d reading %dth item\n", __FILE__, __LINE__, i);
2008-03-17 02:40:59 +00:00
int idx = rbuf_int ( & rc ) ;
2008-04-07 01:30:25 +00:00
//printf("%s:%d idx=%d\n", __FILE__, __LINE__, idx);
u_int32_t memsize , disksize ;
rbuf_LEAFENTRY ( & rc , & memsize , & disksize , & tmp_le ) ;
LEAFENTRY le = mempool_malloc_from_gpma ( result - > u . l . buffer , & result - > u . l . buffer_mempool , memsize ) ;
assert ( le ) ;
memcpy ( le , tmp_le , memsize ) ;
toku_free ( tmp_le ) ;
assert ( disksize = = leafentry_disksize ( le ) ) ;
result - > u . l . n_bytes_in_buffer + = disksize + PMA_ITEM_OVERHEAD ;
//printf("idx=%d\n", idx);
toku_gpma_set_at_index ( result - > u . l . buffer , idx , memsize , le ) ;
actual_sum + = result - > rand4fingerprint * toku_le_crc ( le ) ;
//printf("%s:%d rand4=%08x fp=%08x \n", __FILE__, __LINE__, result->rand4fingerprint, actual_sum);
2007-07-13 19:37:47 +00:00
}
2008-03-17 02:40:59 +00:00
if ( r ! = 0 ) goto died_21 ;
if ( actual_sum ! = result - > local_fingerprint ) {
//fprintf(stderr, "%s:%d Corrupted checksum stored=%08x rand=%08x actual=%08x height=%d n_keys=%d\n", __FILE__, __LINE__, result->rand4fingerprint, result->local_fingerprint, actual_sum, result->height, n_in_buf);
return DB_BADFORMAT ;
goto died_21 ;
} else {
//fprintf(stderr, "%s:%d Good checksum=%08x height=%d\n", __FILE__, __LINE__, actual_sum, result->height);
}
2008-04-07 01:30:25 +00:00
//toku_verify_counts(result);
2007-07-13 19:37:47 +00:00
}
2007-11-14 17:58:38 +00:00
{
unsigned int n_read_so_far = rc . ndone ;
if ( n_read_so_far + 4 ! = rc . size ) {
r = DB_BADFORMAT ; goto died_21 ;
}
uint32_t crc = toku_crc32 ( toku_null_crc , rc . buf , n_read_so_far ) ;
uint32_t storedcrc = rbuf_int ( & rc ) ;
if ( crc ! = storedcrc ) {
printf ( " Bad CRC \n " ) ;
assert ( 0 ) ; //this is wrong!!!
r = DB_BADFORMAT ;
goto died_21 ;
}
}
2007-07-13 19:37:47 +00:00
//printf("%s:%d Ok got %lld n_children=%d\n", __FILE__, __LINE__, result->thisnodename, result->n_children);
2007-07-20 18:00:14 +00:00
toku_free ( rc . buf ) ;
2007-07-13 19:37:47 +00:00
* brtnode = result ;
2008-04-07 01:30:25 +00:00
//toku_verify_counts(result);
2007-07-13 19:37:47 +00:00
return 0 ;
}
2007-11-19 23:54:17 +00:00
void toku_verify_counts ( BRTNODE node ) {
2007-07-24 02:36:00 +00:00
/*foo*/
2007-07-13 19:37:47 +00:00
if ( node - > height = = 0 ) {
assert ( node - > u . l . buffer ) ;
2007-12-05 19:41:39 +00:00
unsigned int sum = 0 ;
2008-04-02 23:40:36 +00:00
unsigned int count = 0 ;
2008-04-03 23:00:59 +00:00
u_int32_t fp = 0 ;
GPMA_ITERATE ( node - > u . l . buffer , idx , dlen , ddata ,
2008-04-02 23:40:36 +00:00
( {
count + + ;
2008-04-07 01:30:25 +00:00
sum + = PMA_ITEM_OVERHEAD + leafentry_disksize ( ddata ) ; // use the disk size, not the memory size.
fp + = toku_le_crc ( ddata ) ;
2008-04-02 23:40:36 +00:00
} ) ) ;
assert ( count = = toku_gpma_n_entries ( node - > u . l . buffer ) ) ;
2007-12-05 19:41:39 +00:00
assert ( sum = = node - > u . l . n_bytes_in_buffer ) ;
2008-04-03 23:00:59 +00:00
u_int32_t fps = node - > rand4fingerprint * fp ;
assert ( fps = = node - > local_fingerprint ) ;
2007-07-13 19:37:47 +00:00
} else {
unsigned int sum = 0 ;
int i ;
for ( i = 0 ; i < node - > u . n . n_children ; i + + )
2008-01-31 22:05:43 +00:00
sum + = BNC_NBYTESINBUF ( node , i ) ;
2008-01-11 14:03:33 +00:00
// We don't rally care of the later buffers have garbage in them. Valgrind would do a better job noticing if we leave it uninitialized.
2007-11-14 17:58:38 +00:00
// But for now the code always initializes the later tables so they are 0.
2008-01-11 14:03:33 +00:00
assert ( sum = = node - > u . n . n_bytes_in_buffers ) ;
2007-07-13 19:37:47 +00:00
}
}
2007-11-21 13:07:49 +00:00
int toku_serialize_brt_header_size ( struct brt_header * h ) {
unsigned int size = 4 + 4 + 4 + 8 + 8 + 4 ; /* this size, flags, the tree's nodesize, freelist, unused_memory, named_roots. */
2007-07-13 19:37:47 +00:00
if ( h - > n_named_roots < 0 ) {
size + = 8 ;
} else {
2007-11-21 13:07:49 +00:00
int i ;
2007-07-13 19:37:47 +00:00
for ( i = 0 ; i < h - > n_named_roots ; i + + ) {
size + = 12 + 1 + strlen ( h - > names [ i ] ) ;
}
}
2007-11-21 13:07:49 +00:00
return size ;
}
int toku_serialize_brt_header_to_wbuf ( struct wbuf * wbuf , struct brt_header * h ) {
unsigned int size = toku_serialize_brt_header_size ( h ) ; // !!! seems silly to recompute the size when the caller knew it. Do we really need the size?
wbuf_int ( wbuf , size ) ;
wbuf_int ( wbuf , h - > flags ) ;
wbuf_int ( wbuf , h - > nodesize ) ;
2007-11-22 18:45:22 +00:00
wbuf_DISKOFF ( wbuf , h - > freelist ) ;
wbuf_DISKOFF ( wbuf , h - > unused_memory ) ;
2007-11-21 13:07:49 +00:00
wbuf_int ( wbuf , h - > n_named_roots ) ;
2007-07-13 19:37:47 +00:00
if ( h - > n_named_roots > 0 ) {
2007-11-21 13:07:49 +00:00
int i ;
2007-07-13 19:37:47 +00:00
for ( i = 0 ; i < h - > n_named_roots ; i + + ) {
char * s = h - > names [ i ] ;
unsigned int l = 1 + strlen ( s ) ;
2007-11-22 18:45:22 +00:00
wbuf_DISKOFF ( wbuf , h - > roots [ i ] ) ;
2007-11-21 13:07:49 +00:00
wbuf_bytes ( wbuf , s , l ) ;
2007-07-13 19:37:47 +00:00
assert ( l > 0 & & s [ l - 1 ] = = 0 ) ;
}
} else {
2007-11-22 18:45:22 +00:00
wbuf_DISKOFF ( wbuf , h - > unnamed_root ) ;
2007-07-13 19:37:47 +00:00
}
2007-11-21 13:07:49 +00:00
assert ( wbuf - > ndone < = wbuf - > size ) ;
return 0 ;
}
int toku_serialize_brt_header_to ( int fd , struct brt_header * h ) {
struct wbuf w ;
unsigned int size = toku_serialize_brt_header_size ( h ) ;
wbuf_init ( & w , toku_malloc ( size ) , size ) ;
int r = toku_serialize_brt_header_to_wbuf ( & w , h ) ;
2007-07-13 19:37:47 +00:00
assert ( w . ndone = = size ) ;
{
2007-11-21 13:07:49 +00:00
ssize_t nwrote = pwrite ( fd , w . buf , w . ndone , 0 ) ;
if ( nwrote < 0 ) perror ( " pwrite " ) ;
assert ( ( size_t ) nwrote = = w . ndone ) ;
2007-07-13 19:37:47 +00:00
}
2007-07-20 18:00:14 +00:00
toku_free ( w . buf ) ;
2007-11-21 13:07:49 +00:00
return r ;
2007-07-13 19:37:47 +00:00
}
2007-11-19 23:54:17 +00:00
int toku_deserialize_brtheader_from ( int fd , DISKOFF off , struct brt_header * * brth ) {
2007-10-17 22:10:47 +00:00
//printf("%s:%d calling MALLOC\n", __FILE__, __LINE__);
2007-07-13 19:37:47 +00:00
struct brt_header * MALLOC ( h ) ;
2007-08-21 23:32:17 +00:00
struct rbuf rc ;
2007-07-13 19:37:47 +00:00
int size ;
int sizeagain ;
2007-12-11 20:03:12 +00:00
int ret = - 1 ;
2007-07-13 19:37:47 +00:00
assert ( off = = 0 ) ;
2007-10-17 22:10:47 +00:00
//printf("%s:%d malloced %p\n", __FILE__, __LINE__, h);
2007-07-13 19:37:47 +00:00
{
uint32_t size_n ;
ssize_t r = pread ( fd , & size_n , sizeof ( size_n ) , off ) ;
2007-12-11 20:03:12 +00:00
if ( r = = 0 ) {
died0 :
toku_free ( h ) ; return ret ;
}
if ( r ! = sizeof ( size_n ) ) { ret = EINVAL ; goto died0 ; }
2007-07-13 19:37:47 +00:00
size = ntohl ( size_n ) ;
}
2007-07-20 18:00:14 +00:00
rc . buf = toku_malloc ( size ) ;
2007-12-11 20:03:12 +00:00
if ( rc . buf = = NULL ) { ret = ENOMEM ; goto died0 ; }
if ( 0 ) {
died1 :
toku_free ( rc . buf ) ;
goto died0 ;
}
2007-07-13 19:37:47 +00:00
rc . size = size ;
2007-12-11 20:03:12 +00:00
if ( rc . size < = 0 ) { ret = EINVAL ; goto died1 ; }
2007-07-13 19:37:47 +00:00
rc . ndone = 0 ;
{
ssize_t r = pread ( fd , rc . buf , size , off ) ;
2007-12-11 20:03:12 +00:00
if ( r ! = size ) { ret = EINVAL ; goto died1 ; }
2007-07-13 19:37:47 +00:00
}
h - > dirty = 0 ;
sizeagain = rbuf_int ( & rc ) ;
2007-12-11 20:03:12 +00:00
if ( sizeagain ! = size ) { ret = EINVAL ; goto died1 ; }
2007-11-14 17:58:38 +00:00
h - > flags = rbuf_int ( & rc ) ;
2007-07-13 19:37:47 +00:00
h - > nodesize = rbuf_int ( & rc ) ;
h - > freelist = rbuf_diskoff ( & rc ) ;
h - > unused_memory = rbuf_diskoff ( & rc ) ;
h - > n_named_roots = rbuf_int ( & rc ) ;
if ( h - > n_named_roots > = 0 ) {
int i ;
MALLOC_N ( h - > n_named_roots , h - > roots ) ;
2007-12-11 20:03:12 +00:00
if ( h - > n_named_roots > 0 & & h - > roots = = NULL ) { ret = ENOMEM ; goto died1 ; }
if ( 0 ) {
died2 :
toku_free ( h - > roots ) ;
goto died1 ;
}
2007-07-13 19:37:47 +00:00
MALLOC_N ( h - > n_named_roots , h - > names ) ;
2007-12-11 20:03:12 +00:00
if ( h - > n_named_roots > 0 & & h - > names = = NULL ) { ret = ENOMEM ; goto died2 ; }
if ( 0 ) {
died3 :
toku_free ( h - > names ) ;
for ( i = 0 ; i < h - > n_named_roots ; i + + ) {
if ( h - > names [ i ] ) toku_free ( h - > names [ i ] ) ;
}
goto died2 ;
}
2007-07-13 19:37:47 +00:00
for ( i = 0 ; i < h - > n_named_roots ; i + + ) {
bytevec nameptr ;
unsigned int len ;
h - > roots [ i ] = rbuf_diskoff ( & rc ) ;
rbuf_bytes ( & rc , & nameptr , & len ) ;
2007-12-11 20:03:12 +00:00
if ( strlen ( nameptr ) + 1 ! = len ) { ret = EINVAL ; goto died3 ; }
2007-11-29 15:41:46 +00:00
h - > names [ i ] = toku_memdup ( nameptr , len ) ;
2007-12-11 20:03:12 +00:00
if ( len > 0 & & h - > names [ i ] = = NULL ) { ret = ENOMEM ; goto died3 ; }
2007-07-13 19:37:47 +00:00
}
2007-12-11 20:03:12 +00:00
2007-07-13 19:37:47 +00:00
h - > unnamed_root = - 1 ;
} else {
h - > roots = 0 ;
h - > names = 0 ;
h - > unnamed_root = rbuf_diskoff ( & rc ) ;
}
2007-12-11 20:03:12 +00:00
if ( rc . ndone ! = rc . size ) { ret = EINVAL ; goto died3 ; }
2007-07-20 18:00:14 +00:00
toku_free ( rc . buf ) ;
2007-07-13 19:37:47 +00:00
* brth = h ;
return 0 ;
}
2007-11-14 17:58:38 +00:00
2007-12-06 14:20:47 +00:00
unsigned int toku_brt_pivot_key_len ( BRT brt , struct kv_pair * pk ) {
if ( brt - > flags & TOKU_DB_DUPSORT ) {
return kv_pair_keylen ( pk ) + kv_pair_vallen ( pk ) ;
} else {
return kv_pair_keylen ( pk ) ;
}
}
unsigned int toku_brtnode_pivot_key_len ( BRTNODE node , struct kv_pair * pk ) {
if ( node - > flags & TOKU_DB_DUPSORT ) {
return kv_pair_keylen ( pk ) + kv_pair_vallen ( pk ) ;
} else {
return kv_pair_keylen ( pk ) ;
}
}
2008-04-09 02:45:27 +00:00
// To serialize the fifo, we just write it all at the end of the file.
// For now, just do all the writes as separate system calls. This function is hardly ever called, and
// we might not be able to allocate a large enough buffer to hold everything,
// and it would be more complex to batch up several writes.
int toku_serialize_fifo_at ( int fd , off_t freeoff , FIFO fifo ) {
{
int size = 4 ;
char buf [ size ] ;
struct wbuf w ;
wbuf_init ( & w , buf , size ) ;
wbuf_int ( & w , toku_fifo_n_entries ( fifo ) ) ;
ssize_t r = pwrite ( fd , w . buf , size , freeoff ) ;
if ( r ! = size ) return errno ;
freeoff + = size ;
}
FIFO_ITERATE ( fifo , key , keylen , val , vallen , type , xid ,
( {
size_t size = keylen + vallen + 1 + 8 + 4 + 4 ;
char * MALLOC_N ( size , buf ) ;
assert ( buf ! = 0 ) ;
struct wbuf w ;
wbuf_init ( & w , buf , size ) ;
wbuf_char ( & w , type ) ;
wbuf_TXNID ( & w , xid ) ;
wbuf_bytes ( & w , key , keylen ) ;
//printf("%s:%d Writing %d bytes: %s\n", __FILE__, __LINE__, vallen, (char*)val);
wbuf_bytes ( & w , val , vallen ) ;
assert ( w . ndone = = size ) ;
ssize_t r = pwrite ( fd , w . buf , ( size_t ) size , freeoff ) ;
if ( r < 0 ) return errno ;
assert ( r = = ( ssize_t ) size ) ;
freeoff + = size ;
toku_free ( buf ) ;
} ) ) ;
return 0 ;
}
int read_int ( int fd , off_t * at , u_int32_t * result ) {
int v ;
ssize_t r = pread ( fd , & v , 4 , * at ) ;
if ( r < 0 ) return errno ;
assert ( r = = 4 ) ;
* result = ntohl ( v ) ;
( * at ) + = 4 ;
return 0 ;
}
int read_char ( int fd , off_t * at , char * result ) {
ssize_t r = pread ( fd , result , 1 , * at ) ;
if ( r < 0 ) return errno ;
assert ( r = = 1 ) ;
( * at ) + + ;
return 0 ;
}
int read_uint64_t ( int fd , off_t * at , u_int64_t * result ) {
u_int32_t v1 , v2 ;
int r ;
if ( ( r = read_int ( fd , at , & v1 ) ) ) return r ;
if ( ( r = read_int ( fd , at , & v2 ) ) ) return r ;
* result = ( ( ( u_int64_t ) v1 ) < < 32 ) + v2 ;
return 0 ;
}
int read_nbytes ( int fd , off_t * at , char * * data , u_int32_t len ) {
char * result = toku_malloc ( len ) ;
if ( result = = 0 ) return errno ;
ssize_t r = pread ( fd , result , len , * at ) ;
//printf("%s:%d read %d bytes, which are %s\n", __FILE__, __LINE__, len, result);
if ( r < 0 ) return errno ;
assert ( r = = ( ssize_t ) len ) ;
( * at ) + = len ;
* data = result ;
return 0 ;
}
int toku_deserialize_fifo_at ( int fd , off_t at , FIFO * fifo ) {
FIFO result ;
int r = toku_fifo_create ( & result ) ;
if ( r ) return r ;
u_int32_t count ;
if ( ( r = read_int ( fd , & at , & count ) ) ) return r ;
u_int32_t i ;
for ( i = 0 ; i < count ; i + + ) {
char type ;
TXNID xid ;
u_int32_t keylen , vallen ;
char * key , * val ;
if ( ( r = read_char ( fd , & at , & type ) ) ) return r ;
if ( ( r = read_uint64_t ( fd , & at , & xid ) ) ) return r ;
if ( ( r = read_int ( fd , & at , & keylen ) ) ) return r ;
if ( ( r = read_nbytes ( fd , & at , & key , keylen ) ) ) return r ;
if ( ( r = read_int ( fd , & at , & vallen ) ) ) return r ;
if ( ( r = read_nbytes ( fd , & at , & val , vallen ) ) ) return r ;
//printf("%s:%d read %d byte key, key=%s\n dlen=%d data=%s\n", __FILE__, __LINE__, keylen, key, vallen, val);
if ( ( r = toku_fifo_enq ( result , key , keylen , val , vallen , type , xid ) ) ) return r ;
toku_free ( key ) ;
toku_free ( val ) ;
}
* fifo = result ;
//printf("%s:%d *fifo=%p\n", __FILE__, __LINE__, result);
return 0 ;
}