2007-11-29 14:18:54 +00:00
/* -*- mode: C; c-basic-offset: 4 -*- */
2008-01-24 15:10:32 +00:00
# ident "Copyright (c) 2007, 2008 Tokutek Inc. All rights reserved."
2007-11-29 14:18:54 +00:00
2013-04-16 23:57:20 -04:00
# include "includes.h"
2007-11-14 17:58:38 +00:00
2013-04-16 23:57:18 -04:00
#if 0
2013-04-16 23:57:18 -04:00
static u_int64_t ntohll ( u_int64_t v ) {
union u {
u_int32_t l [ 2 ] ;
u_int64_t ll ;
} uv ;
uv . ll = v ;
return ( ( ( u_int64_t ) uv . l [ 0 ] ) < < 32 ) + uv . l [ 1 ] ;
}
2013-04-16 23:57:18 -04:00
# endif
2013-04-16 23:57:18 -04:00
2013-04-16 23:57:18 -04:00
static u_int64_t umin64 ( u_int64_t a , u_int64_t b ) {
if ( a < b ) return a ;
return b ;
}
static inline u_int64_t alignup ( u_int64_t a , u_int64_t b ) {
return ( ( a + b - 1 ) / b ) * b ;
}
static void maybe_preallocate_in_file ( int fd , u_int64_t size ) {
struct stat sbuf ;
{
int r = fstat ( fd , & sbuf ) ;
assert ( r = = 0 ) ;
}
assert ( sbuf . st_size > = 0 ) ;
if ( ( size_t ) sbuf . st_size < size ) {
const int N = umin64 ( size , 16 < < 20 ) ; // Double the size of the file, or add 16MB, whichever is less.
char * MALLOC_N ( N , wbuf ) ;
memset ( wbuf , 0 , N ) ;
off_t start_write = alignup ( sbuf . st_size , 4096 ) ;
assert ( start_write > = sbuf . st_size ) ;
ssize_t r = pwrite ( fd , wbuf , N , start_write ) ;
assert ( r = = N ) ;
2013-04-16 23:57:19 -04:00
toku_free ( wbuf ) ;
2013-04-16 23:57:18 -04:00
}
}
// This mutex protects pwrite from running in parallel, and also protects modifications to the block allocator.
static pthread_mutex_t pwrite_mutex = PTHREAD_MUTEX_INITIALIZER ;
static int pwrite_is_locked = 0 ;
static inline void
lock_for_pwrite ( void ) {
// Locks the pwrite_mutex.
int r = pthread_mutex_lock ( & pwrite_mutex ) ;
assert ( r = = 0 ) ;
pwrite_is_locked = 1 ;
}
static inline void
unlock_for_pwrite ( void ) {
pwrite_is_locked = 0 ;
int r = pthread_mutex_unlock ( & pwrite_mutex ) ;
assert ( r = = 0 ) ;
}
2013-04-16 23:57:19 -04:00
static ssize_t
2013-04-16 23:57:18 -04:00
toku_pwrite ( int fd , const void * buf , size_t count , off_t offset )
// requires that the pwrite has been locked
{
assert ( pwrite_is_locked ) ;
maybe_preallocate_in_file ( fd , offset + count ) ;
return pwrite ( fd , buf , count , offset ) ;
}
2013-04-16 23:57:16 -04:00
// Don't include the compressed data size or the uncompressed data size.
2007-11-19 23:54:17 +00:00
static const int brtnode_header_overhead = ( 8 + // magic "tokunode" or "tokuleaf"
2008-04-17 03:11:55 +00:00
4 + // nodesize
2007-11-19 23:54:17 +00:00
8 + // checkpoint number
2013-04-16 23:57:16 -04:00
4 + // target node size
4 + // compressed data size
2013-04-16 23:57:16 -04:00
4 + // uncompressed data size
2007-11-27 15:22:56 +00:00
4 + // flags
2007-11-19 23:54:17 +00:00
4 + // height
4 + // random for fingerprint
4 + // localfingerprint
4 ) ; // crc32 at the end
2007-11-14 17:58:38 +00:00
2013-04-16 23:57:18 -04:00
static int deserialize_fifo_at ( int fd , off_t at , FIFO * fifo ) ;
2013-04-16 23:57:19 -04:00
static int
addupsize ( OMTVALUE lev , u_int32_t UU ( idx ) , void * vp ) {
2008-07-24 21:25:31 +00:00
LEAFENTRY le = lev ;
unsigned int * ip = vp ;
( * ip ) + = OMT_ITEM_OVERHEAD + leafentry_disksize ( le ) ;
return 0 ;
}
2008-04-09 19:11:15 +00:00
static unsigned int toku_serialize_brtnode_size_slow ( BRTNODE node ) {
2007-11-14 17:58:38 +00:00
unsigned int size = brtnode_header_overhead ;
2007-07-13 19:37:47 +00:00
if ( node - > height > 0 ) {
unsigned int hsize = 0 ;
unsigned int csize = 0 ;
int i ;
size + = 4 ; /* n_children */
2007-11-14 17:58:38 +00:00
size + = 4 ; /* subtree fingerprint. */
2008-04-09 19:11:15 +00:00
size + = 4 * ( node - > u . n . n_children - 1 ) ; /* key lengths*/
if ( node - > flags & TOKU_DB_DUPSORT ) size + = 4 * ( node - > u . n . n_children - 1 ) ;
2007-07-13 19:37:47 +00:00
for ( i = 0 ; i < node - > u . n . n_children - 1 ; i + + ) {
2007-12-06 14:20:47 +00:00
csize + = toku_brtnode_pivot_key_len ( node , node - > u . n . childkeys [ i ] ) ;
2007-07-13 19:37:47 +00:00
}
2008-04-30 13:23:04 +00:00
size + = ( 8 + 4 + 4 + 8 ) * ( node - > u . n . n_children ) ; /* For each child, a child offset, a count for the number of hash table entries, the subtree fingerprint, and the leafentry_estimate. */
2008-01-11 14:03:33 +00:00
int n_buffers = node - > u . n . n_children ;
assert ( 0 < = n_buffers & & n_buffers < TREE_FANOUT + 1 ) ;
for ( i = 0 ; i < n_buffers ; i + + ) {
2008-01-31 22:05:43 +00:00
FIFO_ITERATE ( BNC_BUFFER ( node , i ) ,
key __attribute__ ( ( __unused__ ) ) , keylen ,
data __attribute__ ( ( __unused__ ) ) , datalen ,
2008-02-05 18:25:23 +00:00
type __attribute__ ( ( __unused__ ) ) , xid __attribute__ ( ( __unused__ ) ) ,
2008-01-31 22:05:43 +00:00
( hsize + = BRT_CMD_OVERHEAD + KEY_VALUE_OVERHEAD + keylen + datalen ) ) ;
2007-07-13 19:37:47 +00:00
}
2008-01-11 14:03:33 +00:00
assert ( hsize = = node - > u . n . n_bytes_in_buffers ) ;
2007-07-13 19:37:47 +00:00
assert ( csize = = node - > u . n . totalchildkeylens ) ;
return size + hsize + csize ;
} else {
unsigned int hsize = 0 ;
2008-04-22 20:39:50 +00:00
toku_omt_iterate ( node - > u . l . buffer ,
addupsize ,
& hsize ) ;
2008-04-09 15:46:41 +00:00
assert ( hsize < = node - > u . l . n_bytes_in_buffer ) ;
2007-07-13 19:37:47 +00:00
hsize + = 4 ; /* add n entries in buffer table. */
return size + hsize ;
}
}
2013-04-16 23:57:16 -04:00
// This is the size of the uncompressed data, including the uncompressed header, and including the 4 bytes for the information about how big is the compressed version, and how big is the uncompressed version.
2007-11-19 23:54:17 +00:00
unsigned int toku_serialize_brtnode_size ( BRTNODE node ) {
2007-11-14 17:58:38 +00:00
unsigned int result = brtnode_header_overhead ;
2007-07-13 19:37:47 +00:00
assert ( sizeof ( off_t ) = = 8 ) ;
if ( node - > height > 0 ) {
2007-11-14 17:58:38 +00:00
result + = 4 ; /* subtree fingerpirnt */
2013-04-16 23:57:24 -04:00
result + = 4 ; /* n_children */
2008-01-08 21:43:11 +00:00
result + = 4 * ( node - > u . n . n_children - 1 ) ; /* key lengths*/
2007-11-27 18:16:45 +00:00
if ( node - > flags & TOKU_DB_DUPSORT ) result + = 4 * ( node - > u . n . n_children - 1 ) ; /* data lengths */
2013-04-16 23:57:24 -04:00
assert ( node - > u . n . totalchildkeylens < ( 1 < < 30 ) ) ;
2007-07-13 19:37:47 +00:00
result + = node - > u . n . totalchildkeylens ; /* the lengths of the pivot keys, without their key lengths. */
2008-04-30 13:23:04 +00:00
result + = ( 8 + 4 + 4 + 8 ) * ( node - > u . n . n_children ) ; /* For each child, a child offset, a count for the number of hash table entries, the subtree fingerprint, and the leafentry_estimate. */
2008-01-11 14:03:33 +00:00
result + = node - > u . n . n_bytes_in_buffers ;
2007-07-13 19:37:47 +00:00
} else {
2008-04-22 20:39:50 +00:00
result + = 4 ; /* n_entries in buffer table. */
2007-07-13 19:37:47 +00:00
result + = node - > u . l . n_bytes_in_buffer ;
2007-11-29 15:34:49 +00:00
if ( toku_memory_check ) {
2007-11-19 23:54:17 +00:00
unsigned int slowresult = toku_serialize_brtnode_size_slow ( node ) ;
2013-04-16 23:57:19 -04:00
if ( result ! = slowresult ) printf ( " %s:%d result=%u slowresult=%u \n " , __FILE__ , __LINE__ , result , slowresult ) ;
2007-07-13 19:37:47 +00:00
assert ( result = = slowresult ) ;
}
}
return result ;
}
2013-04-16 23:57:19 -04:00
static int
wbufwriteleafentry ( OMTVALUE lev , u_int32_t UU ( idx ) , void * v ) {
2008-07-24 21:25:31 +00:00
LEAFENTRY le = lev ;
struct wbuf * thisw = v ;
wbuf_LEAFENTRY ( thisw , le ) ;
return 0 ;
}
2013-04-16 23:57:20 -04:00
enum { uncompressed_magic_len = ( 8 // tokuleaf or tokunode
+ 4 // version
+ 8 // lsn
) } ;
2013-04-16 23:57:16 -04:00
2013-04-16 23:57:20 -04:00
enum { compression_header_len = ( 4 // compressed_len
+ 4 // uncompressed_len
) } ;
2013-04-16 23:57:16 -04:00
2013-04-16 23:57:18 -04:00
void toku_serialize_brtnode_to ( int fd , BLOCKNUM blocknum , BRTNODE node , struct brt_header * h ) {
2007-08-21 23:32:17 +00:00
struct wbuf w ;
2007-07-13 19:37:47 +00:00
int i ;
2013-04-16 23:57:16 -04:00
unsigned int calculated_size = toku_serialize_brtnode_size ( node ) - 8 ; // don't include the compressed or uncompressed sizes
2013-04-16 23:57:23 -04:00
//printf("%s:%d serializing %" PRIu64 " size=%d\n", __FILE__, __LINE__, blocknum.b, calculated_size);
2008-04-17 03:11:55 +00:00
//assert(calculated_size<=size);
2007-10-16 21:02:53 +00:00
//char buf[size];
2013-04-16 23:57:16 -04:00
char * MALLOC_N ( calculated_size , buf ) ;
2008-04-07 01:30:25 +00:00
//toku_verify_counts(node);
2008-04-17 03:11:55 +00:00
//assert(size>0);
2007-07-13 19:37:47 +00:00
//printf("%s:%d serializing %lld w height=%d p0=%p\n", __FILE__, __LINE__, off, node->height, node->mdicts[0]);
2013-04-16 23:57:23 -04:00
wbuf_init ( & w , buf , calculated_size ) ;
2007-11-14 17:58:38 +00:00
wbuf_literal_bytes ( & w , " toku " , 4 ) ;
if ( node - > height = = 0 ) wbuf_literal_bytes ( & w , " leaf " , 4 ) ;
else wbuf_literal_bytes ( & w , " node " , 4 ) ;
2008-07-27 22:16:49 +00:00
wbuf_int ( & w , BRT_LAYOUT_VERSION ) ;
2007-11-24 03:50:28 +00:00
wbuf_ulonglong ( & w , node - > log_lsn . lsn ) ;
2007-11-14 17:58:38 +00:00
//printf("%s:%d %lld.calculated_size=%d\n", __FILE__, __LINE__, off, calculated_size);
2013-04-16 23:57:16 -04:00
wbuf_uint ( & w , node - > nodesize ) ;
2008-04-02 23:40:36 +00:00
wbuf_uint ( & w , node - > flags ) ;
2008-04-04 18:22:01 +00:00
wbuf_int ( & w , node - > height ) ;
2007-11-14 17:58:38 +00:00
//printf("%s:%d %lld rand=%08x sum=%08x height=%d\n", __FILE__, __LINE__, node->thisnodename, node->rand4fingerprint, node->subtree_fingerprint, node->height);
2008-04-04 18:22:01 +00:00
wbuf_uint ( & w , node - > rand4fingerprint ) ;
wbuf_uint ( & w , node - > local_fingerprint ) ;
2008-04-02 23:40:36 +00:00
// printf("%s:%d wrote %08x for node %lld\n", __FILE__, __LINE__, node->local_fingerprint, (long long)node->thisnodename);
2007-11-14 17:58:38 +00:00
//printf("%s:%d local_fingerprint=%8x\n", __FILE__, __LINE__, node->local_fingerprint);
2007-07-13 19:37:47 +00:00
//printf("%s:%d w.ndone=%d n_children=%d\n", __FILE__, __LINE__, w.ndone, node->n_children);
2007-11-14 17:58:38 +00:00
if ( node - > height > 0 ) {
2008-01-29 21:43:08 +00:00
assert ( node - > u . n . n_children > 0 ) ;
2007-11-14 17:58:38 +00:00
// Local fingerprint is not actually stored while in main memory. Must calculate it.
// Subtract the child fingerprints from the subtree fingerprint to get the local fingerprint.
{
u_int32_t subtree_fingerprint = node - > local_fingerprint ;
for ( i = 0 ; i < node - > u . n . n_children ; i + + ) {
2008-01-31 22:05:43 +00:00
subtree_fingerprint + = BNC_SUBTREE_FINGERPRINT ( node , i ) ;
2007-11-14 17:58:38 +00:00
}
2008-04-04 18:22:01 +00:00
wbuf_uint ( & w , subtree_fingerprint ) ;
2007-11-14 17:58:38 +00:00
}
2007-07-13 19:37:47 +00:00
wbuf_int ( & w , node - > u . n . n_children ) ;
2007-11-14 17:58:38 +00:00
for ( i = 0 ; i < node - > u . n . n_children ; i + + ) {
2008-04-04 18:22:01 +00:00
wbuf_uint ( & w , BNC_SUBTREE_FINGERPRINT ( node , i ) ) ;
2008-04-30 13:23:04 +00:00
wbuf_ulonglong ( & w , BNC_SUBTREE_LEAFENTRY_ESTIMATE ( node , i ) ) ;
2007-11-14 17:58:38 +00:00
}
2007-07-13 19:37:47 +00:00
//printf("%s:%d w.ndone=%d\n", __FILE__, __LINE__, w.ndone);
for ( i = 0 ; i < node - > u . n . n_children - 1 ; i + + ) {
2007-11-27 15:22:56 +00:00
if ( node - > flags & TOKU_DB_DUPSORT ) {
2007-11-27 18:16:45 +00:00
wbuf_bytes ( & w , kv_pair_key ( node - > u . n . childkeys [ i ] ) , kv_pair_keylen ( node - > u . n . childkeys [ i ] ) ) ;
wbuf_bytes ( & w , kv_pair_val ( node - > u . n . childkeys [ i ] ) , kv_pair_vallen ( node - > u . n . childkeys [ i ] ) ) ;
} else {
2007-12-06 14:20:47 +00:00
wbuf_bytes ( & w , kv_pair_key ( node - > u . n . childkeys [ i ] ) , toku_brtnode_pivot_key_len ( node , node - > u . n . childkeys [ i ] ) ) ;
2007-11-27 18:16:45 +00:00
}
2007-07-13 19:37:47 +00:00
//printf("%s:%d w.ndone=%d (childkeylen[%d]=%d\n", __FILE__, __LINE__, w.ndone, i, node->childkeylens[i]);
}
for ( i = 0 ; i < node - > u . n . n_children ; i + + ) {
2013-04-16 23:57:18 -04:00
wbuf_BLOCKNUM ( & w , BNC_BLOCKNUM ( node , i ) ) ;
2007-07-13 19:37:47 +00:00
//printf("%s:%d w.ndone=%d\n", __FILE__, __LINE__, w.ndone);
}
{
2008-01-11 14:03:33 +00:00
int n_buffers = node - > u . n . n_children ;
2007-11-14 17:58:38 +00:00
u_int32_t check_local_fingerprint = 0 ;
2008-01-11 14:03:33 +00:00
for ( i = 0 ; i < n_buffers ; i + + ) {
2007-07-13 19:37:47 +00:00
//printf("%s:%d p%d=%p n_entries=%d\n", __FILE__, __LINE__, i, node->mdicts[i], mdict_n_entries(node->mdicts[i]));
2008-01-31 22:05:43 +00:00
wbuf_int ( & w , toku_fifo_n_entries ( BNC_BUFFER ( node , i ) ) ) ;
2008-02-05 18:25:23 +00:00
FIFO_ITERATE ( BNC_BUFFER ( node , i ) , key , keylen , data , datalen , type , xid ,
2013-04-16 23:57:21 -04:00
{
2013-04-16 23:57:20 -04:00
assert ( type > = 0 & & type < 256 ) ;
2013-04-16 23:57:19 -04:00
wbuf_char ( & w , ( unsigned char ) type ) ;
2008-02-05 18:25:23 +00:00
wbuf_TXNID ( & w , xid ) ;
2007-11-14 17:58:38 +00:00
wbuf_bytes ( & w , key , keylen ) ;
wbuf_bytes ( & w , data , datalen ) ;
2008-07-27 22:16:49 +00:00
check_local_fingerprint + = node - > rand4fingerprint * toku_calc_fingerprint_cmd ( type , xid , key , keylen , data , datalen ) ;
2013-04-16 23:57:21 -04:00
} ) ;
2007-07-13 19:37:47 +00:00
}
2007-11-14 17:58:38 +00:00
//printf("%s:%d check_local_fingerprint=%8x\n", __FILE__, __LINE__, check_local_fingerprint);
2013-04-16 23:57:18 -04:00
if ( check_local_fingerprint ! = node - > local_fingerprint ) printf ( " %s:%d node=% " PRId64 " fingerprint expected=%08x actual=%08x \n " , __FILE__ , __LINE__ , node - > thisnodename . b , check_local_fingerprint , node - > local_fingerprint ) ;
2007-11-14 17:58:38 +00:00
assert ( check_local_fingerprint = = node - > local_fingerprint ) ;
2007-07-13 19:37:47 +00:00
}
} else {
2008-04-07 01:30:25 +00:00
//printf("%s:%d writing node %lld n_entries=%d\n", __FILE__, __LINE__, node->thisnodename, toku_gpma_n_entries(node->u.l.buffer));
2008-04-22 20:39:50 +00:00
wbuf_uint ( & w , toku_omt_size ( node - > u . l . buffer ) ) ;
toku_omt_iterate ( node - > u . l . buffer , wbufwriteleafentry , & w ) ;
2007-07-13 19:37:47 +00:00
}
assert ( w . ndone < = w . size ) ;
2007-11-14 17:58:38 +00:00
# ifdef CRC_ATEND
wbuf_int ( & w , crc32 ( toku_null_crc , w . buf , w . ndone ) ) ;
# endif
# ifdef CRC_INCR
2008-07-27 22:16:49 +00:00
{
u_int32_t checksum = x1764_finish ( & w . checksum ) ;
wbuf_uint ( & w , checksum ) ;
}
2007-11-14 17:58:38 +00:00
# endif
2013-04-16 23:57:16 -04:00
if ( calculated_size ! = w . ndone )
2013-04-16 23:57:19 -04:00
printf ( " %s:%d w.done=%u calculated_size=%u \n " , __FILE__ , __LINE__ , w . ndone , calculated_size ) ;
2013-04-16 23:57:16 -04:00
assert ( calculated_size = = w . ndone ) ;
// The uncompressed part of the header is
// tokuleaf(8),
// version(4),
// lsn(8),
// compressed_len(4),[which includes only the compressed data]
// uncompressed_len(4)[which includes only the compressed data, not the header]
2013-04-16 23:57:16 -04:00
// The first part of the data is uncompressed
2013-04-16 23:57:16 -04:00
uLongf uncompressed_len = calculated_size - uncompressed_magic_len ;
2013-04-16 23:57:16 -04:00
uLongf compressed_len = compressBound ( uncompressed_len ) ;
2013-04-16 23:57:16 -04:00
char * MALLOC_N ( compressed_len + uncompressed_magic_len + compression_header_len , compressed_buf ) ;
2013-04-16 23:57:16 -04:00
2013-04-16 23:57:16 -04:00
memcpy ( compressed_buf , buf , uncompressed_magic_len ) ;
if ( 0 ) printf ( " First 4 bytes before compressing data are %02x%02x%02x%02x \n " ,
buf [ uncompressed_magic_len ] , buf [ uncompressed_magic_len + 1 ] ,
buf [ uncompressed_magic_len + 2 ] , buf [ uncompressed_magic_len + 3 ] ) ;
2013-04-16 23:57:16 -04:00
{
2013-04-16 23:57:17 -04:00
int r = compress2 ( ( ( Bytef * ) compressed_buf ) + uncompressed_magic_len + compression_header_len , & compressed_len ,
( ( Bytef * ) buf ) + uncompressed_magic_len , calculated_size - uncompressed_magic_len ,
1 ) ;
2013-04-16 23:57:16 -04:00
assert ( r = = Z_OK ) ;
}
2013-04-16 23:57:25 -04:00
if ( 0 ) printf ( " Block % " PRId64 " Size before compressing %u, after compression %lu \n " , blocknum . b , calculated_size - uncompressed_magic_len , compressed_len ) ;
2013-04-16 23:57:16 -04:00
( ( int32_t * ) ( compressed_buf + uncompressed_magic_len ) ) [ 0 ] = htonl ( compressed_len ) ;
( ( int32_t * ) ( compressed_buf + uncompressed_magic_len ) ) [ 1 ] = htonl ( uncompressed_len ) ;
2008-03-18 12:08:56 +00:00
2007-11-14 17:58:38 +00:00
//write_now: printf("%s:%d Writing %d bytes\n", __FILE__, __LINE__, w.ndone);
2007-07-13 19:37:47 +00:00
{
2013-04-16 23:57:18 -04:00
lock_for_pwrite ( ) ;
2008-06-15 17:09:14 +00:00
// If the node has never been written, then write the whole buffer, including the zeros
2013-04-16 23:57:18 -04:00
assert ( blocknum . b > = 0 ) ;
2013-04-16 23:57:18 -04:00
//printf("%s:%d h=%p\n", __FILE__, __LINE__, h);
//printf("%s:%d translated_blocknum_limit=%lu blocknum.b=%lu\n", __FILE__, __LINE__, h->translated_blocknum_limit, blocknum.b);
//printf("%s:%d allocator=%p\n", __FILE__, __LINE__, h->block_allocator);
//printf("%s:%d bt=%p\n", __FILE__, __LINE__, h->block_translation);
2013-04-16 23:57:25 -04:00
extend_block_translation ( blocknum , h ) ;
2013-04-16 23:57:18 -04:00
if ( h - > block_translation [ blocknum . b ] . size > 0 ) {
block_allocator_free_block ( h - > block_allocator , h - > block_translation [ blocknum . b ] . diskoff ) ;
h - > block_translation [ blocknum . b ] . diskoff = 0 ;
h - > block_translation [ blocknum . b ] . size = 0 ;
2013-04-16 23:57:18 -04:00
}
2013-04-16 23:57:18 -04:00
h - > dirty = 1 ; // Allocating a block dirties the header.
2013-04-16 23:57:16 -04:00
size_t n_to_write = uncompressed_magic_len + compression_header_len + compressed_len ;
2013-04-16 23:57:18 -04:00
u_int64_t offset ;
2013-04-16 23:57:18 -04:00
block_allocator_alloc_block ( h - > block_allocator , n_to_write , & offset ) ;
h - > block_translation [ blocknum . b ] . diskoff = offset ;
h - > block_translation [ blocknum . b ] . size = n_to_write ;
2013-04-16 23:57:18 -04:00
ssize_t r = toku_pwrite ( fd , compressed_buf , n_to_write , offset ) ;
2007-07-31 21:23:00 +00:00
if ( r < 0 ) printf ( " r=%ld errno=%d \n " , ( long ) r , errno ) ;
2008-06-15 17:09:14 +00:00
assert ( r = = ( ssize_t ) n_to_write ) ;
2013-04-16 23:57:18 -04:00
unlock_for_pwrite ( ) ;
2007-07-13 19:37:47 +00:00
}
//printf("%s:%d wrote %d bytes for %lld size=%lld\n", __FILE__, __LINE__, w.ndone, off, size);
2013-04-16 23:57:23 -04:00
assert ( w . ndone = = calculated_size ) ;
2007-10-16 21:02:53 +00:00
toku_free ( buf ) ;
2013-04-16 23:57:16 -04:00
toku_free ( compressed_buf ) ;
2007-07-13 19:37:47 +00:00
}
2013-04-16 23:57:18 -04:00
int toku_deserialize_brtnode_from ( int fd , BLOCKNUM blocknum , u_int32_t fullhash , BRTNODE * brtnode , struct brt_header * h ) {
2013-04-16 23:57:25 -04:00
if ( 0 ) printf ( " Deserializing Block % " PRId64 " \n " , blocknum . b ) ;
2013-04-16 23:57:18 -04:00
assert ( 0 < = blocknum . b & & ( u_int64_t ) blocknum . b < h - > translated_blocknum_limit ) ;
DISKOFF offset = h - > block_translation [ blocknum . b ] . diskoff ;
2007-07-13 19:37:47 +00:00
TAGMALLOC ( BRTNODE , result ) ;
2007-08-21 23:32:17 +00:00
struct rbuf rc ;
2007-07-13 19:37:47 +00:00
int i ;
int r ;
2008-04-07 15:54:58 +00:00
if ( result = = 0 ) {
2007-07-13 19:37:47 +00:00
r = errno ;
2007-07-20 18:00:14 +00:00
if ( 0 ) { died0 : toku_free ( result ) ; }
2007-07-13 19:37:47 +00:00
return r ;
}
2008-06-15 17:09:14 +00:00
result - > ever_been_written = 1 ;
2013-04-16 23:57:16 -04:00
char uncompressed_header [ uncompressed_magic_len + compression_header_len ] ;
2013-04-16 23:57:16 -04:00
u_int32_t compressed_size ;
u_int32_t uncompressed_size ;
2007-07-13 19:37:47 +00:00
{
2013-04-16 23:57:16 -04:00
// get the compressed size
2013-04-16 23:57:18 -04:00
r = pread ( fd , uncompressed_header , sizeof ( uncompressed_header ) , offset ) ;
2007-07-13 19:37:47 +00:00
//printf("%s:%d r=%d the datasize=%d\n", __FILE__, __LINE__, r, ntohl(datasize_n));
2013-04-16 23:57:16 -04:00
if ( r ! = ( int ) sizeof ( uncompressed_header ) ) {
2007-07-13 19:37:47 +00:00
if ( r = = - 1 ) r = errno ;
else r = DB_BADFORMAT ;
goto died0 ;
}
2013-04-16 23:57:16 -04:00
compressed_size = ntohl ( * ( u_int32_t * ) ( & uncompressed_header [ uncompressed_magic_len ] ) ) ;
2013-04-16 23:57:16 -04:00
if ( compressed_size < = 0 | | compressed_size > ( 1 < < 30 ) ) { r = DB_BADFORMAT ; goto died0 ; }
2013-04-16 23:57:16 -04:00
uncompressed_size = ntohl ( * ( u_int32_t * ) ( & uncompressed_header [ uncompressed_magic_len + 4 ] ) ) ;
2013-04-16 23:57:25 -04:00
if ( 0 ) printf ( " Block % " PRId64 " Compressed size = %u, uncompressed size=%u \n " , blocknum . b , compressed_size , uncompressed_size ) ;
2013-04-16 23:57:16 -04:00
if ( uncompressed_size < = 0 | | uncompressed_size > ( 1 < < 30 ) ) { r = DB_BADFORMAT ; goto died0 ; }
2007-07-13 19:37:47 +00:00
}
2013-04-16 23:57:16 -04:00
2013-04-16 23:57:23 -04:00
//printf("%s:%d serializing %" PRIu64 " size=%d\n", __FILE__, __LINE__, blocknum.b, uncompressed_size);
2013-04-16 23:57:16 -04:00
unsigned char * MALLOC_N ( compressed_size , compressed_data ) ;
assert ( compressed_data ) ;
2007-07-13 19:37:47 +00:00
{
2013-04-16 23:57:18 -04:00
ssize_t rlen = pread ( fd , compressed_data , compressed_size , offset + uncompressed_magic_len + compression_header_len ) ;
2013-04-16 23:57:18 -04:00
//printf("%s:%d pread->%d offset=%ld datasize=%d\n", __FILE__, __LINE__, r, offset, compressed_size + uncompressed_magic_len + compression_header_len);
2013-04-16 23:57:16 -04:00
assert ( ( size_t ) rlen = = compressed_size ) ;
2007-07-13 19:37:47 +00:00
//printf("Got %d %d %d %d\n", rc.buf[0], rc.buf[1], rc.buf[2], rc.buf[3]);
}
2013-04-16 23:57:16 -04:00
2013-04-16 23:57:16 -04:00
rc . size = uncompressed_size + uncompressed_magic_len ;
2013-04-16 23:57:16 -04:00
assert ( rc . size > 0 ) ;
2013-04-16 23:57:16 -04:00
rc . buf = toku_malloc ( rc . size ) ;
assert ( rc . buf ) ;
memcpy ( rc . buf , uncompressed_header , uncompressed_magic_len ) ;
2013-04-16 23:57:16 -04:00
{
uLongf destlen = uncompressed_size ;
2013-04-16 23:57:16 -04:00
r = uncompress ( rc . buf + uncompressed_magic_len , & destlen ,
2013-04-16 23:57:16 -04:00
compressed_data , compressed_size ) ;
assert ( destlen = = uncompressed_size ) ;
assert ( r = = Z_OK ) ;
}
2013-04-16 23:57:16 -04:00
if ( 0 ) printf ( " First 4 bytes of uncompressed data are %02x%02x%02x%02x \n " ,
rc . buf [ uncompressed_magic_len ] , rc . buf [ uncompressed_magic_len + 1 ] ,
rc . buf [ uncompressed_magic_len + 2 ] , rc . buf [ uncompressed_magic_len + 3 ] ) ;
2013-04-16 23:57:16 -04:00
toku_free ( compressed_data ) ;
rc . ndone = 0 ;
//printf("Deserializing %lld datasize=%d\n", off, datasize);
2007-11-14 17:58:38 +00:00
{
bytevec tmp ;
rbuf_literal_bytes ( & rc , & tmp , 8 ) ;
if ( memcmp ( tmp , " tokuleaf " , 8 ) ! = 0
& & memcmp ( tmp , " tokunode " , 8 ) ! = 0 ) {
r = DB_BADFORMAT ;
2013-04-16 23:57:16 -04:00
return r ;
2007-11-14 17:58:38 +00:00
}
}
result - > layout_version = rbuf_int ( & rc ) ;
2008-05-22 21:28:00 +00:00
{
switch ( result - > layout_version ) {
2013-04-16 23:57:18 -04:00
case BRT_LAYOUT_VERSION_9 : goto ok_layout_version ;
2008-07-27 22:16:49 +00:00
// Don't support older versions.
2008-05-22 21:28:00 +00:00
}
2007-11-14 17:58:38 +00:00
r = DB_BADFORMAT ;
2013-04-16 23:57:16 -04:00
return r ;
2008-05-22 21:28:00 +00:00
ok_layout_version : ;
2007-11-14 17:58:38 +00:00
}
2007-11-24 03:50:28 +00:00
result - > disk_lsn . lsn = rbuf_ulonglong ( & rc ) ;
2013-04-16 23:57:16 -04:00
result - > nodesize = rbuf_int ( & rc ) ;
2007-11-24 03:50:28 +00:00
result - > log_lsn = result - > disk_lsn ;
2013-04-16 23:57:16 -04:00
2013-04-16 23:57:18 -04:00
result - > thisnodename = blocknum ;
2008-04-17 03:11:55 +00:00
result - > flags = rbuf_int ( & rc ) ;
2007-07-13 19:37:47 +00:00
result - > height = rbuf_int ( & rc ) ;
2007-11-14 17:58:38 +00:00
result - > rand4fingerprint = rbuf_int ( & rc ) ;
result - > local_fingerprint = rbuf_int ( & rc ) ;
2008-04-02 23:40:36 +00:00
// printf("%s:%d read %08x\n", __FILE__, __LINE__, result->local_fingerprint);
2007-09-18 16:09:55 +00:00
result - > dirty = 0 ;
2008-06-17 17:05:19 +00:00
result - > fullhash = fullhash ;
2007-07-13 19:37:47 +00:00
//printf("height==%d\n", result->height);
if ( result - > height > 0 ) {
result - > u . n . totalchildkeylens = 0 ;
2007-11-14 17:58:38 +00:00
u_int32_t subtree_fingerprint = rbuf_int ( & rc ) ;
u_int32_t check_subtree_fingerprint = 0 ;
2007-07-13 19:37:47 +00:00
result - > u . n . n_children = rbuf_int ( & rc ) ;
2008-03-06 22:48:07 +00:00
MALLOC_N ( result - > u . n . n_children + 1 , result - > u . n . childinfos ) ;
MALLOC_N ( result - > u . n . n_children , result - > u . n . childkeys ) ;
2007-07-13 19:37:47 +00:00
//printf("n_children=%d\n", result->n_children);
2013-04-16 23:57:23 -04:00
assert ( result - > u . n . n_children > = 0 ) ;
2007-11-14 17:58:38 +00:00
for ( i = 0 ; i < result - > u . n . n_children ; i + + ) {
u_int32_t childfp = rbuf_int ( & rc ) ;
2008-01-31 22:05:43 +00:00
BNC_SUBTREE_FINGERPRINT ( result , i ) = childfp ;
2007-11-14 17:58:38 +00:00
check_subtree_fingerprint + = childfp ;
2008-07-27 22:16:49 +00:00
BNC_SUBTREE_LEAFENTRY_ESTIMATE ( result , i ) = rbuf_ulonglong ( & rc ) ;
2007-11-14 17:58:38 +00:00
}
2007-07-13 19:37:47 +00:00
for ( i = 0 ; i < result - > u . n . n_children - 1 ; i + + ) {
2007-11-27 15:22:56 +00:00
if ( result - > flags & TOKU_DB_DUPSORT ) {
bytevec keyptr , dataptr ;
unsigned int keylen , datalen ;
rbuf_bytes ( & rc , & keyptr , & keylen ) ;
rbuf_bytes ( & rc , & dataptr , & datalen ) ;
2007-11-27 18:16:45 +00:00
result - > u . n . childkeys [ i ] = kv_pair_malloc ( keyptr , keylen , dataptr , datalen ) ;
2007-11-27 15:22:56 +00:00
} else {
bytevec childkeyptr ;
2007-12-06 14:30:33 +00:00
unsigned int cklen ;
rbuf_bytes ( & rc , & childkeyptr , & cklen ) ; /* Returns a pointer into the rbuf. */
result - > u . n . childkeys [ i ] = kv_pair_malloc ( ( void * ) childkeyptr , cklen , 0 , 0 ) ;
2007-11-27 15:22:56 +00:00
}
//printf(" key %d length=%d data=%s\n", i, result->childkeylens[i], result->childkeys[i]);
2007-12-06 14:20:47 +00:00
result - > u . n . totalchildkeylens + = toku_brtnode_pivot_key_len ( result , result - > u . n . childkeys [ i ] ) ;
2007-07-13 19:37:47 +00:00
}
for ( i = 0 ; i < result - > u . n . n_children ; i + + ) {
2013-04-16 23:57:18 -04:00
BNC_BLOCKNUM ( result , i ) = rbuf_blocknum ( & rc ) ;
2008-06-18 00:30:36 +00:00
BNC_HAVE_FULLHASH ( result , i ) = FALSE ;
2008-03-06 21:46:57 +00:00
BNC_NBYTESINBUF ( result , i ) = 0 ;
2007-07-13 19:37:47 +00:00
//printf("Child %d at %lld\n", i, result->children[i]);
}
2008-01-11 14:03:33 +00:00
result - > u . n . n_bytes_in_buffers = 0 ;
2007-07-24 01:32:03 +00:00
for ( i = 0 ; i < result - > u . n . n_children ; i + + ) {
2008-01-31 22:05:43 +00:00
r = toku_fifo_create ( & BNC_BUFFER ( result , i ) ) ;
2007-07-13 19:37:47 +00:00
if ( r ! = 0 ) {
int j ;
2008-01-11 14:03:33 +00:00
if ( 0 ) { died_12 : j = result - > u . n . n_bytes_in_buffers ; }
2008-01-31 22:05:43 +00:00
for ( j = 0 ; j < i ; j + + ) toku_fifo_free ( & BNC_BUFFER ( result , j ) ) ;
2013-04-16 23:57:16 -04:00
return DB_BADFORMAT ;
2007-07-13 19:37:47 +00:00
}
}
{
2007-07-24 01:32:03 +00:00
int cnum ;
2007-11-14 17:58:38 +00:00
u_int32_t check_local_fingerprint = 0 ;
2007-07-24 01:32:03 +00:00
for ( cnum = 0 ; cnum < result - > u . n . n_children ; cnum + + ) {
int n_in_this_hash = rbuf_int ( & rc ) ;
//printf("%d in hash\n", n_in_hash);
for ( i = 0 ; i < n_in_this_hash ; i + + ) {
int diff ;
bytevec key ; ITEMLEN keylen ;
bytevec val ; ITEMLEN vallen ;
2008-04-07 01:30:25 +00:00
//toku_verify_counts(result);
2008-02-05 18:25:23 +00:00
int type = rbuf_char ( & rc ) ;
TXNID xid = rbuf_ulonglong ( & rc ) ;
2007-07-24 01:32:03 +00:00
rbuf_bytes ( & rc , & key , & keylen ) ; /* Returns a pointer into the rbuf. */
rbuf_bytes ( & rc , & val , & vallen ) ;
2008-07-27 22:16:49 +00:00
check_local_fingerprint + = result - > rand4fingerprint * toku_calc_fingerprint_cmd ( type , xid , key , keylen , val , vallen ) ;
2007-09-06 21:36:45 +00:00
//printf("Found %s,%s\n", (char*)key, (char*)val);
2007-07-24 01:32:03 +00:00
{
2008-02-05 18:25:23 +00:00
r = toku_fifo_enq ( BNC_BUFFER ( result , cnum ) , key , keylen , val , vallen , type , xid ) ; /* Copies the data into the hash table. */
2007-07-24 01:32:03 +00:00
if ( r ! = 0 ) { goto died_12 ; }
}
2007-09-06 21:36:45 +00:00
diff = keylen + vallen + KEY_VALUE_OVERHEAD + BRT_CMD_OVERHEAD ;
2008-01-11 14:03:33 +00:00
result - > u . n . n_bytes_in_buffers + = diff ;
2008-01-31 22:05:43 +00:00
BNC_NBYTESINBUF ( result , cnum ) + = diff ;
2007-07-24 01:32:03 +00:00
//printf("Inserted\n");
2007-07-13 19:37:47 +00:00
}
}
2007-11-14 17:58:38 +00:00
if ( check_local_fingerprint ! = result - > local_fingerprint ) {
fprintf ( stderr , " %s:%d local fingerprint is wrong (found %8x calcualted %8x \n " , __FILE__ , __LINE__ , result - > local_fingerprint , check_local_fingerprint ) ;
return DB_BADFORMAT ;
}
if ( check_subtree_fingerprint + check_local_fingerprint ! = subtree_fingerprint ) {
fprintf ( stderr , " %s:%d subtree fingerprint is wrong \n " , __FILE__ , __LINE__ ) ;
return DB_BADFORMAT ;
}
2007-07-13 19:37:47 +00:00
}
} else {
int n_in_buf = rbuf_int ( & rc ) ;
result - > u . l . n_bytes_in_buffer = 0 ;
2008-06-18 10:02:06 +00:00
result - > u . l . seqinsert = 0 ;
2008-05-29 03:12:59 +00:00
2007-07-13 19:37:47 +00:00
//printf("%s:%d r PMA= %p\n", __FILE__, __LINE__, result->u.l.buffer);
2013-04-16 23:57:16 -04:00
toku_mempool_init ( & result - > u . l . buffer_mempool , rc . buf , uncompressed_size + uncompressed_magic_len ) ;
2008-03-17 02:40:59 +00:00
u_int32_t actual_sum = 0 ;
2008-05-29 03:12:59 +00:00
u_int32_t start_of_data = rc . ndone ;
OMTVALUE * MALLOC_N ( n_in_buf , array ) ;
2007-07-13 19:37:47 +00:00
for ( i = 0 ; i < n_in_buf ; i + + ) {
2008-05-29 03:12:59 +00:00
LEAFENTRY le = ( LEAFENTRY ) ( & rc . buf [ rc . ndone ] ) ;
u_int32_t disksize = leafentry_disksize ( le ) ;
rc . ndone + = disksize ;
assert ( rc . ndone < = rc . size ) ;
array [ i ] = ( OMTVALUE ) le ;
2008-07-27 22:16:49 +00:00
actual_sum + = x1764_memory ( le , disksize ) ;
2007-07-13 19:37:47 +00:00
}
2008-05-29 03:12:59 +00:00
u_int32_t end_of_data = rc . ndone ;
result - > u . l . n_bytes_in_buffer + = end_of_data - start_of_data + n_in_buf * OMT_ITEM_OVERHEAD ;
actual_sum * = result - > rand4fingerprint ;
r = toku_omt_create_from_sorted_array ( & result - > u . l . buffer , array , n_in_buf ) ;
toku_free ( array ) ;
if ( r ! = 0 ) {
if ( 0 ) { died_21 : toku_omt_destroy ( & result - > u . l . buffer ) ; }
2013-04-16 23:57:16 -04:00
return DB_BADFORMAT ;
2008-05-29 03:12:59 +00:00
}
result - > u . l . buffer_mempool . frag_size = start_of_data ;
result - > u . l . buffer_mempool . free_offset = end_of_data ;
2008-03-17 02:40:59 +00:00
if ( r ! = 0 ) goto died_21 ;
if ( actual_sum ! = result - > local_fingerprint ) {
//fprintf(stderr, "%s:%d Corrupted checksum stored=%08x rand=%08x actual=%08x height=%d n_keys=%d\n", __FILE__, __LINE__, result->rand4fingerprint, result->local_fingerprint, actual_sum, result->height, n_in_buf);
return DB_BADFORMAT ;
2013-04-16 23:57:19 -04:00
// goto died_21;
2008-03-17 02:40:59 +00:00
} else {
//fprintf(stderr, "%s:%d Good checksum=%08x height=%d\n", __FILE__, __LINE__, actual_sum, result->height);
}
2008-04-07 01:30:25 +00:00
//toku_verify_counts(result);
2007-07-13 19:37:47 +00:00
}
2007-11-14 17:58:38 +00:00
{
unsigned int n_read_so_far = rc . ndone ;
if ( n_read_so_far + 4 ! = rc . size ) {
r = DB_BADFORMAT ; goto died_21 ;
}
2008-07-27 22:16:49 +00:00
uint32_t crc = x1764_memory ( rc . buf , n_read_so_far ) ;
2008-07-23 03:47:05 +00:00
uint32_t storedcrc = rbuf_int ( & rc ) ;
2007-11-14 17:58:38 +00:00
if ( crc ! = storedcrc ) {
printf ( " Bad CRC \n " ) ;
2008-07-27 22:16:49 +00:00
printf ( " %s:%d crc=%08x stored=%08x \n " , __FILE__ , __LINE__ , crc , storedcrc ) ;
2007-11-14 17:58:38 +00:00
assert ( 0 ) ; //this is wrong!!!
r = DB_BADFORMAT ;
goto died_21 ;
}
}
2007-07-13 19:37:47 +00:00
//printf("%s:%d Ok got %lld n_children=%d\n", __FILE__, __LINE__, result->thisnodename, result->n_children);
2008-05-29 03:12:59 +00:00
if ( result - > height > 0 ) {
// For height==0 we used the buf inside the OMT
toku_free ( rc . buf ) ;
}
2007-07-13 19:37:47 +00:00
* brtnode = result ;
2008-04-07 01:30:25 +00:00
//toku_verify_counts(result);
2007-07-13 19:37:47 +00:00
return 0 ;
}
2008-07-24 21:31:52 +00:00
struct sum_info {
unsigned int dsum ;
unsigned int msum ;
unsigned int count ;
u_int32_t fp ;
} ;
2013-04-16 23:57:19 -04:00
static int
sum_item ( OMTVALUE lev , u_int32_t UU ( idx ) , void * vsi ) {
2008-07-24 21:25:31 +00:00
LEAFENTRY le = lev ;
struct sum_info * si = vsi ;
si - > count + + ;
si - > dsum + = OMT_ITEM_OVERHEAD + leafentry_disksize ( le ) ;
si - > msum + = leafentry_memsize ( le ) ;
si - > fp + = toku_le_crc ( le ) ;
return 0 ;
}
2007-11-19 23:54:17 +00:00
void toku_verify_counts ( BRTNODE node ) {
2007-07-24 02:36:00 +00:00
/*foo*/
2007-07-13 19:37:47 +00:00
if ( node - > height = = 0 ) {
assert ( node - > u . l . buffer ) ;
2008-07-24 21:31:52 +00:00
struct sum_info sum_info = { 0 , 0 , 0 , 0 } ;
2008-04-22 20:39:50 +00:00
toku_omt_iterate ( node - > u . l . buffer , sum_item , & sum_info ) ;
assert ( sum_info . count = = toku_omt_size ( node - > u . l . buffer ) ) ;
assert ( sum_info . dsum = = node - > u . l . n_bytes_in_buffer ) ;
assert ( sum_info . msum = = node - > u . l . buffer_mempool . free_offset - node - > u . l . buffer_mempool . frag_size ) ;
u_int32_t fps = node - > rand4fingerprint * sum_info . fp ;
2008-04-03 23:00:59 +00:00
assert ( fps = = node - > local_fingerprint ) ;
2007-07-13 19:37:47 +00:00
} else {
unsigned int sum = 0 ;
int i ;
for ( i = 0 ; i < node - > u . n . n_children ; i + + )
2008-01-31 22:05:43 +00:00
sum + = BNC_NBYTESINBUF ( node , i ) ;
2008-01-11 14:03:33 +00:00
// We don't rally care of the later buffers have garbage in them. Valgrind would do a better job noticing if we leave it uninitialized.
2007-11-14 17:58:38 +00:00
// But for now the code always initializes the later tables so they are 0.
2008-01-11 14:03:33 +00:00
assert ( sum = = node - > u . n . n_bytes_in_buffers ) ;
2007-07-13 19:37:47 +00:00
}
}
2007-11-21 13:07:49 +00:00
int toku_serialize_brt_header_size ( struct brt_header * h ) {
2008-05-22 21:28:00 +00:00
unsigned int size = ( + 8 // "tokudata"
+ 4 // size
+ 4 // version
2013-04-16 23:57:18 -04:00
+ 4 // tree's nodesize
+ 8 // free blocks
+ 8 // unused blocks
+ 4 // n_named_roots
+ 8 // max_blocknum_translated
+ 8 // block_translation_address_on_disk
) ;
2007-07-13 19:37:47 +00:00
if ( h - > n_named_roots < 0 ) {
2008-05-22 21:28:00 +00:00
size + = ( + 8 // diskoff
+ 4 // flags
) ;
2007-07-13 19:37:47 +00:00
} else {
2007-11-21 13:07:49 +00:00
int i ;
2007-07-13 19:37:47 +00:00
for ( i = 0 ; i < h - > n_named_roots ; i + + ) {
2008-05-22 21:28:00 +00:00
size + = ( + 8 // root diskoff
+ 4 // flags
+ 4 // length of null terminated string (including null)
+ 1 + strlen ( h - > names [ i ] ) // null-terminated string
) ;
2007-07-13 19:37:47 +00:00
}
}
2007-11-21 13:07:49 +00:00
return size ;
}
int toku_serialize_brt_header_to_wbuf ( struct wbuf * wbuf , struct brt_header * h ) {
unsigned int size = toku_serialize_brt_header_size ( h ) ; // !!! seems silly to recompute the size when the caller knew it. Do we really need the size?
2008-05-22 21:28:00 +00:00
wbuf_literal_bytes ( wbuf , " tokudata " , 8 ) ;
2007-11-21 13:07:49 +00:00
wbuf_int ( wbuf , size ) ;
2008-05-22 21:28:00 +00:00
wbuf_int ( wbuf , BRT_LAYOUT_VERSION ) ;
2007-11-21 13:07:49 +00:00
wbuf_int ( wbuf , h - > nodesize ) ;
2013-04-16 23:57:18 -04:00
wbuf_BLOCKNUM ( wbuf , h - > free_blocks ) ;
wbuf_BLOCKNUM ( wbuf , h - > unused_blocks ) ;
2007-11-21 13:07:49 +00:00
wbuf_int ( wbuf , h - > n_named_roots ) ;
2013-04-16 23:57:18 -04:00
if ( h - > block_translation_address_on_disk ! = 0 ) {
2013-04-16 23:57:18 -04:00
block_allocator_free_block ( h - > block_allocator , h - > block_translation_address_on_disk ) ;
2013-04-16 23:57:18 -04:00
}
2013-04-16 23:57:18 -04:00
block_allocator_alloc_block ( h - > block_allocator , 4 + 16 * h - > translated_blocknum_limit , & h - > block_translation_address_on_disk ) ;
//printf("%s:%d bta=%lu size=%lu\n", __FILE__, __LINE__, h->block_translation_address_on_disk, 4 + 16*h->translated_blocknum_limit);
2013-04-16 23:57:18 -04:00
wbuf_ulonglong ( wbuf , h - > translated_blocknum_limit ) ;
2013-04-16 23:57:18 -04:00
wbuf_DISKOFF ( wbuf , h - > block_translation_address_on_disk ) ;
2008-05-20 21:24:11 +00:00
if ( h - > n_named_roots > = 0 ) {
2007-11-21 13:07:49 +00:00
int i ;
2007-07-13 19:37:47 +00:00
for ( i = 0 ; i < h - > n_named_roots ; i + + ) {
char * s = h - > names [ i ] ;
unsigned int l = 1 + strlen ( s ) ;
2013-04-16 23:57:18 -04:00
wbuf_BLOCKNUM ( wbuf , h - > roots [ i ] ) ;
2008-05-22 21:28:00 +00:00
wbuf_int ( wbuf , h - > flags_array [ i ] ) ;
2007-11-21 13:07:49 +00:00
wbuf_bytes ( wbuf , s , l ) ;
2007-07-13 19:37:47 +00:00
assert ( l > 0 & & s [ l - 1 ] = = 0 ) ;
}
} else {
2013-04-16 23:57:18 -04:00
wbuf_BLOCKNUM ( wbuf , h - > roots [ 0 ] ) ;
2008-05-22 21:28:00 +00:00
wbuf_int ( wbuf , h - > flags_array [ 0 ] ) ;
2007-07-13 19:37:47 +00:00
}
2007-11-21 13:07:49 +00:00
assert ( wbuf - > ndone < = wbuf - > size ) ;
return 0 ;
}
int toku_serialize_brt_header_to ( int fd , struct brt_header * h ) {
2013-04-16 23:57:18 -04:00
lock_for_pwrite ( ) ;
2007-07-13 19:37:47 +00:00
{
2013-04-16 23:57:18 -04:00
struct wbuf w ;
unsigned int size = toku_serialize_brt_header_size ( h ) ;
wbuf_init ( & w , toku_malloc ( size ) , size ) ;
int r = toku_serialize_brt_header_to_wbuf ( & w , h ) ;
assert ( r = = 0 ) ;
assert ( w . ndone = = size ) ;
2013-04-16 23:57:18 -04:00
ssize_t nwrote = toku_pwrite ( fd , w . buf , w . ndone , 0 ) ;
2007-11-21 13:07:49 +00:00
if ( nwrote < 0 ) perror ( " pwrite " ) ;
assert ( ( size_t ) nwrote = = w . ndone ) ;
2013-04-16 23:57:18 -04:00
toku_free ( w . buf ) ;
2007-07-13 19:37:47 +00:00
}
2013-04-16 23:57:18 -04:00
{
struct wbuf w ;
2013-04-16 23:57:18 -04:00
u_int64_t size = 4 + h - > translated_blocknum_limit * 16 ; // 4 for the checksum
2013-04-16 23:57:18 -04:00
//printf("%s:%d writing translation table of size %ld at %ld\n", __FILE__, __LINE__, size, h->block_translation_address_on_disk);
2013-04-16 23:57:18 -04:00
wbuf_init ( & w , toku_malloc ( size ) , size ) ;
u_int64_t i ;
2013-04-16 23:57:18 -04:00
for ( i = 0 ; i < h - > translated_blocknum_limit ; i + + ) {
2013-04-16 23:57:18 -04:00
//printf("%s:%d %ld,%ld\n", __FILE__, __LINE__, h->block_translation[i].diskoff, h->block_translation[i].size);
2013-04-16 23:57:18 -04:00
wbuf_ulonglong ( & w , h - > block_translation [ i ] . diskoff ) ;
wbuf_ulonglong ( & w , h - > block_translation [ i ] . size ) ;
}
2013-04-16 23:57:18 -04:00
u_int32_t checksum = x1764_finish ( & w . checksum ) ;
wbuf_int ( & w , checksum ) ;
2013-04-16 23:57:18 -04:00
ssize_t nwrote = toku_pwrite ( fd , w . buf , size , h - > block_translation_address_on_disk ) ;
2013-04-16 23:57:18 -04:00
assert ( nwrote = = ( ssize_t ) size ) ;
toku_free ( w . buf ) ;
} ;
2013-04-16 23:57:18 -04:00
unlock_for_pwrite ( ) ;
2013-04-16 23:57:18 -04:00
return 0 ;
2007-07-13 19:37:47 +00:00
}
2013-04-16 23:57:18 -04:00
// We only deserialize brt header once and then share everything with all the brts.
2013-04-16 23:57:19 -04:00
static int
deserialize_brtheader ( u_int32_t size , int fd , DISKOFF off , struct brt_header * * brth ) {
2008-05-22 21:28:00 +00:00
// We already know the first 8 bytes are "tokudata", and we read in the size.
struct brt_header * MALLOC ( h ) ;
if ( h = = 0 ) return errno ;
int ret = - 1 ;
if ( 0 ) { died0 : toku_free ( h ) ; return ret ; }
struct rbuf rc ;
rc . buf = toku_malloc ( size - 12 ) ; // we can skip the first 12 bytes.
if ( rc . buf = = NULL ) { ret = errno ; if ( 0 ) { died1 : toku_free ( rc . buf ) ; } goto died0 ; }
rc . size = size - 12 ;
if ( rc . size < = 0 ) { ret = EINVAL ; goto died1 ; }
rc . ndone = 0 ;
{
ssize_t r = pread ( fd , rc . buf , size - 12 , off + 12 ) ;
2008-05-27 12:26:08 +00:00
if ( r ! = ( ssize_t ) size - 12 ) { ret = EINVAL ; goto died1 ; }
2008-05-22 21:28:00 +00:00
}
h - > dirty = 0 ;
h - > layout_version = rbuf_int ( & rc ) ;
h - > nodesize = rbuf_int ( & rc ) ;
2013-04-16 23:57:18 -04:00
assert ( h - > layout_version = = BRT_LAYOUT_VERSION_9 ) ;
h - > free_blocks = rbuf_blocknum ( & rc ) ;
h - > unused_blocks = rbuf_blocknum ( & rc ) ;
2008-05-22 21:28:00 +00:00
h - > n_named_roots = rbuf_int ( & rc ) ;
2013-04-16 23:57:18 -04:00
h - > translated_blocknum_limit = rbuf_diskoff ( & rc ) ;
2013-04-16 23:57:18 -04:00
h - > block_translation_size_on_disk = 4 + 16 * h - > translated_blocknum_limit ;
2013-04-16 23:57:18 -04:00
h - > block_translation_address_on_disk = rbuf_diskoff ( & rc ) ;
// Set up the the block translation buffer.
2013-04-16 23:57:25 -04:00
create_block_allocator ( & h - > block_allocator , BLOCK_ALLOCATOR_HEADER_RESERVE , BLOCK_ALLOCATOR_ALIGNMENT ) ;
2013-04-16 23:57:18 -04:00
// printf("%s:%d translated_blocknum_limit=%ld, block_translation_address_on_disk=%ld\n", __FILE__, __LINE__, h->translated_blocknum_limit, h->block_translation_address_on_disk);
2013-04-16 23:57:18 -04:00
if ( h - > block_translation_address_on_disk = = 0 ) {
2013-04-16 23:57:18 -04:00
h - > block_translation = 0 ;
} else {
2013-04-16 23:57:18 -04:00
lock_for_pwrite ( ) ;
2013-04-16 23:57:18 -04:00
block_allocator_alloc_block_at ( h - > block_allocator , h - > block_translation_size_on_disk , h - > block_translation_address_on_disk ) ;
2013-04-16 23:57:18 -04:00
XMALLOC_N ( h - > translated_blocknum_limit , h - > block_translation ) ;
2013-04-16 23:57:18 -04:00
unsigned char * XMALLOC_N ( h - > block_translation_size_on_disk , tbuf ) ;
{
ssize_t r = pread ( fd , tbuf , h - > block_translation_size_on_disk , h - > block_translation_address_on_disk ) ;
assert ( r = = ( ssize_t ) h - > block_translation_size_on_disk ) ;
2013-04-16 23:57:18 -04:00
}
2013-04-16 23:57:18 -04:00
{
// check the checksum
u_int32_t x1764 = x1764_memory ( tbuf , h - > block_translation_size_on_disk - 4 ) ;
2013-04-16 23:57:18 -04:00
u_int64_t offset = h - > block_translation_size_on_disk - 4 ;
2013-04-16 23:57:18 -04:00
//printf("%s:%d read from %ld (x1764 offset=%ld) size=%ld\n", __FILE__, __LINE__, h->block_translation_address_on_disk, offset, h->block_translation_size_on_disk);
2013-04-16 23:57:18 -04:00
u_int32_t stored_x1764 = ntohl ( * ( int * ) ( tbuf + offset ) ) ;
2013-04-16 23:57:18 -04:00
assert ( x1764 = = stored_x1764 ) ;
}
// now read all that data.
u_int64_t i ;
struct rbuf rt ;
rt . buf = tbuf ;
rt . ndone = 0 ;
rt . size = h - > block_translation_size_on_disk - 4 ;
assert ( rt . size > 0 ) ;
2013-04-16 23:57:18 -04:00
for ( i = 0 ; i < h - > translated_blocknum_limit ; i + + ) {
2013-04-16 23:57:18 -04:00
h - > block_translation [ i ] . diskoff = rbuf_diskoff ( & rt ) ;
h - > block_translation [ i ] . size = rbuf_diskoff ( & rt ) ;
2013-04-16 23:57:18 -04:00
if ( h - > block_translation [ i ] . size > 0 )
block_allocator_alloc_block_at ( h - > block_allocator , h - > block_translation [ i ] . size , h - > block_translation [ i ] . diskoff ) ;
2013-04-16 23:57:18 -04:00
//printf("%s:%d %ld %ld\n", __FILE__, __LINE__, h->block_translation[i].diskoff, h->block_translation[i].size);
2013-04-16 23:57:18 -04:00
}
2013-04-16 23:57:18 -04:00
unlock_for_pwrite ( ) ;
2013-04-16 23:57:18 -04:00
toku_free ( tbuf ) ;
2013-04-16 23:57:18 -04:00
}
2008-05-22 21:28:00 +00:00
if ( h - > n_named_roots > = 0 ) {
int i ;
int n_to_malloc = ( h - > n_named_roots = = 0 ) ? 1 : h - > n_named_roots ;
MALLOC_N ( n_to_malloc , h - > flags_array ) ; if ( h - > flags_array = = 0 ) { ret = errno ; if ( 0 ) { died2 : free ( h - > flags_array ) ; } goto died1 ; }
MALLOC_N ( n_to_malloc , h - > roots ) ; if ( h - > roots = = 0 ) { ret = errno ; if ( 0 ) { died3 : if ( h - > n_named_roots > = 0 ) free ( h - > roots ) ; } goto died2 ; }
2008-06-18 21:38:01 +00:00
MALLOC_N ( n_to_malloc , h - > root_hashes ) ; if ( h - > root_hashes = = 0 ) { ret = errno ; if ( 0 ) { died4 : if ( h - > n_named_roots > = 0 ) free ( h - > root_hashes ) ; } goto died3 ; }
MALLOC_N ( n_to_malloc , h - > names ) ; if ( h - > names = = 0 ) { ret = errno ; if ( 0 ) { died5 : if ( h - > n_named_roots > = 0 ) free ( h - > names ) ; } goto died4 ; }
2008-05-22 21:28:00 +00:00
for ( i = 0 ; i < h - > n_named_roots ; i + + ) {
2008-06-18 21:38:01 +00:00
h - > root_hashes [ i ] . valid = FALSE ;
2013-04-16 23:57:18 -04:00
h - > roots [ i ] = rbuf_blocknum ( & rc ) ;
2008-05-22 21:28:00 +00:00
h - > flags_array [ i ] = rbuf_int ( & rc ) ;
bytevec nameptr ;
unsigned int len ;
rbuf_bytes ( & rc , & nameptr , & len ) ;
assert ( strlen ( nameptr ) + 1 = = len ) ;
h - > names [ i ] = toku_memdup ( nameptr , len ) ;
assert ( len = = 0 | | h - > names [ i ] ! = NULL ) ; // make sure the malloc worked. Give up if this malloc failed...
}
} else {
int n_to_malloc = 1 ;
MALLOC_N ( n_to_malloc , h - > flags_array ) ; if ( h - > flags_array = = 0 ) { ret = errno ; goto died1 ; }
MALLOC_N ( n_to_malloc , h - > roots ) ; if ( h - > roots = = 0 ) { ret = errno ; goto died2 ; }
2008-06-18 21:38:01 +00:00
MALLOC_N ( n_to_malloc , h - > root_hashes ) ; if ( h - > root_hashes = = 0 ) { ret = errno ; goto died3 ; }
2008-05-22 21:28:00 +00:00
h - > names = 0 ;
2013-04-16 23:57:18 -04:00
h - > roots [ 0 ] = rbuf_blocknum ( & rc ) ;
2008-06-18 21:38:01 +00:00
h - > root_hashes [ 0 ] . valid = FALSE ;
2008-05-22 21:28:00 +00:00
h - > flags_array [ 0 ] = rbuf_int ( & rc ) ;
}
2008-06-18 21:38:01 +00:00
if ( rc . ndone ! = rc . size ) { ret = EINVAL ; goto died5 ; }
2008-05-22 21:28:00 +00:00
toku_free ( rc . buf ) ;
2013-04-16 23:57:18 -04:00
{
int r ;
2013-04-16 23:57:18 -04:00
if ( ( r = deserialize_fifo_at ( fd , block_allocator_allocated_limit ( h - > block_allocator ) , & h - > fifo ) ) ) return r ;
2013-04-16 23:57:18 -04:00
}
2008-05-22 21:28:00 +00:00
* brth = h ;
return 0 ;
}
2013-04-16 23:57:18 -04:00
int toku_deserialize_brtheader_from ( int fd , BLOCKNUM blocknum , struct brt_header * * brth ) {
2008-05-22 21:28:00 +00:00
//printf("%s:%d calling MALLOC\n", __FILE__, __LINE__);
2013-04-16 23:57:18 -04:00
assert ( blocknum . b = = 0 ) ;
DISKOFF offset = 0 ;
2008-05-22 21:28:00 +00:00
//printf("%s:%d malloced %p\n", __FILE__, __LINE__, h);
char magic [ 12 ] ;
2013-04-16 23:57:18 -04:00
ssize_t r = pread ( fd , magic , 12 , offset ) ;
2008-05-22 21:28:00 +00:00
if ( r = = 0 ) return - 1 ;
if ( r < 0 ) return errno ;
if ( r ! = 12 ) return EINVAL ;
2008-07-27 22:16:49 +00:00
assert ( memcmp ( magic , " tokudata " , 8 ) = = 0 ) ;
// It's version 7 or later, and the magi clooks OK
2013-04-16 23:57:18 -04:00
return deserialize_brtheader ( ntohl ( * ( int * ) ( & magic [ 8 ] ) ) , fd , offset , brth ) ;
2008-05-22 21:28:00 +00:00
}
2007-12-06 14:20:47 +00:00
unsigned int toku_brt_pivot_key_len ( BRT brt , struct kv_pair * pk ) {
if ( brt - > flags & TOKU_DB_DUPSORT ) {
return kv_pair_keylen ( pk ) + kv_pair_vallen ( pk ) ;
} else {
return kv_pair_keylen ( pk ) ;
}
}
unsigned int toku_brtnode_pivot_key_len ( BRTNODE node , struct kv_pair * pk ) {
if ( node - > flags & TOKU_DB_DUPSORT ) {
return kv_pair_keylen ( pk ) + kv_pair_vallen ( pk ) ;
} else {
return kv_pair_keylen ( pk ) ;
}
}
2008-04-09 02:45:27 +00:00
// To serialize the fifo, we just write it all at the end of the file.
// For now, just do all the writes as separate system calls. This function is hardly ever called, and
// we might not be able to allocate a large enough buffer to hold everything,
// and it would be more complex to batch up several writes.
int toku_serialize_fifo_at ( int fd , off_t freeoff , FIFO fifo ) {
2013-04-16 23:57:18 -04:00
//printf("%s:%d Serializing fifo at %" PRId64 " (count=%d)\n", __FILE__, __LINE__, freeoff, toku_fifo_n_entries(fifo));
2013-04-16 23:57:18 -04:00
lock_for_pwrite ( ) ;
2008-04-09 02:45:27 +00:00
{
2013-04-16 23:57:20 -04:00
enum { size = 4 } ;
2008-04-09 02:45:27 +00:00
char buf [ size ] ;
struct wbuf w ;
wbuf_init ( & w , buf , size ) ;
wbuf_int ( & w , toku_fifo_n_entries ( fifo ) ) ;
2013-04-16 23:57:18 -04:00
ssize_t r = toku_pwrite ( fd , w . buf , size , freeoff ) ;
2008-04-09 02:45:27 +00:00
if ( r ! = size ) return errno ;
freeoff + = size ;
}
FIFO_ITERATE ( fifo , key , keylen , val , vallen , type , xid ,
2013-04-16 23:57:21 -04:00
{
2008-04-09 02:45:27 +00:00
size_t size = keylen + vallen + 1 + 8 + 4 + 4 ;
char * MALLOC_N ( size , buf ) ;
assert ( buf ! = 0 ) ;
struct wbuf w ;
wbuf_init ( & w , buf , size ) ;
2013-04-16 23:57:19 -04:00
assert ( type > = 0 & & type < 256 ) ;
wbuf_char ( & w , ( unsigned char ) type ) ;
2008-04-09 02:45:27 +00:00
wbuf_TXNID ( & w , xid ) ;
wbuf_bytes ( & w , key , keylen ) ;
//printf("%s:%d Writing %d bytes: %s\n", __FILE__, __LINE__, vallen, (char*)val);
wbuf_bytes ( & w , val , vallen ) ;
assert ( w . ndone = = size ) ;
2013-04-16 23:57:18 -04:00
ssize_t r = toku_pwrite ( fd , w . buf , ( size_t ) size , freeoff ) ;
if ( r < 0 ) {
unlock_for_pwrite ( ) ;
return errno ;
}
2008-04-09 02:45:27 +00:00
assert ( r = = ( ssize_t ) size ) ;
freeoff + = size ;
toku_free ( buf ) ;
2013-04-16 23:57:21 -04:00
} ) ;
2013-04-16 23:57:18 -04:00
unlock_for_pwrite ( ) ;
2008-04-09 02:45:27 +00:00
return 0 ;
}
2013-04-16 23:57:19 -04:00
static int
read_int ( int fd , off_t * at , u_int32_t * result ) {
2008-04-09 02:45:27 +00:00
int v ;
ssize_t r = pread ( fd , & v , 4 , * at ) ;
if ( r < 0 ) return errno ;
assert ( r = = 4 ) ;
* result = ntohl ( v ) ;
( * at ) + = 4 ;
return 0 ;
}
2013-04-16 23:57:19 -04:00
static int
read_char ( int fd , off_t * at , char * result ) {
2008-04-09 02:45:27 +00:00
ssize_t r = pread ( fd , result , 1 , * at ) ;
if ( r < 0 ) return errno ;
assert ( r = = 1 ) ;
( * at ) + + ;
return 0 ;
}
2013-04-16 23:57:19 -04:00
static int
read_u_int64_t ( int fd , off_t * at , u_int64_t * result ) {
2008-05-04 16:56:15 +00:00
u_int32_t v1 = 0 , v2 = 0 ;
2008-04-09 02:45:27 +00:00
int r ;
if ( ( r = read_int ( fd , at , & v1 ) ) ) return r ;
if ( ( r = read_int ( fd , at , & v2 ) ) ) return r ;
* result = ( ( ( u_int64_t ) v1 ) < < 32 ) + v2 ;
return 0 ;
}
2013-04-16 23:57:19 -04:00
static int
read_nbytes ( int fd , off_t * at , char * * data , u_int32_t len ) {
2008-04-09 02:45:27 +00:00
char * result = toku_malloc ( len ) ;
if ( result = = 0 ) return errno ;
ssize_t r = pread ( fd , result , len , * at ) ;
2013-04-16 23:57:18 -04:00
//printf("%s:%d read %d bytes at %" PRId64 ", which are %s\n", __FILE__, __LINE__, len, *at, result);
2008-04-09 02:45:27 +00:00
if ( r < 0 ) return errno ;
assert ( r = = ( ssize_t ) len ) ;
( * at ) + = len ;
* data = result ;
return 0 ;
}
2013-04-16 23:57:18 -04:00
static int deserialize_fifo_at ( int fd , off_t at , FIFO * fifo ) {
2008-04-09 02:45:27 +00:00
FIFO result ;
int r = toku_fifo_create ( & result ) ;
if ( r ) return r ;
2008-05-04 16:56:15 +00:00
u_int32_t count = 0 ;
2008-04-09 02:45:27 +00:00
if ( ( r = read_int ( fd , & at , & count ) ) ) return r ;
u_int32_t i ;
for ( i = 0 ; i < count ; i + + ) {
char type ;
TXNID xid ;
2008-05-04 16:56:15 +00:00
u_int32_t keylen = 0 , vallen = 0 ;
char * key = 0 , * val = 0 ;
2008-04-09 02:45:27 +00:00
if ( ( r = read_char ( fd , & at , & type ) ) ) return r ;
2008-07-21 18:00:38 +00:00
if ( ( r = read_u_int64_t ( fd , & at , & xid ) ) ) return r ;
2008-04-09 02:45:27 +00:00
if ( ( r = read_int ( fd , & at , & keylen ) ) ) return r ;
if ( ( r = read_nbytes ( fd , & at , & key , keylen ) ) ) return r ;
if ( ( r = read_int ( fd , & at , & vallen ) ) ) return r ;
if ( ( r = read_nbytes ( fd , & at , & val , vallen ) ) ) return r ;
//printf("%s:%d read %d byte key, key=%s\n dlen=%d data=%s\n", __FILE__, __LINE__, keylen, key, vallen, val);
if ( ( r = toku_fifo_enq ( result , key , keylen , val , vallen , type , xid ) ) ) return r ;
toku_free ( key ) ;
toku_free ( val ) ;
}
* fifo = result ;
//printf("%s:%d *fifo=%p\n", __FILE__, __LINE__, result);
return 0 ;
}