2007-11-29 14:18:54 +00:00
/* -*- mode: C; c-basic-offset: 4 -*- */
2008-01-24 15:10:32 +00:00
# ident "Copyright (c) 2007, 2008 Tokutek Inc. All rights reserved."
2007-11-29 14:18:54 +00:00
2013-04-16 23:57:20 -04:00
# include "includes.h"
2007-11-14 17:58:38 +00:00
2013-04-16 23:57:18 -04:00
#if 0
2013-04-16 23:57:18 -04:00
static u_int64_t ntohll ( u_int64_t v ) {
union u {
u_int32_t l [ 2 ] ;
u_int64_t ll ;
} uv ;
uv . ll = v ;
return ( ( ( u_int64_t ) uv . l [ 0 ] ) < < 32 ) + uv . l [ 1 ] ;
}
2013-04-16 23:57:18 -04:00
# endif
2013-04-16 23:57:18 -04:00
2013-04-16 23:57:18 -04:00
static u_int64_t umin64 ( u_int64_t a , u_int64_t b ) {
if ( a < b ) return a ;
return b ;
}
static inline u_int64_t alignup ( u_int64_t a , u_int64_t b ) {
return ( ( a + b - 1 ) / b ) * b ;
}
// This mutex protects pwrite from running in parallel, and also protects modifications to the block allocator.
2013-04-16 23:57:27 -04:00
static toku_pthread_mutex_t pwrite_mutex = TOKU_PTHREAD_MUTEX_INITIALIZER ;
2013-04-16 23:57:18 -04:00
static int pwrite_is_locked = 0 ;
2013-04-16 23:57:30 -04:00
void toku_pwrite_lock_init ( void ) {
int r = toku_pthread_mutex_init ( & pwrite_mutex , NULL ) ; assert ( r = = 0 ) ;
}
void toku_pwrite_lock_destroy ( void ) {
int r = toku_pthread_mutex_destroy ( & pwrite_mutex ) ; assert ( r = = 0 ) ;
}
2013-04-16 23:57:18 -04:00
static inline void
lock_for_pwrite ( void ) {
2013-04-16 23:57:41 -04:00
// Locks the pwrite_mutex.
2013-04-16 23:57:27 -04:00
int r = toku_pthread_mutex_lock ( & pwrite_mutex ) ;
2013-04-16 23:57:18 -04:00
assert ( r = = 0 ) ;
pwrite_is_locked = 1 ;
}
static inline void
unlock_for_pwrite ( void ) {
pwrite_is_locked = 0 ;
2013-04-16 23:57:27 -04:00
int r = toku_pthread_mutex_unlock ( & pwrite_mutex ) ;
2013-04-16 23:57:18 -04:00
assert ( r = = 0 ) ;
}
2013-04-16 23:57:47 -04:00
enum { FILE_CHANGE_INCREMENT = ( 16 < < 20 ) } ;
void
toku_maybe_truncate_cachefile ( CACHEFILE cf , u_int64_t size_used )
// Effect: If file size >= SIZE+32MiB, reduce file size.
// (32 instead of 16.. hysteresis).
// Return 0 on success, otherwise an error number.
{
//Check file size before taking pwrite lock to reduce likelihood of taking
//the lock needlessly.
//Check file size after taking lock to avoid race conditions.
int64_t file_size ;
{
int r = toku_os_get_file_size ( toku_cachefile_fd ( cf ) , & file_size ) ;
if ( r ! = 0 & & toku_cachefile_is_dev_null ( cf ) ) goto done ;
assert ( r = = 0 ) ;
assert ( file_size > = 0 ) ;
}
// If file space is overallocated by at least 32M
if ( ( u_int64_t ) file_size > = size_used + ( 2 * FILE_CHANGE_INCREMENT ) ) {
lock_for_pwrite ( ) ;
{
int r = toku_os_get_file_size ( toku_cachefile_fd ( cf ) , & file_size ) ;
if ( r ! = 0 & & toku_cachefile_is_dev_null ( cf ) ) goto cleanup ;
assert ( r = = 0 ) ;
assert ( file_size > = 0 ) ;
}
if ( ( u_int64_t ) file_size > = size_used + ( 2 * FILE_CHANGE_INCREMENT ) ) {
toku_off_t new_size = alignup ( file_size , ( 2 * FILE_CHANGE_INCREMENT ) ) ; //Truncate to new size_used.
assert ( new_size < file_size ) ;
int r = toku_cachefile_truncate ( cf , new_size ) ;
assert ( r = = 0 ) ;
}
cleanup :
unlock_for_pwrite ( ) ;
}
done :
return ;
}
int
maybe_preallocate_in_file ( int fd , u_int64_t size )
// Effect: If file size is less than SIZE, make it bigger by either doubling it or growing by 16MiB whichever is less.
// Return 0 on success, otherwise an error number.
{
int64_t file_size ;
{
int r = toku_os_get_file_size ( fd , & file_size ) ;
assert ( r = = 0 ) ;
}
assert ( file_size > = 0 ) ;
if ( ( u_int64_t ) file_size < size ) {
const int N = umin64 ( size , FILE_CHANGE_INCREMENT ) ; // Double the size of the file, or add 16MiB, whichever is less.
char * MALLOC_N ( N , wbuf ) ;
memset ( wbuf , 0 , N ) ;
toku_off_t start_write = alignup ( file_size , 4096 ) ;
assert ( start_write > = file_size ) ;
ssize_t r = toku_os_pwrite ( fd , wbuf , N , start_write ) ;
if ( r = = - 1 ) {
int e = errno ; // must save errno before calling toku_free.
toku_free ( wbuf ) ;
return e ;
}
toku_free ( wbuf ) ;
assert ( r = = N ) ; // We don't handle short writes properly, which is the case where 0<= r < N.
}
return 0 ;
}
2013-04-16 23:57:38 -04:00
static int
toku_pwrite_extend ( int fd , const void * buf , size_t count , toku_off_t offset , ssize_t * num_wrote )
2013-04-16 23:57:18 -04:00
// requires that the pwrite has been locked
2013-04-16 23:57:38 -04:00
// Returns 0 on success (and fills in *num_wrote for how many bytes are written)
// Returns nonzero error number problems.
2013-04-16 23:57:18 -04:00
{
assert ( pwrite_is_locked ) ;
2013-04-16 23:57:38 -04:00
{
int r = maybe_preallocate_in_file ( fd , offset + count ) ;
if ( r ! = 0 ) {
* num_wrote = 0 ;
return r ;
}
}
{
* num_wrote = toku_os_pwrite ( fd , buf , count , offset ) ;
if ( * num_wrote < 0 ) {
int r = errno ;
* num_wrote = 0 ;
return r ;
} else {
return 0 ;
}
}
2013-04-16 23:57:18 -04:00
}
2013-04-16 23:57:46 -04:00
// Don't include the compression header
2007-11-19 23:54:17 +00:00
static const int brtnode_header_overhead = ( 8 + // magic "tokunode" or "tokuleaf"
2008-04-17 03:11:55 +00:00
4 + // nodesize
2007-11-19 23:54:17 +00:00
8 + // checkpoint number
2013-04-16 23:57:16 -04:00
4 + // target node size
2007-11-27 15:22:56 +00:00
4 + // flags
2007-11-19 23:54:17 +00:00
4 + // height
4 + // random for fingerprint
4 + // localfingerprint
4 ) ; // crc32 at the end
2007-11-14 17:58:38 +00:00
2013-04-16 23:57:19 -04:00
static int
addupsize ( OMTVALUE lev , u_int32_t UU ( idx ) , void * vp ) {
2008-07-24 21:25:31 +00:00
LEAFENTRY le = lev ;
unsigned int * ip = vp ;
( * ip ) + = OMT_ITEM_OVERHEAD + leafentry_disksize ( le ) ;
return 0 ;
}
2008-04-09 19:11:15 +00:00
static unsigned int toku_serialize_brtnode_size_slow ( BRTNODE node ) {
2007-11-14 17:58:38 +00:00
unsigned int size = brtnode_header_overhead ;
2007-07-13 19:37:47 +00:00
if ( node - > height > 0 ) {
unsigned int hsize = 0 ;
unsigned int csize = 0 ;
int i ;
size + = 4 ; /* n_children */
2007-11-14 17:58:38 +00:00
size + = 4 ; /* subtree fingerprint. */
2008-04-09 19:11:15 +00:00
size + = 4 * ( node - > u . n . n_children - 1 ) ; /* key lengths*/
if ( node - > flags & TOKU_DB_DUPSORT ) size + = 4 * ( node - > u . n . n_children - 1 ) ;
2007-07-13 19:37:47 +00:00
for ( i = 0 ; i < node - > u . n . n_children - 1 ; i + + ) {
2007-12-06 14:20:47 +00:00
csize + = toku_brtnode_pivot_key_len ( node , node - > u . n . childkeys [ i ] ) ;
2007-07-13 19:37:47 +00:00
}
2013-04-16 23:57:47 -04:00
size + = ( 8 + 4 + 4 + 1 + 3 * 8 ) * ( node - > u . n . n_children ) ; /* For each child, a child offset, a count for the number of hash table entries, the subtree fingerprint, and 3*8 for the subtree estimates and 1 for the exact bit for the estimates. */
2008-01-11 14:03:33 +00:00
int n_buffers = node - > u . n . n_children ;
assert ( 0 < = n_buffers & & n_buffers < TREE_FANOUT + 1 ) ;
for ( i = 0 ; i < n_buffers ; i + + ) {
2008-01-31 22:05:43 +00:00
FIFO_ITERATE ( BNC_BUFFER ( node , i ) ,
key __attribute__ ( ( __unused__ ) ) , keylen ,
data __attribute__ ( ( __unused__ ) ) , datalen ,
2008-02-05 18:25:23 +00:00
type __attribute__ ( ( __unused__ ) ) , xid __attribute__ ( ( __unused__ ) ) ,
2008-01-31 22:05:43 +00:00
( hsize + = BRT_CMD_OVERHEAD + KEY_VALUE_OVERHEAD + keylen + datalen ) ) ;
2007-07-13 19:37:47 +00:00
}
2008-01-11 14:03:33 +00:00
assert ( hsize = = node - > u . n . n_bytes_in_buffers ) ;
2007-07-13 19:37:47 +00:00
assert ( csize = = node - > u . n . totalchildkeylens ) ;
return size + hsize + csize ;
} else {
unsigned int hsize = 0 ;
2008-04-22 20:39:50 +00:00
toku_omt_iterate ( node - > u . l . buffer ,
addupsize ,
& hsize ) ;
2008-04-09 15:46:41 +00:00
assert ( hsize < = node - > u . l . n_bytes_in_buffer ) ;
2007-07-13 19:37:47 +00:00
hsize + = 4 ; /* add n entries in buffer table. */
2013-04-16 23:57:47 -04:00
hsize + = 3 * 8 ; /* add the three leaf stats, but no exact bit. */
2007-07-13 19:37:47 +00:00
return size + hsize ;
}
}
2013-04-16 23:57:46 -04:00
// This is the size of the uncompressed data, not including the compression headers
2007-11-19 23:54:17 +00:00
unsigned int toku_serialize_brtnode_size ( BRTNODE node ) {
2007-11-14 17:58:38 +00:00
unsigned int result = brtnode_header_overhead ;
2013-04-16 23:57:27 -04:00
assert ( sizeof ( toku_off_t ) = = 8 ) ;
2007-07-13 19:37:47 +00:00
if ( node - > height > 0 ) {
2007-11-14 17:58:38 +00:00
result + = 4 ; /* subtree fingerpirnt */
2013-04-16 23:57:24 -04:00
result + = 4 ; /* n_children */
2008-01-08 21:43:11 +00:00
result + = 4 * ( node - > u . n . n_children - 1 ) ; /* key lengths*/
2007-11-27 18:16:45 +00:00
if ( node - > flags & TOKU_DB_DUPSORT ) result + = 4 * ( node - > u . n . n_children - 1 ) ; /* data lengths */
2013-04-16 23:57:24 -04:00
assert ( node - > u . n . totalchildkeylens < ( 1 < < 30 ) ) ;
2007-07-13 19:37:47 +00:00
result + = node - > u . n . totalchildkeylens ; /* the lengths of the pivot keys, without their key lengths. */
2013-04-16 23:57:47 -04:00
result + = ( 8 + 4 + 4 + 1 + 3 * 8 ) * ( node - > u . n . n_children ) ; /* For each child, a child offset, a count for the number of hash table entries, the subtree fingerprint, and 3*8 for the subtree estimates and one for the exact bit. */
2008-01-11 14:03:33 +00:00
result + = node - > u . n . n_bytes_in_buffers ;
2007-07-13 19:37:47 +00:00
} else {
2008-04-22 20:39:50 +00:00
result + = 4 ; /* n_entries in buffer table. */
2013-04-16 23:57:47 -04:00
result + = 3 * 8 ; /* the three leaf stats. */
2007-07-13 19:37:47 +00:00
result + = node - > u . l . n_bytes_in_buffer ;
2007-11-29 15:34:49 +00:00
if ( toku_memory_check ) {
2007-11-19 23:54:17 +00:00
unsigned int slowresult = toku_serialize_brtnode_size_slow ( node ) ;
2013-04-16 23:57:19 -04:00
if ( result ! = slowresult ) printf ( " %s:%d result=%u slowresult=%u \n " , __FILE__ , __LINE__ , result , slowresult ) ;
2007-07-13 19:37:47 +00:00
assert ( result = = slowresult ) ;
}
}
return result ;
}
2013-04-16 23:57:19 -04:00
static int
wbufwriteleafentry ( OMTVALUE lev , u_int32_t UU ( idx ) , void * v ) {
2008-07-24 21:25:31 +00:00
LEAFENTRY le = lev ;
struct wbuf * thisw = v ;
wbuf_LEAFENTRY ( thisw , le ) ;
return 0 ;
}
2013-04-16 23:57:20 -04:00
enum { uncompressed_magic_len = ( 8 // tokuleaf or tokunode
+ 4 // version
+ 8 // lsn
2013-04-16 23:57:46 -04:00
)
} ;
// uncompressed header offsets
enum {
uncompressed_magic_offset = 0 ,
uncompressed_version_offset = 8 ,
uncompressed_lsn_offset = 12 ,
} ;
// compression header sub block sizes
struct sub_block_sizes {
u_int32_t compressed_size ;
u_int32_t uncompressed_size ;
} ;
// round up n
static inline int roundup2 ( int n , int alignment ) {
return ( n + alignment - 1 ) & ~ ( alignment - 1 ) ;
}
// choose the number of sub blocks such that the sub block size
// is around 1 meg. put an upper bound on the number of sub blocks.
static int get_sub_block_sizes ( int totalsize , int maxn , struct sub_block_sizes sizes [ ] ) {
const int meg = 1024 * 1024 ;
const int alignment = 256 ;
int n , subsize ;
n = totalsize / meg ;
if ( n = = 0 ) {
n = 1 ;
subsize = totalsize ;
} else {
if ( n > maxn )
n = maxn ;
subsize = roundup2 ( totalsize / n , alignment ) ;
while ( n < maxn & & subsize > = meg + meg / 8 ) {
n + + ;
subsize = roundup2 ( totalsize / n , alignment ) ;
}
}
// generate the sub block sizes
int i ;
for ( i = 0 ; i < n - 1 ; i + + ) {
sizes [ i ] . uncompressed_size = subsize ;
sizes [ i ] . compressed_size = compressBound ( subsize ) ;
totalsize - = subsize ;
}
if ( i = = 0 | | totalsize > 0 ) {
sizes [ i ] . uncompressed_size = totalsize ;
sizes [ i ] . compressed_size = compressBound ( totalsize ) ;
i + + ;
}
return i ;
}
// get the size of the compression header
static size_t get_compression_header_size ( int layout_version , int n ) {
if ( layout_version < BRT_LAYOUT_VERSION_10 )
return n * sizeof ( struct sub_block_sizes ) ;
else
return sizeof ( u_int32_t ) + n * sizeof ( struct sub_block_sizes ) ;
}
2013-04-16 23:57:16 -04:00
2013-04-16 23:57:46 -04:00
// get the sum of the sub block compressed sizes
static size_t get_sum_compressed_size ( int n , struct sub_block_sizes sizes [ ] ) {
int i ;
size_t compressed_size = 0 ;
for ( i = 0 ; i < n ; i + + )
compressed_size + = sizes [ i ] . compressed_size ;
return compressed_size ;
}
// get the sum of the sub block uncompressed sizes
static size_t get_sum_uncompressed_size ( int n , struct sub_block_sizes sizes [ ] ) {
int i ;
size_t uncompressed_size = 0 ;
for ( i = 0 ; i < n ; i + + )
uncompressed_size + = sizes [ i ] . uncompressed_size ;
return uncompressed_size ;
}
2013-04-16 23:57:16 -04:00
2013-04-16 23:57:38 -04:00
static inline void ignore_int ( int UU ( ignore_me ) ) { }
2013-04-16 23:57:47 -04:00
int toku_serialize_brtnode_to ( int fd , BLOCKNUM blocknum , BRTNODE node , struct brt_header * h , int n_workitems , int n_threads , BOOL for_checkpoint ) {
2007-08-21 23:32:17 +00:00
struct wbuf w ;
2007-07-13 19:37:47 +00:00
int i ;
2013-04-16 23:57:46 -04:00
// serialize the node into buf
unsigned int calculated_size = toku_serialize_brtnode_size ( node ) ;
2013-04-16 23:57:23 -04:00
//printf("%s:%d serializing %" PRIu64 " size=%d\n", __FILE__, __LINE__, blocknum.b, calculated_size);
2008-04-17 03:11:55 +00:00
//assert(calculated_size<=size);
2007-10-16 21:02:53 +00:00
//char buf[size];
2013-04-16 23:57:16 -04:00
char * MALLOC_N ( calculated_size , buf ) ;
2008-04-07 01:30:25 +00:00
//toku_verify_counts(node);
2008-04-17 03:11:55 +00:00
//assert(size>0);
2007-07-13 19:37:47 +00:00
//printf("%s:%d serializing %lld w height=%d p0=%p\n", __FILE__, __LINE__, off, node->height, node->mdicts[0]);
2013-04-16 23:57:23 -04:00
wbuf_init ( & w , buf , calculated_size ) ;
2007-11-14 17:58:38 +00:00
wbuf_literal_bytes ( & w , " toku " , 4 ) ;
if ( node - > height = = 0 ) wbuf_literal_bytes ( & w , " leaf " , 4 ) ;
else wbuf_literal_bytes ( & w , " node " , 4 ) ;
2013-04-16 23:57:46 -04:00
assert ( node - > layout_version = = BRT_LAYOUT_VERSION_9 | | node - > layout_version = = BRT_LAYOUT_VERSION ) ;
wbuf_int ( & w , node - > layout_version ) ;
2007-11-24 03:50:28 +00:00
wbuf_ulonglong ( & w , node - > log_lsn . lsn ) ;
2007-11-14 17:58:38 +00:00
//printf("%s:%d %lld.calculated_size=%d\n", __FILE__, __LINE__, off, calculated_size);
2013-04-16 23:57:16 -04:00
wbuf_uint ( & w , node - > nodesize ) ;
2008-04-02 23:40:36 +00:00
wbuf_uint ( & w , node - > flags ) ;
2008-04-04 18:22:01 +00:00
wbuf_int ( & w , node - > height ) ;
2007-11-14 17:58:38 +00:00
//printf("%s:%d %lld rand=%08x sum=%08x height=%d\n", __FILE__, __LINE__, node->thisnodename, node->rand4fingerprint, node->subtree_fingerprint, node->height);
2008-04-04 18:22:01 +00:00
wbuf_uint ( & w , node - > rand4fingerprint ) ;
wbuf_uint ( & w , node - > local_fingerprint ) ;
2008-04-02 23:40:36 +00:00
// printf("%s:%d wrote %08x for node %lld\n", __FILE__, __LINE__, node->local_fingerprint, (long long)node->thisnodename);
2007-11-14 17:58:38 +00:00
//printf("%s:%d local_fingerprint=%8x\n", __FILE__, __LINE__, node->local_fingerprint);
2007-07-13 19:37:47 +00:00
//printf("%s:%d w.ndone=%d n_children=%d\n", __FILE__, __LINE__, w.ndone, node->n_children);
2007-11-14 17:58:38 +00:00
if ( node - > height > 0 ) {
2008-01-29 21:43:08 +00:00
assert ( node - > u . n . n_children > 0 ) ;
2007-11-14 17:58:38 +00:00
// Local fingerprint is not actually stored while in main memory. Must calculate it.
// Subtract the child fingerprints from the subtree fingerprint to get the local fingerprint.
{
u_int32_t subtree_fingerprint = node - > local_fingerprint ;
for ( i = 0 ; i < node - > u . n . n_children ; i + + ) {
2008-01-31 22:05:43 +00:00
subtree_fingerprint + = BNC_SUBTREE_FINGERPRINT ( node , i ) ;
2007-11-14 17:58:38 +00:00
}
2008-04-04 18:22:01 +00:00
wbuf_uint ( & w , subtree_fingerprint ) ;
2007-11-14 17:58:38 +00:00
}
2007-07-13 19:37:47 +00:00
wbuf_int ( & w , node - > u . n . n_children ) ;
2007-11-14 17:58:38 +00:00
for ( i = 0 ; i < node - > u . n . n_children ; i + + ) {
2008-04-04 18:22:01 +00:00
wbuf_uint ( & w , BNC_SUBTREE_FINGERPRINT ( node , i ) ) ;
2013-04-16 23:57:47 -04:00
struct subtree_estimates * se = & ( BNC_SUBTREE_ESTIMATES ( node , i ) ) ;
wbuf_ulonglong ( & w , se - > nkeys ) ;
wbuf_ulonglong ( & w , se - > ndata ) ;
wbuf_ulonglong ( & w , se - > dsize ) ;
2013-04-16 23:57:47 -04:00
wbuf_char ( & w , ( char ) se - > exact ) ;
2007-11-14 17:58:38 +00:00
}
2007-07-13 19:37:47 +00:00
//printf("%s:%d w.ndone=%d\n", __FILE__, __LINE__, w.ndone);
for ( i = 0 ; i < node - > u . n . n_children - 1 ; i + + ) {
2007-11-27 15:22:56 +00:00
if ( node - > flags & TOKU_DB_DUPSORT ) {
2007-11-27 18:16:45 +00:00
wbuf_bytes ( & w , kv_pair_key ( node - > u . n . childkeys [ i ] ) , kv_pair_keylen ( node - > u . n . childkeys [ i ] ) ) ;
wbuf_bytes ( & w , kv_pair_val ( node - > u . n . childkeys [ i ] ) , kv_pair_vallen ( node - > u . n . childkeys [ i ] ) ) ;
} else {
2007-12-06 14:20:47 +00:00
wbuf_bytes ( & w , kv_pair_key ( node - > u . n . childkeys [ i ] ) , toku_brtnode_pivot_key_len ( node , node - > u . n . childkeys [ i ] ) ) ;
2007-11-27 18:16:45 +00:00
}
2007-07-13 19:37:47 +00:00
//printf("%s:%d w.ndone=%d (childkeylen[%d]=%d\n", __FILE__, __LINE__, w.ndone, i, node->childkeylens[i]);
}
for ( i = 0 ; i < node - > u . n . n_children ; i + + ) {
2013-04-16 23:57:18 -04:00
wbuf_BLOCKNUM ( & w , BNC_BLOCKNUM ( node , i ) ) ;
2007-07-13 19:37:47 +00:00
//printf("%s:%d w.ndone=%d\n", __FILE__, __LINE__, w.ndone);
}
{
2008-01-11 14:03:33 +00:00
int n_buffers = node - > u . n . n_children ;
2007-11-14 17:58:38 +00:00
u_int32_t check_local_fingerprint = 0 ;
2008-01-11 14:03:33 +00:00
for ( i = 0 ; i < n_buffers ; i + + ) {
2007-07-13 19:37:47 +00:00
//printf("%s:%d p%d=%p n_entries=%d\n", __FILE__, __LINE__, i, node->mdicts[i], mdict_n_entries(node->mdicts[i]));
2008-01-31 22:05:43 +00:00
wbuf_int ( & w , toku_fifo_n_entries ( BNC_BUFFER ( node , i ) ) ) ;
2008-02-05 18:25:23 +00:00
FIFO_ITERATE ( BNC_BUFFER ( node , i ) , key , keylen , data , datalen , type , xid ,
2013-04-16 23:57:21 -04:00
{
2013-04-16 23:57:20 -04:00
assert ( type > = 0 & & type < 256 ) ;
2013-04-16 23:57:19 -04:00
wbuf_char ( & w , ( unsigned char ) type ) ;
2008-02-05 18:25:23 +00:00
wbuf_TXNID ( & w , xid ) ;
2007-11-14 17:58:38 +00:00
wbuf_bytes ( & w , key , keylen ) ;
wbuf_bytes ( & w , data , datalen ) ;
2008-07-27 22:16:49 +00:00
check_local_fingerprint + = node - > rand4fingerprint * toku_calc_fingerprint_cmd ( type , xid , key , keylen , data , datalen ) ;
2013-04-16 23:57:21 -04:00
} ) ;
2007-07-13 19:37:47 +00:00
}
2007-11-14 17:58:38 +00:00
//printf("%s:%d check_local_fingerprint=%8x\n", __FILE__, __LINE__, check_local_fingerprint);
2013-04-16 23:57:18 -04:00
if ( check_local_fingerprint ! = node - > local_fingerprint ) printf ( " %s:%d node=% " PRId64 " fingerprint expected=%08x actual=%08x \n " , __FILE__ , __LINE__ , node - > thisnodename . b , check_local_fingerprint , node - > local_fingerprint ) ;
2007-11-14 17:58:38 +00:00
assert ( check_local_fingerprint = = node - > local_fingerprint ) ;
2007-07-13 19:37:47 +00:00
}
} else {
2008-04-07 01:30:25 +00:00
//printf("%s:%d writing node %lld n_entries=%d\n", __FILE__, __LINE__, node->thisnodename, toku_gpma_n_entries(node->u.l.buffer));
2013-04-16 23:57:47 -04:00
wbuf_ulonglong ( & w , node - > u . l . leaf_stats . nkeys ) ;
wbuf_ulonglong ( & w , node - > u . l . leaf_stats . ndata ) ;
wbuf_ulonglong ( & w , node - > u . l . leaf_stats . dsize ) ;
2008-04-22 20:39:50 +00:00
wbuf_uint ( & w , toku_omt_size ( node - > u . l . buffer ) ) ;
toku_omt_iterate ( node - > u . l . buffer , wbufwriteleafentry , & w ) ;
2007-07-13 19:37:47 +00:00
}
assert ( w . ndone < = w . size ) ;
2007-11-14 17:58:38 +00:00
# ifdef CRC_ATEND
2013-04-16 23:57:41 -04:00
wbuf_int ( & w , crc32 ( toku_null_crc , w . buf , w . ndone ) ) ;
2007-11-14 17:58:38 +00:00
# endif
# ifdef CRC_INCR
2008-07-27 22:16:49 +00:00
{
u_int32_t checksum = x1764_finish ( & w . checksum ) ;
wbuf_uint ( & w , checksum ) ;
}
2007-11-14 17:58:38 +00:00
# endif
2013-04-16 23:57:16 -04:00
if ( calculated_size ! = w . ndone )
2013-04-16 23:57:19 -04:00
printf ( " %s:%d w.done=%u calculated_size=%u \n " , __FILE__ , __LINE__ , w . ndone , calculated_size ) ;
2013-04-16 23:57:16 -04:00
assert ( calculated_size = = w . ndone ) ;
2013-04-16 23:57:29 -04:00
// The uncompressed part of the block header is
2013-04-16 23:57:16 -04:00
// tokuleaf(8),
// version(4),
// lsn(8),
2013-04-16 23:57:46 -04:00
// n_sub_blocks(4), followed by n length pairs
// compressed_len(4)
// uncompressed_len(4)
2013-04-16 23:57:16 -04:00
2013-04-16 23:57:46 -04:00
// select the number of sub blocks and their sizes.
// impose an upper bound on the number of sub blocks.
int max_sub_blocks = 4 ;
if ( node - > layout_version < BRT_LAYOUT_VERSION_10 )
max_sub_blocks = 1 ;
struct sub_block_sizes sub_block_sizes [ max_sub_blocks ] ;
int n_sub_blocks = get_sub_block_sizes ( calculated_size - uncompressed_magic_len , max_sub_blocks , sub_block_sizes ) ;
assert ( 0 < n_sub_blocks & & n_sub_blocks < = max_sub_blocks ) ;
if ( 0 & & n_sub_blocks ! = 1 ) {
printf ( " %s:%d %d: " , __FUNCTION__ , __LINE__ , n_sub_blocks ) ;
for ( i = 0 ; i < n_sub_blocks ; i + + )
printf ( " %u " , sub_block_sizes [ i ] . uncompressed_size ) ;
printf ( " \n " ) ;
}
size_t compressed_len = get_sum_compressed_size ( n_sub_blocks , sub_block_sizes ) ;
size_t compression_header_len = get_compression_header_size ( node - > layout_version , n_sub_blocks ) ;
2013-04-16 23:57:16 -04:00
char * MALLOC_N ( compressed_len + uncompressed_magic_len + compression_header_len , compressed_buf ) ;
memcpy ( compressed_buf , buf , uncompressed_magic_len ) ;
if ( 0 ) printf ( " First 4 bytes before compressing data are %02x%02x%02x%02x \n " ,
2013-04-16 23:57:46 -04:00
buf [ uncompressed_magic_len ] , buf [ uncompressed_magic_len + 1 ] ,
buf [ uncompressed_magic_len + 2 ] , buf [ uncompressed_magic_len + 3 ] ) ;
// TBD compress all of the sub blocks
char * uncompressed_ptr = buf + uncompressed_magic_len ;
char * compressed_base_ptr = compressed_buf + uncompressed_magic_len + compression_header_len ;
char * compressed_ptr = compressed_base_ptr ;
for ( i = 0 ; i < n_sub_blocks ; i + + ) {
uLongf uncompressed_len = sub_block_sizes [ i ] . uncompressed_size ;
uLongf real_compressed_len = sub_block_sizes [ i ] . compressed_size ;
{
2013-04-16 23:57:38 -04:00
# ifdef ADAPTIVE_COMPRESSION
2013-04-16 23:57:46 -04:00
// Marketing has expressed concern that this algorithm will make customers go crazy.
int compression_level ;
if ( n_workitems < = n_threads ) compression_level = 5 ;
else if ( n_workitems < = 2 * n_threads ) compression_level = 4 ;
else if ( n_workitems < = 3 * n_threads ) compression_level = 3 ;
else if ( n_workitems < = 4 * n_threads ) compression_level = 2 ;
else compression_level = 1 ;
2013-04-16 23:57:38 -04:00
# else
2013-04-16 23:57:46 -04:00
int compression_level = 5 ;
ignore_int ( n_workitems ) ; ignore_int ( n_threads ) ;
2013-04-16 23:57:38 -04:00
# endif
2013-04-16 23:57:46 -04:00
//printf("compress(%d) n_workitems=%d n_threads=%d\n", compression_level, n_workitems, n_threads);
int r = compress2 ( ( Bytef * ) compressed_ptr , & real_compressed_len ,
( Bytef * ) uncompressed_ptr , uncompressed_len ,
compression_level ) ;
assert ( r = = Z_OK ) ;
sub_block_sizes [ i ] . compressed_size = real_compressed_len ; // replace the compressed size estimate with the real size
uncompressed_ptr + = uncompressed_len ; // update the uncompressed and compressed buffer pointers
compressed_ptr + = real_compressed_len ;
}
2013-04-16 23:57:16 -04:00
}
2013-04-16 23:57:46 -04:00
compressed_len = compressed_ptr - compressed_base_ptr ;
2013-04-16 23:57:16 -04:00
2013-04-16 23:57:46 -04:00
if ( 0 ) printf ( " Block % " PRId64 " Size before compressing %u, after compression % " PRIu64 " \n " , blocknum . b , calculated_size - uncompressed_magic_len , ( uint64_t ) compressed_len ) ;
2013-04-16 23:57:16 -04:00
2013-04-16 23:57:46 -04:00
// write out the compression header
uint32_t * compressed_header_ptr = ( uint32_t * ) ( compressed_buf + uncompressed_magic_len ) ;
if ( node - > layout_version > = BRT_LAYOUT_VERSION_10 )
2013-04-16 23:57:46 -04:00
* compressed_header_ptr + + = toku_htod32 ( n_sub_blocks ) ;
2013-04-16 23:57:46 -04:00
for ( i = 0 ; i < n_sub_blocks ; i + + ) {
2013-04-16 23:57:46 -04:00
compressed_header_ptr [ 0 ] = toku_htod32 ( sub_block_sizes [ i ] . compressed_size ) ;
compressed_header_ptr [ 1 ] = toku_htod32 ( sub_block_sizes [ i ] . uncompressed_size ) ;
2013-04-16 23:57:46 -04:00
compressed_header_ptr + = 2 ;
}
2008-03-18 12:08:56 +00:00
2007-11-14 17:58:38 +00:00
//write_now: printf("%s:%d Writing %d bytes\n", __FILE__, __LINE__, w.ndone);
2013-04-16 23:57:38 -04:00
int r ;
2007-07-13 19:37:47 +00:00
{
2008-06-15 17:09:14 +00:00
// If the node has never been written, then write the whole buffer, including the zeros
2013-04-16 23:57:18 -04:00
assert ( blocknum . b > = 0 ) ;
2013-04-16 23:57:18 -04:00
//printf("%s:%d h=%p\n", __FILE__, __LINE__, h);
//printf("%s:%d translated_blocknum_limit=%lu blocknum.b=%lu\n", __FILE__, __LINE__, h->translated_blocknum_limit, blocknum.b);
//printf("%s:%d allocator=%p\n", __FILE__, __LINE__, h->block_allocator);
//printf("%s:%d bt=%p\n", __FILE__, __LINE__, h->block_translation);
2013-04-16 23:57:16 -04:00
size_t n_to_write = uncompressed_magic_len + compression_header_len + compressed_len ;
2013-04-16 23:57:47 -04:00
DISKOFF offset ;
2013-04-16 23:57:44 -04:00
//h will be dirtied
2013-04-16 23:57:47 -04:00
toku_blocknum_realloc_on_disk ( h - > blocktable , blocknum , n_to_write , & offset ,
h , for_checkpoint ) ;
2013-04-16 23:57:38 -04:00
ssize_t n_wrote ;
2013-04-16 23:57:47 -04:00
lock_for_pwrite ( ) ;
2013-04-16 23:57:38 -04:00
r = toku_pwrite_extend ( fd , compressed_buf , n_to_write , offset , & n_wrote ) ;
if ( r ) {
2013-04-16 23:57:38 -04:00
// fprintf(stderr, "%s:%d: Error writing data to file. errno=%d (%s)\n", __FILE__, __LINE__, r, strerror(r));
2013-04-16 23:57:38 -04:00
} else {
r = 0 ;
}
2013-04-16 23:57:18 -04:00
unlock_for_pwrite ( ) ;
2007-07-13 19:37:47 +00:00
}
//printf("%s:%d wrote %d bytes for %lld size=%lld\n", __FILE__, __LINE__, w.ndone, off, size);
2013-04-16 23:57:23 -04:00
assert ( w . ndone = = calculated_size ) ;
2007-10-16 21:02:53 +00:00
toku_free ( buf ) ;
2013-04-16 23:57:16 -04:00
toku_free ( compressed_buf ) ;
2013-04-16 23:57:38 -04:00
return r ;
2007-07-13 19:37:47 +00:00
}
2013-04-16 23:57:46 -04:00
# define DO_DECOMPRESS_WORKER 1
struct decompress_work {
toku_pthread_t id ;
void * compress_ptr ;
void * uncompress_ptr ;
u_int32_t compress_size ;
u_int32_t uncompress_size ;
} ;
// initialize the decompression work
static void init_decompress_work ( struct decompress_work * w ,
void * compress_ptr , u_int32_t compress_size ,
void * uncompress_ptr , u_int32_t uncompress_size ) {
w - > id = 0 ;
w - > compress_ptr = compress_ptr ; w - > compress_size = compress_size ;
w - > uncompress_ptr = uncompress_ptr ; w - > uncompress_size = uncompress_size ;
}
// do the decompression work
static void do_decompress_work ( struct decompress_work * w ) {
uLongf destlen = w - > uncompress_size ;
int r = uncompress ( w - > uncompress_ptr , & destlen ,
w - > compress_ptr , w - > compress_size ) ;
assert ( destlen = = w - > uncompress_size ) ;
assert ( r = = Z_OK ) ;
}
# if DO_DECOMPRESS_WORKER
static void * decompress_worker ( void * ) ;
static void start_decompress_work ( struct decompress_work * w ) {
int r = toku_pthread_create ( & w - > id , NULL , decompress_worker , w ) ; assert ( r = = 0 ) ;
}
static void wait_decompress_work ( struct decompress_work * w ) {
void * ret ;
int r = toku_pthread_join ( w - > id , & ret ) ; assert ( r = = 0 ) ;
}
static void * decompress_worker ( void * arg ) {
struct decompress_work * w = ( struct decompress_work * ) arg ;
do_decompress_work ( w ) ;
return arg ;
}
# endif
# define DO_TOKU_TRACE 0
# if DO_TOKU_TRACE
static int toku_trace_fd = - 1 ;
static inline void do_toku_trace ( const char * cp , int len ) {
write ( toku_trace_fd , cp , len ) ;
}
# define toku_trace(a) do_toku_trace(a, strlen(a))
# else
# define toku_trace(a)
# endif
2013-04-16 23:57:18 -04:00
int toku_deserialize_brtnode_from ( int fd , BLOCKNUM blocknum , u_int32_t fullhash , BRTNODE * brtnode , struct brt_header * h ) {
2013-04-16 23:57:25 -04:00
if ( 0 ) printf ( " Deserializing Block % " PRId64 " \n " , blocknum . b ) ;
2013-04-16 23:57:38 -04:00
if ( h - > panic ) return h - > panic ;
2013-04-16 23:57:46 -04:00
# if DO_TOKU_TRACE
if ( toku_trace_fd = = - 1 )
toku_trace_fd = open ( " /dev/null " , O_WRONLY ) ;
toku_trace ( " deserial start " ) ;
# endif
// get the file offset and block size for the block
2013-04-16 23:57:42 -04:00
DISKOFF offset , size ;
2013-04-16 23:57:47 -04:00
toku_translate_blocknum_to_offset_size ( h - > blocktable , blocknum , & offset , & size ) ;
2007-07-13 19:37:47 +00:00
TAGMALLOC ( BRTNODE , result ) ;
2007-08-21 23:32:17 +00:00
struct rbuf rc ;
2007-07-13 19:37:47 +00:00
int i ;
int r ;
2008-04-07 15:54:58 +00:00
if ( result = = 0 ) {
2007-07-13 19:37:47 +00:00
r = errno ;
2007-07-20 18:00:14 +00:00
if ( 0 ) { died0 : toku_free ( result ) ; }
2007-07-13 19:37:47 +00:00
return r ;
}
2013-04-16 23:57:41 -04:00
result - > ever_been_written = 1 ;
2013-04-16 23:57:42 -04:00
unsigned char * MALLOC_N ( size , compressed_block ) ;
2013-04-16 23:57:23 -04:00
2013-04-16 23:57:46 -04:00
// read the compressed block
2013-04-16 23:57:42 -04:00
ssize_t rlen = pread ( fd , compressed_block , size , offset ) ;
assert ( ( DISKOFF ) rlen = = size ) ;
2013-04-16 23:57:16 -04:00
2013-04-16 23:57:46 -04:00
// get the layout_version
2013-04-16 23:57:42 -04:00
unsigned char * uncompressed_header = compressed_block ;
2013-04-16 23:57:46 -04:00
int layout_version = toku_dtoh32 ( * ( uint32_t * ) ( uncompressed_header + uncompressed_version_offset ) ) ;
2013-04-16 23:57:46 -04:00
// get the number of compressed sub blocks
int n_sub_blocks ;
int compression_header_offset ;
if ( layout_version < BRT_LAYOUT_VERSION_10 ) {
n_sub_blocks = 1 ;
compression_header_offset = uncompressed_magic_len ;
} else {
2013-04-16 23:57:46 -04:00
n_sub_blocks = toku_dtoh32 ( * ( u_int32_t * ) ( & uncompressed_header [ uncompressed_magic_len ] ) ) ;
2013-04-16 23:57:46 -04:00
compression_header_offset = uncompressed_magic_len + 4 ;
}
assert ( 0 < n_sub_blocks ) ;
// verify the sizes of the compressed sub blocks
if ( 0 & & n_sub_blocks ! = 1 ) printf ( " %s:%d %d \n " , __FUNCTION__ , __LINE__ , n_sub_blocks ) ;
struct sub_block_sizes sub_block_sizes [ n_sub_blocks ] ;
for ( i = 0 ; i < n_sub_blocks ; i + + ) {
2013-04-16 23:57:46 -04:00
u_int32_t compressed_size = toku_dtoh32 ( * ( u_int32_t * ) ( & uncompressed_header [ compression_header_offset + 8 * i ] ) ) ;
2013-04-16 23:57:46 -04:00
if ( compressed_size < = 0 | | compressed_size > ( 1 < < 30 ) ) { r = toku_db_badformat ( ) ; goto died0 ; }
2013-04-16 23:57:46 -04:00
u_int32_t uncompressed_size = toku_dtoh32 ( * ( u_int32_t * ) ( & uncompressed_header [ compression_header_offset + 8 * i + 4 ] ) ) ;
2013-04-16 23:57:46 -04:00
if ( 0 ) printf ( " Block % " PRId64 " Compressed size = %u, uncompressed size=%u \n " , blocknum . b , compressed_size , uncompressed_size ) ;
if ( uncompressed_size < = 0 | | uncompressed_size > ( 1 < < 30 ) ) { r = toku_db_badformat ( ) ; goto died0 ; }
2013-04-16 23:57:42 -04:00
2013-04-16 23:57:46 -04:00
sub_block_sizes [ i ] . compressed_size = compressed_size ;
sub_block_sizes [ i ] . uncompressed_size = uncompressed_size ;
}
unsigned char * compressed_data = compressed_block + uncompressed_magic_len + get_compression_header_size ( layout_version , n_sub_blocks ) ;
2013-04-16 23:57:16 -04:00
2013-04-16 23:57:46 -04:00
size_t uncompressed_size = get_sum_uncompressed_size ( n_sub_blocks , sub_block_sizes ) ;
rc . size = uncompressed_magic_len + uncompressed_size ;
2013-04-16 23:57:16 -04:00
assert ( rc . size > 0 ) ;
2013-04-16 23:57:16 -04:00
rc . buf = toku_malloc ( rc . size ) ;
assert ( rc . buf ) ;
2013-04-16 23:57:46 -04:00
// construct the uncompressed block from the header and compressed sub blocks
2013-04-16 23:57:16 -04:00
memcpy ( rc . buf , uncompressed_header , uncompressed_magic_len ) ;
2013-04-16 23:57:46 -04:00
// decompress the sub blocks
2013-04-16 23:57:46 -04:00
unsigned char * uncompressed_data = rc . buf + uncompressed_magic_len ;
2013-04-16 23:57:46 -04:00
struct decompress_work decompress_work [ n_sub_blocks ] ;
for ( i = 0 ; i < n_sub_blocks ; i + + ) {
init_decompress_work ( & decompress_work [ i ] , compressed_data , sub_block_sizes [ i ] . compressed_size , uncompressed_data , sub_block_sizes [ i ] . uncompressed_size ) ;
if ( i > 0 ) {
# if DO_DECOMPRESS_WORKER
start_decompress_work ( & decompress_work [ i ] ) ;
# else
do_decompress_work ( & decompress_work [ i ] ) ;
# endif
}
uncompressed_data + = sub_block_sizes [ i ] . uncompressed_size ;
compressed_data + = sub_block_sizes [ i ] . compressed_size ;
2013-04-16 23:57:16 -04:00
}
2013-04-16 23:57:46 -04:00
do_decompress_work ( & decompress_work [ 0 ] ) ;
# if DO_DECOMPRESS_WORKER
for ( i = 1 ; i < n_sub_blocks ; i + + )
wait_decompress_work ( & decompress_work [ i ] ) ;
# endif
toku_trace ( " decompress done " ) ;
2013-04-16 23:57:16 -04:00
if ( 0 ) printf ( " First 4 bytes of uncompressed data are %02x%02x%02x%02x \n " ,
rc . buf [ uncompressed_magic_len ] , rc . buf [ uncompressed_magic_len + 1 ] ,
rc . buf [ uncompressed_magic_len + 2 ] , rc . buf [ uncompressed_magic_len + 3 ] ) ;
2013-04-16 23:57:42 -04:00
toku_free ( compressed_block ) ;
2013-04-16 23:57:46 -04:00
// deserialize the uncompressed block
2013-04-16 23:57:16 -04:00
rc . ndone = 0 ;
//printf("Deserializing %lld datasize=%d\n", off, datasize);
2007-11-14 17:58:38 +00:00
{
bytevec tmp ;
rbuf_literal_bytes ( & rc , & tmp , 8 ) ;
if ( memcmp ( tmp , " tokuleaf " , 8 ) ! = 0
& & memcmp ( tmp , " tokunode " , 8 ) ! = 0 ) {
2013-04-16 23:57:33 -04:00
r = toku_db_badformat ( ) ;
2013-04-16 23:57:16 -04:00
return r ;
2007-11-14 17:58:38 +00:00
}
}
result - > layout_version = rbuf_int ( & rc ) ;
2008-05-22 21:28:00 +00:00
{
switch ( result - > layout_version ) {
2013-04-16 23:57:47 -04:00
case BRT_LAYOUT_VERSION_10 : goto ok_layout_version ;
// Don't support older versions.
2008-05-22 21:28:00 +00:00
}
2013-04-16 23:57:33 -04:00
r = toku_db_badformat ( ) ;
2013-04-16 23:57:16 -04:00
return r ;
2008-05-22 21:28:00 +00:00
ok_layout_version : ;
2007-11-14 17:58:38 +00:00
}
2007-11-24 03:50:28 +00:00
result - > disk_lsn . lsn = rbuf_ulonglong ( & rc ) ;
2013-04-16 23:57:16 -04:00
result - > nodesize = rbuf_int ( & rc ) ;
2007-11-24 03:50:28 +00:00
result - > log_lsn = result - > disk_lsn ;
2013-04-16 23:57:16 -04:00
2013-04-16 23:57:18 -04:00
result - > thisnodename = blocknum ;
2008-04-17 03:11:55 +00:00
result - > flags = rbuf_int ( & rc ) ;
2007-07-13 19:37:47 +00:00
result - > height = rbuf_int ( & rc ) ;
2007-11-14 17:58:38 +00:00
result - > rand4fingerprint = rbuf_int ( & rc ) ;
result - > local_fingerprint = rbuf_int ( & rc ) ;
2008-04-02 23:40:36 +00:00
// printf("%s:%d read %08x\n", __FILE__, __LINE__, result->local_fingerprint);
2007-09-18 16:09:55 +00:00
result - > dirty = 0 ;
2008-06-17 17:05:19 +00:00
result - > fullhash = fullhash ;
2007-07-13 19:37:47 +00:00
//printf("height==%d\n", result->height);
if ( result - > height > 0 ) {
result - > u . n . totalchildkeylens = 0 ;
2007-11-14 17:58:38 +00:00
u_int32_t subtree_fingerprint = rbuf_int ( & rc ) ;
u_int32_t check_subtree_fingerprint = 0 ;
2007-07-13 19:37:47 +00:00
result - > u . n . n_children = rbuf_int ( & rc ) ;
2008-03-06 22:48:07 +00:00
MALLOC_N ( result - > u . n . n_children + 1 , result - > u . n . childinfos ) ;
MALLOC_N ( result - > u . n . n_children , result - > u . n . childkeys ) ;
2007-07-13 19:37:47 +00:00
//printf("n_children=%d\n", result->n_children);
2013-04-16 23:57:23 -04:00
assert ( result - > u . n . n_children > = 0 ) ;
2007-11-14 17:58:38 +00:00
for ( i = 0 ; i < result - > u . n . n_children ; i + + ) {
u_int32_t childfp = rbuf_int ( & rc ) ;
2008-01-31 22:05:43 +00:00
BNC_SUBTREE_FINGERPRINT ( result , i ) = childfp ;
2007-11-14 17:58:38 +00:00
check_subtree_fingerprint + = childfp ;
2013-04-16 23:57:47 -04:00
struct subtree_estimates * se = & ( BNC_SUBTREE_ESTIMATES ( result , i ) ) ;
se - > nkeys = rbuf_ulonglong ( & rc ) ;
se - > ndata = rbuf_ulonglong ( & rc ) ;
se - > dsize = rbuf_ulonglong ( & rc ) ;
se - > exact = rbuf_char ( & rc ) ;
2007-11-14 17:58:38 +00:00
}
2007-07-13 19:37:47 +00:00
for ( i = 0 ; i < result - > u . n . n_children - 1 ; i + + ) {
2007-11-27 15:22:56 +00:00
if ( result - > flags & TOKU_DB_DUPSORT ) {
bytevec keyptr , dataptr ;
unsigned int keylen , datalen ;
rbuf_bytes ( & rc , & keyptr , & keylen ) ;
rbuf_bytes ( & rc , & dataptr , & datalen ) ;
2007-11-27 18:16:45 +00:00
result - > u . n . childkeys [ i ] = kv_pair_malloc ( keyptr , keylen , dataptr , datalen ) ;
2007-11-27 15:22:56 +00:00
} else {
bytevec childkeyptr ;
2007-12-06 14:30:33 +00:00
unsigned int cklen ;
rbuf_bytes ( & rc , & childkeyptr , & cklen ) ; /* Returns a pointer into the rbuf. */
result - > u . n . childkeys [ i ] = kv_pair_malloc ( ( void * ) childkeyptr , cklen , 0 , 0 ) ;
2007-11-27 15:22:56 +00:00
}
//printf(" key %d length=%d data=%s\n", i, result->childkeylens[i], result->childkeys[i]);
2007-12-06 14:20:47 +00:00
result - > u . n . totalchildkeylens + = toku_brtnode_pivot_key_len ( result , result - > u . n . childkeys [ i ] ) ;
2007-07-13 19:37:47 +00:00
}
for ( i = 0 ; i < result - > u . n . n_children ; i + + ) {
2013-04-16 23:57:18 -04:00
BNC_BLOCKNUM ( result , i ) = rbuf_blocknum ( & rc ) ;
2008-06-18 00:30:36 +00:00
BNC_HAVE_FULLHASH ( result , i ) = FALSE ;
2008-03-06 21:46:57 +00:00
BNC_NBYTESINBUF ( result , i ) = 0 ;
2007-07-13 19:37:47 +00:00
//printf("Child %d at %lld\n", i, result->children[i]);
}
2013-04-16 23:57:41 -04:00
result - > u . n . n_bytes_in_buffers = 0 ;
2007-07-24 01:32:03 +00:00
for ( i = 0 ; i < result - > u . n . n_children ; i + + ) {
2008-01-31 22:05:43 +00:00
r = toku_fifo_create ( & BNC_BUFFER ( result , i ) ) ;
2007-07-13 19:37:47 +00:00
if ( r ! = 0 ) {
int j ;
2008-01-11 14:03:33 +00:00
if ( 0 ) { died_12 : j = result - > u . n . n_bytes_in_buffers ; }
2008-01-31 22:05:43 +00:00
for ( j = 0 ; j < i ; j + + ) toku_fifo_free ( & BNC_BUFFER ( result , j ) ) ;
2013-04-16 23:57:33 -04:00
return toku_db_badformat ( ) ;
2007-07-13 19:37:47 +00:00
}
}
{
2007-07-24 01:32:03 +00:00
int cnum ;
2007-11-14 17:58:38 +00:00
u_int32_t check_local_fingerprint = 0 ;
2007-07-24 01:32:03 +00:00
for ( cnum = 0 ; cnum < result - > u . n . n_children ; cnum + + ) {
int n_in_this_hash = rbuf_int ( & rc ) ;
//printf("%d in hash\n", n_in_hash);
for ( i = 0 ; i < n_in_this_hash ; i + + ) {
int diff ;
2013-04-16 23:57:41 -04:00
bytevec key ; ITEMLEN keylen ;
2007-07-24 01:32:03 +00:00
bytevec val ; ITEMLEN vallen ;
2008-04-07 01:30:25 +00:00
//toku_verify_counts(result);
2008-02-05 18:25:23 +00:00
int type = rbuf_char ( & rc ) ;
TXNID xid = rbuf_ulonglong ( & rc ) ;
2007-07-24 01:32:03 +00:00
rbuf_bytes ( & rc , & key , & keylen ) ; /* Returns a pointer into the rbuf. */
rbuf_bytes ( & rc , & val , & vallen ) ;
2008-07-27 22:16:49 +00:00
check_local_fingerprint + = result - > rand4fingerprint * toku_calc_fingerprint_cmd ( type , xid , key , keylen , val , vallen ) ;
2007-09-06 21:36:45 +00:00
//printf("Found %s,%s\n", (char*)key, (char*)val);
2007-07-24 01:32:03 +00:00
{
2008-02-05 18:25:23 +00:00
r = toku_fifo_enq ( BNC_BUFFER ( result , cnum ) , key , keylen , val , vallen , type , xid ) ; /* Copies the data into the hash table. */
2007-07-24 01:32:03 +00:00
if ( r ! = 0 ) { goto died_12 ; }
}
2007-09-06 21:36:45 +00:00
diff = keylen + vallen + KEY_VALUE_OVERHEAD + BRT_CMD_OVERHEAD ;
2008-01-11 14:03:33 +00:00
result - > u . n . n_bytes_in_buffers + = diff ;
2008-01-31 22:05:43 +00:00
BNC_NBYTESINBUF ( result , cnum ) + = diff ;
2007-07-24 01:32:03 +00:00
//printf("Inserted\n");
2007-07-13 19:37:47 +00:00
}
}
2007-11-14 17:58:38 +00:00
if ( check_local_fingerprint ! = result - > local_fingerprint ) {
fprintf ( stderr , " %s:%d local fingerprint is wrong (found %8x calcualted %8x \n " , __FILE__ , __LINE__ , result - > local_fingerprint , check_local_fingerprint ) ;
2013-04-16 23:57:33 -04:00
return toku_db_badformat ( ) ;
2007-11-14 17:58:38 +00:00
}
if ( check_subtree_fingerprint + check_local_fingerprint ! = subtree_fingerprint ) {
fprintf ( stderr , " %s:%d subtree fingerprint is wrong \n " , __FILE__ , __LINE__ ) ;
2013-04-16 23:57:33 -04:00
return toku_db_badformat ( ) ;
2007-11-14 17:58:38 +00:00
}
2007-07-13 19:37:47 +00:00
}
} else {
2013-04-16 23:57:47 -04:00
result - > u . l . leaf_stats . nkeys = rbuf_ulonglong ( & rc ) ;
result - > u . l . leaf_stats . ndata = rbuf_ulonglong ( & rc ) ;
result - > u . l . leaf_stats . dsize = rbuf_ulonglong ( & rc ) ;
result - > u . l . leaf_stats . exact = TRUE ;
2007-07-13 19:37:47 +00:00
int n_in_buf = rbuf_int ( & rc ) ;
result - > u . l . n_bytes_in_buffer = 0 ;
2008-06-18 10:02:06 +00:00
result - > u . l . seqinsert = 0 ;
2008-05-29 03:12:59 +00:00
2013-04-16 23:57:41 -04:00
//printf("%s:%d r PMA= %p\n", __FILE__, __LINE__, result->u.l.buffer);
2013-04-16 23:57:16 -04:00
toku_mempool_init ( & result - > u . l . buffer_mempool , rc . buf , uncompressed_size + uncompressed_magic_len ) ;
2008-03-17 02:40:59 +00:00
u_int32_t actual_sum = 0 ;
2008-05-29 03:12:59 +00:00
u_int32_t start_of_data = rc . ndone ;
OMTVALUE * MALLOC_N ( n_in_buf , array ) ;
2007-07-13 19:37:47 +00:00
for ( i = 0 ; i < n_in_buf ; i + + ) {
2008-05-29 03:12:59 +00:00
LEAFENTRY le = ( LEAFENTRY ) ( & rc . buf [ rc . ndone ] ) ;
u_int32_t disksize = leafentry_disksize ( le ) ;
rc . ndone + = disksize ;
assert ( rc . ndone < = rc . size ) ;
array [ i ] = ( OMTVALUE ) le ;
2008-07-27 22:16:49 +00:00
actual_sum + = x1764_memory ( le , disksize ) ;
2007-07-13 19:37:47 +00:00
}
2013-04-16 23:57:46 -04:00
toku_trace ( " fill array " ) ;
2008-05-29 03:12:59 +00:00
u_int32_t end_of_data = rc . ndone ;
result - > u . l . n_bytes_in_buffer + = end_of_data - start_of_data + n_in_buf * OMT_ITEM_OVERHEAD ;
actual_sum * = result - > rand4fingerprint ;
2013-04-16 23:57:45 -04:00
r = toku_omt_create_steal_sorted_array ( & result - > u . l . buffer , & array , n_in_buf , n_in_buf ) ;
2013-04-16 23:57:46 -04:00
toku_trace ( " create omt " ) ;
2008-05-29 03:12:59 +00:00
if ( r ! = 0 ) {
2013-04-16 23:57:45 -04:00
toku_free ( array ) ;
2008-05-29 03:12:59 +00:00
if ( 0 ) { died_21 : toku_omt_destroy ( & result - > u . l . buffer ) ; }
2013-04-16 23:57:33 -04:00
return toku_db_badformat ( ) ;
2008-05-29 03:12:59 +00:00
}
2013-04-16 23:57:45 -04:00
assert ( array = = NULL ) ;
2013-04-16 23:57:41 -04:00
r = toku_leaflock_borrow ( & result - > u . l . leaflock ) ;
if ( r ! = 0 ) goto died_21 ;
2008-05-29 03:12:59 +00:00
result - > u . l . buffer_mempool . frag_size = start_of_data ;
result - > u . l . buffer_mempool . free_offset = end_of_data ;
2008-03-17 02:40:59 +00:00
if ( r ! = 0 ) goto died_21 ;
if ( actual_sum ! = result - > local_fingerprint ) {
//fprintf(stderr, "%s:%d Corrupted checksum stored=%08x rand=%08x actual=%08x height=%d n_keys=%d\n", __FILE__, __LINE__, result->rand4fingerprint, result->local_fingerprint, actual_sum, result->height, n_in_buf);
2013-04-16 23:57:33 -04:00
return toku_db_badformat ( ) ;
2013-04-16 23:57:19 -04:00
// goto died_21;
2008-03-17 02:40:59 +00:00
} else {
//fprintf(stderr, "%s:%d Good checksum=%08x height=%d\n", __FILE__, __LINE__, actual_sum, result->height);
}
2013-04-16 23:57:41 -04:00
2008-04-07 01:30:25 +00:00
//toku_verify_counts(result);
2007-07-13 19:37:47 +00:00
}
2007-11-14 17:58:38 +00:00
{
unsigned int n_read_so_far = rc . ndone ;
if ( n_read_so_far + 4 ! = rc . size ) {
2013-04-16 23:57:33 -04:00
r = toku_db_badformat ( ) ; goto died_21 ;
2007-11-14 17:58:38 +00:00
}
2013-04-16 23:57:46 -04:00
toku_trace ( " x1764 start " ) ;
2008-07-27 22:16:49 +00:00
uint32_t crc = x1764_memory ( rc . buf , n_read_so_far ) ;
2013-04-16 23:57:46 -04:00
toku_trace ( " x1764 " ) ;
2008-07-23 03:47:05 +00:00
uint32_t storedcrc = rbuf_int ( & rc ) ;
2007-11-14 17:58:38 +00:00
if ( crc ! = storedcrc ) {
printf ( " Bad CRC \n " ) ;
2008-07-27 22:16:49 +00:00
printf ( " %s:%d crc=%08x stored=%08x \n " , __FILE__ , __LINE__ , crc , storedcrc ) ;
2007-11-14 17:58:38 +00:00
assert ( 0 ) ; //this is wrong!!!
2013-04-16 23:57:33 -04:00
r = toku_db_badformat ( ) ;
2007-11-14 17:58:38 +00:00
goto died_21 ;
}
}
2007-07-13 19:37:47 +00:00
//printf("%s:%d Ok got %lld n_children=%d\n", __FILE__, __LINE__, result->thisnodename, result->n_children);
2008-05-29 03:12:59 +00:00
if ( result - > height > 0 ) {
// For height==0 we used the buf inside the OMT
toku_free ( rc . buf ) ;
}
2013-04-16 23:57:46 -04:00
toku_trace ( " deserial done " ) ;
2007-07-13 19:37:47 +00:00
* brtnode = result ;
2008-04-07 01:30:25 +00:00
//toku_verify_counts(result);
2007-07-13 19:37:47 +00:00
return 0 ;
}
2008-07-24 21:31:52 +00:00
struct sum_info {
unsigned int dsum ;
unsigned int msum ;
unsigned int count ;
u_int32_t fp ;
} ;
2013-04-16 23:57:19 -04:00
static int
sum_item ( OMTVALUE lev , u_int32_t UU ( idx ) , void * vsi ) {
2008-07-24 21:25:31 +00:00
LEAFENTRY le = lev ;
struct sum_info * si = vsi ;
si - > count + + ;
si - > dsum + = OMT_ITEM_OVERHEAD + leafentry_disksize ( le ) ;
si - > msum + = leafentry_memsize ( le ) ;
si - > fp + = toku_le_crc ( le ) ;
return 0 ;
}
2007-11-19 23:54:17 +00:00
void toku_verify_counts ( BRTNODE node ) {
2007-07-24 02:36:00 +00:00
/*foo*/
2007-07-13 19:37:47 +00:00
if ( node - > height = = 0 ) {
assert ( node - > u . l . buffer ) ;
2008-07-24 21:31:52 +00:00
struct sum_info sum_info = { 0 , 0 , 0 , 0 } ;
2008-04-22 20:39:50 +00:00
toku_omt_iterate ( node - > u . l . buffer , sum_item , & sum_info ) ;
assert ( sum_info . count = = toku_omt_size ( node - > u . l . buffer ) ) ;
assert ( sum_info . dsum = = node - > u . l . n_bytes_in_buffer ) ;
assert ( sum_info . msum = = node - > u . l . buffer_mempool . free_offset - node - > u . l . buffer_mempool . frag_size ) ;
u_int32_t fps = node - > rand4fingerprint * sum_info . fp ;
2008-04-03 23:00:59 +00:00
assert ( fps = = node - > local_fingerprint ) ;
2007-07-13 19:37:47 +00:00
} else {
unsigned int sum = 0 ;
int i ;
for ( i = 0 ; i < node - > u . n . n_children ; i + + )
2008-01-31 22:05:43 +00:00
sum + = BNC_NBYTESINBUF ( node , i ) ;
2008-01-11 14:03:33 +00:00
// We don't rally care of the later buffers have garbage in them. Valgrind would do a better job noticing if we leave it uninitialized.
2007-11-14 17:58:38 +00:00
// But for now the code always initializes the later tables so they are 0.
2008-01-11 14:03:33 +00:00
assert ( sum = = node - > u . n . n_bytes_in_buffers ) ;
2007-07-13 19:37:47 +00:00
}
}
2013-04-16 23:57:41 -04:00
2013-04-16 23:57:47 -04:00
int toku_serialize_brt_header_size ( struct brt_header * UU ( h ) ) {
2013-04-16 23:57:41 -04:00
unsigned int size = ( + 8 // "tokudata"
2008-05-22 21:28:00 +00:00
+ 4 // size
+ 4 // version
2013-04-16 23:57:46 -04:00
+ 8 // byte order verification
2013-04-16 23:57:47 -04:00
+ 8 // checkpoint_count
+ 8 // checkpoint_lsn
2013-04-16 23:57:18 -04:00
+ 4 // tree's nodesize
2013-04-16 23:57:47 -04:00
+ 8 // translation_size_on_disk
+ 8 // translation_address_on_disk
+ 4 // checksum
2013-04-16 23:57:18 -04:00
) ;
2013-04-16 23:57:47 -04:00
size + = ( + 8 // diskoff
+ 4 // flags
) ;
2013-04-16 23:57:47 -04:00
assert ( size < = BLOCK_ALLOCATOR_HEADER_RESERVE ) ;
2007-11-21 13:07:49 +00:00
return size ;
}
2013-04-16 23:57:47 -04:00
int toku_serialize_brt_header_to_wbuf ( struct wbuf * wbuf , struct brt_header * h , DISKOFF translation_location_on_disk , DISKOFF translation_size_on_disk ) {
2007-11-21 13:07:49 +00:00
unsigned int size = toku_serialize_brt_header_size ( h ) ; // !!! seems silly to recompute the size when the caller knew it. Do we really need the size?
2008-05-22 21:28:00 +00:00
wbuf_literal_bytes ( wbuf , " tokudata " , 8 ) ;
2013-04-16 23:57:46 -04:00
wbuf_network_int ( wbuf , size ) ; //MUST be in network order regardless of disk order
wbuf_network_int ( wbuf , h - > layout_version ) ; //MUST be in network order regardless of disk order
wbuf_literal_bytes ( wbuf , & toku_byte_order_host , 8 ) ; //Must not translate byte order
2013-04-16 23:57:47 -04:00
wbuf_ulonglong ( wbuf , h - > checkpoint_count ) ;
wbuf_LSN ( wbuf , h - > checkpoint_lsn ) ;
2007-11-21 13:07:49 +00:00
wbuf_int ( wbuf , h - > nodesize ) ;
2013-04-16 23:57:41 -04:00
2013-04-16 23:57:18 -04:00
//printf("%s:%d bta=%lu size=%lu\n", __FILE__, __LINE__, h->block_translation_address_on_disk, 4 + 16*h->translated_blocknum_limit);
2013-04-16 23:57:47 -04:00
wbuf_DISKOFF ( wbuf , translation_location_on_disk ) ;
wbuf_DISKOFF ( wbuf , translation_size_on_disk ) ;
2013-04-16 23:57:47 -04:00
wbuf_BLOCKNUM ( wbuf , h - > root ) ;
wbuf_int ( wbuf , h - > flags ) ;
2013-04-16 23:57:47 -04:00
u_int32_t checksum = x1764_finish ( & wbuf - > checksum ) ;
wbuf_int ( wbuf , checksum ) ;
2007-11-21 13:07:49 +00:00
assert ( wbuf - > ndone < = wbuf - > size ) ;
return 0 ;
}
int toku_serialize_brt_header_to ( int fd , struct brt_header * h ) {
2013-04-16 23:57:38 -04:00
int rr = 0 ;
if ( h - > panic ) return h - > panic ;
2013-04-16 23:57:47 -04:00
assert ( h - > type = = BRTHEADER_CHECKPOINT_INPROGRESS ) ;
2013-04-16 23:57:44 -04:00
toku_block_lock_for_multiple_operations ( h - > blocktable ) ;
2013-04-16 23:57:47 -04:00
struct wbuf w_translation ;
int64_t size_translation ;
int64_t address_translation ;
{
//Must serialize translation first, to get address,size for header.
toku_serialize_translation_to_wbuf_unlocked ( h - > blocktable , & w_translation ,
& address_translation ,
& size_translation ) ;
assert ( size_translation = = w_translation . size ) ;
}
2013-04-16 23:57:41 -04:00
struct wbuf w_main ;
unsigned int size_main = toku_serialize_brt_header_size ( h ) ;
2007-07-13 19:37:47 +00:00
{
2013-04-16 23:57:41 -04:00
wbuf_init ( & w_main , toku_malloc ( size_main ) , size_main ) ;
2013-04-16 23:57:38 -04:00
{
2013-04-16 23:57:47 -04:00
int r = toku_serialize_brt_header_to_wbuf ( & w_main , h , address_translation , size_translation ) ;
2013-04-16 23:57:38 -04:00
assert ( r = = 0 ) ;
}
2013-04-16 23:57:41 -04:00
assert ( w_main . ndone = = size_main ) ;
}
2013-04-16 23:57:47 -04:00
toku_block_unlock_for_multiple_operations ( h - > blocktable ) ;
char * writing_what ;
2013-04-16 23:57:47 -04:00
lock_for_pwrite ( ) ;
2013-04-16 23:57:41 -04:00
{
2013-04-16 23:57:47 -04:00
//Actual Write translation table
ssize_t nwrote ;
rr = toku_pwrite_extend ( fd , w_translation . buf ,
size_translation , address_translation , & nwrote ) ;
if ( rr ) {
writing_what = " translation " ;
goto panic ;
}
assert ( nwrote = = size_translation ) ;
2013-04-16 23:57:41 -04:00
}
{
//Actual Write main header
2013-04-16 23:57:38 -04:00
ssize_t nwrote ;
2013-04-16 23:57:47 -04:00
//Alternate writing header to two locations:
// Beginning (0) or BLOCK_ALLOCATOR_HEADER_RESERVE
toku_off_t main_offset ;
//TODO: #1623 uncomment next line when ready for 2 headers
main_offset = ( h - > checkpoint_count & 0x1 ) ? 0 : BLOCK_ALLOCATOR_HEADER_RESERVE ;
rr = toku_pwrite_extend ( fd , w_main . buf , w_main . ndone , main_offset , & nwrote ) ;
2013-04-16 23:57:38 -04:00
if ( rr ) {
2013-04-16 23:57:47 -04:00
writing_what = " header " ;
panic :
2013-04-16 23:57:38 -04:00
if ( h - > panic = = 0 ) {
2013-04-16 23:57:42 -04:00
char * e = strerror ( rr ) ;
int l = 200 + strlen ( e ) ;
char s [ l ] ;
2013-04-16 23:57:38 -04:00
h - > panic = rr ;
2013-04-16 23:57:47 -04:00
snprintf ( s , l - 1 , " %s:%d: Error writing %s to data file. errno=%d (%s) \n " , __FILE__ , __LINE__ , writing_what , rr , e ) ;
2013-04-16 23:57:38 -04:00
h - > panic_string = toku_strdup ( s ) ;
}
goto finish ;
}
2013-04-16 23:57:41 -04:00
assert ( ( u_int64_t ) nwrote = = size_main ) ;
2007-07-13 19:37:47 +00:00
}
2013-04-16 23:57:38 -04:00
finish :
2013-04-16 23:57:47 -04:00
toku_free ( w_main . buf ) ;
2013-04-16 23:57:41 -04:00
toku_free ( w_translation . buf ) ;
2013-04-16 23:57:18 -04:00
unlock_for_pwrite ( ) ;
2013-04-16 23:57:38 -04:00
return rr ;
2007-07-13 19:37:47 +00:00
}
2013-04-16 23:57:47 -04:00
//Descriptor is written to disk during toku_brt_open iff we have a new (or changed)
//descriptor.
//Descriptors are NOT written during the header checkpoint process.
2013-04-16 23:57:47 -04:00
int
toku_serialize_descriptor_contents_to_fd ( int fd , DBT * desc , DISKOFF offset ) {
int r ;
// make the checksum
int64_t size = desc - > size + 4 ; //4 for checksum
struct wbuf w ;
wbuf_init ( & w , toku_xmalloc ( size ) , size ) ;
wbuf_literal_bytes ( & w , desc - > data , desc - > size ) ;
u_int32_t checksum = x1764_finish ( & w . checksum ) ;
wbuf_int ( & w , checksum ) ;
assert ( w . ndone = = w . size ) ;
{
lock_for_pwrite ( ) ;
//Actual Write translation table
ssize_t nwrote ;
r = toku_pwrite_extend ( fd , w . buf , size , offset , & nwrote ) ;
unlock_for_pwrite ( ) ;
if ( r = = 0 ) assert ( nwrote = = size ) ;
}
toku_free ( w . buf ) ;
return r ;
}
static void
2013-04-16 23:57:48 -04:00
deserialize_descriptor_from ( int fd , struct brt_header * h , DBT * desc ) {
2013-04-16 23:57:47 -04:00
DISKOFF offset ;
DISKOFF size ;
toku_get_descriptor_offset_size ( h - > blocktable , & offset , & size ) ;
2013-04-16 23:57:47 -04:00
memset ( desc , 0 , sizeof ( * desc ) ) ;
2013-04-16 23:57:47 -04:00
if ( size > 0 ) {
2013-04-16 23:57:47 -04:00
assert ( size > = 4 ) ; //4 for checksum
{
unsigned char * XMALLOC_N ( size , dbuf ) ;
{
lock_for_pwrite ( ) ;
ssize_t r = pread ( fd , dbuf , size , offset ) ;
assert ( r = = size ) ;
unlock_for_pwrite ( ) ;
}
{
// check the checksum
u_int32_t x1764 = x1764_memory ( dbuf , size - 4 ) ;
//printf("%s:%d read from %ld (x1764 offset=%ld) size=%ld\n", __FILE__, __LINE__, block_translation_address_on_disk, offset, block_translation_size_on_disk);
u_int32_t stored_x1764 = toku_dtoh32 ( * ( int * ) ( dbuf + size - 4 ) ) ;
assert ( x1764 = = stored_x1764 ) ;
}
2013-04-16 23:57:48 -04:00
desc - > size = size - 4 ;
2013-04-16 23:57:47 -04:00
desc - > data = dbuf ; //Uses 4 extra bytes, but fast.
2013-04-16 23:57:47 -04:00
}
}
}
2013-04-16 23:57:18 -04:00
// We only deserialize brt header once and then share everything with all the brts.
2013-04-16 23:57:19 -04:00
static int
2013-04-16 23:57:47 -04:00
deserialize_brtheader ( int fd , struct rbuf * rb , struct brt_header * * brth ) {
// We already know:
// we have an rbuf representing the header.
// The checksum has been validated
//Steal rbuf (used to simplify merge/reduce diff size/keep old code)
struct rbuf rc = * rb ;
memset ( rb , 0 , sizeof ( * rb ) ) ;
2013-04-16 23:57:47 -04:00
struct brt_header * CALLOC ( h ) ;
2008-05-22 21:28:00 +00:00
if ( h = = 0 ) return errno ;
int ret = - 1 ;
2013-04-16 23:57:47 -04:00
if ( 0 ) { died1 : toku_free ( h ) ; return ret ; }
h - > type = BRTHEADER_CURRENT ;
h - > checkpoint_header = NULL ;
2008-05-22 21:28:00 +00:00
h - > dirty = 0 ;
2013-04-16 23:57:38 -04:00
h - > panic = 0 ;
h - > panic_string = 0 ;
2013-04-16 23:57:46 -04:00
//version MUST be in network order on disk regardless of disk order
h - > layout_version = rbuf_network_int ( & rc ) ;
assert ( h - > layout_version = = BRT_LAYOUT_VERSION_10 ) ;
bytevec tmp_byte_order_check ;
rbuf_literal_bytes ( & rc , & tmp_byte_order_check , 8 ) ; //Must not translate byte order
int64_t byte_order_stored = * ( int64_t * ) tmp_byte_order_check ;
assert ( byte_order_stored = = toku_byte_order_host ) ;
2013-04-16 23:57:47 -04:00
assert ( h - > layout_version = = BRT_LAYOUT_VERSION_10 ) ;
2013-04-16 23:57:47 -04:00
h - > checkpoint_count = rbuf_ulonglong ( & rc ) ;
h - > checkpoint_lsn = rbuf_lsn ( & rc ) ;
h - > nodesize = rbuf_int ( & rc ) ;
DISKOFF translation_address_on_disk = rbuf_diskoff ( & rc ) ;
DISKOFF translation_size_on_disk = rbuf_diskoff ( & rc ) ;
assert ( translation_address_on_disk > 0 ) ;
assert ( translation_size_on_disk > 0 ) ;
2013-04-16 23:57:18 -04:00
// printf("%s:%d translated_blocknum_limit=%ld, block_translation_address_on_disk=%ld\n", __FILE__, __LINE__, h->translated_blocknum_limit, h->block_translation_address_on_disk);
2013-04-16 23:57:47 -04:00
//Load translation table
{
lock_for_pwrite ( ) ;
unsigned char * XMALLOC_N ( translation_size_on_disk , tbuf ) ;
{
// This cast is messed up in 32-bits if the block translation table is ever more than 4GB. But in that case, the translation table itself won't fit in main memory.
ssize_t r = pread ( fd , tbuf , translation_size_on_disk , translation_address_on_disk ) ;
assert ( r = = translation_size_on_disk ) ;
}
2013-04-16 23:57:47 -04:00
unlock_for_pwrite ( ) ;
2013-04-16 23:57:47 -04:00
// Create table and read in data.
toku_blocktable_create_from_buffer ( & h - > blocktable ,
translation_address_on_disk ,
translation_size_on_disk ,
tbuf ) ;
toku_free ( tbuf ) ;
2013-04-16 23:57:18 -04:00
}
2013-04-16 23:57:47 -04:00
2013-04-16 23:57:47 -04:00
h - > root = rbuf_blocknum ( & rc ) ;
h - > root_hash . valid = FALSE ;
h - > flags = rbuf_int ( & rc ) ;
2013-04-16 23:57:47 -04:00
deserialize_descriptor_from ( fd , h , & h - > descriptor ) ;
2013-04-16 23:57:47 -04:00
( void ) rbuf_int ( & rc ) ; //Read in checksum and ignore (already verified).
2013-04-16 23:57:47 -04:00
if ( rc . ndone ! = rc . size ) { ret = EINVAL ; goto died1 ; }
2008-05-22 21:28:00 +00:00
toku_free ( rc . buf ) ;
2013-04-16 23:57:47 -04:00
rc . buf = NULL ;
2008-05-22 21:28:00 +00:00
* brth = h ;
return 0 ;
}
2013-04-16 23:57:47 -04:00
//-1 means we can overwrite everything in the file AND the header is useless
static int
deserialize_brtheader_from_fd_into_rbuf ( int fd , toku_off_t offset , struct rbuf * rb , u_int64_t * checkpoint_count ) {
2013-04-16 23:57:47 -04:00
int r = 0 ;
2013-04-16 23:57:47 -04:00
const int prefix_size = 8 + // magic ("tokudata")
4 ; // size
char prefix [ prefix_size ] ;
rb - > buf = NULL ;
int64_t n = pread ( fd , prefix , prefix_size , offset ) ;
if ( n = = 0 ) r = - 1 ;
else if ( n < 0 ) r = errno ;
else if ( n ! = prefix_size ) r = EINVAL ;
else if ( memcmp ( prefix , " tokudata " , 8 ) ! = 0 ) {
if ( ( * ( u_int64_t * ) & prefix [ 0 ] ) = = 0 ) r = - 1 ; //Could be a tokudb file but header never written
else r = EINVAL ; //Not a tokudb file! Do not use.
}
else {
// It's version 7 or later, and the magic looks OK
//Size must be stored in network order regardless of DISK_ORDER
u_int32_t size = toku_ntohl ( * ( u_int32_t * ) ( prefix + 8 ) ) ;
rb - > size = size ;
rb - > ndone = prefix_size ;
rb - > buf = toku_malloc ( rb - > size ) ;
if ( ! rb - > buf ) r = ENOMEM ;
else {
n = pread ( fd , rb - > buf , rb - > size , offset ) ;
if ( n ! = ( int64_t ) size ) r = EINVAL ; //Header might be useless (wrong size) or could be an error.
2013-04-16 23:57:47 -04:00
if ( r = = 0 ) {
//check version (before checksum, since older versions didn't have checksums)
int version = rbuf_network_int ( rb ) ;
if ( version ! = BRT_LAYOUT_VERSION_10 ) r = TOKUDB_DICTIONARY_TOO_OLD ; //Cannot use
}
if ( r = = 0 ) {
2013-04-16 23:57:47 -04:00
u_int32_t calculated_x1764 = x1764_memory ( rb - > buf , size - 4 ) ;
u_int32_t stored_x1764 = toku_dtoh32 ( * ( int * ) ( rb - > buf + size - 4 ) ) ;
if ( calculated_x1764 ! = stored_x1764 ) r = - 1 ; //Header useless
else r = 0 ;
}
if ( r = = 0 ) {
//Verify byte order
bytevec tmp_byte_order_check ;
rbuf_literal_bytes ( rb , & tmp_byte_order_check , 8 ) ; //Must not translate byte order
int64_t byte_order_stored = * ( int64_t * ) tmp_byte_order_check ;
if ( byte_order_stored ! = toku_byte_order_host ) r = EINVAL ; //Cannot use
}
if ( r = = 0 ) {
* checkpoint_count = rbuf_ulonglong ( rb ) ;
//Restart after 'size'
rb - > ndone = prefix_size ;
}
}
}
if ( r ! = 0 & & rb - > buf ) toku_free ( rb - > buf ) ;
return r ;
}
//TODO:
// * read in whole thing, do checksum
// * switch to using rbuf
// * read in size, version, LSN and checkpoint count (pre)
// * read in LSN and checkpoint count to save in header object
int toku_deserialize_brtheader_from ( int fd , struct brt_header * * brth ) {
struct rbuf rb_0 ;
struct rbuf rb_1 ;
u_int64_t checkpoint_count_0 ;
u_int64_t checkpoint_count_1 ;
int r0 ;
int r1 ;
{
toku_off_t header_0_off = 0 ;
r0 = deserialize_brtheader_from_fd_into_rbuf ( fd , header_0_off , & rb_0 , & checkpoint_count_0 ) ;
}
{
toku_off_t header_1_off = BLOCK_ALLOCATOR_HEADER_RESERVE ;
r1 = deserialize_brtheader_from_fd_into_rbuf ( fd , header_1_off , & rb_1 , & checkpoint_count_1 ) ;
}
struct rbuf * rb = NULL ;
if ( r0 = = 0 ) rb = & rb_0 ;
if ( r1 = = 0 & & ( r0 ! = 0 | | checkpoint_count_1 > checkpoint_count_0 ) ) rb = & rb_1 ;
int r = 0 ;
if ( rb = = NULL ) {
r = r0 ;
2013-04-16 23:57:47 -04:00
if ( r1 = = TOKUDB_DICTIONARY_TOO_OLD ) r = r1 ;
2013-04-16 23:57:47 -04:00
assert ( r ! = 0 ) ;
}
if ( r = = 0 ) r = deserialize_brtheader ( fd , rb , brth ) ;
2013-04-16 23:57:47 -04:00
if ( r0 = = 0 & & rb_0 . buf ) toku_free ( rb_0 . buf ) ;
if ( r1 = = 0 & & rb_1 . buf ) toku_free ( rb_1 . buf ) ;
2013-04-16 23:57:47 -04:00
return r ;
2008-05-22 21:28:00 +00:00
}
2007-12-06 14:20:47 +00:00
unsigned int toku_brt_pivot_key_len ( BRT brt , struct kv_pair * pk ) {
if ( brt - > flags & TOKU_DB_DUPSORT ) {
return kv_pair_keylen ( pk ) + kv_pair_vallen ( pk ) ;
} else {
return kv_pair_keylen ( pk ) ;
}
}
unsigned int toku_brtnode_pivot_key_len ( BRTNODE node , struct kv_pair * pk ) {
if ( node - > flags & TOKU_DB_DUPSORT ) {
return kv_pair_keylen ( pk ) + kv_pair_vallen ( pk ) ;
} else {
return kv_pair_keylen ( pk ) ;
}
}
2008-04-09 02:45:27 +00:00
2013-04-16 23:57:19 -04:00
static int
2013-04-16 23:57:27 -04:00
read_int ( int fd , toku_off_t * at , u_int32_t * result ) {
2008-04-09 02:45:27 +00:00
int v ;
ssize_t r = pread ( fd , & v , 4 , * at ) ;
if ( r < 0 ) return errno ;
assert ( r = = 4 ) ;
2013-04-16 23:57:46 -04:00
* result = toku_dtoh32 ( v ) ;
2008-04-09 02:45:27 +00:00
( * at ) + = 4 ;
return 0 ;
}
2013-04-16 23:57:48 -04:00
static int read_u_int64_t UU ( ( int fd , toku_off_t * at , u_int64_t * result ) ) ;
2013-04-16 23:57:19 -04:00
static int
2013-04-16 23:57:27 -04:00
read_u_int64_t ( int fd , toku_off_t * at , u_int64_t * result ) {
2008-05-04 16:56:15 +00:00
u_int32_t v1 = 0 , v2 = 0 ;
2008-04-09 02:45:27 +00:00
int r ;
if ( ( r = read_int ( fd , at , & v1 ) ) ) return r ;
if ( ( r = read_int ( fd , at , & v2 ) ) ) return r ;
* result = ( ( ( u_int64_t ) v1 ) < < 32 ) + v2 ;
return 0 ;
}
2013-04-16 23:57:33 -04:00
int toku_db_badformat ( void ) {
return DB_BADFORMAT ;
}