2013-04-16 23:57:47 -04:00
/* -*- mode: C; c-basic-offset: 4 -*- */
# ident "$Id$"
2013-04-16 23:59:09 -04:00
# ident "Copyright (c) 2007-2010 Tokutek Inc. All rights reserved."
2013-04-16 23:57:48 -04:00
# ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11 / 760379 and to the patents and / or patent applications resulting from it."
2013-04-16 23:57:41 -04:00
2013-04-16 23:57:53 -04:00
# include <toku_portability.h>
2013-04-16 23:57:47 -04:00
# include "brt-internal.h" // ugly but pragmatic, need access to dirty bits while holding translation lock
2013-04-16 23:57:41 -04:00
# include "brttypes.h"
# include "block_table.h"
# include "memory.h"
# include "toku_assert.h"
# include "toku_pthread.h"
# include "block_allocator.h"
# include "rbuf.h"
# include "wbuf.h"
2013-04-16 23:57:47 -04:00
//When the translation (btt) is stored on disk:
// In Header:
// size_on_disk
// location_on_disk
// In block translation table (in order):
// smallest_never_used_blocknum
// blocknum_freelist_head
// array
// a checksum
struct translation { //This is the BTT (block translation table)
enum translation_type type ;
int64_t length_of_array ; //Number of elements in array (block_translation). always >= smallest_never_used_blocknum
BLOCKNUM smallest_never_used_blocknum ;
BLOCKNUM blocknum_freelist_head ; // next (previously used) unused blocknum (free list)
2013-04-16 23:57:41 -04:00
struct block_translation_pair * block_translation ;
// Where and how big is the block translation vector stored on disk.
2013-04-16 23:57:47 -04:00
// size_on_disk is stored in block_translation[RESERVED_BLOCKNUM_TRANSLATION].size
// location_on is stored in block_translation[RESERVED_BLOCKNUM_TRANSLATION].u.diskoff
} ;
static const BLOCKNUM freelist_null = { - 1 } ; // in a freelist, this indicates end of list
static const DISKOFF size_is_free = ( DISKOFF ) - 1 ; // value of block_translation_pair.size if blocknum is unused
static const DISKOFF diskoff_unused = ( DISKOFF ) - 2 ; // value of block_translation_pair.u.diskoff if blocknum is used but does not yet have a diskblock
/********
* There are three copies of the translation table ( btt ) in the block table :
*
* checkpointed Is initialized by deserializing from disk ,
* and is the only version ever read from disk .
2013-04-16 23:59:20 -04:00
* When read from disk it is copied to current .
* It is immutable . It can be replaced by an inprogress btt .
2013-04-16 23:57:47 -04:00
*
* inprogress Is only filled by copying from current ,
* and is the only version ever serialized to disk .
* ( It is serialized to disk on checkpoint and clean shutdown . )
2013-04-16 23:59:20 -04:00
* At end of checkpoint it replaces ' checkpointed ' .
* During a checkpoint , any ' pending ' dirty writes will update
* inprogress .
2013-04-16 23:57:47 -04:00
*
* current Is initialized by copying from checkpointed ,
* is the only version ever modified while the database is in use ,
* and is the only version ever copied to inprogress .
* It is never stored on disk .
* * * * * * * */
struct block_table {
struct translation current ; // The current translation is the one used by client threads. It is not represented on disk.
struct translation inprogress ; // the translation used by the checkpoint currently in progress. If the checkpoint thread allocates a block, it must also update the current translation.
struct translation checkpointed ; // the translation for the data that shall remain inviolate on disk until the next checkpoint finishes, after which any blocks used only in this translation can be freed.
// The in-memory data structure for block allocation. There is no on-disk data structure for block allocation.
// Note: This is *allocation* not *translation*. The block_allocator is unaware of which blocks are used for which translation, but simply allocates and deallocates blocks.
2013-04-16 23:57:41 -04:00
BLOCK_ALLOCATOR block_allocator ;
2013-04-16 23:57:44 -04:00
toku_pthread_mutex_t mutex ;
int is_locked ;
2013-04-16 23:57:47 -04:00
BOOL checkpoint_skipped ;
BOOL checkpoint_failed ;
2013-04-16 23:57:41 -04:00
} ;
2013-04-16 23:57:47 -04:00
//forward decls
static int64_t calculate_size_on_disk ( struct translation * t ) ;
static inline BOOL translation_prevents_freeing ( struct translation * t , BLOCKNUM b , struct block_translation_pair * old_pair ) ;
static inline void lock_for_blocktable ( BLOCK_TABLE bt ) ;
static inline void unlock_for_blocktable ( BLOCK_TABLE bt ) ;
static void
brtheader_set_dirty ( struct brt_header * h , BOOL for_checkpoint ) {
assert ( h - > blocktable - > is_locked ) ;
assert ( h - > type = = BRTHEADER_CURRENT ) ;
if ( for_checkpoint ) {
assert ( h - > checkpoint_header - > type = = BRTHEADER_CHECKPOINT_INPROGRESS ) ;
h - > checkpoint_header - > dirty = 1 ;
}
2013-04-16 23:59:17 -04:00
else {
h - > dirty = 1 ;
}
2013-04-16 23:57:47 -04:00
}
2013-04-16 23:59:02 -04:00
//fd is protected (must be holding fdlock)
2013-04-16 23:57:47 -04:00
static void
2013-04-16 23:59:02 -04:00
maybe_truncate_cachefile ( BLOCK_TABLE bt , int fd , struct brt_header * h , u_int64_t size_needed_before ) {
2013-04-16 23:57:47 -04:00
assert ( bt - > is_locked ) ;
u_int64_t new_size_needed = block_allocator_allocated_limit ( bt - > block_allocator ) ;
//Save a call to toku_os_get_file_size (kernel call) if unlikely to be useful.
if ( new_size_needed < size_needed_before )
2013-04-16 23:59:02 -04:00
toku_maybe_truncate_cachefile ( h - > cf , fd , new_size_needed ) ;
2013-04-16 23:57:47 -04:00
}
2013-04-16 23:59:02 -04:00
//fd is protected (must be holding fdlock)
2013-04-16 23:57:47 -04:00
void
2013-04-16 23:59:02 -04:00
toku_maybe_truncate_cachefile_on_open ( BLOCK_TABLE bt , int fd , struct brt_header * h ) {
2013-04-16 23:57:47 -04:00
lock_for_blocktable ( bt ) ;
u_int64_t size_needed = block_allocator_allocated_limit ( bt - > block_allocator ) ;
2013-04-16 23:59:02 -04:00
toku_maybe_truncate_cachefile ( h - > cf , fd , size_needed ) ;
2013-04-16 23:57:47 -04:00
unlock_for_blocktable ( bt ) ;
}
2013-04-16 23:57:41 -04:00
static void
2013-04-16 23:57:47 -04:00
copy_translation ( struct translation * dst , struct translation * src , enum translation_type newtype ) {
assert ( src - > length_of_array > = src - > smallest_never_used_blocknum . b ) ; //verify invariant
assert ( newtype = = TRANSLATION_DEBUG | |
( src - > type = = TRANSLATION_CURRENT & & newtype = = TRANSLATION_INPROGRESS ) | |
( src - > type = = TRANSLATION_CHECKPOINTED & & newtype = = TRANSLATION_CURRENT ) ) ;
dst - > type = newtype ;
dst - > smallest_never_used_blocknum = src - > smallest_never_used_blocknum ;
dst - > blocknum_freelist_head = src - > blocknum_freelist_head ;
// destination btt is of fixed size. Allocate+memcpy the exact length necessary.
dst - > length_of_array = dst - > smallest_never_used_blocknum . b ;
XMALLOC_N ( dst - > length_of_array , dst - > block_translation ) ;
memcpy ( dst - > block_translation ,
src - > block_translation ,
dst - > length_of_array * sizeof ( * dst - > block_translation ) ) ;
//New version of btt is not yet stored on disk.
dst - > block_translation [ RESERVED_BLOCKNUM_TRANSLATION ] . size = 0 ;
dst - > block_translation [ RESERVED_BLOCKNUM_TRANSLATION ] . u . diskoff = diskoff_unused ;
}
2013-04-16 23:59:17 -04:00
int64_t
toku_block_get_blocks_in_use_unlocked ( BLOCK_TABLE bt ) {
BLOCKNUM b ;
struct translation * t = & bt - > current ;
int64_t num_blocks = 0 ;
{
//Reserved blocknums do not get upgraded; They are part of the header.
for ( b . b = RESERVED_BLOCKNUMS ; b . b < t - > smallest_never_used_blocknum . b ; b . b + + ) {
if ( t - > block_translation [ b . b ] . size ! = size_is_free ) {
num_blocks + + ;
}
}
}
return num_blocks ;
}
2013-04-16 23:59:04 -04:00
static void
maybe_optimize_translation ( struct translation * t ) {
//Reduce 'smallest_never_used_blocknum.b' (completely free blocknums instead of just
//on a free list. Doing so requires us to regenerate the free list.
//This is O(n) work, so do it only if you're already doing that.
BLOCKNUM b ;
assert ( t - > smallest_never_used_blocknum . b > = RESERVED_BLOCKNUMS ) ;
//Calculate how large the free suffix is.
int64_t freed ;
{
for ( b . b = t - > smallest_never_used_blocknum . b ; b . b > RESERVED_BLOCKNUMS ; b . b - - ) {
if ( t - > block_translation [ b . b - 1 ] . size ! = size_is_free ) {
break ;
}
}
freed = t - > smallest_never_used_blocknum . b - b . b ;
}
if ( freed > 0 ) {
t - > smallest_never_used_blocknum . b = b . b ;
if ( t - > length_of_array / 4 > t - > smallest_never_used_blocknum . b ) {
//We're using more memory than necessary to represent this now. Reduce.
u_int64_t new_length = t - > smallest_never_used_blocknum . b * 2 ;
XREALLOC_N ( new_length , t - > block_translation ) ;
t - > length_of_array = new_length ;
//No need to zero anything out.
}
//Regenerate free list.
t - > blocknum_freelist_head . b = freelist_null . b ;
for ( b . b = RESERVED_BLOCKNUMS ; b . b < t - > smallest_never_used_blocknum . b ; b . b + + ) {
if ( t - > block_translation [ b . b ] . size = = size_is_free ) {
t - > block_translation [ b . b ] . u . next_free_blocknum = t - > blocknum_freelist_head ;
t - > blocknum_freelist_head = b ;
}
}
}
}
2013-04-16 23:57:47 -04:00
// block table must be locked by caller of this function
void
toku_block_translation_note_start_checkpoint_unlocked ( BLOCK_TABLE bt ) {
assert ( bt - > is_locked ) ;
// Copy current translation to inprogress translation.
assert ( bt - > inprogress . block_translation = = NULL ) ;
2013-04-16 23:59:04 -04:00
//We're going to do O(n) work to copy the translation, so we
//can afford to do O(n) work by optimizing the translation
maybe_optimize_translation ( & bt - > current ) ;
2013-04-16 23:57:47 -04:00
copy_translation ( & bt - > inprogress , & bt - > current , TRANSLATION_INPROGRESS ) ;
2013-04-16 23:57:47 -04:00
2013-04-16 23:57:47 -04:00
bt - > checkpoint_skipped = FALSE ;
bt - > checkpoint_failed = FALSE ;
}
//#define PRNTF(str, b, siz, ad, bt) printf("%s[%d] %s %"PRId64" %"PRId64" %"PRId64"\n", __FUNCTION__, __LINE__, str, b, siz, ad); fflush(stdout); if (bt) block_allocator_validate(((BLOCK_TABLE)(bt))->block_allocator);
//Debugging function
# define PRNTF(str, b, siz, ad, bt)
void
toku_block_translation_note_failed_checkpoint ( BLOCK_TABLE bt ) {
lock_for_blocktable ( bt ) ;
assert ( bt - > inprogress . block_translation ) ;
bt - > checkpoint_failed = TRUE ;
unlock_for_blocktable ( bt ) ;
}
void
toku_block_translation_note_skipped_checkpoint ( BLOCK_TABLE bt ) {
//Purpose, alert block translation that the checkpoint was skipped, e.x. for a non-dirty header
lock_for_blocktable ( bt ) ;
assert ( bt - > inprogress . block_translation ) ;
bt - > checkpoint_skipped = TRUE ;
unlock_for_blocktable ( bt ) ;
}
static void
cleanup_failed_checkpoint ( BLOCK_TABLE bt ) {
int64_t i ;
struct translation * t = & bt - > inprogress ;
for ( i = 0 ; i < t - > length_of_array ; i + + ) {
struct block_translation_pair * pair = & t - > block_translation [ i ] ;
if ( pair - > size > 0 & &
! translation_prevents_freeing ( & bt - > current , make_blocknum ( i ) , pair ) & &
! translation_prevents_freeing ( & bt - > checkpointed , make_blocknum ( i ) , pair ) ) {
PRNTF ( " free " , i , pair - > size , pair - > u . diskoff , bt ) ;
block_allocator_free_block ( bt - > block_allocator , pair - > u . diskoff ) ;
2013-04-16 23:57:41 -04:00
}
}
2013-04-16 23:57:47 -04:00
toku_free ( bt - > inprogress . block_translation ) ;
memset ( & bt - > inprogress , 0 , sizeof ( bt - > inprogress ) ) ;
}
// Purpose: free disk space used by previous checkpoint, unless still in use by current.
// capture inprogress as new checkpointed.
// For each entry in checkpointBTT
// if offset does not match offset in inprogress
// assert offset does not match offset in current
// free (offset,len) from checkpoint
// move inprogress to checkpoint (resetting type)
// inprogress = NULL
2013-04-16 23:59:02 -04:00
//fd is protected (must be holding fdlock)
2013-04-16 23:57:47 -04:00
void
2013-04-16 23:59:02 -04:00
toku_block_translation_note_end_checkpoint ( BLOCK_TABLE bt , int fd , struct brt_header * h ) {
2013-04-16 23:57:47 -04:00
// Free unused blocks
lock_for_blocktable ( bt ) ;
2013-04-16 23:57:47 -04:00
u_int64_t allocated_limit_at_start = block_allocator_allocated_limit ( bt - > block_allocator ) ;
2013-04-16 23:57:47 -04:00
assert ( bt - > inprogress . block_translation ) ;
if ( bt - > checkpoint_skipped | | bt - > checkpoint_failed ) {
cleanup_failed_checkpoint ( bt ) ;
goto end ;
}
//Make certain inprogress was allocated space on disk
assert ( bt - > inprogress . block_translation [ RESERVED_BLOCKNUM_TRANSLATION ] . size > 0 ) ;
assert ( bt - > inprogress . block_translation [ RESERVED_BLOCKNUM_TRANSLATION ] . u . diskoff > 0 ) ;
int64_t i ;
struct translation * t = & bt - > checkpointed ;
for ( i = 0 ; i < t - > length_of_array ; i + + ) {
struct block_translation_pair * pair = & t - > block_translation [ i ] ;
if ( pair - > size > 0 & & ! translation_prevents_freeing ( & bt - > inprogress , make_blocknum ( i ) , pair ) ) {
assert ( ! translation_prevents_freeing ( & bt - > current , make_blocknum ( i ) , pair ) ) ;
PRNTF ( " free " , i , pair - > size , pair - > u . diskoff , bt ) ;
block_allocator_free_block ( bt - > block_allocator , pair - > u . diskoff ) ;
}
}
toku_free ( bt - > checkpointed . block_translation ) ;
bt - > checkpointed = bt - > inprogress ;
bt - > checkpointed . type = TRANSLATION_CHECKPOINTED ;
memset ( & bt - > inprogress , 0 , sizeof ( bt - > inprogress ) ) ;
2013-04-16 23:59:02 -04:00
maybe_truncate_cachefile ( bt , fd , h , allocated_limit_at_start ) ;
2013-04-16 23:57:47 -04:00
end :
unlock_for_blocktable ( bt ) ;
}
static inline void
verify_valid_blocknum ( struct translation * t , BLOCKNUM b ) {
assert ( b . b > = 0 ) ;
assert ( b . b < t - > smallest_never_used_blocknum . b ) ;
//Sanity check: Verify invariant
assert ( t - > length_of_array > = t - > smallest_never_used_blocknum . b ) ;
2013-04-16 23:57:41 -04:00
}
2013-04-16 23:57:47 -04:00
//Can be freed
2013-04-16 23:57:41 -04:00
static inline void
2013-04-16 23:57:47 -04:00
verify_valid_freeable_blocknum ( struct translation * t , BLOCKNUM b ) {
assert ( t - > type = = TRANSLATION_CURRENT ) ;
assert ( b . b > = RESERVED_BLOCKNUMS ) ;
assert ( b . b < t - > smallest_never_used_blocknum . b ) ;
//Sanity check: Verify invariant
assert ( t - > length_of_array > = t - > smallest_never_used_blocknum . b ) ;
2013-04-16 23:57:41 -04:00
}
2013-04-16 23:57:44 -04:00
static void
2013-04-16 23:57:47 -04:00
blocktable_lock_init ( BLOCK_TABLE bt ) {
2013-04-16 23:57:44 -04:00
memset ( & bt - > mutex , 0 , sizeof ( bt - > mutex ) ) ;
int r = toku_pthread_mutex_init ( & bt - > mutex , NULL ) ; assert ( r = = 0 ) ;
bt - > is_locked = 0 ;
2013-04-16 23:57:41 -04:00
}
2013-04-16 23:57:44 -04:00
static void
2013-04-16 23:57:47 -04:00
blocktable_lock_destroy ( BLOCK_TABLE bt ) {
2013-04-16 23:57:44 -04:00
int r = toku_pthread_mutex_destroy ( & bt - > mutex ) ; assert ( r = = 0 ) ;
2013-04-16 23:57:41 -04:00
}
static inline void
2013-04-16 23:57:47 -04:00
lock_for_blocktable ( BLOCK_TABLE bt ) {
2013-04-16 23:57:41 -04:00
// Locks the blocktable_mutex.
2013-04-16 23:57:44 -04:00
int r = toku_pthread_mutex_lock ( & bt - > mutex ) ;
2013-04-16 23:57:41 -04:00
assert ( r = = 0 ) ;
2013-04-16 23:57:44 -04:00
bt - > is_locked = 1 ;
2013-04-16 23:57:41 -04:00
}
static inline void
2013-04-16 23:57:47 -04:00
unlock_for_blocktable ( BLOCK_TABLE bt ) {
2013-04-16 23:57:44 -04:00
bt - > is_locked = 0 ;
int r = toku_pthread_mutex_unlock ( & bt - > mutex ) ;
2013-04-16 23:57:41 -04:00
assert ( r = = 0 ) ;
}
2013-04-16 23:57:47 -04:00
void
2013-04-16 23:57:51 -04:00
toku_brtheader_lock ( struct brt_header * h ) {
BLOCK_TABLE bt = h - > blocktable ;
2013-04-16 23:57:47 -04:00
lock_for_blocktable ( bt ) ;
2013-04-16 23:57:41 -04:00
}
2013-04-16 23:57:47 -04:00
void
2013-04-16 23:57:51 -04:00
toku_brtheader_unlock ( struct brt_header * h ) {
BLOCK_TABLE bt = h - > blocktable ;
2013-04-16 23:57:47 -04:00
assert ( bt - > is_locked ) ;
unlock_for_blocktable ( bt ) ;
2013-04-16 23:57:41 -04:00
}
2013-04-16 23:57:47 -04:00
// This is a special debugging function used only in the brt-serialize-test.
2013-04-16 23:57:41 -04:00
void
toku_block_alloc ( BLOCK_TABLE bt , u_int64_t size , u_int64_t * offset ) {
2013-04-16 23:57:44 -04:00
lock_for_blocktable ( bt ) ;
2013-04-16 23:57:47 -04:00
PRNTF ( " allocSomethingUnknown " , 0L , ( int64_t ) size , 0L , bt ) ;
block_allocator_alloc_block ( bt - > block_allocator , size , offset ) ;
PRNTF ( " allocSomethingUnknownd " , 0L , ( int64_t ) size , ( int64_t ) * offset , bt ) ;
2013-04-16 23:57:44 -04:00
unlock_for_blocktable ( bt ) ;
2013-04-16 23:57:41 -04:00
}
2013-04-16 23:57:47 -04:00
// Also used only in brt-serialize-test.
2013-04-16 23:57:41 -04:00
void
toku_block_free ( BLOCK_TABLE bt , u_int64_t offset ) {
2013-04-16 23:57:44 -04:00
lock_for_blocktable ( bt ) ;
2013-04-16 23:57:47 -04:00
PRNTF ( " freeSOMETHINGunknown " , 0L , 0L , offset , bt ) ;
block_allocator_free_block ( bt - > block_allocator , offset ) ;
2013-04-16 23:57:44 -04:00
unlock_for_blocktable ( bt ) ;
2013-04-16 23:57:41 -04:00
}
2013-04-16 23:57:47 -04:00
static int64_t
calculate_size_on_disk ( struct translation * t ) {
int64_t r = ( 8 + // smallest_never_used_blocknum
8 + // blocknum_freelist_head
t - > smallest_never_used_blocknum . b * 16 + // Array
4 ) ; // 4 for checksum
return r ;
2013-04-16 23:57:41 -04:00
}
2013-04-16 23:57:47 -04:00
// We cannot free the disk space allocated to this blocknum if it is still in use by the given translation table.
static inline BOOL
translation_prevents_freeing ( struct translation * t , BLOCKNUM b , struct block_translation_pair * old_pair ) {
2013-04-16 23:57:54 -04:00
BOOL r = ( BOOL )
( t - > block_translation & &
b . b < t - > smallest_never_used_blocknum . b & &
old_pair - > u . diskoff = = t - > block_translation [ b . b ] . u . diskoff ) ;
2013-04-16 23:57:47 -04:00
return r ;
2013-04-16 23:57:41 -04:00
}
2013-04-16 23:57:47 -04:00
static void
2013-04-16 23:57:47 -04:00
blocknum_realloc_on_disk_internal ( BLOCK_TABLE bt , BLOCKNUM b , DISKOFF size , DISKOFF * offset , struct brt_header * h , BOOL for_checkpoint ) {
2013-04-16 23:57:44 -04:00
assert ( bt - > is_locked ) ;
2013-04-16 23:57:47 -04:00
brtheader_set_dirty ( h , for_checkpoint ) ;
struct translation * t = & bt - > current ;
struct block_translation_pair old_pair = t - > block_translation [ b . b ] ;
PRNTF ( " old " , b . b , old_pair . size , old_pair . u . diskoff , bt ) ;
//Free the old block if it is not still in use by the checkpoint in progress or the previous checkpoint
2013-04-16 23:57:54 -04:00
BOOL cannot_free = ( BOOL )
( ( ! for_checkpoint & & translation_prevents_freeing ( & bt - > inprogress , b , & old_pair ) ) | |
translation_prevents_freeing ( & bt - > checkpointed , b , & old_pair ) ) ;
2013-04-16 23:57:47 -04:00
if ( ! cannot_free & & old_pair . u . diskoff ! = diskoff_unused ) {
PRNTF ( " Freed " , b . b , old_pair . size , old_pair . u . diskoff , bt ) ;
block_allocator_free_block ( bt - > block_allocator , old_pair . u . diskoff ) ;
}
2013-04-16 23:57:41 -04:00
2013-04-16 23:57:47 -04:00
u_int64_t allocator_offset ;
//Allocate a new block
block_allocator_alloc_block ( bt - > block_allocator , size , & allocator_offset ) ;
t - > block_translation [ b . b ] . u . diskoff = allocator_offset ;
t - > block_translation [ b . b ] . size = size ;
* offset = allocator_offset ;
PRNTF ( " New " , b . b , t - > block_translation [ b . b ] . size , t - > block_translation [ b . b ] . u . diskoff , bt ) ;
//Update inprogress btt if appropriate (if called because Pending bit is set).
if ( for_checkpoint ) {
assert ( b . b < bt - > inprogress . length_of_array ) ;
bt - > inprogress . block_translation [ b . b ] = t - > block_translation [ b . b ] ;
2013-04-16 23:57:41 -04:00
}
}
void
2013-04-16 23:57:47 -04:00
toku_blocknum_realloc_on_disk ( BLOCK_TABLE bt , BLOCKNUM b , DISKOFF size , DISKOFF * offset , struct brt_header * h , BOOL for_checkpoint ) {
lock_for_blocktable ( bt ) ;
2013-04-16 23:57:47 -04:00
struct translation * t = & bt - > current ;
verify_valid_freeable_blocknum ( t , b ) ;
blocknum_realloc_on_disk_internal ( bt , b , size , offset , h , for_checkpoint ) ;
2013-04-16 23:57:47 -04:00
unlock_for_blocktable ( bt ) ;
2013-04-16 23:57:41 -04:00
}
2013-04-16 23:57:47 -04:00
// Purpose of this function is to figure out where to put the inprogress btt on disk, allocate space for it there.
static void
blocknum_alloc_translation_on_disk_unlocked ( BLOCK_TABLE bt ) {
2013-04-16 23:57:44 -04:00
assert ( bt - > is_locked ) ;
2013-04-16 23:57:41 -04:00
2013-04-16 23:57:47 -04:00
struct translation * t = & bt - > inprogress ;
assert ( t - > block_translation ) ;
BLOCKNUM b = make_blocknum ( RESERVED_BLOCKNUM_TRANSLATION ) ;
struct block_translation_pair old_pair = t - > block_translation [ b . b ] ;
//Each inprogress is allocated only once
assert ( old_pair . size = = 0 & & old_pair . u . diskoff = = diskoff_unused ) ;
//Allocate a new block
int64_t size = calculate_size_on_disk ( t ) ;
u_int64_t offset ;
block_allocator_alloc_block ( bt - > block_allocator , size , & offset ) ;
PRNTF ( " blokAllokator " , 1L , size , offset , bt ) ;
t - > block_translation [ b . b ] . u . diskoff = offset ;
t - > block_translation [ b . b ] . size = size ;
}
//Fills wbuf with bt
//A clean shutdown runs checkpoint start so that current and inprogress are copies.
2013-04-16 23:57:41 -04:00
void
2013-04-16 23:57:47 -04:00
toku_serialize_translation_to_wbuf_unlocked ( BLOCK_TABLE bt , struct wbuf * w ,
int64_t * address , int64_t * size ) {
2013-04-16 23:57:44 -04:00
assert ( bt - > is_locked ) ;
2013-04-16 23:57:47 -04:00
struct translation * t = & bt - > inprogress ;
BLOCKNUM b = make_blocknum ( RESERVED_BLOCKNUM_TRANSLATION ) ;
blocknum_alloc_translation_on_disk_unlocked ( bt ) ;
{
//Init wbuf
u_int64_t size_translation = calculate_size_on_disk ( t ) ;
assert ( ( int64_t ) size_translation = = t - > block_translation [ b . b ] . size ) ;
if ( 0 )
2013-04-16 23:57:47 -04:00
printf ( " %s:%d writing translation table of size_translation % " PRIu64 " at % " PRId64 " \n " , __FILE__ , __LINE__ , size_translation , t - > block_translation [ b . b ] . u . diskoff ) ;
2013-04-16 23:57:47 -04:00
wbuf_init ( w , toku_malloc ( size_translation ) , size_translation ) ;
assert ( w - > size = = size_translation ) ;
}
wbuf_BLOCKNUM ( w , t - > smallest_never_used_blocknum ) ;
wbuf_BLOCKNUM ( w , t - > blocknum_freelist_head ) ;
int64_t i ;
for ( i = 0 ; i < t - > smallest_never_used_blocknum . b ; i + + ) {
if ( 0 )
2013-04-16 23:57:47 -04:00
printf ( " %s:%d % " PRId64 " ,% " PRId64 " \n " , __FILE__ , __LINE__ , t - > block_translation [ i ] . u . diskoff , t - > block_translation [ i ] . size ) ;
2013-04-16 23:57:47 -04:00
wbuf_DISKOFF ( w , t - > block_translation [ i ] . u . diskoff ) ;
wbuf_DISKOFF ( w , t - > block_translation [ i ] . size ) ;
2013-04-16 23:57:41 -04:00
}
u_int32_t checksum = x1764_finish ( & w - > checksum ) ;
wbuf_int ( w , checksum ) ;
2013-04-16 23:57:47 -04:00
* address = t - > block_translation [ b . b ] . u . diskoff ;
* size = t - > block_translation [ b . b ] . size ;
2013-04-16 23:57:41 -04:00
}
2013-04-16 23:57:47 -04:00
// Perhaps rename: purpose is get disk address of a block, given its blocknum (blockid?)
2013-04-16 23:57:47 -04:00
static void
translate_blocknum_to_offset_size_unlocked ( BLOCK_TABLE bt , BLOCKNUM b , DISKOFF * offset , DISKOFF * size ) {
2013-04-16 23:57:47 -04:00
struct translation * t = & bt - > current ;
verify_valid_blocknum ( t , b ) ;
if ( offset ) * offset = t - > block_translation [ b . b ] . u . diskoff ;
if ( size ) * size = t - > block_translation [ b . b ] . size ;
2013-04-16 23:57:47 -04:00
}
// Perhaps rename: purpose is get disk address of a block, given its blocknum (blockid?)
void
toku_translate_blocknum_to_offset_size ( BLOCK_TABLE bt , BLOCKNUM b , DISKOFF * offset , DISKOFF * size ) {
lock_for_blocktable ( bt ) ;
translate_blocknum_to_offset_size_unlocked ( bt , b , offset , size ) ;
2013-04-16 23:57:44 -04:00
unlock_for_blocktable ( bt ) ;
2013-04-16 23:57:41 -04:00
}
2013-04-16 23:57:47 -04:00
//Only called by toku_allocate_blocknum
static void
maybe_expand_translation ( struct translation * t ) {
// Effect: expand the array to maintain size invariant
// given that one more never-used blocknum will soon be used.
if ( t - > length_of_array < = t - > smallest_never_used_blocknum . b ) {
//expansion is necessary
u_int64_t new_length = t - > smallest_never_used_blocknum . b * 2 ;
XREALLOC_N ( new_length , t - > block_translation ) ;
u_int64_t i ;
for ( i = t - > length_of_array ; i < new_length ; i + + ) {
t - > block_translation [ i ] . u . next_free_blocknum = freelist_null ;
2013-04-16 23:57:47 -04:00
t - > block_translation [ i ] . size = size_is_free ;
2013-04-16 23:57:47 -04:00
}
t - > length_of_array = new_length ;
}
2013-04-16 23:57:42 -04:00
}
2013-04-16 23:57:47 -04:00
void
2013-04-16 23:57:47 -04:00
toku_allocate_blocknum_unlocked ( BLOCK_TABLE bt , BLOCKNUM * res , struct brt_header * h ) {
assert ( bt - > is_locked ) ;
2013-04-16 23:57:41 -04:00
BLOCKNUM result ;
2013-04-16 23:57:47 -04:00
struct translation * t = & bt - > current ;
if ( t - > blocknum_freelist_head . b = = freelist_null . b ) {
// no previously used blocknums are available
// use a never used blocknum
maybe_expand_translation ( t ) ; //Ensure a never used blocknums is available
result = t - > smallest_never_used_blocknum ;
t - > smallest_never_used_blocknum . b + + ;
} else { // reuse a previously used blocknum
result = t - > blocknum_freelist_head ;
BLOCKNUM next = t - > block_translation [ result . b ] . u . next_free_blocknum ;
t - > blocknum_freelist_head = next ;
2013-04-16 23:57:41 -04:00
}
2013-04-16 23:57:47 -04:00
//Verify the blocknum is free
assert ( t - > block_translation [ result . b ] . size = = size_is_free ) ;
//blocknum is not free anymore
t - > block_translation [ result . b ] . u . diskoff = diskoff_unused ;
t - > block_translation [ result . b ] . size = 0 ;
verify_valid_freeable_blocknum ( t , result ) ;
2013-04-16 23:57:41 -04:00
* res = result ;
2013-04-16 23:57:47 -04:00
brtheader_set_dirty ( h , FALSE ) ;
2013-04-16 23:57:41 -04:00
}
2013-04-16 23:57:47 -04:00
void
2013-04-16 23:57:47 -04:00
toku_allocate_blocknum ( BLOCK_TABLE bt , BLOCKNUM * res , struct brt_header * h ) {
lock_for_blocktable ( bt ) ;
toku_allocate_blocknum_unlocked ( bt , res , h ) ;
unlock_for_blocktable ( bt ) ;
}
static void
free_blocknum_unlocked ( BLOCK_TABLE bt , BLOCKNUM * bp , struct brt_header * h ) {
2013-04-16 23:57:47 -04:00
// Effect: Free a blocknum.
// If the blocknum holds the only reference to a block on disk, free that block
2013-04-16 23:57:47 -04:00
assert ( bt - > is_locked ) ;
2013-04-16 23:57:47 -04:00
BLOCKNUM b = * bp ;
bp - > b = 0 ; //Remove caller's reference.
struct translation * t = & bt - > current ;
verify_valid_freeable_blocknum ( t , b ) ;
struct block_translation_pair old_pair = t - > block_translation [ b . b ] ;
assert ( old_pair . size ! = size_is_free ) ;
PRNTF ( " free_blocknum " , b . b , t - > block_translation [ b . b ] . size , t - > block_translation [ b . b ] . u . diskoff , bt ) ;
t - > block_translation [ b . b ] . size = size_is_free ;
t - > block_translation [ b . b ] . u . next_free_blocknum = t - > blocknum_freelist_head ;
t - > blocknum_freelist_head = b ;
//If the size is 0, no disk block has ever been assigned to this blocknum.
if ( old_pair . size > 0 ) {
//Free the old block if it is not still in use by the checkpoint in progress or the previous checkpoint
2013-04-16 23:57:54 -04:00
BOOL cannot_free = ( BOOL )
( translation_prevents_freeing ( & bt - > inprogress , b , & old_pair ) | |
translation_prevents_freeing ( & bt - > checkpointed , b , & old_pair ) ) ;
2013-04-16 23:57:47 -04:00
if ( ! cannot_free ) {
PRNTF ( " free_blocknum_free " , b . b , old_pair . size , old_pair . u . diskoff , bt ) ;
block_allocator_free_block ( bt - > block_allocator , old_pair . u . diskoff ) ;
}
2013-04-16 23:57:41 -04:00
}
2013-04-16 23:57:47 -04:00
else assert ( old_pair . size = = 0 & & old_pair . u . diskoff = = diskoff_unused ) ;
brtheader_set_dirty ( h , FALSE ) ;
2013-04-16 23:57:47 -04:00
}
void
toku_free_blocknum ( BLOCK_TABLE bt , BLOCKNUM * bp , struct brt_header * h ) {
lock_for_blocktable ( bt ) ;
free_blocknum_unlocked ( bt , bp , h ) ;
2013-04-16 23:57:44 -04:00
unlock_for_blocktable ( bt ) ;
2013-04-16 23:57:41 -04:00
}
2013-04-16 23:57:47 -04:00
2013-04-16 23:59:02 -04:00
//fd is protected (must be holding fdlock)
2013-04-16 23:57:47 -04:00
void
2013-04-16 23:59:02 -04:00
toku_block_translation_truncate_unlocked ( BLOCK_TABLE bt , int fd , struct brt_header * h ) {
2013-04-16 23:57:47 -04:00
assert ( bt - > is_locked ) ;
2013-04-16 23:57:47 -04:00
u_int64_t allocated_limit_at_start = block_allocator_allocated_limit ( bt - > block_allocator ) ;
2013-04-16 23:57:47 -04:00
brtheader_set_dirty ( h , FALSE ) ;
2013-04-16 23:57:47 -04:00
//Free all regular/data blocks (non reserved)
//Meta data is stored in reserved blocks
2013-04-16 23:57:47 -04:00
struct translation * t = & bt - > current ;
int64_t i ;
for ( i = RESERVED_BLOCKNUMS ; i < t - > smallest_never_used_blocknum . b ; i + + ) {
BLOCKNUM b = make_blocknum ( i ) ;
2013-04-16 23:57:47 -04:00
if ( t - > block_translation [ i ] . size > = 0 ) free_blocknum_unlocked ( bt , & b , h ) ;
2013-04-16 23:57:47 -04:00
}
2013-04-16 23:59:02 -04:00
maybe_truncate_cachefile ( bt , fd , h , allocated_limit_at_start ) ;
2013-04-16 23:57:47 -04:00
}
2013-04-16 23:57:41 -04:00
//Verify there are no free blocks.
void
2013-04-16 23:57:47 -04:00
toku_block_verify_no_free_blocknums ( BLOCK_TABLE bt ) {
assert ( bt - > current . blocknum_freelist_head . b = = freelist_null . b ) ;
2013-04-16 23:57:41 -04:00
}
2013-04-16 23:59:05 -04:00
//Verify there are no data blocks except root.
void
toku_block_verify_no_data_blocks_except_root_unlocked ( BLOCK_TABLE bt , BLOCKNUM root ) {
//Relies on checkpoint having used optimize_translation
assert ( root . b > = RESERVED_BLOCKNUMS ) ;
assert ( bt - > current . smallest_never_used_blocknum . b = = root . b + 1 ) ;
int64_t i ;
for ( i = RESERVED_BLOCKNUMS ; i < root . b ; i + + ) {
BLOCKNUM b = make_blocknum ( i ) ;
assert ( bt - > current . block_translation [ b . b ] . size = = size_is_free ) ;
}
}
2013-04-16 23:57:47 -04:00
//Verify a blocknum is currently allocated.
2013-04-16 23:57:41 -04:00
void
2013-04-16 23:57:47 -04:00
toku_verify_blocknum_allocated ( BLOCK_TABLE bt , BLOCKNUM b ) {
2013-04-16 23:57:44 -04:00
lock_for_blocktable ( bt ) ;
2013-04-16 23:57:47 -04:00
struct translation * t = & bt - > current ;
verify_valid_blocknum ( t , b ) ;
assert ( t - > block_translation [ b . b ] . size ! = size_is_free ) ;
2013-04-16 23:57:44 -04:00
unlock_for_blocktable ( bt ) ;
2013-04-16 23:57:41 -04:00
}
2013-04-16 23:57:47 -04:00
//Only used by toku_dump_translation table (debug info)
static void
dump_translation ( FILE * f , struct translation * t ) {
if ( t - > block_translation ) {
BLOCKNUM b = make_blocknum ( RESERVED_BLOCKNUM_TRANSLATION ) ;
2013-04-16 23:57:47 -04:00
fprintf ( f , " length_of_array[% " PRId64 " ] " , t - > length_of_array ) ;
2013-04-16 23:57:47 -04:00
fprintf ( f , " smallest_never_used_blocknum[% " PRId64 " ] " , t - > smallest_never_used_blocknum . b ) ;
fprintf ( f , " blocknum_free_list_head[% " PRId64 " ] " , t - > blocknum_freelist_head . b ) ;
2013-04-16 23:57:47 -04:00
fprintf ( f , " size_on_disk[% " PRId64 " ] " , t - > block_translation [ b . b ] . size ) ;
2013-04-16 23:58:54 -04:00
fprintf ( f , " location_on_disk[% " PRId64 " ] \n " , t - > block_translation [ b . b ] . u . diskoff ) ;
2013-04-16 23:57:47 -04:00
int64_t i ;
for ( i = 0 ; i < t - > length_of_array ; i + + ) {
2013-04-16 23:58:54 -04:00
fprintf ( f , " % " PRId64 " : % " PRId64 " % " PRId64 " \n " , i , t - > block_translation [ i ] . u . diskoff , t - > block_translation [ i ] . size ) ;
2013-04-16 23:57:47 -04:00
}
fprintf ( f , " \n " ) ;
2013-04-16 23:57:41 -04:00
}
2013-04-16 23:57:47 -04:00
else fprintf ( f , " does not exist \n " ) ;
2013-04-16 23:57:41 -04:00
}
2013-04-16 23:57:47 -04:00
//Only used by toku_brt_dump which is only for debugging purposes
2013-04-16 23:57:41 -04:00
void
2013-04-16 23:57:47 -04:00
toku_dump_translation_table ( FILE * f , BLOCK_TABLE bt ) {
2013-04-16 23:57:44 -04:00
lock_for_blocktable ( bt ) ;
2013-04-16 23:57:47 -04:00
fprintf ( f , " Current block translation: " ) ;
dump_translation ( f , & bt - > current ) ;
fprintf ( f , " Checkpoint in progress block translation: " ) ;
dump_translation ( f , & bt - > inprogress ) ;
fprintf ( f , " Checkpointed block translation: " ) ;
dump_translation ( f , & bt - > checkpointed ) ;
2013-04-16 23:57:44 -04:00
unlock_for_blocktable ( bt ) ;
2013-04-16 23:57:41 -04:00
}
2013-04-16 23:57:47 -04:00
//Only used by brtdump
2013-04-16 23:57:41 -04:00
void
2013-04-16 23:57:47 -04:00
toku_blocknum_dump_translation ( BLOCK_TABLE bt , BLOCKNUM b ) {
2013-04-16 23:57:44 -04:00
lock_for_blocktable ( bt ) ;
2013-04-16 23:57:41 -04:00
2013-04-16 23:57:47 -04:00
struct translation * t = & bt - > current ;
if ( b . b < t - > length_of_array ) {
struct block_translation_pair * bx = & t - > block_translation [ b . b ] ;
printf ( " % " PRId64 " : % " PRId64 " % " PRId64 " \n " , b . b , bx - > u . diskoff , bx - > size ) ;
}
2013-04-16 23:57:44 -04:00
unlock_for_blocktable ( bt ) ;
2013-04-16 23:57:41 -04:00
}
2013-04-16 23:57:44 -04:00
//Must not call this function when anything else is using the blocktable.
//No one may use the blocktable afterwards.
2013-04-16 23:57:41 -04:00
void
toku_blocktable_destroy ( BLOCK_TABLE * btp ) {
BLOCK_TABLE bt = * btp ;
* btp = NULL ;
2013-04-16 23:57:47 -04:00
if ( bt - > current . block_translation ) toku_free ( bt - > current . block_translation ) ;
if ( bt - > inprogress . block_translation ) toku_free ( bt - > inprogress . block_translation ) ;
if ( bt - > checkpointed . block_translation ) toku_free ( bt - > checkpointed . block_translation ) ;
2013-04-16 23:57:41 -04:00
destroy_block_allocator ( & bt - > block_allocator ) ;
2013-04-16 23:57:44 -04:00
blocktable_lock_destroy ( bt ) ;
2013-04-16 23:57:41 -04:00
toku_free ( bt ) ;
2013-04-16 23:57:41 -04:00
}
2013-04-16 23:57:47 -04:00
static BLOCK_TABLE
blocktable_create_internal ( void ) {
2013-04-16 23:57:44 -04:00
// Effect: Fill it in, including the translation table, which is uninitialized
BLOCK_TABLE XMALLOC ( bt ) ;
2013-04-16 23:57:47 -04:00
memset ( bt , 0 , sizeof ( * bt ) ) ;
2013-04-16 23:57:44 -04:00
blocktable_lock_init ( bt ) ;
2013-04-16 23:57:47 -04:00
//There are two headers, so we reserve space for two.
u_int64_t reserve_per_header = BLOCK_ALLOCATOR_HEADER_RESERVE ;
//Must reserve in multiples of BLOCK_ALLOCATOR_ALIGNMENT
//Round up the per-header usage if necessary.
//We want each header aligned.
u_int64_t remainder = BLOCK_ALLOCATOR_HEADER_RESERVE % BLOCK_ALLOCATOR_ALIGNMENT ;
if ( remainder ! = 0 ) {
reserve_per_header + = BLOCK_ALLOCATOR_ALIGNMENT ;
reserve_per_header - = remainder ;
2013-04-16 23:57:41 -04:00
}
2013-04-16 23:57:47 -04:00
assert ( 2 * reserve_per_header = = BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE ) ;
create_block_allocator ( & bt - > block_allocator ,
BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE ,
BLOCK_ALLOCATOR_ALIGNMENT ) ;
return bt ;
}
static void
translation_default ( struct translation * t ) { // destination into which to create a default translation
t - > type = TRANSLATION_CHECKPOINTED ;
2013-04-16 23:57:47 -04:00
t - > smallest_never_used_blocknum = make_blocknum ( RESERVED_BLOCKNUMS ) ;
t - > length_of_array = t - > smallest_never_used_blocknum . b ;
2013-04-16 23:57:47 -04:00
t - > blocknum_freelist_head = freelist_null ;
XMALLOC_N ( t - > length_of_array , t - > block_translation ) ;
int64_t i ;
for ( i = 0 ; i < t - > length_of_array ; i + + ) {
t - > block_translation [ i ] . size = 0 ;
t - > block_translation [ i ] . u . diskoff = diskoff_unused ;
2013-04-16 23:57:44 -04:00
}
2013-04-16 23:57:47 -04:00
}
2013-04-16 23:57:44 -04:00
2013-04-16 23:57:47 -04:00
static void
translation_deserialize_from_buffer ( struct translation * t , // destination into which to deserialize
DISKOFF location_on_disk , //Location of translation_buffer
u_int64_t size_on_disk ,
2013-04-16 23:59:25 -04:00
unsigned char * translation_buffer ) { // buffer with serialized translation
2013-04-16 23:57:47 -04:00
assert ( location_on_disk ! = 0 ) ;
t - > type = TRANSLATION_CHECKPOINTED ;
{
// check the checksum
u_int32_t x1764 = x1764_memory ( translation_buffer , size_on_disk - 4 ) ;
u_int64_t offset = size_on_disk - 4 ;
//printf("%s:%d read from %ld (x1764 offset=%ld) size=%ld\n", __FILE__, __LINE__, block_translation_address_on_disk, offset, block_translation_size_on_disk);
u_int32_t stored_x1764 = toku_dtoh32 ( * ( int * ) ( translation_buffer + offset ) ) ;
assert ( x1764 = = stored_x1764 ) ;
}
struct rbuf rt ;
rt . buf = translation_buffer ;
rt . ndone = 0 ;
rt . size = size_on_disk - 4 ; //4==checksum
t - > smallest_never_used_blocknum = rbuf_blocknum ( & rt ) ;
t - > length_of_array = t - > smallest_never_used_blocknum . b ;
assert ( t - > smallest_never_used_blocknum . b > = RESERVED_BLOCKNUMS ) ;
t - > blocknum_freelist_head = rbuf_blocknum ( & rt ) ;
XMALLOC_N ( t - > length_of_array , t - > block_translation ) ;
int64_t i ;
for ( i = 0 ; i < t - > length_of_array ; i + + ) {
t - > block_translation [ i ] . u . diskoff = rbuf_diskoff ( & rt ) ;
t - > block_translation [ i ] . size = rbuf_diskoff ( & rt ) ;
PRNTF ( " ReadIn " , i , t - > block_translation [ i ] . size , t - > block_translation [ i ] . u . diskoff , NULL ) ;
}
assert ( calculate_size_on_disk ( t ) = = ( int64_t ) size_on_disk ) ;
assert ( t - > block_translation [ RESERVED_BLOCKNUM_TRANSLATION ] . size = = ( int64_t ) size_on_disk ) ;
assert ( t - > block_translation [ RESERVED_BLOCKNUM_TRANSLATION ] . u . diskoff = = location_on_disk ) ;
}
// We just initialized a translation, inform block allocator to reserve space for each blocknum in use.
static void
blocktable_note_translation ( BLOCK_ALLOCATOR allocator , struct translation * t ) {
//This is where the space for them will be reserved (in addition to normal blocks).
//See RESERVED_BLOCKNUMS
2013-04-16 23:59:36 -04:00
// Previously this added blocks one at a time. Now we make an array and pass it in so it can be sorted and merged. See #3218.
struct block_allocator_blockpair * MALLOC_N ( t - > smallest_never_used_blocknum . b , pairs ) ;
u_int64_t n_pairs = 0 ;
for ( int64_t i = 0 ; i < t - > smallest_never_used_blocknum . b ; i + + ) {
2013-04-16 23:57:47 -04:00
struct block_translation_pair pair = t - > block_translation [ i ] ;
2013-04-16 23:59:36 -04:00
if ( pair . size > 0 ) {
2013-04-16 23:57:47 -04:00
assert ( pair . u . diskoff ! = diskoff_unused ) ;
2013-04-16 23:59:36 -04:00
pairs [ n_pairs + + ] = ( struct block_allocator_blockpair ) { . size = pair . size ,
. offset = pair . u . diskoff } ;
}
2013-04-16 23:57:47 -04:00
}
2013-04-16 23:59:36 -04:00
block_allocator_alloc_blocks_at ( allocator , n_pairs , pairs ) ;
toku_free ( pairs ) ;
2013-04-16 23:57:47 -04:00
}
// Fill in the checkpointed translation from buffer, and copy checkpointed to current.
// The one read from disk is the last known checkpointed one, so we are keeping it in
// place and then setting current (which is never stored on disk) for current use.
// The translation_buffer has translation only, we create the rest of the block_table.
void
toku_blocktable_create_from_buffer ( BLOCK_TABLE * btp ,
DISKOFF location_on_disk , //Location of translation_buffer
DISKOFF size_on_disk ,
2013-04-16 23:59:25 -04:00
unsigned char * translation_buffer ) {
2013-04-16 23:57:47 -04:00
BLOCK_TABLE bt = blocktable_create_internal ( ) ;
2013-04-16 23:59:25 -04:00
translation_deserialize_from_buffer ( & bt - > checkpointed , location_on_disk , size_on_disk , translation_buffer ) ;
2013-04-16 23:57:47 -04:00
blocktable_note_translation ( bt - > block_allocator , & bt - > checkpointed ) ;
// we just filled in checkpointed, now copy it to current.
copy_translation ( & bt - > current , & bt - > checkpointed , TRANSLATION_CURRENT ) ;
2013-04-16 23:57:44 -04:00
* btp = bt ;
}
2013-04-16 23:57:47 -04:00
void
toku_blocktable_create_new ( BLOCK_TABLE * btp ) {
BLOCK_TABLE bt = blocktable_create_internal ( ) ;
translation_default ( & bt - > checkpointed ) ; // create default btt (empty except for reserved blocknums)
blocktable_note_translation ( bt - > block_allocator , & bt - > checkpointed ) ;
// we just created a default checkpointed, now copy it to current.
copy_translation ( & bt - > current , & bt - > checkpointed , TRANSLATION_CURRENT ) ;
* btp = bt ;
}
int
toku_blocktable_iterate ( BLOCK_TABLE bt , enum translation_type type , BLOCKTABLE_CALLBACK f , void * extra , BOOL data_only , BOOL used_only ) {
struct translation * src ;
int r = 0 ;
switch ( type ) {
case TRANSLATION_CURRENT : src = & bt - > current ; break ;
case TRANSLATION_INPROGRESS : src = & bt - > inprogress ; break ;
case TRANSLATION_CHECKPOINTED : src = & bt - > checkpointed ; break ;
default : r = EINVAL ; break ;
}
struct translation fakecurrent ;
struct translation * t = & fakecurrent ;
if ( r = = 0 ) {
lock_for_blocktable ( bt ) ;
copy_translation ( t , src , TRANSLATION_DEBUG ) ;
t - > block_translation [ RESERVED_BLOCKNUM_TRANSLATION ] =
src - > block_translation [ RESERVED_BLOCKNUM_TRANSLATION ] ;
unlock_for_blocktable ( bt ) ;
int64_t i ;
for ( i = 0 ; i < t - > smallest_never_used_blocknum . b ; i + + ) {
struct block_translation_pair pair = t - > block_translation [ i ] ;
if ( data_only & & i < RESERVED_BLOCKNUMS ) continue ;
if ( used_only & & pair . size < = 0 ) continue ;
r = f ( make_blocknum ( i ) , pair . size , pair . u . diskoff , extra ) ;
if ( r ! = 0 ) break ;
2013-04-16 23:57:41 -04:00
}
2013-04-16 23:57:47 -04:00
toku_free ( t - > block_translation ) ;
2013-04-16 23:57:41 -04:00
}
2013-04-16 23:57:47 -04:00
return r ;
}
2013-04-16 23:57:41 -04:00
2013-04-16 23:57:47 -04:00
typedef struct {
int64_t used_space ;
int64_t total_space ;
} frag_extra ;
static int
frag_helper ( BLOCKNUM UU ( b ) , int64_t size , int64_t address , void * extra ) {
frag_extra * info = extra ;
if ( size + address > info - > total_space )
info - > total_space = size + address ;
info - > used_space + = size ;
return 0 ;
2013-04-16 23:57:41 -04:00
}
void
2013-04-16 23:57:47 -04:00
toku_blocktable_internal_fragmentation ( BLOCK_TABLE bt , int64_t * total_sizep , int64_t * used_sizep ) {
frag_extra info = { 0 , 0 } ;
int r = toku_blocktable_iterate ( bt , TRANSLATION_CHECKPOINTED , frag_helper , & info , FALSE , TRUE ) ;
assert ( r = = 0 ) ;
if ( total_sizep ) * total_sizep = info . total_space ;
if ( used_sizep ) * used_sizep = info . used_space ;
2013-04-16 23:57:41 -04:00
}
2013-04-16 23:57:47 -04:00
2013-04-16 23:57:47 -04:00
void
toku_realloc_descriptor_on_disk ( BLOCK_TABLE bt , DISKOFF size , DISKOFF * offset , struct brt_header * h ) {
lock_for_blocktable ( bt ) ;
BLOCKNUM b = make_blocknum ( RESERVED_BLOCKNUM_DESCRIPTOR ) ;
blocknum_realloc_on_disk_internal ( bt , b , size , offset , h , FALSE ) ;
unlock_for_blocktable ( bt ) ;
}
void
toku_get_descriptor_offset_size ( BLOCK_TABLE bt , DISKOFF * offset , DISKOFF * size ) {
lock_for_blocktable ( bt ) ;
BLOCKNUM b = make_blocknum ( RESERVED_BLOCKNUM_DESCRIPTOR ) ;
translate_blocknum_to_offset_size_unlocked ( bt , b , offset , size ) ;
unlock_for_blocktable ( bt ) ;
}
2013-04-16 23:59:01 -04:00
void
toku_block_table_get_fragmentation_unlocked ( BLOCK_TABLE bt , TOKU_DB_FRAGMENTATION report ) {
//Requires: blocktable lock is held.
//Requires: report->file_size_bytes is already filled in.
//Count the headers.
report - > data_bytes = BLOCK_ALLOCATOR_HEADER_RESERVE ;
report - > data_blocks = 1 ;
report - > checkpoint_bytes_additional = BLOCK_ALLOCATOR_HEADER_RESERVE ;
report - > checkpoint_blocks_additional = 1 ;
struct translation * current = & bt - > current ;
int64_t i ;
for ( i = 0 ; i < current - > length_of_array ; i + + ) {
struct block_translation_pair * pair = & current - > block_translation [ i ] ;
if ( pair - > size > 0 ) {
report - > data_bytes + = pair - > size ;
report - > data_blocks + + ;
}
}
struct translation * checkpointed = & bt - > checkpointed ;
for ( i = 0 ; i < checkpointed - > length_of_array ; i + + ) {
struct block_translation_pair * pair = & checkpointed - > block_translation [ i ] ;
if ( pair - > size > 0 & &
! ( i < current - > length_of_array & &
current - > block_translation [ i ] . size > 0 & &
current - > block_translation [ i ] . u . diskoff = = pair - > u . diskoff )
) {
report - > checkpoint_bytes_additional + = pair - > size ;
report - > checkpoint_blocks_additional + + ;
}
}
struct translation * inprogress = & bt - > inprogress ;
for ( i = 0 ; i < inprogress - > length_of_array ; i + + ) {
struct block_translation_pair * pair = & inprogress - > block_translation [ i ] ;
if ( pair - > size > 0 & &
! ( i < current - > length_of_array & &
current - > block_translation [ i ] . size > 0 & &
current - > block_translation [ i ] . u . diskoff = = pair - > u . diskoff ) & &
! ( i < checkpointed - > length_of_array & &
checkpointed - > block_translation [ i ] . size > 0 & &
checkpointed - > block_translation [ i ] . u . diskoff = = pair - > u . diskoff )
) {
report - > checkpoint_bytes_additional + = pair - > size ;
report - > checkpoint_blocks_additional + + ;
}
}
block_allocator_get_unused_statistics ( bt - > block_allocator , report ) ;
}