free diskblocks. Addresses #1195.

git-svn-id: file:///svn/toku/tokudb.1195@7679 c7de825b-a66e-492c-adef-691d508d4ae1
This commit is contained in:
Bradley C. Kuszmaul 2013-04-16 23:57:25 -04:00 committed by Yoni Fogel
parent e9a8a72193
commit 6b654211c0
6 changed files with 78 additions and 22 deletions

View file

@ -115,8 +115,8 @@ struct remembered_hash {
};
struct block_translation_pair {
DISKOFF diskoff;
DISKOFF size;
DISKOFF diskoff; // When in free list, set to the next free block. In this case it's really a BLOCKNUM.
DISKOFF size; // set to 0xFFFFFFFFFFFFFFFF for free
};
// The brt_header is not managed by the cachetable. Instead, it hangs off the cachefile as userdata.

View file

@ -294,7 +294,7 @@ void toku_serialize_brtnode_to (int fd, BLOCKNUM blocknum, BRTNODE node, struct
assert(r==Z_OK);
}
if (0) printf("Size before compressing %u, after compression %lu\n", calculated_size-uncompressed_magic_len, compressed_len);
if (0) printf("Block %" PRId64 " Size before compressing %u, after compression %lu\n", blocknum.b, calculated_size-uncompressed_magic_len, compressed_len);
((int32_t*)(compressed_buf+uncompressed_magic_len))[0] = htonl(compressed_len);
((int32_t*)(compressed_buf+uncompressed_magic_len))[1] = htonl(uncompressed_len);
@ -308,18 +308,7 @@ void toku_serialize_brtnode_to (int fd, BLOCKNUM blocknum, BRTNODE node, struct
//printf("%s:%d translated_blocknum_limit=%lu blocknum.b=%lu\n", __FILE__, __LINE__, h->translated_blocknum_limit, blocknum.b);
//printf("%s:%d allocator=%p\n", __FILE__, __LINE__, h->block_allocator);
//printf("%s:%d bt=%p\n", __FILE__, __LINE__, h->block_translation);
if (h->translated_blocknum_limit <= (u_int64_t)blocknum.b) {
if (h->block_translation == 0) assert(h->translated_blocknum_limit==0);
u_int64_t new_limit = blocknum.b + 1;
u_int64_t old_limit = h->translated_blocknum_limit;
u_int64_t j;
XREALLOC_N(new_limit, h->block_translation);
for (j=old_limit; j<new_limit; j++) {
h->block_translation[j].diskoff = 0;
h->block_translation[j].size = 0;
}
h->translated_blocknum_limit = new_limit;
}
extend_block_translation(blocknum, h);
if (h->block_translation[blocknum.b].size > 0) {
block_allocator_free_block(h->block_allocator, h->block_translation[blocknum.b].diskoff);
h->block_translation[blocknum.b].diskoff = 0;
@ -344,6 +333,7 @@ void toku_serialize_brtnode_to (int fd, BLOCKNUM blocknum, BRTNODE node, struct
}
int toku_deserialize_brtnode_from (int fd, BLOCKNUM blocknum, u_int32_t fullhash, BRTNODE *brtnode, struct brt_header *h) {
if (0) printf("Deserializing Block %" PRId64 "\n", blocknum.b);
assert(0 <= blocknum.b && (u_int64_t)blocknum.b < h->translated_blocknum_limit);
DISKOFF offset = h->block_translation[blocknum.b].diskoff;
TAGMALLOC(BRTNODE, result);
@ -371,8 +361,8 @@ int toku_deserialize_brtnode_from (int fd, BLOCKNUM blocknum, u_int32_t fullhash
compressed_size = ntohl(*(u_int32_t*)(&uncompressed_header[uncompressed_magic_len]));
if (compressed_size<=0 || compressed_size>(1<<30)) { r = DB_BADFORMAT; goto died0; }
uncompressed_size = ntohl(*(u_int32_t*)(&uncompressed_header[uncompressed_magic_len+4]));
if (0) printf("Block %" PRId64 " Compressed size = %u, uncompressed size=%u\n", blocknum.b, compressed_size, uncompressed_size);
if (uncompressed_size<=0 || uncompressed_size>(1<<30)) { r = DB_BADFORMAT; goto died0; }
if (0) printf("Compressed size = %u, uncompressed size=%u\n", compressed_size, uncompressed_size);
}
//printf("%s:%d serializing %" PRIu64 " size=%d\n", __FILE__, __LINE__, blocknum.b, uncompressed_size);

View file

@ -546,13 +546,59 @@ void toku_brtheader_free (struct brt_header *h) {
toku_free(h);
}
void
extend_block_translation (BLOCKNUM blocknum, struct brt_header *h)
// Effect: Record a block translation. This means extending the translation table, and setting the diskoff and size to zero in any of the unused spots.
{
if (h->translated_blocknum_limit <= (u_int64_t)blocknum.b) {
if (h->block_translation == 0) assert(h->translated_blocknum_limit==0);
u_int64_t new_limit = blocknum.b + 1;
u_int64_t old_limit = h->translated_blocknum_limit;
u_int64_t j;
XREALLOC_N(new_limit, h->block_translation);
for (j=old_limit; j<new_limit; j++) {
h->block_translation[j].diskoff = 0;
h->block_translation[j].size = 0;
}
h->translated_blocknum_limit = new_limit;
}
}
const DISKOFF diskoff_is_null = (DISKOFF)-1; // in a freelist, this indicates end of list
const DISKOFF size_is_free = (DISKOFF)-1;
static int
allocate_diskblocknumber (BLOCKNUM *res, BRT brt, TOKULOGGER logger __attribute__((__unused__))) {
assert(brt->h->free_blocks.b == -1); // no blocks in the free list
BLOCKNUM result = brt->h->unused_blocks;
brt->h->unused_blocks.b++;
brt->h->dirty = 1;
BLOCKNUM result;
if (brt->h->free_blocks.b == diskoff_is_null) {
// no blocks in the free list
result = brt->h->unused_blocks;
brt->h->unused_blocks.b++;
} else {
result = brt->h->free_blocks;
assert(brt->h->block_translation[result.b].size = size_is_free);
brt->h->block_translation[result.b].size = 0;
brt->h->free_blocks.b = brt->h->block_translation[result.b].diskoff; // pop the freelist
}
assert(result.b>0);
*res = result;
brt->h->dirty = 1;
return 0;
}
static int
free_diskblocknumber (BLOCKNUM *b, struct brt_header *h, TOKULOGGER logger __attribute__((__unused__)))
// Effect: Free a diskblock
// Watch out for the case where the disk block was never yet written to disk and is beyond the translated_blocknum_limit.
{
extend_block_translation(*b, h);
assert((u_int64_t)b->b <= h->translated_blocknum_limit);
assert(h->block_translation[b->b].size != size_is_free);
h->block_translation[b->b].size = size_is_free;
h->block_translation[b->b].diskoff = h->free_blocks.b;
h->free_blocks.b = b->b;
b->b = 0;
h->dirty = 1;
return 0;
}
@ -2080,14 +2126,18 @@ brt_merge_child (BRT t, BRTNODE node, int childnum_to_merge, BOOL *did_io, TOKUL
// Unpin both, and return the first nonzero error code that is found
assert(node->dirty);
{
int rrb1 = 0;
int rra = toku_unpin_brtnode(t, childa);
int rrb;
if (did_merge) {
rrb = toku_cachetable_unpin_and_remove(t->cf, childb->thisnodename);
BLOCKNUM bn = childb->thisnodename;
rrb = toku_cachetable_unpin_and_remove(t->cf, bn);
rrb1 = free_diskblocknumber(&bn, t->h, logger);
} else {
rrb = toku_unpin_brtnode(t, childb);
}
if (rrb1) return rrb1;
if (rra) return rra;
if (rrb) return rrb;
}
@ -4125,6 +4175,12 @@ int toku_dump_brt (FILE *f, BRT brt) {
CACHEKEY *rootp;
assert(brt->h);
u_int32_t fullhash;
u_int64_t i;
fprintf(f, "Block translation:");
for (i=0; i<brt->h->translated_blocknum_limit; i++) {
fprintf(f, " %"PRIu64": %"PRId64" %"PRId64"", i, brt->h->block_translation[i].diskoff, brt->h->block_translation[i].size);
}
fprintf(f, "\n");
rootp = toku_calculate_root_offset_pointer(brt, &fullhash);
return toku_dump_brtnode(f, brt, *rootp, 0, 0, 0, 0, 0);
}

View file

@ -113,4 +113,6 @@ enum brt_header_flags {
int toku_brt_keyrange (BRT brt, DBT *key, u_int64_t *less, u_int64_t *equal, u_int64_t *greater);
void extend_block_translation (BLOCKNUM blocknum, struct brt_header *h);
#endif

View file

@ -291,8 +291,15 @@ main (int argc, const char *argv[]) {
}
} else {
BLOCKNUM blocknum;
printf("Block translation:");
for (blocknum.b=0; blocknum.b<h->unused_blocks.b; blocknum.b++) {
printf(" %" PRIu64 ":", blocknum.b);
if (h->block_translation[blocknum.b].size == -1) printf("free");
else printf("%" PRIu64 ":%" PRIu64, h->block_translation[blocknum.b].diskoff, h->block_translation[blocknum.b].size);
}
for (blocknum.b=1; blocknum.b<h->unused_blocks.b; blocknum.b++) {
dump_node(f, blocknum, h);
if (h->block_translation[blocknum.b].size != -1)
dump_node(f, blocknum, h);
}
}
toku_brtheader_free(h);

View file

@ -1089,6 +1089,7 @@ static void test_new_brt_cursor_last(int n, int dup_mode) {
memcpy(&vv, val.data, val.size);
assert(vv == (int) htonl(i));
//if (n==512 && i<=360) { printf("i=%d\n", i); toku_dump_brt(stdout, t); }
r = toku_brt_cursor_delete(cursor, 0, null_txn); assert(r == 0);
}
assert(i == -1);