closes[t:2449] [t:2484] Merge #2449 changes to main.

Rollback logs are now checkpointed.  There are no rolltmp files.


git-svn-id: file:///svn/toku/tokudb@19167 c7de825b-a66e-492c-adef-691d508d4ae1
This commit is contained in:
Yoni Fogel 2013-04-16 23:59:05 -04:00
parent bf8e181e9e
commit 1bf7a7a403
64 changed files with 2018 additions and 1700 deletions

View file

@ -379,7 +379,7 @@ typedef struct __toku_txn_progress {
} *TOKU_TXN_PROGRESS, TOKU_TXN_PROGRESS_S;
typedef void(*TXN_PROGRESS_POLL_FUNCTION)(TOKU_TXN_PROGRESS, void*);
struct txn_stat {
u_int64_t rolltmp_raw_count;
u_int64_t rollback_raw_count;
};
struct __toku_db_txn {
DB_ENV *mgrp /*In TokuDB, mgrp is a DB_ENV not a DB_TXNMGR*/; /* 32-bit offset=0 size=4, 64=bit offset=0 size=8 */

View file

@ -395,7 +395,7 @@ typedef struct __toku_txn_progress {
} *TOKU_TXN_PROGRESS, TOKU_TXN_PROGRESS_S;
typedef void(*TXN_PROGRESS_POLL_FUNCTION)(TOKU_TXN_PROGRESS, void*);
struct txn_stat {
u_int64_t rolltmp_raw_count;
u_int64_t rollback_raw_count;
};
struct __toku_db_txn {
DB_ENV *mgrp /*In TokuDB, mgrp is a DB_ENV not a DB_TXNMGR*/; /* 32-bit offset=0 size=4, 64=bit offset=0 size=8 */

View file

@ -403,7 +403,7 @@ typedef struct __toku_txn_progress {
} *TOKU_TXN_PROGRESS, TOKU_TXN_PROGRESS_S;
typedef void(*TXN_PROGRESS_POLL_FUNCTION)(TOKU_TXN_PROGRESS, void*);
struct txn_stat {
u_int64_t rolltmp_raw_count;
u_int64_t rollback_raw_count;
};
struct __toku_db_txn {
DB_ENV *mgrp /*In TokuDB, mgrp is a DB_ENV not a DB_TXNMGR*/; /* 32-bit offset=0 size=4, 64=bit offset=0 size=8 */

View file

@ -403,7 +403,7 @@ typedef struct __toku_txn_progress {
} *TOKU_TXN_PROGRESS, TOKU_TXN_PROGRESS_S;
typedef void(*TXN_PROGRESS_POLL_FUNCTION)(TOKU_TXN_PROGRESS, void*);
struct txn_stat {
u_int64_t rolltmp_raw_count;
u_int64_t rollback_raw_count;
};
struct __toku_db_txn {
DB_ENV *mgrp /*In TokuDB, mgrp is a DB_ENV not a DB_TXNMGR*/; /* 32-bit offset=0 size=4, 64=bit offset=0 size=8 */

View file

@ -407,7 +407,7 @@ typedef struct __toku_txn_progress {
} *TOKU_TXN_PROGRESS, TOKU_TXN_PROGRESS_S;
typedef void(*TXN_PROGRESS_POLL_FUNCTION)(TOKU_TXN_PROGRESS, void*);
struct txn_stat {
u_int64_t rolltmp_raw_count;
u_int64_t rollback_raw_count;
};
struct __toku_db_txn {
DB_ENV *mgrp /*In TokuDB, mgrp is a DB_ENV not a DB_TXNMGR*/; /* 32-bit offset=0 size=4, 64=bit offset=0 size=8 */

View file

@ -585,7 +585,7 @@ int main (int argc __attribute__((__unused__)), char *const argv[] __attribute__
printf("} *TOKU_TXN_PROGRESS, TOKU_TXN_PROGRESS_S;\n");
printf("typedef void(*TXN_PROGRESS_POLL_FUNCTION)(TOKU_TXN_PROGRESS, void*);\n");
printf("struct txn_stat {\n u_int64_t rolltmp_raw_count;\n};\n");
printf("struct txn_stat {\n u_int64_t rollback_raw_count;\n};\n");
const char *extra[] = {
"int (*txn_stat)(DB_TXN *, struct txn_stat **)",
"struct { void *next, *prev; } open_txns",

View file

@ -354,7 +354,7 @@ typedef struct __toku_txn_progress {
} *TOKU_TXN_PROGRESS, TOKU_TXN_PROGRESS_S;
typedef void(*TXN_PROGRESS_POLL_FUNCTION)(TOKU_TXN_PROGRESS, void*);
struct txn_stat {
u_int64_t rolltmp_raw_count;
u_int64_t rollback_raw_count;
};
struct __toku_db_txn {
DB_ENV *mgrp /*In TokuDB, mgrp is a DB_ENV not a DB_TXNMGR*/;

View file

@ -64,7 +64,7 @@ build: build.tdb build.bdb
build.bdb: $(TARGET_BDB) $(SCANSCAN_BDB) $(WINDOWS_BDB_LIB_NAME)
build.tdb: $(TARGET_TDB) $(SCANSCAN_TDB)
check: check-default check-rowsize-dup check-rowsize check-xfast check-x check-no-rolltmp check-4G
check: check-default check-rowsize-dup check-rowsize check-xfast check-x check-no-rollback check-4G child.benchmark.dir
SUPPORT_KEYSIZE=$$((3*1024)) # at least 3KiB
SUPPORT_ROWSIZE=$$((80*1024)) # at least 80KiB
@ -96,8 +96,8 @@ check-xfast: $(TARGET_TDB)
./$(TARGET_TDB) $(VERBVERBOSE) --noserial -x --valsize 1000 --cachesize 8000000 --xcount 1000 --periter 20000 --env xfast.dir 1 $(SUMMARIZE_CMD)
# A relatively fast test that detects #853 (don't log changes to a dictionary created in the same txn)
check-no-rolltmp: $(TARGET_TDB)
./$(TARGET_TDB) $(VERBVERBOSE) --env no-rolltmp.dir --singlex --nolog --check_small_rolltmp $(SUMMARIZE_CMD)
check-no-rollback: $(TARGET_TDB)
./$(TARGET_TDB) $(VERBVERBOSE) --env no-rollback.dir --singlex --nolog --check_small_rollback $(SUMMARIZE_CMD)
# Check to make sure that if we make a file that's bigger than 4GB that we can read the file back out and get all the rows.
ifeq ($(TOKU_SKIP_4G),1)

View file

@ -53,7 +53,7 @@ int singlex_child = 0; // Do a single transaction, but do all work with a child
int singlex = 0; // Do a single transaction
int singlex_create = 0; // Create the db using the single transaction (only valid if singlex)
int insert1first = 0; // insert 1 before doing the rest
int check_small_rolltmp = 0; // verify that the rollback logs are small (only valid if singlex)
int check_small_rollback = 0; // verify that the rollback logs are small (only valid if singlex)
int do_transactions = 0;
int if_transactions_do_logging = DB_INIT_LOG; // set this to zero if we want no logging when transactions are used
int do_abort = 0;
@ -294,14 +294,14 @@ static void benchmark_shutdown (void) {
#endif
if (do_transactions && singlex && !insert1first && (singlex_create || prelock)) {
#if defined(TOKUDB)
//There should be a single 'truncate' in the rolltmp instead of many 'insert' entries.
//There should be a single 'truncate' in the rollback instead of many 'insert' entries.
struct txn_stat *s;
r = tid->txn_stat(tid, &s);
assert(r==0);
//TODO: #1125 Always do the test after performance testing is done.
if (singlex_child) fprintf(stderr, "SKIPPED 'small rolltmp' test for child txn\n");
if (singlex_child) fprintf(stderr, "SKIPPED 'small rollback' test for child txn\n");
else
assert(s->rolltmp_raw_count < 100); // gross test, not worth investigating details
assert(s->rollback_raw_count < 100); // gross test, not worth investigating details
os_free(s);
//system("ls -l bench.tokudb");
#endif
@ -487,7 +487,7 @@ static int print_usage (const char *argv0) {
fprintf(stderr, " --singlex-child (implies -x) Run the whole job as a single transaction, do all work a child of that transaction.\n");
fprintf(stderr, " --finish-child-first Commit/abort child before doing so to parent (no effect if no child).\n");
fprintf(stderr, " --singlex-create (implies --singlex) Create the file using the single transaction (Default is to use a different transaction to create.)\n");
fprintf(stderr, " --check_small_rolltmp (Only valid in --singlex mode) Verify that very little data was saved in the rollback logs.\n");
fprintf(stderr, " --check_small_rollback (Only valid in --singlex mode) Verify that very little data was saved in the rollback logs.\n");
fprintf(stderr, " --prelock Prelock the database.\n");
fprintf(stderr, " --prelockflag Prelock the database and send the DB_PRELOCKED_WRITE flag.\n");
fprintf(stderr, " --abort Abort the singlex after the transaction is over. (Requires --singlex.)\n");
@ -589,8 +589,8 @@ int main (int argc, const char *const argv[]) {
singlex = 1;
} else if (strcmp(arg, "--insert1first") == 0) {
insert1first = 1;
} else if (strcmp(arg, "--check_small_rolltmp") == 0) {
check_small_rolltmp = 1;
} else if (strcmp(arg, "--check_small_rollback") == 0) {
check_small_rollback = 1;
} else if (strcmp(arg, "--xcount") == 0) {
if (i+1 >= argc) return print_usage(argv[0]);
items_per_transaction = strtoll(argv[++i], &endptr, 10); assert(*endptr == 0);
@ -685,8 +685,8 @@ int main (int argc, const char *const argv[]) {
fprintf(stderr, "--insert_multiple only works on the TokuDB (not BDB)\n");
return print_usage(argv[0]);
}
if (check_small_rolltmp) {
fprintf(stderr, "--check_small_rolltmp only works on the TokuDB (not BDB)\n");
if (check_small_rollback) {
fprintf(stderr, "--check_small_rollback only works on the TokuDB (not BDB)\n");
return print_usage(argv[0]);
}
#endif
@ -697,8 +697,8 @@ int main (int argc, const char *const argv[]) {
put_flagss[i] = put_flags;
}
}
if (check_small_rolltmp && !singlex) {
fprintf(stderr, "--check_small_rolltmp requires --singlex\n");
if (check_small_rollback && !singlex) {
fprintf(stderr, "--check_small_rollback requires --singlex\n");
return print_usage(argv[0]);
}
if (!do_transactions && insert_multiple) {

View file

@ -354,7 +354,7 @@ typedef struct __toku_txn_progress {
} *TOKU_TXN_PROGRESS, TOKU_TXN_PROGRESS_S;
typedef void(*TXN_PROGRESS_POLL_FUNCTION)(TOKU_TXN_PROGRESS, void*);
struct txn_stat {
u_int64_t rolltmp_raw_count;
u_int64_t rollback_raw_count;
};
struct __toku_db_txn {
DB_ENV *mgrp /*In TokuDB, mgrp is a DB_ENV not a DB_TXNMGR*/;

View file

@ -41,7 +41,6 @@ local: bins libs $(TEST_NEWBRT);
BRT_SOURCES = \
block_allocator \
block_table \
bread \
brt-serialize \
brt-verify \
brt \

View file

@ -596,6 +596,19 @@ toku_block_verify_no_free_blocknums(BLOCK_TABLE bt) {
assert(bt->current.blocknum_freelist_head.b == freelist_null.b);
}
//Verify there are no data blocks except root.
void
toku_block_verify_no_data_blocks_except_root_unlocked(BLOCK_TABLE bt, BLOCKNUM root) {
//Relies on checkpoint having used optimize_translation
assert(root.b >= RESERVED_BLOCKNUMS);
assert(bt->current.smallest_never_used_blocknum.b == root.b + 1);
int64_t i;
for (i=RESERVED_BLOCKNUMS; i < root.b; i++) {
BLOCKNUM b = make_blocknum(i);
assert(bt->current.block_translation[b.b].size == size_is_free);
}
}
//Verify a blocknum is currently allocated.
void
toku_verify_blocknum_allocated(BLOCK_TABLE bt, BLOCKNUM b) {

View file

@ -35,6 +35,7 @@ void toku_allocate_blocknum(BLOCK_TABLE bt, BLOCKNUM *res, struct brt_header * h
void toku_allocate_blocknum_unlocked(BLOCK_TABLE bt, BLOCKNUM *res, struct brt_header * h);
void toku_free_blocknum(BLOCK_TABLE bt, BLOCKNUM *b, struct brt_header * h);
void toku_verify_blocknum_allocated(BLOCK_TABLE bt, BLOCKNUM b);
void toku_block_verify_no_data_blocks_except_root_unlocked(BLOCK_TABLE bt, BLOCKNUM root);
void toku_block_verify_no_free_blocknums(BLOCK_TABLE bt);
void toku_realloc_descriptor_on_disk(BLOCK_TABLE bt, DISKOFF size, DISKOFF *offset, struct brt_header * h);
void toku_get_descriptor_offset_size(BLOCK_TABLE bt, DISKOFF *offset, DISKOFF *size);

View file

@ -1,80 +0,0 @@
/* Buffered read. */
#ident "$Id$"
#ident "Copyright (c) 2007, 2008, 2009 Tokutek Inc. All rights reserved."
#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."
#include "includes.h"
struct bread {
int64_t fileoff; // The byte before this offset is the next byte we will read (since we are reading backward)
int fd;
int bufoff; // The current offset in the buf. The next byte we will read is buf[bufoff-1] (assuming that bufoff>0).
char *buf; // A buffer with at least bufoff bytes in it.
};
BREAD create_bread_from_fd_initialize_at(int fd) {
BREAD XMALLOC(result);
int r = toku_os_get_file_size(fd, &result->fileoff);
assert(r==0);
result->fd=fd;
result->bufoff=0;
result->buf = 0;
return result;
}
int close_bread_without_closing_fd(BREAD br) {
toku_free(br->buf);
toku_free(br);
return 0;
}
ssize_t bread_backwards(BREAD br, void *vbuf, size_t nbytes) {
char *buf=vbuf;
ssize_t result=0;
const int i4 = sizeof(u_int32_t);
while (nbytes > 0) {
// read whatever we can out of the buffer.
if (br->bufoff>0) {
size_t to_copy = ((size_t)br->bufoff >= nbytes) ? nbytes : (size_t)br->bufoff;
memcpy(buf+nbytes-to_copy, &br->buf[br->bufoff-to_copy], to_copy);
nbytes -= to_copy;
result += to_copy;
br->bufoff -= to_copy;
}
if (nbytes>0) {
assert(br->bufoff==0);
u_int32_t compressed_length_n, uncompressed_length_n;
assert(br->fileoff>=i4); // there better be the three lengths plus the compressed data.
{ ssize_t r = pread(br->fd, &compressed_length_n, i4, br->fileoff- i4); assert(r==i4); }
u_int32_t compressed_length = toku_dtoh32(compressed_length_n);
assert(br->fileoff >= compressed_length + 3*i4);
{ ssize_t r = pread(br->fd, &uncompressed_length_n, i4, br->fileoff-2*i4); assert(r==i4); }
u_int32_t uncompressed_length = toku_dtoh32(uncompressed_length_n);
char *XMALLOC_N(compressed_length, zbuf);
{
ssize_t r = pread(br->fd, zbuf, compressed_length, br->fileoff- compressed_length -2*i4);
assert(r==(ssize_t)compressed_length);
}
{
u_int32_t compressed_length_n_again;
ssize_t r = pread(br->fd, &compressed_length_n_again, i4, br->fileoff-compressed_length-3*i4); assert(r==i4);
assert(compressed_length_n_again == compressed_length_n);
}
uLongf destlen = uncompressed_length;
XREALLOC_N(uncompressed_length, br->buf);
uncompress((Bytef*)br->buf, &destlen, (Bytef*)zbuf, compressed_length);
assert(destlen==uncompressed_length);
toku_free(zbuf);
br->bufoff = uncompressed_length;
br->fileoff -= (compressed_length + 3*i4);
}
}
return result;
}
int bread_has_more(BREAD br) {
return (br->fileoff>0) || (br->bufoff>0);
}

View file

@ -1,30 +0,0 @@
#ifndef BREAD_H
#define BREAD_H
#ident "$Id$"
#ident "Copyright (c) 2007, 2008, 2009 Tokutek Inc. All rights reserved."
#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."
// A BREAD reads a file backwards using buffered I/O. BREAD stands for Buffered Read or Backwards Read.
// Conceivably, we could read forward too.
// The buffered I/O is buffered using a large buffer (e.g., something like a megabyte).
// Furthermore, data is compressed into blocks. Each block is a 4-byte compressed length (in network order), followed by compressed data, followed by a 4-byte uncompressed-length (in network order), followed by a 4-byte compressed length
// The compressed-length appears twice so that the file can be read backward or forward.
// If not for the large-buffer requirement, as well as compression, as well as reading backward, we could have used a FILE.
#include <sys/types.h>
typedef struct bread *BREAD;
BREAD create_bread_from_fd_initialize_at(int fd);
// Effect: Given a file descriptor, fd, create a BREAD.
// Requires: The fd must be an open fd.
int close_bread_without_closing_fd(BREAD);
// Effect: Close the BREAD, but don't close the underlying fd.
ssize_t bread_backwards(BREAD, void *buf, size_t nbytes);
// Read nbytes into buf, reading backwards.
int bread_has_more(BREAD);
// Is there more to read?
#endif

View file

@ -217,6 +217,10 @@ int toku_serialize_brtnode_to_memory (BRTNODE node, int n_workitems, int n_threa
/*out*/ size_t *n_bytes_to_write,
/*out*/ char **bytes_to_write);
int toku_serialize_brtnode_to(int fd, BLOCKNUM, BRTNODE node, struct brt_header *h, int n_workitems, int n_threads, BOOL for_checkpoint);
int toku_serialize_rollback_log_to (int fd, BLOCKNUM blocknum, ROLLBACK_LOG_NODE log,
struct brt_header *h, int n_workitems, int n_threads,
BOOL for_checkpoint);
int toku_deserialize_rollback_log_from (int fd, BLOCKNUM blocknum, u_int32_t fullhash, ROLLBACK_LOG_NODE *logp, TOKUTXN txn, struct brt_header *h);
int toku_deserialize_brtnode_from (int fd, BLOCKNUM off, u_int32_t /*fullhash*/, BRTNODE *brtnode, struct brt_header *h);
unsigned int toku_serialize_brtnode_size(BRTNODE node); /* How much space will it take? */
int toku_keycompare (bytevec key1, ITEMLEN key1len, bytevec key2, ITEMLEN key2len);

View file

@ -168,7 +168,7 @@ toku_full_pwrite_extend (int fd, const void *buf, size_t count, toku_off_t offse
// Overhead calculated in same order fields are written to wbuf
enum {
node_header_overhead = (8+ // magic "tokunode" or "tokuleaf"
node_header_overhead = (8+ // magic "tokunode" or "tokuleaf" or "tokuroll"
4+ // layout_version
4), // layout_version_original
@ -430,48 +430,27 @@ serialize_node(BRTNODE node, char *buf, size_t calculated_size, int n_sub_blocks
assert(calculated_size==wb.ndone);
}
int
toku_serialize_brtnode_to_memory (BRTNODE node, int UU(n_workitems), int UU(n_threads), /*out*/ size_t *n_bytes_to_write, /*out*/ char **bytes_to_write) {
// get the size of the serialized node
unsigned int calculated_size = toku_serialize_brtnode_size(node);
// choose sub block parameters
int n_sub_blocks = 0, sub_block_size = 0;
size_t data_size = calculated_size - node_header_overhead;
choose_sub_block_size(data_size, max_sub_blocks, &sub_block_size, &n_sub_blocks);
assert(0 < n_sub_blocks && n_sub_blocks <= max_sub_blocks);
assert(sub_block_size > 0);
// set the initial sub block size for all of the sub blocks
struct sub_block sub_block[n_sub_blocks];
for (int i = 0; i < n_sub_blocks; i++)
sub_block_init(&sub_block[i]);
set_all_sub_block_sizes(data_size, sub_block_size, n_sub_blocks, sub_block);
// alloocate space for the serialized node
char *MALLOC_N(calculated_size, buf);
//toku_verify_counts(node);
//assert(size>0);
//printf("%s:%d serializing %lld w height=%d p0=%p\n", __FILE__, __LINE__, off, node->height, node->mdicts[0]);
// serialize the node into buf
serialize_node(node, buf, calculated_size, n_sub_blocks, sub_block);
// allocate space for the compressed buf
static void
serialize_uncompressed_block_to_memory(char * uncompressed_buf,
int n_sub_blocks,
struct sub_block sub_block[n_sub_blocks],
/*out*/ size_t *n_bytes_to_write,
/*out*/ char **bytes_to_write) {
// allocate space for the compressed uncompressed_buf
size_t compressed_len = get_sum_compressed_size_bound(n_sub_blocks, sub_block);
size_t sub_block_header_len = sub_block_header_size(n_sub_blocks);
size_t header_len = node_header_overhead + sub_block_header_len + sizeof (uint32_t); // node + sub_block + checksum
char *MALLOC_N(header_len + compressed_len, compressed_buf);
// copy the header
memcpy(compressed_buf, buf, node_header_overhead);
memcpy(compressed_buf, uncompressed_buf, node_header_overhead);
if (0) printf("First 4 bytes before compressing data are %02x%02x%02x%02x\n",
buf[node_header_overhead], buf[node_header_overhead+1],
buf[node_header_overhead+2], buf[node_header_overhead+3]);
uncompressed_buf[node_header_overhead], uncompressed_buf[node_header_overhead+1],
uncompressed_buf[node_header_overhead+2], uncompressed_buf[node_header_overhead+3]);
// compress all of the sub blocks
char *uncompressed_ptr = buf + node_header_overhead;
char *uncompressed_ptr = uncompressed_buf + node_header_overhead;
char *compressed_ptr = compressed_buf + header_len;
compressed_len = compress_all_sub_blocks(n_sub_blocks, sub_block, uncompressed_ptr, compressed_ptr, num_cores);
@ -494,9 +473,40 @@ toku_serialize_brtnode_to_memory (BRTNODE node, int UU(n_workitems), int UU(n_th
*n_bytes_to_write = header_len + compressed_len;
*bytes_to_write = compressed_buf;
}
int
toku_serialize_brtnode_to_memory (BRTNODE node, int UU(n_workitems), int UU(n_threads), /*out*/ size_t *n_bytes_to_write, /*out*/ char **bytes_to_write) {
// get the size of the serialized node
size_t calculated_size = toku_serialize_brtnode_size(node);
// choose sub block parameters
int n_sub_blocks = 0, sub_block_size = 0;
size_t data_size = calculated_size - node_header_overhead;
choose_sub_block_size(data_size, max_sub_blocks, &sub_block_size, &n_sub_blocks);
assert(0 < n_sub_blocks && n_sub_blocks <= max_sub_blocks);
assert(sub_block_size > 0);
// set the initial sub block size for all of the sub blocks
struct sub_block sub_block[n_sub_blocks];
for (int i = 0; i < n_sub_blocks; i++)
sub_block_init(&sub_block[i]);
set_all_sub_block_sizes(data_size, sub_block_size, n_sub_blocks, sub_block);
// allocate space for the serialized node
char *MALLOC_N(calculated_size, buf);
//toku_verify_counts(node);
//assert(size>0);
//printf("%s:%d serializing %lld w height=%d p0=%p\n", __FILE__, __LINE__, off, node->height, node->mdicts[0]);
// serialize the node into buf
serialize_node(node, buf, calculated_size, n_sub_blocks, sub_block);
//Compress and malloc buffer to write
serialize_uncompressed_block_to_memory(buf, n_sub_blocks, sub_block,
n_bytes_to_write, bytes_to_write);
toku_free(buf);
return 0;
}
@ -522,9 +532,8 @@ toku_serialize_brtnode_to (int fd, BLOCKNUM blocknum, BRTNODE node, struct brt_h
//printf("%s:%d bt=%p\n", __FILE__, __LINE__, h->block_translation);
DISKOFF offset;
//h will be dirtied
toku_blocknum_realloc_on_disk(h->blocktable, blocknum, n_to_write, &offset,
h, for_checkpoint);
h, for_checkpoint); //dirties h
lock_for_pwrite();
toku_full_pwrite_extend(fd, compressed_buf, n_to_write, offset);
unlock_for_pwrite();
@ -852,7 +861,7 @@ deserialize_brtnode_from_rbuf (BLOCKNUM blocknum, u_int32_t fullhash, BRTNODE *b
}
static int
decompress_brtnode_from_raw_block_into_rbuf(u_int8_t *raw_block, struct rbuf *rb, BLOCKNUM blocknum) {
decompress_from_raw_block_into_rbuf(u_int8_t *raw_block, struct rbuf *rb, BLOCKNUM blocknum) {
toku_trace("decompress");
int r;
@ -914,14 +923,14 @@ decompress_brtnode_from_raw_block_into_rbuf(u_int8_t *raw_block, struct rbuf *rb
}
static int
decompress_brtnode_from_raw_block_into_rbuf_versioned(u_int32_t version, u_int8_t *raw_block, struct rbuf *rb, BLOCKNUM blocknum) {
decompress_from_raw_block_into_rbuf_versioned(u_int32_t version, u_int8_t *raw_block, struct rbuf *rb, BLOCKNUM blocknum) {
int r;
switch (version) {
case BRT_LAYOUT_VERSION_10:
r = decompress_brtnode_from_raw_block_into_rbuf_10(raw_block, rb, blocknum);
break;
case BRT_LAYOUT_VERSION:
r = decompress_brtnode_from_raw_block_into_rbuf(raw_block, rb, blocknum);
r = decompress_from_raw_block_into_rbuf(raw_block, rb, blocknum);
break;
default:
assert(FALSE);
@ -959,19 +968,16 @@ deserialize_brtnode_from_rbuf_versioned (u_int32_t version, BLOCKNUM blocknum, u
return r;
}
// Read brt node from file into struct. Perform version upgrade if necessary.
int
toku_deserialize_brtnode_from (int fd, BLOCKNUM blocknum, u_int32_t fullhash, BRTNODE *brtnode, struct brt_header *h) {
toku_trace("deserial start");
static int
read_and_decompress_block_from_fd_into_rbuf(int fd, BLOCKNUM blocknum,
struct brt_header *h,
struct rbuf *rb,
/* out */ int *layout_version_p) {
int r;
struct rbuf rb = {.buf = NULL, .size = 0, .ndone = 0};
if (0) printf("Deserializing Block %" PRId64 "\n", blocknum.b);
if (h->panic) return h->panic;
toku_trace("deserial start");
toku_trace("deserial start nopanic");
// get the file offset and block size for the block
DISKOFF offset, size;
@ -986,7 +992,9 @@ toku_deserialize_brtnode_from (int fd, BLOCKNUM blocknum, u_int32_t fullhash, BR
int layout_version;
{
u_int8_t *magic = raw_block + uncompressed_magic_offset;
if (memcmp(magic, "tokuleaf", 8)!=0 && memcmp(magic, "tokunode", 8)!=0) {
if (memcmp(magic, "tokuleaf", 8)!=0 &&
memcmp(magic, "tokunode", 8)!=0 &&
memcmp(magic, "tokuroll", 8)!=0) {
r = toku_db_badformat();
goto cleanup;
}
@ -1006,16 +1014,47 @@ toku_deserialize_brtnode_from (int fd, BLOCKNUM blocknum, u_int32_t fullhash, BR
u_int32_t stored_xsum = toku_dtoh32(*(u_int32_t *)(raw_block + header_length));
assert(xsum == stored_xsum);
r = decompress_brtnode_from_raw_block_into_rbuf_versioned(layout_version, raw_block, &rb, blocknum);
r = decompress_from_raw_block_into_rbuf_versioned(layout_version, raw_block, rb, blocknum);
if (r!=0) goto cleanup;
*layout_version_p = layout_version;
cleanup:
if (r!=0) {
if (rb->buf) toku_free(rb->buf);
rb->buf = NULL;
}
if (raw_block) toku_free(raw_block);
return r;
}
// Read brt node from file into struct. Perform version upgrade if necessary.
int
toku_deserialize_brtnode_from (int fd, BLOCKNUM blocknum, u_int32_t fullhash,
BRTNODE *brtnode, struct brt_header *h) {
toku_trace("deserial start");
int r;
struct rbuf rb = {.buf = NULL, .size = 0, .ndone = 0};
int layout_version;
r = read_and_decompress_block_from_fd_into_rbuf(fd, blocknum, h, &rb, &layout_version);
if (r!=0) goto cleanup;
{
u_int8_t *magic = rb.buf + uncompressed_magic_offset;
if (memcmp(magic, "tokuleaf", 8)!=0 &&
memcmp(magic, "tokunode", 8)!=0) {
r = toku_db_badformat();
goto cleanup;
}
}
r = deserialize_brtnode_from_rbuf_versioned(layout_version, blocknum, fullhash, brtnode, h, &rb);
toku_trace("deserial done");
cleanup:
if (rb.buf) toku_free(rb.buf);
if (raw_block) toku_free(raw_block);
return r;
}
@ -1603,5 +1642,245 @@ toku_db_badformat(void) {
return DB_BADFORMAT;
}
static size_t
serialize_rollback_log_size(ROLLBACK_LOG_NODE log) {
size_t size = node_header_overhead //8 "tokuroll", 4 version, 4 version_original
+8 //TXNID
+8 //sequence
+8 //thislogname
+8 //older (blocknum)
+8 //resident_bytecount
+8 //memarena_size_needed_to_load
+log->rollentry_resident_bytecount;
return size;
}
static void
serialize_rollback_log_node_to_buf(ROLLBACK_LOG_NODE log, char *buf, size_t calculated_size, int UU(n_sub_blocks), struct sub_block UU(sub_block[])) {
struct wbuf wb;
wbuf_init(&wb, buf, calculated_size);
{ //Serialize rollback log to local wbuf
wbuf_nocrc_literal_bytes(&wb, "tokuroll", 8);
assert(log->layout_version == BRT_LAYOUT_VERSION);
wbuf_nocrc_int(&wb, log->layout_version);
wbuf_nocrc_int(&wb, log->layout_version_original);
wbuf_nocrc_TXNID(&wb, log->txnid);
wbuf_nocrc_ulonglong(&wb, log->sequence);
wbuf_nocrc_BLOCKNUM(&wb, log->thislogname);
wbuf_nocrc_BLOCKNUM(&wb, log->older);
wbuf_nocrc_ulonglong(&wb, log->rollentry_resident_bytecount);
//Write down memarena size needed to restore
wbuf_nocrc_ulonglong(&wb, memarena_total_size_in_use(log->rollentry_arena));
{
//Store rollback logs
struct roll_entry *item;
size_t done_before = wb.ndone;
for (item = log->newest_logentry; item; item = item->prev) {
toku_logger_rollback_wbuf_nocrc_write(&wb, item);
}
assert(done_before + log->rollentry_resident_bytecount == wb.ndone);
}
}
assert(wb.ndone == wb.size);
assert(calculated_size==wb.ndone);
}
static int
toku_serialize_rollback_log_to_memory (ROLLBACK_LOG_NODE log,
int UU(n_workitems), int UU(n_threads),
/*out*/ size_t *n_bytes_to_write,
/*out*/ char **bytes_to_write) {
// get the size of the serialized node
size_t calculated_size = serialize_rollback_log_size(log);
// choose sub block parameters
int n_sub_blocks = 0, sub_block_size = 0;
size_t data_size = calculated_size - node_header_overhead;
choose_sub_block_size(data_size, max_sub_blocks, &sub_block_size, &n_sub_blocks);
assert(0 < n_sub_blocks && n_sub_blocks <= max_sub_blocks);
assert(sub_block_size > 0);
// set the initial sub block size for all of the sub blocks
struct sub_block sub_block[n_sub_blocks];
for (int i = 0; i < n_sub_blocks; i++)
sub_block_init(&sub_block[i]);
set_all_sub_block_sizes(data_size, sub_block_size, n_sub_blocks, sub_block);
// allocate space for the serialized node
char *XMALLOC_N(calculated_size, buf);
// serialize the node into buf
serialize_rollback_log_node_to_buf(log, buf, calculated_size, n_sub_blocks, sub_block);
//Compress and malloc buffer to write
serialize_uncompressed_block_to_memory(buf, n_sub_blocks, sub_block,
n_bytes_to_write, bytes_to_write);
toku_free(buf);
return 0;
}
int
toku_serialize_rollback_log_to (int fd, BLOCKNUM blocknum, ROLLBACK_LOG_NODE log,
struct brt_header *h, int n_workitems, int n_threads,
BOOL for_checkpoint) {
size_t n_to_write;
char *compressed_buf;
{
int r = toku_serialize_rollback_log_to_memory(log, n_workitems, n_threads, &n_to_write, &compressed_buf);
if (r!=0) return r;
}
{
assert(blocknum.b>=0);
DISKOFF offset;
toku_blocknum_realloc_on_disk(h->blocktable, blocknum, n_to_write, &offset,
h, for_checkpoint); //dirties h
lock_for_pwrite();
toku_full_pwrite_extend(fd, compressed_buf, n_to_write, offset);
unlock_for_pwrite();
}
toku_free(compressed_buf);
log->dirty = 0; // See #1957. Must set the node to be clean after serializing it so that it doesn't get written again on the next checkpoint or eviction.
return 0;
}
static int
deserialize_rollback_log_from_rbuf (BLOCKNUM blocknum, u_int32_t fullhash, ROLLBACK_LOG_NODE *log_p,
TOKUTXN txn, struct brt_header *h, struct rbuf *rb) {
TAGMALLOC(ROLLBACK_LOG_NODE, result);
int r;
if (result==NULL) {
r=errno;
if (0) { died0: toku_free(result); }
return r;
}
//printf("Deserializing %lld datasize=%d\n", off, datasize);
bytevec magic;
rbuf_literal_bytes(rb, &magic, 8);
assert(!memcmp(magic, "tokuroll", 8));
result->layout_version = rbuf_int(rb);
assert(result->layout_version == BRT_LAYOUT_VERSION);
result->layout_version_original = rbuf_int(rb);
result->layout_version_read_from_disk = result->layout_version;
result->dirty = FALSE;
//TODO: Maybe add descriptor (or just descriptor version) here eventually?
//TODO: This is hard.. everything is shared in a single dictionary.
rbuf_TXNID(rb, &result->txnid);
result->sequence = rbuf_ulonglong(rb);
if (result->txnid == txn->txnid64 && result->sequence > txn->num_rollback_nodes) {
r = toku_db_badformat();
goto died0;
}
result->thislogname = rbuf_blocknum(rb);
if (result->thislogname.b != blocknum.b) {
r = toku_db_badformat();
goto died0;
}
result->thishash = toku_cachetable_hash(h->cf, result->thislogname);
if (result->thishash != fullhash) {
r = toku_db_badformat();
goto died0;
}
result->older = rbuf_blocknum(rb);
result->older_hash = toku_cachetable_hash(h->cf, result->older);
result->rollentry_resident_bytecount = rbuf_ulonglong(rb);
size_t arena_initial_size = rbuf_ulonglong(rb);
result->rollentry_arena = memarena_create_presized(arena_initial_size);
if (0) { died1: memarena_close(&result->rollentry_arena); goto died0; }
//Load rollback entries
assert(rb->size > 4);
//Start with empty list
result->oldest_logentry = result->newest_logentry = NULL;
while (rb->ndone < rb->size) {
struct roll_entry *item;
uint32_t rollback_fsize = rbuf_int(rb); //Already read 4. Rest is 4 smaller
bytevec item_vec;
rbuf_literal_bytes(rb, &item_vec, rollback_fsize-4);
unsigned char* item_buf = (unsigned char*)item_vec;
r = toku_parse_rollback(item_buf, rollback_fsize-4, &item, result->rollentry_arena);
if (r!=0) {
r = toku_db_badformat();
goto died1;
}
//Add to head of list
if (result->oldest_logentry) {
result->oldest_logentry->prev = item;
result->oldest_logentry = item;
item->prev = NULL;
}
else {
result->oldest_logentry = result->newest_logentry = item;
item->prev = NULL;
}
}
toku_free(rb->buf);
rb->buf = NULL;
*log_p = result;
return 0;
}
static int
deserialize_rollback_log_from_rbuf_versioned (u_int32_t version, BLOCKNUM blocknum, u_int32_t fullhash,
ROLLBACK_LOG_NODE *log,
TOKUTXN txn, struct brt_header *h, struct rbuf *rb) {
int r = 0;
ROLLBACK_LOG_NODE rollback_log_node = NULL;
int upgrade = 0;
switch (version) {
case BRT_LAYOUT_VERSION:
if (!upgrade)
r = deserialize_rollback_log_from_rbuf(blocknum, fullhash, &rollback_log_node, txn, h, rb);
if (r==0) {
assert(rollback_log_node);
*log = rollback_log_node;
}
if (upgrade && r == 0) (*log)->dirty = 1;
break; // this is the only break
default:
assert(FALSE);
}
return r;
}
// Read rollback log node from file into struct. Perform version upgrade if necessary.
int
toku_deserialize_rollback_log_from (int fd, BLOCKNUM blocknum, u_int32_t fullhash,
ROLLBACK_LOG_NODE *logp, TOKUTXN txn, struct brt_header *h) {
toku_trace("deserial start");
int r;
struct rbuf rb = {.buf = NULL, .size = 0, .ndone = 0};
int layout_version;
r = read_and_decompress_block_from_fd_into_rbuf(fd, blocknum, h, &rb, &layout_version);
if (r!=0) goto cleanup;
{
u_int8_t *magic = rb.buf + uncompressed_magic_offset;
if (memcmp(magic, "tokuroll", 8)!=0) {
r = toku_db_badformat();
goto cleanup;
}
}
r = deserialize_rollback_log_from_rbuf_versioned(layout_version, blocknum, fullhash, logp, txn, h, &rb);
toku_trace("deserial done");
cleanup:
if (rb.buf) toku_free(rb.buf);
return r;
}
// NOTE: Backwards compatibility functions are in the included .c file(s):
#include "backwards_10.c"

View file

@ -2650,13 +2650,6 @@ int toku_brt_insert (BRT brt, DBT *key, DBT *val, TOKUTXN txn) {
return toku_brt_maybe_insert(brt, key, val, txn, FALSE, ZERO_LSN, TRUE, BRT_INSERT);
}
static void
txn_note_doing_work(TOKUTXN txn) {
if (txn)
txn->has_done_work = TRUE;
}
int
toku_brt_load_recovery(TOKUTXN txn, char const * old_iname, char const * new_iname, int do_fsync, int do_log, LSN *load_lsn) {
int r = 0;
@ -2665,12 +2658,9 @@ toku_brt_load_recovery(TOKUTXN txn, char const * old_iname, char const * new_ina
//before the (old) file is actually unlinked
TOKULOGGER logger = toku_txn_logger(txn);
BYTESTRING old_iname_bs = {.len=strlen(old_iname),
.data=toku_memdup_in_rollback(txn, old_iname, strlen(old_iname))};
BYTESTRING new_iname_bs = {.len=strlen(new_iname),
.data=toku_memdup_in_rollback(txn, new_iname, strlen(new_iname))};
r = toku_logger_save_rollback_load(txn, old_iname_bs, new_iname_bs);
BYTESTRING old_iname_bs = {.len=strlen(old_iname), .data=(char*)old_iname};
BYTESTRING new_iname_bs = {.len=strlen(new_iname), .data=(char*)new_iname};
r = toku_logger_save_rollback_load(txn, &old_iname_bs, &new_iname_bs);
if (r==0 && do_log && logger) {
TXNID xid = toku_txn_get_txnid(txn);
r = toku_log_load(logger, load_lsn, do_fsync, xid, old_iname_bs, new_iname_bs);
@ -2715,15 +2705,14 @@ int toku_brt_maybe_insert (BRT brt, DBT *key, DBT *val, TOKUTXN txn, BOOL oplsn_
int r = 0;
XIDS message_xids;
TXNID xid = toku_txn_get_txnid(txn);
txn_note_doing_work(txn);
if (txn && (brt->h->txnid_that_created_or_locked_when_empty != xid)) {
BYTESTRING keybs = {key->size, toku_memdup_in_rollback(txn, key->data, key->size)};
BYTESTRING keybs = {key->size, key->data};
int need_data = (brt->flags&TOKU_DB_DUPSORT)!=0; // dupsorts don't need the data part
if (need_data) {
BYTESTRING databs = {val->size, toku_memdup_in_rollback(txn, val->data, val->size)};
r = toku_logger_save_rollback_cmdinsertboth(txn, toku_cachefile_filenum(brt->cf), keybs, databs);
BYTESTRING databs = {val->size, val->data};
r = toku_logger_save_rollback_cmdinsertboth(txn, toku_cachefile_filenum(brt->cf), &keybs, &databs);
} else {
r = toku_logger_save_rollback_cmdinsert (txn, toku_cachefile_filenum(brt->cf), keybs);
r = toku_logger_save_rollback_cmdinsert (txn, toku_cachefile_filenum(brt->cf), &keybs);
}
if (r!=0) return r;
r = toku_txn_note_brt(txn, brt);
@ -2788,10 +2777,9 @@ int toku_brt_maybe_delete(BRT brt, DBT *key, TOKUTXN txn, BOOL oplsn_valid, LSN
int r;
XIDS message_xids;
TXNID xid = toku_txn_get_txnid(txn);
txn_note_doing_work(txn);
if (txn && (brt->h->txnid_that_created_or_locked_when_empty != xid)) {
BYTESTRING keybs = {key->size, toku_memdup_in_rollback(txn, key->data, key->size)};
r = toku_logger_save_rollback_cmddelete(txn, toku_cachefile_filenum(brt->cf), keybs);
BYTESTRING keybs = {key->size, key->data};
r = toku_logger_save_rollback_cmddelete(txn, toku_cachefile_filenum(brt->cf), &keybs);
if (r!=0) return r;
r = toku_txn_note_brt(txn, brt);
if (r!=0) return r;
@ -2975,6 +2963,20 @@ brtheader_log_fassociate_during_checkpoint (CACHEFILE cf, void *header_v) {
return r;
}
static int
brtheader_log_suppress_rollback_during_checkpoint (CACHEFILE cf, void *header_v) {
int r = 0;
struct brt_header *h = header_v;
TXNID xid = h->txnid_that_created_or_locked_when_empty;
if (xid != TXNID_NONE) {
//Only log if useful.
TOKULOGGER logger = toku_cachefile_logger(cf);
FILENUM filenum = toku_cachefile_filenum (cf);
r = toku_log_suppress_rollback(logger, NULL, 0, filenum, xid);
}
return r;
}
static int brtheader_note_pin_by_checkpoint (CACHEFILE cachefile, void *header_v);
static int brtheader_note_unpin_by_checkpoint (CACHEFILE cachefile, void *header_v);
@ -2997,6 +2999,7 @@ brt_init_header_partial (BRT t) {
toku_cachefile_set_userdata(t->cf,
t->h,
brtheader_log_fassociate_during_checkpoint,
brtheader_log_suppress_rollback_during_checkpoint,
toku_brtheader_close,
toku_brtheader_checkpoint,
toku_brtheader_begin_checkpoint,
@ -3074,6 +3077,7 @@ int toku_read_brt_header_and_store_in_cachefile (CACHEFILE cf, struct brt_header
toku_cachefile_set_userdata(cf,
(void*)h,
brtheader_log_fassociate_during_checkpoint,
brtheader_log_suppress_rollback_during_checkpoint,
toku_brtheader_close,
toku_brtheader_checkpoint,
toku_brtheader_begin_checkpoint,
@ -3129,7 +3133,7 @@ verify_builtin_comparisons_consistent(BRT t, u_int32_t flags) {
// This is the actual open, used for various purposes, such as normal use, recovery, and redirect.
// fname_in_env is the iname, relative to the env_dir (data_dir is already in iname as prefix)
static int
brt_open(BRT t, const char *fname_in_env, int is_create, int only_create, CACHETABLE cachetable, TOKUTXN txn, DB *db, int recovery_force_fcreate, FILENUM use_filenum, DICTIONARY_ID use_dictionary_id) {
brt_open(BRT t, const char *fname_in_env, int is_create, int only_create, CACHETABLE cachetable, TOKUTXN txn, DB *db, FILENUM use_filenum, DICTIONARY_ID use_dictionary_id) {
int r;
BOOL txn_created = FALSE;
@ -3147,11 +3151,11 @@ brt_open(BRT t, const char *fname_in_env, int is_create, int only_create, CACHET
assert(is_create || !only_create);
t->db = db;
BOOL log_fopen = FALSE; // set true if we're opening a pre-existing file
BOOL did_create = FALSE;
FILENUM reserved_filenum = use_filenum;
{
int fd = -1;
BOOL did_create = FALSE;
r = brt_open_file(fname_in_cwd, &fd);
FILENUM reserved_filenum = use_filenum;
int use_reserved_filenum = reserved_filenum.fileid != FILENUM_NONE.fileid;
if (r==ENOENT && is_create) {
toku_cachetable_reserve_filenum(cachetable, &reserved_filenum, use_reserved_filenum, reserved_filenum);
@ -3164,6 +3168,12 @@ brt_open(BRT t, const char *fname_in_env, int is_create, int only_create, CACHET
if (use_reserved_filenum) assert(reserved_filenum.fileid == use_filenum.fileid);
did_create = TRUE;
mode_t mode = S_IRWXU|S_IRWXG|S_IRWXO;
if (txn) {
BYTESTRING bs = { .len=strlen(fname_in_env), .data = (char*)fname_in_env };
r = toku_logger_save_rollback_fcreate(txn, reserved_filenum, &bs); // bs is a copy of the fname relative to the environment
if (r != 0) goto died1;
}
txn_created = (BOOL)(txn!=NULL);
r = toku_logger_log_fcreate(txn, fname_in_env, reserved_filenum, mode, t->flags, &(t->temp_descriptor));
if (r!=0) goto died1;
r = brt_create_file(t, fname_in_cwd, &fd);
@ -3176,14 +3186,7 @@ brt_open(BRT t, const char *fname_in_env, int is_create, int only_create, CACHET
fname_in_env,
use_reserved_filenum||did_create, reserved_filenum, did_create);
if (r != 0) goto died1;
if (did_create || recovery_force_fcreate) {
if (txn) {
BYTESTRING bs = { .len=strlen(fname_in_env), .data = toku_strdup_in_rollback(txn, fname_in_env) };
r = toku_logger_save_rollback_fcreate(txn, toku_cachefile_filenum(t->cf), bs); // bs is a copy of the fname relative to the environment
if (r != 0) goto died_after_open;
}
txn_created = (BOOL)(txn!=NULL);
} else
if (!did_create)
log_fopen = TRUE; //Log of fopen must be delayed till flags are available
}
if (r!=0) {
@ -3294,7 +3297,7 @@ brt_open(BRT t, const char *fname_in_env, int is_create, int only_create, CACHET
if (t->db) t->db->descriptor = &t->h->descriptor.dbt;
if (txn_created) {
assert(txn);
assert(t->h->txnid_that_created_or_locked_when_empty == 0); // Uses 0 for no transaction.
assert(t->h->txnid_that_created_or_locked_when_empty == TXNID_NONE);
t->h->txnid_that_created_or_locked_when_empty = toku_txn_get_txnid(txn);
r = toku_txn_note_brt(txn, t);
assert(r==0);
@ -3312,11 +3315,11 @@ brt_open(BRT t, const char *fname_in_env, int is_create, int only_create, CACHET
// Open a brt for the purpose of recovery, which requires that the brt be open to a pre-determined FILENUM. (dict_id is assigned by the brt_open() function.)
int
toku_brt_open_recovery(BRT t, const char *fname_in_env, int is_create, int only_create, CACHETABLE cachetable, TOKUTXN txn, DB *db, int recovery_force_fcreate, FILENUM use_filenum) {
toku_brt_open_recovery(BRT t, const char *fname_in_env, int is_create, int only_create, CACHETABLE cachetable, TOKUTXN txn, DB *db, FILENUM use_filenum) {
int r;
assert(use_filenum.fileid != FILENUM_NONE.fileid);
r = brt_open(t, fname_in_env, is_create, only_create, cachetable,
txn, db, recovery_force_fcreate, use_filenum, DICTIONARY_ID_NONE);
txn, db, use_filenum, DICTIONARY_ID_NONE);
return r;
}
@ -3324,7 +3327,7 @@ toku_brt_open_recovery(BRT t, const char *fname_in_env, int is_create, int only_
int
toku_brt_open(BRT t, const char *fname_in_env, int is_create, int only_create, CACHETABLE cachetable, TOKUTXN txn, DB *db) {
int r;
r = brt_open(t, fname_in_env, is_create, only_create, cachetable, txn, db, FALSE, FILENUM_NONE, DICTIONARY_ID_NONE);
r = brt_open(t, fname_in_env, is_create, only_create, cachetable, txn, db, FILENUM_NONE, DICTIONARY_ID_NONE);
return r;
}
@ -3359,7 +3362,7 @@ brt_open_for_redirect(BRT *new_brtp, const char *fname_in_env, TOKUTXN txn, BRT
assert(r==0);
}
CACHETABLE ct = toku_cachefile_get_cachetable(old_brt->cf);
r = brt_open(t, fname_in_env, 0, 0, ct, txn, old_brt->db, FALSE, FILENUM_NONE, old_h->dict_id);
r = brt_open(t, fname_in_env, 0, 0, ct, txn, old_brt->db, FILENUM_NONE, old_h->dict_id);
assert(r==0);
if (old_h->descriptor.version==0) {
assert(t->h->descriptor.version == 0);
@ -3400,7 +3403,7 @@ brt_redirect_db (BRT brt_to, BRT brt_from) {
}
static int
redirect_brt_close_delayed(DB *db, u_int32_t UU(flags)) {
fake_db_brt_close_delayed(DB *db, u_int32_t UU(flags)) {
BRT brt_to_close = db->api_internal;
char *error_string = NULL;
int r = toku_close_brt(brt_to_close, &error_string);
@ -3435,7 +3438,7 @@ toku_brt_header_close_redirected_brts(struct brt_header * h) {
assert(which == num_brts);
for (which = 0; which < num_brts; which++) {
int r;
r = toku_brt_db_delay_closed(brts[which], dbs[which], redirect_brt_close_delayed, 0);
r = toku_brt_db_delay_closed(brts[which], dbs[which], fake_db_brt_close_delayed, 0);
assert(r==0);
}
return 0;
@ -3590,7 +3593,6 @@ toku_dictionary_redirect (const char *dst_fname_in_env, BRT old_brt, TOKUTXN txn
}
if (txn) {
txn_note_doing_work(txn);
r = toku_txn_note_brt(txn, old_brt); // mark old brt as touched by this txn
assert(r==0);
}
@ -3618,6 +3620,8 @@ toku_dictionary_redirect (const char *dst_fname_in_env, BRT old_brt, TOKUTXN txn
assert(new_h->txnid_that_created_or_locked_when_empty == TXNID_NONE);
TXNID xid = toku_txn_get_txnid(txn);
new_h->txnid_that_created_or_locked_when_empty = xid;
r = toku_log_suppress_rollback(txn->logger, NULL, 0, new_filenum, xid);
assert(r==0);
}
cleanup:
@ -3856,6 +3860,8 @@ toku_brtheader_close (CACHEFILE cachefile, int fd, void *header_v, char **malloc
if (h->panic) {
r = h->panic;
} else if (h->dictionary_opened) { //Otherwise header has never fully been created.
assert(h->cf == cachefile);
TOKULOGGER logger = toku_cachefile_logger(cachefile);
LSN lsn = ZERO_LSN;
//Get LSN
if (oplsn_valid) {
@ -3868,17 +3874,19 @@ toku_brtheader_close (CACHEFILE cachefile, int fd, void *header_v, char **malloc
else {
//Get LSN from logger
lsn = ZERO_LSN; // if there is no logger, we use zero for the lsn
TOKULOGGER logger = toku_cachefile_logger(cachefile);
if (logger) {
//NEED NAME
char* fname_in_env = toku_cachefile_fname_in_env(cachefile);
assert(fname_in_env);
BYTESTRING bs = {.len=strlen(fname_in_env), .data=fname_in_env};
r = toku_log_fclose(logger, &lsn, h->dirty, bs, toku_cachefile_filenum(cachefile), h->flags); // flush the log on close (if new header is being written), otherwise it might not make it out.
r = toku_log_fclose(logger, &lsn, h->dirty, bs, toku_cachefile_filenum(cachefile)); // flush the log on close (if new header is being written), otherwise it might not make it out.
if (r!=0) return r;
}
}
if (h->dirty) { // this is the only place this bit is tested (in currentheader)
if (logger) { //Rollback cachefile MUST NOT BE CLOSED DIRTY
//It can be checkpointed only via 'checkpoint'
assert(logger->rollback_cachefile != cachefile);
}
int r2;
//assert(lsn.lsn!=0);
r2 = toku_brtheader_begin_checkpoint(cachefile, fd, lsn, header_v);
@ -5315,11 +5323,10 @@ int toku_brt_maybe_delete_both(BRT brt, DBT *key, DBT *val, TOKUTXN txn, BOOL op
int r;
XIDS message_xids;
TXNID xid = toku_txn_get_txnid(txn);
txn_note_doing_work(txn);
if (txn && (brt->h->txnid_that_created_or_locked_when_empty != xid)) {
BYTESTRING keybs = {key->size, toku_memdup_in_rollback(txn, key->data, key->size)};
BYTESTRING databs = {val->size, toku_memdup_in_rollback(txn, val->data, val->size)};
r = toku_logger_save_rollback_cmddeleteboth(txn, toku_cachefile_filenum(brt->cf), keybs, databs);
BYTESTRING keybs = {key->size, key->data};
BYTESTRING databs = {val->size, val->data};
r = toku_logger_save_rollback_cmddeleteboth(txn, toku_cachefile_filenum(brt->cf), &keybs, &databs);
if (r!=0) return r;
r = toku_txn_note_brt(txn, brt);
if (r!=0) return r;
@ -5671,8 +5678,8 @@ int toku_brt_destroy(void) {
}
//Return TRUE if empty, FALSE if not empty.
static BOOL
brt_is_empty (BRT brt) {
BOOL
toku_brt_is_empty (BRT brt) {
BRT_CURSOR cursor;
int r, r2;
BOOL is_empty;
@ -5687,12 +5694,12 @@ brt_is_empty (BRT brt) {
}
int
toku_brt_note_table_lock (BRT brt, TOKUTXN txn)
{
toku_brt_note_table_lock (BRT brt, TOKUTXN txn, BOOL ignore_not_empty) {
int r = 0;
if (brt->h->txnid_that_created_or_locked_when_empty != toku_txn_get_txnid(txn) &&
brt_is_empty(brt) &&
brt->h->txnid_that_created_or_locked_when_empty == 0) {
(ignore_not_empty || toku_brt_is_empty(brt)) &&
brt->h->txnid_that_created_or_locked_when_empty == TXNID_NONE)
{
brt->h->txnid_that_created_or_locked_when_empty = toku_txn_get_txnid(txn);
r = toku_txn_note_brt(txn, brt);
assert(r==0);
@ -5711,7 +5718,7 @@ LSN toku_brt_checkpoint_lsn(BRT brt) {
return brt->h->checkpoint_lsn;
}
static int toku_brt_header_set_panic(struct brt_header *h, int panic, char *panic_string) {
int toku_brt_header_set_panic(struct brt_header *h, int panic, char *panic_string) {
if (h->panic == 0) {
h->panic = panic;
if (h->panic_string)
@ -5743,7 +5750,7 @@ int toku_logger_log_fdelete (TOKUTXN txn, const char *fname, FILENUM filenum, u_
// Prepare to remove a dictionary from the database when this transaction is committed:
// - if cachetable has file open, mark it as in use so that cf remains valid until we're done
// - mark transaction as NEED fsync on commit
// - make entry in rolltmp log
// - make entry in rollback log
// - make fdelete entry in recovery log
int toku_brt_remove_on_commit(TOKUTXN txn, DBT* iname_in_env_dbt_p) {
assert(txn);
@ -5779,12 +5786,9 @@ int toku_brt_remove_on_commit(TOKUTXN txn, DBT* iname_in_env_dbt_p) {
toku_txn_force_fsync_on_commit(txn); //If the txn commits, the commit MUST be in the log
//before the file is actually unlinked
{
BYTESTRING iname_in_env_bs = {
.len=strlen(iname_in_env),
.data = toku_strdup_in_rollback(txn, iname_in_env)
};
// make entry in rolltmp log
r = toku_logger_save_rollback_fdelete(txn, was_open, filenum, iname_in_env_bs);
BYTESTRING iname_in_env_bs = { .len=strlen(iname_in_env), .data = (char*)iname_in_env };
// make entry in rollback log
r = toku_logger_save_rollback_fdelete(txn, was_open, filenum, &iname_in_env_bs);
assert(r==0); //On error we would need to remove the CF reference, which is complicated.
}
if (r==0)
@ -5794,7 +5798,7 @@ int toku_brt_remove_on_commit(TOKUTXN txn, DBT* iname_in_env_dbt_p) {
}
//
// Non-transaction version of fdelete
int toku_brt_remove_now(CACHETABLE ct, DBT* iname_in_env_dbt_p) {
int r;
const char *iname_in_env = iname_in_env_dbt_p->data;

View file

@ -52,7 +52,7 @@ int brt_set_cachetable(BRT, CACHETABLE);
int toku_brt_open(BRT, const char *fname_in_env,
int is_create, int only_create, CACHETABLE ct, TOKUTXN txn, DB *db);
int toku_brt_open_recovery(BRT, const char *fname_in_env,
int is_create, int only_create, CACHETABLE ct, TOKUTXN txn, DB *db, int recovery_force_fcreate, FILENUM use_filenum);
int is_create, int only_create, CACHETABLE ct, TOKUTXN txn, DB *db, FILENUM use_filenum);
int toku_brt_remove_subdb(BRT brt, const char *dbname, u_int32_t flags);
@ -206,12 +206,14 @@ void toku_maybe_truncate_cachefile (CACHEFILE cf, int fd, u_int64_t size_used);
int maybe_preallocate_in_file (int fd, u_int64_t size);
// Effect: If file size is less than SIZE, make it bigger by either doubling it or growing by 16MB whichever is less.
int toku_brt_note_table_lock (BRT brt, TOKUTXN txn);
int toku_brt_note_table_lock (BRT brt, TOKUTXN txn, BOOL ignore_not_empty);
// Effect: Record the fact that the BRT has a table lock (and thus no other txn will modify it until this txn completes. As a result, we can limit the amount of information in the rollback data structure.
int toku_brt_zombie_needed (BRT brt);
int toku_brt_get_fragmentation(BRT brt, TOKU_DB_FRAGMENTATION report);
int toku_brt_header_set_panic(struct brt_header *h, int panic, char *panic_string);
BOOL toku_brt_is_empty (BRT brt);
double get_tdiff(void) __attribute__((__visibility__("default")));

View file

@ -33,6 +33,7 @@ typedef u_int64_t TXNID;
#define TXNID_NONE ((TXNID)0)
typedef struct s_blocknum { int64_t b; } BLOCKNUM; // make a struct so that we will notice type problems.
#define ROLLBACK_NONE ((BLOCKNUM){0})
static inline BLOCKNUM make_blocknum(int64_t b) { BLOCKNUM result={b}; return result; }
@ -70,6 +71,7 @@ typedef enum __toku_bool { FALSE=0, TRUE=1} BOOL;
typedef struct tokulogger *TOKULOGGER;
#define NULL_LOGGER ((TOKULOGGER)0)
typedef struct tokutxn *TOKUTXN;
typedef struct txninfo *TXNINFO;
#define NULL_TXN ((TOKUTXN)0)
struct logged_btt_pair {
@ -121,5 +123,8 @@ typedef int (*generate_row_for_del_func)(DB *dest_db, DB *src_db, DBT *dest_val,
#define UU(x) x __attribute__((__unused__))
typedef struct memarena *MEMARENA;
typedef struct rollback_log_node *ROLLBACK_LOG_NODE;
#endif

View file

@ -100,8 +100,6 @@ struct ctpair {
PAIR next,prev; // In LRU list.
PAIR hash_chain;
LSN modified_lsn; // What was the LSN when modified (undefined if not dirty)
LSN written_lsn; // What was the LSN when written (we need to get this information when we fetch)
BOOL checkpoint_pending; // If this is on, then we have got to write the pair out to disk before modifying it.
PAIR pending_next;
@ -155,6 +153,8 @@ struct cachetable {
struct workqueue wq; // async work queue
THREADPOOL threadpool; // pool of worker threads
LSN lsn_of_checkpoint_in_progress;
u_int32_t checkpoint_num_files; // how many cachefiles are in the checkpoint
u_int32_t checkpoint_num_txns; // how many transactions are in the checkpoint
PAIR pending_head; // list of pairs marked with checkpoint_pending
struct rwlock pending_lock; // multiple writer threads, single checkpoint thread
struct minicron checkpointer; // the periodic checkpointing thread
@ -165,7 +165,6 @@ struct cachetable {
BOOL set_env_dir; //Can only set env_dir once
};
// Lock the cachetable
static inline void cachefiles_lock(CACHETABLE ct) {
int r = toku_pthread_mutex_lock(&ct->cachefiles_mutex); assert(r == 0);
@ -224,6 +223,7 @@ struct cachefile {
void *userdata;
int (*log_fassociate_during_checkpoint)(CACHEFILE cf, void *userdata); // When starting a checkpoint we must log all open files.
int (*log_suppress_rollback_during_checkpoint)(CACHEFILE cf, void *userdata); // When starting a checkpoint we must log which files need rollbacks suppressed
int (*close_userdata)(CACHEFILE cf, int fd, void *userdata, char **error_string, BOOL lsnvalid, LSN); // when closing the last reference to a cachefile, first call this function.
int (*begin_checkpoint_userdata)(CACHEFILE cf, int fd, LSN lsn_of_checkpoint, void *userdata); // before checkpointing cachefiles call this function.
int (*checkpoint_userdata)(CACHEFILE cf, int fd, void *userdata); // when checkpointing a cachefile, call this function.
@ -1239,8 +1239,7 @@ static PAIR cachetable_insert_at(CACHETABLE ct,
CACHETABLE_FLUSH_CALLBACK flush_callback,
CACHETABLE_FETCH_CALLBACK fetch_callback,
void *extraargs,
enum cachetable_dirty dirty,
LSN written_lsn) {
enum cachetable_dirty dirty) {
TAGMALLOC(PAIR, p);
assert(p);
memset(p, 0, sizeof *p);
@ -1255,8 +1254,6 @@ static PAIR cachetable_insert_at(CACHETABLE ct,
p->flush_callback = flush_callback;
p->fetch_callback = fetch_callback;
p->extraargs = extraargs;
p->modified_lsn.lsn = 0;
p->written_lsn = written_lsn;
p->fullhash = fullhash;
p->next = p->prev = 0;
rwlock_init(&p->rwlock);
@ -1321,7 +1318,7 @@ int toku_cachetable_put(CACHEFILE cachefile, CACHEKEY key, u_int32_t fullhash, v
}
// flushing could change the table size, but wont' change the fullhash
cachetable_puts++;
PAIR p = cachetable_insert_at(ct, cachefile, key, value, CTPAIR_IDLE, fullhash, size, flush_callback, fetch_callback, extraargs, CACHETABLE_DIRTY, ZERO_LSN);
PAIR p = cachetable_insert_at(ct, cachefile, key, value, CTPAIR_IDLE, fullhash, size, flush_callback, fetch_callback, extraargs, CACHETABLE_DIRTY);
assert(p);
rwlock_read_lock(&p->rwlock, ct->mutex);
note_hash_count(count);
@ -1465,7 +1462,7 @@ int toku_cachetable_get_and_pin(CACHEFILE cachefile, CACHEKEY key, u_int32_t ful
int r;
// Note. hashit(t,key) may have changed as a result of flushing. But fullhash won't have changed.
{
p = cachetable_insert_at(ct, cachefile, key, zero_value, CTPAIR_READING, fullhash, zero_size, flush_callback, fetch_callback, extraargs, CACHETABLE_CLEAN, ZERO_LSN);
p = cachetable_insert_at(ct, cachefile, key, zero_value, CTPAIR_READING, fullhash, zero_size, flush_callback, fetch_callback, extraargs, CACHETABLE_CLEAN);
assert(p);
get_and_pin_footprint = 10;
rwlock_write_lock(&p->rwlock, ct->mutex);
@ -1619,7 +1616,7 @@ int toku_cachefile_prefetch(CACHEFILE cf, CACHEKEY key, u_int32_t fullhash,
// if not found then create a pair in the READING state and fetch it
if (p == 0) {
cachetable_prefetches++;
p = cachetable_insert_at(ct, cf, key, zero_value, CTPAIR_READING, fullhash, zero_size, flush_callback, fetch_callback, extraargs, CACHETABLE_CLEAN, ZERO_LSN);
p = cachetable_insert_at(ct, cf, key, zero_value, CTPAIR_READING, fullhash, zero_size, flush_callback, fetch_callback, extraargs, CACHETABLE_CLEAN);
assert(p);
rwlock_write_lock(&p->rwlock, ct->mutex);
#if DO_WORKER_THREAD
@ -1906,18 +1903,53 @@ int toku_cachetable_unpin_and_remove (CACHEFILE cachefile, CACHEKEY key) {
}
static int
log_open_txn (OMTVALUE txnv, u_int32_t UU(index), void *loggerv) {
TOKUTXN txn = txnv;
TOKULOGGER logger = loggerv;
if (toku_logger_txn_parent(txn)==NULL) { // only have to log the open root transactions
int r = toku_log_xstillopen(logger, NULL, 0,
toku_txn_get_txnid(txn),
toku_txn_get_txnid(toku_logger_txn_parent(txn)));
assert(r==0);
}
set_filenum_in_array(OMTVALUE brtv, u_int32_t index, void*arrayv) {
FILENUM *array = arrayv;
BRT brt = brtv;
array[index] = toku_cachefile_filenum(brt->cf);
return 0;
}
static int
log_open_txn (OMTVALUE txnv, u_int32_t UU(index), void *UU(extra)) {
TOKUTXN txn = txnv;
TOKULOGGER logger = txn->logger;
FILENUMS open_filenums;
uint32_t num_filenums = toku_omt_size(txn->open_brts);
FILENUM array[num_filenums];
{
open_filenums.num = num_filenums;
open_filenums.filenums = array;
//Fill in open_filenums
int r = toku_omt_iterate(txn->open_brts, set_filenum_in_array, array);
assert(r==0);
}
int r = toku_log_xstillopen(logger, NULL, 0,
toku_txn_get_txnid(txn),
toku_txn_get_txnid(toku_logger_txn_parent(txn)),
txn->rollentry_raw_count,
open_filenums,
txn->force_fsync_on_commit,
txn->num_rollback_nodes,
txn->num_rollentries,
txn->spilled_rollback_head,
txn->spilled_rollback_tail,
txn->current_rollback);
assert(r==0);
return 0;
}
static int
unpin_rollback_log_for_checkpoint (OMTVALUE txnv, u_int32_t UU(index), void *UU(extra)) {
int r = 0;
TOKUTXN txn = txnv;
if (txn->pinned_inprogress_rollback_log) {
r = toku_rollback_log_unpin(txn, txn->pinned_inprogress_rollback_log);
assert(r==0);
}
return r;
}
// TODO: #1510 locking of cachetable is suspect
// verify correct algorithm overall
@ -1931,7 +1963,17 @@ toku_cachetable_begin_checkpoint (CACHETABLE ct, TOKULOGGER logger) {
{
unsigned i;
if (logger) { // Unpin all 'inprogress rollback log nodes' pinned by transactions
int r = toku_omt_iterate(logger->live_txns,
unpin_rollback_log_for_checkpoint,
NULL);
assert(r==0);
}
cachetable_lock(ct);
//Initialize accountability counters
ct->checkpoint_num_files = 0;
ct->checkpoint_num_txns = 0;
//Make list of cachefiles to be included in checkpoint.
//If refcount is 0, the cachefile is closing (performing a local checkpoint)
{
@ -1960,11 +2002,6 @@ toku_cachetable_begin_checkpoint (CACHETABLE ct, TOKULOGGER logger) {
assert(r==0);
ct->lsn_of_checkpoint_in_progress = begin_lsn;
}
// Log all the open transactions
{
int r = toku_omt_iterate(logger->live_txns, log_open_txn, logger);
assert(r==0);
}
// Log all the open files
{
//Must loop through ALL open files (even if not included in checkpoint).
@ -1973,6 +2010,26 @@ toku_cachetable_begin_checkpoint (CACHETABLE ct, TOKULOGGER logger) {
for (cf = ct->cachefiles; cf; cf=cf->next) {
if (cf->log_fassociate_during_checkpoint) {
int r = cf->log_fassociate_during_checkpoint(cf, cf->userdata);
ct->checkpoint_num_files++;
assert(r==0);
}
}
cachefiles_unlock(ct);
}
// Log all the open transactions MUST BE AFTER OPEN FILES
{
ct->checkpoint_num_txns = toku_omt_size(logger->live_txns);
int r = toku_omt_iterate(logger->live_txns, log_open_txn, NULL);
assert(r==0);
}
// Log rollback suppression for all the open files MUST BE AFTER TXNS
{
//Must loop through ALL open files (even if not included in checkpoint).
CACHEFILE cf;
cachefiles_lock(ct);
for (cf = ct->cachefiles; cf; cf=cf->next) {
if (cf->log_suppress_rollback_during_checkpoint) {
int r = cf->log_suppress_rollback_during_checkpoint(cf, cf->userdata);
assert(r==0);
}
}
@ -2115,7 +2172,10 @@ toku_cachetable_end_checkpoint(CACHETABLE ct, TOKULOGGER logger,
if (logger) {
int r = toku_log_end_checkpoint(logger, NULL,
1, // want the end_checkpoint to be fsync'd
ct->lsn_of_checkpoint_in_progress.lsn, 0);
ct->lsn_of_checkpoint_in_progress.lsn,
0,
ct->checkpoint_num_files,
ct->checkpoint_num_txns);
assert(r==0);
toku_logger_note_checkpoint(logger, ct->lsn_of_checkpoint_in_progress);
}
@ -2262,6 +2322,7 @@ void
toku_cachefile_set_userdata (CACHEFILE cf,
void *userdata,
int (*log_fassociate_during_checkpoint)(CACHEFILE, void*),
int (*log_suppress_rollback_during_checkpoint)(CACHEFILE, void*),
int (*close_userdata)(CACHEFILE, int, void*, char**, BOOL, LSN),
int (*checkpoint_userdata)(CACHEFILE, int, void*),
int (*begin_checkpoint_userdata)(CACHEFILE, int, LSN, void*),
@ -2270,6 +2331,7 @@ toku_cachefile_set_userdata (CACHEFILE cf,
int (*note_unpin_by_checkpoint)(CACHEFILE, void*)) {
cf->userdata = userdata;
cf->log_fassociate_during_checkpoint = log_fassociate_during_checkpoint;
cf->log_suppress_rollback_during_checkpoint = log_suppress_rollback_during_checkpoint;
cf->close_userdata = close_userdata;
cf->checkpoint_userdata = checkpoint_userdata;
cf->begin_checkpoint_userdata = begin_checkpoint_userdata;

View file

@ -123,6 +123,7 @@ typedef int (*CACHETABLE_FETCH_CALLBACK)(CACHEFILE, int fd, CACHEKEY key, u_int3
void toku_cachefile_set_userdata(CACHEFILE cf, void *userdata,
int (*log_fassociate_during_checkpoint)(CACHEFILE, void*),
int (*log_suppress_rollback_during_checkpoint)(CACHEFILE, void*),
int (*close_userdata)(CACHEFILE, int, void*, char **/*error_string*/, BOOL, LSN),
int (*checkpoint_userdata)(CACHEFILE, int, void*),
int (*begin_checkpoint_userdata)(CACHEFILE, int, LSN, void*),

View file

@ -218,7 +218,6 @@ toku_checkpoint(CACHETABLE ct, TOKULOGGER logger,
checkpoint_footprint = 40;
time_last_checkpoint_begin = time(NULL);
r = toku_cachetable_begin_checkpoint(ct, logger);
LSN oldest_live_lsn = toku_logger_get_oldest_living_lsn(logger);
multi_operation_checkpoint_unlock();
ydb_unlock();
@ -230,7 +229,7 @@ toku_checkpoint(CACHETABLE ct, TOKULOGGER logger,
r = toku_cachetable_end_checkpoint(ct, logger, ydb_lock, ydb_unlock, callback2_f, extra2);
}
if (r==0 && logger) {
LSN trim_lsn = (oldest_live_lsn.lsn < logger->checkpoint_lsn.lsn) ? oldest_live_lsn : logger->checkpoint_lsn;
LSN trim_lsn = logger->last_completed_checkpoint_lsn;
r = toku_logger_maybe_trim_log(logger, trim_lsn);
}

View file

@ -85,7 +85,7 @@ struct tokulogger {
// To access these, you must have the output condition lock.
LSN written_lsn; // the last lsn written
LSN fsynced_lsn; // What is the LSN of the highest fsynced log entry (accessed only while holding the output lock, and updated only when the output lock and output permission are held)
LSN checkpoint_lsn; // What is the LSN of the most recent completed checkpoint.
LSN last_completed_checkpoint_lsn; // What is the LSN of the most recent completed checkpoint.
long long next_log_file_number;
struct logbuf outbuf; // data being written to the file
int n_in_file; // The amount of data in the current file
@ -101,6 +101,7 @@ struct tokulogger {
u_int64_t swap_ctr; // how many times have input/output log buffers been swapped
void (*remove_finalize_callback) (DICTIONARY_ID, void*); // ydb-level callback to be called when a transaction that ...
void * remove_finalize_callback_extra; // ... deletes a file is committed or when one that creates a file is aborted.
CACHEFILE rollback_cachefile;
};
int toku_logger_find_next_unused_log_file(const char *directory, long long *result);
@ -116,25 +117,36 @@ struct tokutxn {
u_int64_t txnid64; /* this happens to be the first lsn */
TOKULOGGER logger;
TOKUTXN parent;
LSN last_lsn; /* Everytime anything is logged, update the LSN. (We need to atomically record the LSN along with writing into the log.) */
LSN first_lsn; /* The first lsn in the transaction. */
struct roll_entry *oldest_logentry,*newest_logentry; /* Only logentries with rollbacks are here. There is a list going from newest to oldest. */
MEMARENA rollentry_arena;
size_t rollentry_resident_bytecount; // How many bytes for the rollentries that are stored in main memory.
char *rollentry_filename;
int rollentry_fd; // If we spill the roll_entries, we write them into this fd.
toku_off_t rollentry_filesize; // How many bytes are in the rollentry file (this is the uncompressed bytes. If the file is compressed it may actually be smaller (or even larger with header information))
u_int64_t rollentry_raw_count; // the total count of every byte in the transaction and all its children.
OMT open_brts; // a collection of the brts that we touched. Indexed by filenum.
XIDS xids; //Represents the xid list
BOOL force_fsync_on_commit; //This transaction NEEDS an fsync once (if) it commits. (commit means root txn)
BOOL has_done_work; //If this transaction has not done work, there is no need to fsync.
TXN_PROGRESS_POLL_FUNCTION progress_poll_fun;
void * progress_poll_fun_extra;
uint64_t num_rollback_nodes;
uint64_t num_rollentries;
uint64_t num_rollentries_processed;
BLOCKNUM spilled_rollback_head;
uint32_t spilled_rollback_head_hash;
BLOCKNUM spilled_rollback_tail;
uint32_t spilled_rollback_tail_hash;
BLOCKNUM current_rollback;
uint32_t current_rollback_hash;
BOOL recovered_from_checkpoint;
ROLLBACK_LOG_NODE pinned_inprogress_rollback_log;
};
struct txninfo {
uint64_t rollentry_raw_count; // the total count of every byte in the transaction and all its children.
uint32_t num_brts;
BRT *open_brts;
BOOL force_fsync_on_commit; //This transaction NEEDS an fsync once (if) it commits. (commit means root txn)
uint64_t num_rollback_nodes;
uint64_t num_rollentries;
BLOCKNUM spilled_rollback_head;
BLOCKNUM spilled_rollback_tail;
BLOCKNUM current_rollback;
};
static inline int toku_logsizeof_u_int8_t (u_int32_t v __attribute__((__unused__))) {
@ -180,5 +192,4 @@ static inline char *fixup_fname(BYTESTRING *f) {
return fname;
}
int toku_read_rollback_backwards(BREAD, struct roll_entry **item, MEMARENA);
#endif

View file

@ -11,7 +11,6 @@
#include "../include/db.h"
#include "brttypes.h"
#include "memory.h"
#include "bread.h"
#include "x1764.h"
typedef void(*voidfp)(void);

View file

@ -41,8 +41,6 @@ struct logtype {
// In the fields, don't mention the command, the LSN, the CRC or the trailing LEN.
int logformat_version_number = 0;
const struct logtype rollbacks[] = {
//TODO: #2037 Add dname
{"fdelete", 'U', FA{{"u_int8_t", "file_was_open", 0},
@ -72,7 +70,12 @@ const struct logtype rollbacks[] = {
{"FILENUM", "filenum", 0},
{"BYTESTRING", "key", 0},
NULLFIELD}},
{"rollinclude", 'r', FA{{"BYTESTRING", "fname", 0},
{"rollinclude", 'r', FA{{"TXNID", "xid", 0},
{"u_int64_t", "num_nodes", 0},
{"BLOCKNUM", "spilled_head", 0},
{"u_int32_t", "spilled_head_hash", 0},
{"BLOCKNUM", "spilled_tail", 0},
{"u_int32_t", "spilled_tail_hash", 0},
NULLFIELD}},
{"tablelock_on_empty_table", 'L', FA{{"FILENUM", "filenum", 0},
NULLFIELD}},
@ -82,46 +85,44 @@ const struct logtype rollbacks[] = {
{"dictionary_redirect", 'R', FA{{"FILENUM", "old_filenum", 0},
{"FILENUM", "new_filenum", 0},
NULLFIELD}},
// {"fclose", 'c', FA{{"FILENUM", "filenum", 0},
// {"BYTESTRING", "fname", 0},
// NULLFIELD}},
// {"deleteatleaf", 'd', FA{{"FILENUM", "filenum", 0}, // Note a delete for rollback. The delete takes place in a leaf.
// {"BYTESTRING", "key", 0},
// {"BYTESTRING", "data", 0},
// NULLFIELD}},
// {"insertatleaf", 'i', FA{{"FILENUM", "filenum", 0}, // Note an insert for rollback. The insert takes place in a leaf.
// {"BYTESTRING", "key", 0},
// {"BYTESTRING", "data", 0},
// NULLFIELD}},
// {"xactiontouchednonleaf", 'n', FA{{"FILENUM", "filenum", 0},
// {"DISKOFFARRAY", "parents", 0},
// {"DISKOFF", "diskoff", 0},
// NULLFIELD}},
{0,0,FA{NULLFIELD}}
};
const struct logtype logtypes[] = {
// Records produced by checkpoints
{"begin_checkpoint", 'x', FA{{"u_int64_t", "timestamp", 0}, NULLFIELD}},
{"end_checkpoint", 'X', FA{{"TXNID", "txnid", 0}, {"u_int64_t", "timestamp", 0}, NULLFIELD}}, // TXNID is LSN of begin_checkpoint
{"end_checkpoint", 'X', FA{{"TXNID", "xid", 0}, // xid is LSN of begin_checkpoint
{"u_int64_t", "timestamp", 0},
{"u_int32_t", "num_fassociate_entries", 0}, // how many files were checkpointed
{"u_int32_t", "num_xstillopen_entries", 0}, // how many txns were checkpointed
NULLFIELD}},
//TODO: #2037 Add dname
{"fassociate", 'f', FA{{"FILENUM", "filenum", 0},
{"u_int32_t", "treeflags", 0},
{"BYTESTRING", "iname", 0}, // pathname of file
NULLFIELD}},
{"xstillopen", 's', FA{{"TXNID", "txnid", 0},
{"TXNID", "parent", 0},
NULLFIELD}}, // only record root transactions
//We do not use a TXNINFO struct since recovery log has
//FILENUMS and TOKUTXN has BRTs (for open_brts)
{"xstillopen", 's', FA{{"TXNID", "xid", 0},
{"TXNID", "parentxid", 0},
{"u_int64_t", "rollentry_raw_count", 0},
{"FILENUMS", "open_filenums", 0},
{"u_int8_t", "force_fsync_on_commit", 0},
{"u_int64_t", "num_rollback_nodes", 0},
{"u_int64_t", "num_rollentries", 0},
{"BLOCKNUM", "spilled_rollback_head", 0},
{"BLOCKNUM", "spilled_rollback_tail", 0},
{"BLOCKNUM", "current_rollback", 0},
NULLFIELD}}, // record all transactions
{"suppress_rollback", 'S', FA{{"FILENUM", "filenum", 0},
{"TXNID", "xid", 0},
NULLFIELD}},
// Records produced by transactions
{"commit", 'C', FA{{"TXNID", "txnid", 0},NULLFIELD}},
{"xabort", 'q', FA{{"TXNID", "txnid", 0},NULLFIELD}},
{"xbegin", 'b', FA{{"TXNID", "parenttxnid", 0},NULLFIELD}},
{"xbegin", 'b', FA{{"TXNID", "parentxid", 0},NULLFIELD}},
{"xcommit",'C', FA{{"TXNID", "xid", 0},NULLFIELD}},
{"xabort", 'q', FA{{"TXNID", "xid", 0},NULLFIELD}},
//TODO: #2037 Add dname
{"fdelete", 'U', FA{{"TXNID", "txnid", 0},
{"BYTESTRING", "iname", 0},
NULLFIELD}},
//TODO: #2037 Add dname
{"fcreate", 'F', FA{{"TXNID", "txnid", 0},
{"fcreate", 'F', FA{{"TXNID", "xid", 0},
{"FILENUM", "filenum", 0},
{"BYTESTRING", "iname", 0},
{"u_int32_t", "mode", "0%o"},
@ -137,21 +138,24 @@ const struct logtype logtypes[] = {
//TODO: #2037 Add dname
{"fclose", 'e', FA{{"BYTESTRING", "iname", 0},
{"FILENUM", "filenum", 0},
{"u_int32_t", "treeflags", 0},
NULLFIELD}},
//TODO: #2037 Add dname
{"fdelete", 'U', FA{{"TXNID", "xid", 0},
{"BYTESTRING", "iname", 0},
NULLFIELD}},
{"tablelock_on_empty_table", 'L', FA{{"FILENUM", "filenum", 0},
{"TXNID", "xid", 0},
NULLFIELD}},
{"enq_insert", 'I', FA{{"FILENUM", "filenum", 0},
{"TXNID", "xid", 0},
{"BYTESTRING", "key", 0},
{"BYTESTRING", "value", 0},
NULLFIELD}},
{"enq_insert_no_overwrite", 'i', FA{{"FILENUM", "filenum", 0},
{"TXNID", "xid", 0},
{"BYTESTRING", "key", 0},
{"BYTESTRING", "value", 0},
NULLFIELD}},
{"enq_insert", 'I', FA{{"FILENUM", "filenum", 0},
{"TXNID", "xid", 0},
{"BYTESTRING", "key", 0},
{"BYTESTRING", "value", 0},
NULLFIELD}},
{"enq_delete_both", 'D', FA{{"FILENUM", "filenum", 0},
{"TXNID", "xid", 0},
{"BYTESTRING", "key", 0},
@ -277,11 +281,10 @@ generate_log_struct (void) {
fprintf(hf, "struct roll_entry {\n");
fprintf(hf, " enum rt_cmd cmd;\n");
fprintf(hf, " struct roll_entry *prev; /* for in-memory list of log entries. Threads from newest to oldest. */\n");
fprintf(hf, " union {\n");
DO_ROLLBACKS(lt, fprintf(hf," struct rolltype_%s %s;\n", lt->name, lt->name));
fprintf(hf, " } u;\n");
fprintf(hf, " struct roll_entry *prev; /* for in-memory list of log entries. Threads from newest to oldest. */\n");
fprintf(hf, " struct roll_entry *next; /* Points to a newer logentry. Needed for flushing to disk, since we want to write the oldest one first. */\n");
fprintf(hf, "};\n");
}
@ -532,47 +535,76 @@ static void
generate_rollbacks (void) {
DO_ROLLBACKS(lt, {
fprintf2(cf, hf, "int toku_logger_save_rollback_%s (TOKUTXN txn", lt->name);
DO_FIELDS(ft, lt, fprintf2(cf, hf, ", %s %s", ft->type, ft->name));
DO_FIELDS(ft, lt, {
if ( strcmp(ft->type, "BYTESTRING") == 0 ) {
fprintf2(cf, hf, ", BYTESTRING *%s_ptr", ft->name);
}
else {
fprintf2(cf, hf, ", %s %s", ft->type, ft->name);
}
});
fprintf(hf, ");\n");
fprintf(cf, ") {\n");
fprintf(cf, " int r;\n");
fprintf(cf, " ROLLBACK_LOG_NODE log;\n");
fprintf(cf, " r = toku_get_and_pin_rollback_log_for_new_entry(txn, &log);\n");
fprintf(cf, " assert(r==0);\n");
// 'memdup' all BYTESTRINGS here
DO_FIELDS(ft, lt, {
if ( strcmp(ft->type, "BYTESTRING") == 0 ) {
fprintf(cf, " BYTESTRING %s = {\n"
" .len = %s_ptr->len,\n"
" .data = toku_memdup_in_rollback(log, %s_ptr->data, %s_ptr->len)\n"
" };\n",
ft->name, ft->name, ft->name, ft->name);
}
});
{
int count=0;
fprintf(cf, " u_int32_t rollback_fsize = toku_logger_rollback_fsize_%s(", lt->name);
DO_FIELDS(ft, lt, fprintf(cf, "%s%s", (count++>0)?", ":"", ft->name));
fprintf(cf, ");\n");
}
fprintf(cf, " struct roll_entry *v = toku_malloc_in_rollback(txn, sizeof(*v));\n");
fprintf(cf, " struct roll_entry *v;\n");
fprintf(cf, " size_t mem_needed = sizeof(v->u.%s) + __builtin_offsetof(struct roll_entry, u.%s);\n", lt->name, lt->name);
fprintf(cf, " v = toku_malloc_in_rollback(log, mem_needed);\n");
fprintf(cf, " if (v==0) return errno;\n");
fprintf(cf, " v->cmd = (enum rt_cmd)%u;\n", lt->command_and_flags&0xff);
DO_FIELDS(ft, lt, fprintf(cf, " v->u.%s.%s = %s;\n", lt->name, ft->name, ft->name));
fprintf(cf, " v->prev = txn->newest_logentry;\n");
fprintf(cf, " v->next = 0;\n");
fprintf(cf, " if (txn->oldest_logentry==0) txn->oldest_logentry=v;\n");
fprintf(cf, " else txn->newest_logentry->next = v;\n");
fprintf(cf, " txn->newest_logentry = v;\n");
fprintf(cf, " txn->rollentry_resident_bytecount += rollback_fsize;\n");
fprintf(cf, " v->prev = log->newest_logentry;\n");
fprintf(cf, " if (log->oldest_logentry==NULL) log->oldest_logentry=v;\n");
fprintf(cf, " log->newest_logentry = v;\n");
fprintf(cf, " log->rollentry_resident_bytecount += rollback_fsize;\n");
fprintf(cf, " txn->rollentry_raw_count += rollback_fsize;\n");
fprintf(cf, " txn->num_rollentries++;\n");
fprintf(cf, " return toku_maybe_spill_rollbacks(txn);\n}\n");
fprintf(cf, " log->dirty = TRUE;\n");
fprintf(cf, " return toku_maybe_spill_rollbacks(txn, log);\n}\n");
});
DO_ROLLBACKS(lt, {
fprintf2(cf, hf, "void toku_logger_rollback_wbufwrite_%s (struct wbuf *wbuf", lt->name);
fprintf2(cf, hf, "void toku_logger_rollback_wbuf_nocrc_write_%s (struct wbuf *wbuf", lt->name);
DO_FIELDS(ft, lt, fprintf2(cf, hf, ", %s %s", ft->type, ft->name));
fprintf2(cf, hf, ")");
fprintf(hf, ";\n");
fprintf(cf, " {\n");
fprintf(cf, " u_int32_t ndone_at_start = wbuf->ndone;\n");
fprintf(cf, " wbuf_char(wbuf, '%c');\n", (char)(0xff&lt->command_and_flags));
DO_FIELDS(ft, lt, fprintf(cf, " wbuf_%s(wbuf, %s);\n", ft->type, ft->name));
fprintf(cf, " wbuf_int(wbuf, 4+wbuf->ndone - ndone_at_start);\n");
{
int count=0;
fprintf(cf, " u_int32_t rollback_fsize = toku_logger_rollback_fsize_%s(", lt->name);
DO_FIELDS(ft, lt, fprintf(cf, "%s%s", (count++>0)?", ":"", ft->name));
fprintf(cf, ");\n");
fprintf(cf, " wbuf_nocrc_int(wbuf, rollback_fsize);\n");
}
fprintf(cf, " wbuf_nocrc_char(wbuf, '%c');\n", (char)(0xff&lt->command_and_flags));
DO_FIELDS(ft, lt, fprintf(cf, " wbuf_nocrc_%s(wbuf, %s);\n", ft->type, ft->name));
fprintf(cf, "}\n");
});
fprintf2(cf, hf, "void toku_logger_rollback_wbufwrite (struct wbuf *wbuf, struct roll_entry *r)");
fprintf2(cf, hf, "void toku_logger_rollback_wbuf_nocrc_write (struct wbuf *wbuf, struct roll_entry *r)");
fprintf(hf, ";\n");
fprintf(cf, " {\n switch (r->cmd) {\n");
DO_ROLLBACKS(lt, {
fprintf(cf, " case RT_%s: toku_logger_rollback_wbufwrite_%s(wbuf", lt->name, lt->name);
fprintf(cf, " case RT_%s: toku_logger_rollback_wbuf_nocrc_write_%s(wbuf", lt->name, lt->name);
DO_FIELDS(ft, lt, fprintf(cf, ", r->u.%s.%s", lt->name, ft->name));
fprintf(cf, "); return;\n");
});
@ -604,12 +636,15 @@ generate_rollbacks (void) {
fprintf2(cf, hf, "int toku_parse_rollback(unsigned char *buf, u_int32_t n_bytes, struct roll_entry **itemp, MEMARENA ma)");
fprintf(hf, ";\n");
fprintf(cf, " {\n assert(n_bytes>0);\n struct roll_entry *item = malloc_in_memarena(ma, sizeof(*item));\n item->cmd=(enum rt_cmd)(buf[0]);\n");
fprintf(cf, " {\n assert(n_bytes>0);\n struct roll_entry *item;\n enum rt_cmd cmd = (enum rt_cmd)(buf[0]);\n size_t mem_needed;\n");
fprintf(cf, " struct rbuf rc = {buf, n_bytes, 1};\n");
fprintf(cf, " switch(item->cmd) {\n");
fprintf(cf, " switch(cmd) {\n");
DO_ROLLBACKS(lt, {
fprintf(cf, " case RT_%s:\n", lt->name);
DO_FIELDS(ft, lt, fprintf(cf, " rbuf_ma_%s(&rc, ma, &item->u.%s.%s);\n", ft->type, lt->name, ft->name));
fprintf(cf, " mem_needed = sizeof(item->u.%s) + __builtin_offsetof(struct roll_entry, u.%s);\n", lt->name, lt->name);
fprintf(cf, " item = malloc_in_memarena(ma, mem_needed);\n");
fprintf(cf, " item->cmd = cmd;\n");
DO_FIELDS(ft, lt, fprintf(cf, " rbuf_ma_%s(&rc, ma, &item->u.%s.%s);\n", ft->type, lt->name, ft->name));
fprintf(cf, " *itemp = item;\n");
fprintf(cf, " return 0;\n");
});

View file

@ -55,7 +55,7 @@ int toku_logger_create (TOKULOGGER *resultp) {
result->outbuf = (struct logbuf) {0, LOGGER_MIN_BUF_SIZE, toku_xmalloc(LOGGER_MIN_BUF_SIZE), ZERO_LSN};
// written_lsn is uninitialized
// fsynced_lsn is uninitialized
result->checkpoint_lsn = ZERO_LSN;
result->last_completed_checkpoint_lsn = ZERO_LSN;
// next_log_file_number is uninitialized
// n_in_file is uninitialized
result->write_block_size = BRT_DEFAULT_NODE_SIZE; // default logging size is the same as the default brt block size
@ -68,6 +68,7 @@ int toku_logger_create (TOKULOGGER *resultp) {
result->input_lock_ctr = 0;
result->output_condition_lock_ctr = 0;
result->swap_ctr = 0;
result->rollback_cachefile = NULL;
result->output_is_available = TRUE;
return 0;
@ -136,6 +137,68 @@ int toku_logger_open (const char *directory, TOKULOGGER logger) {
return 0;
}
int
toku_logger_open_rollback(TOKULOGGER logger, CACHETABLE cachetable, BOOL create) {
assert(logger->is_open);
assert(!logger->is_panicked);
assert(!logger->rollback_cachefile);
int r;
BRT t = NULL; // Note, there is no DB associated with this BRT.
r = toku_brt_create(&t);
assert(r==0);
r = toku_brt_open(t, ROLLBACK_CACHEFILE_NAME, create, create, cachetable, NULL_TXN, NULL);
assert(r==0);
logger->rollback_cachefile = t->cf;
toku_brtheader_lock(t->h);
//Verify it is empty
assert(!t->h->panic);
//Must have no data blocks (rollback logs or otherwise).
toku_block_verify_no_data_blocks_except_root_unlocked(t->h->blocktable, t->h->root);
toku_brtheader_unlock(t->h);
assert(toku_brt_is_empty(t));
return r;
}
// Requires: Rollback cachefile can only be closed immediately after a checkpoint,
// so it will always be clean (!h->dirty) when about to be closed.
// Rollback log can only be closed when there are no open transactions,
// so it will always be empty (no data blocks) when about to be closed.
int
toku_logger_close_rollback(TOKULOGGER logger, BOOL recovery_failed) {
int r = 0;
CACHEFILE cf = logger->rollback_cachefile; // stored in logger at rollback cachefile open
if (!logger->is_panicked && cf) {
BRT brt_to_close;
{ //Find "brt"
struct brt_header *h = toku_cachefile_get_userdata(cf);
toku_brtheader_lock(h);
if (!h->panic && recovery_failed) {
toku_brt_header_set_panic(h, EINVAL, "Recovery failed");
}
//Verify it is safe to close it.
if (!h->panic) { //If paniced, it is safe to close.
assert(!h->dirty); //Must not be dirty.
//Must have no data blocks (rollback logs or otherwise).
toku_block_verify_no_data_blocks_except_root_unlocked(h->blocktable, h->root);
}
assert(!toku_list_empty(&h->live_brts)); // there is always one brt associated with the header
brt_to_close = toku_list_struct(toku_list_head(&h->live_brts), struct brt, live_brt_link);
assert(brt_to_close);
toku_brtheader_unlock(h);
assert(toku_brt_is_empty(brt_to_close));
}
char *error_string_ignore = NULL;
r = toku_close_brt(brt_to_close, &error_string_ignore);
//Set as dealt with already.
logger->rollback_cachefile = NULL;
}
return r;
}
// No locks held on entry
// No locks held on exit.
// No locks are needed, since you cannot legally close the log concurrently with doing anything else.
@ -183,7 +246,8 @@ int toku_logger_shutdown(TOKULOGGER logger) {
if (logger->is_open) {
if (toku_omt_size(logger->live_txns) == 0) {
BYTESTRING comment = { strlen("shutdown"), "shutdown" };
r = toku_log_comment(logger, NULL, TRUE, 0, comment);
int r2 = toku_log_comment(logger, NULL, TRUE, 0, comment);
if (!r) r = r2;
}
}
return r;
@ -787,6 +851,10 @@ int toku_fread_LSN (FILE *f, LSN *lsn, struct x1764 *checksum, u_int32_t *le
return toku_fread_u_int64_t (f, &lsn->lsn, checksum, len);
}
int toku_fread_BLOCKNUM (FILE *f, BLOCKNUM *b, struct x1764 *checksum, u_int32_t *len) {
return toku_fread_u_int64_t (f, (u_int64_t*)&b->b, checksum, len);
}
int toku_fread_FILENUM (FILE *f, FILENUM *filenum, struct x1764 *checksum, u_int32_t *len) {
return toku_fread_u_int32_t (f, &filenum->fileid, checksum, len);
}
@ -903,6 +971,11 @@ int toku_logprint_BYTESTRING (FILE *outf, FILE *inf, const char *fieldname, stru
return 0;
}
int toku_logprint_BLOCKNUM (FILE *outf, FILE *inf, const char *fieldname, struct x1764 *checksum, u_int32_t *len, const char *format) {
return toku_logprint_u_int64_t(outf, inf, fieldname, checksum, len, format);
}
int toku_logprint_FILENUM (FILE *outf, FILE *inf, const char *fieldname, struct x1764 *checksum, u_int32_t *len, const char *format) {
return toku_logprint_u_int32_t(outf, inf, fieldname, checksum, len, format);
@ -982,11 +1055,6 @@ TXNID toku_txn_get_txnid (TOKUTXN txn) {
else return txn->txnid64;
}
LSN toku_txn_get_last_lsn (TOKUTXN txn) {
if (txn==0) return (LSN){0};
return txn->last_lsn;
}
LSN toku_logger_last_lsn(TOKULOGGER logger) {
return logger->lsn;
}
@ -1083,32 +1151,20 @@ int toku_logger_log_archive (TOKULOGGER logger, char ***logs_p, int flags) {
// get them into increasing order
qsort(all_logs, all_n_logs, sizeof(all_logs[0]), logfilenamecompare);
LSN oldest_live_txn_lsn;
{
TXNID oldest_living_xid = toku_logger_get_oldest_living_xid(logger);
if (oldest_living_xid == TXNID_NONE_LIVING)
oldest_live_txn_lsn = MAX_LSN;
else
oldest_live_txn_lsn.lsn = oldest_living_xid;
}
//printf("%s:%d Oldest txn is %lld\n", __FILE__, __LINE__, (long long)oldest_live_txn_lsn.lsn);
LSN save_lsn = logger->last_completed_checkpoint_lsn;
// Now starting at the last one, look for archivable ones.
// Count the total number of bytes, because we have to return a single big array. (That's the BDB interface. Bleah...)
LSN earliest_lsn_in_logfile={(unsigned long long)(-1LL)};
r = peek_at_log(logger, all_logs[all_n_logs-1], &earliest_lsn_in_logfile); // try to find the lsn that's in the most recent log
if ((earliest_lsn_in_logfile.lsn <= logger->checkpoint_lsn.lsn)&&
(earliest_lsn_in_logfile.lsn <= oldest_live_txn_lsn.lsn)) {
if (earliest_lsn_in_logfile.lsn <= save_lsn.lsn) {
i=all_n_logs-1;
} else {
for (i=all_n_logs-2; i>=0; i--) { // start at all_n_logs-2 because we never archive the most recent log
r = peek_at_log(logger, all_logs[i], &earliest_lsn_in_logfile);
if (r!=0) continue; // In case of error, just keep going
//printf("%s:%d file=%s firstlsn=%lld checkpoint_lsns={%lld %lld}\n", __FILE__, __LINE__, all_logs[i], (long long)earliest_lsn_in_logfile.lsn, (long long)logger->checkpoint_lsns[0].lsn, (long long)logger->checkpoint_lsns[1].lsn);
if ((earliest_lsn_in_logfile.lsn <= logger->checkpoint_lsn.lsn)&&
(earliest_lsn_in_logfile.lsn <= oldest_live_txn_lsn.lsn)) {
if (earliest_lsn_in_logfile.lsn <= save_lsn.lsn) {
break;
}
}
@ -1148,7 +1204,7 @@ TOKUTXN toku_logger_txn_parent (TOKUTXN txn) {
}
void toku_logger_note_checkpoint(TOKULOGGER logger, LSN lsn) {
logger->checkpoint_lsn = lsn;
logger->last_completed_checkpoint_lsn = lsn;
}
TXNID toku_logger_get_oldest_living_xid(TOKULOGGER logger) {
@ -1158,17 +1214,6 @@ TXNID toku_logger_get_oldest_living_xid(TOKULOGGER logger) {
return rval;
}
LSN toku_logger_get_oldest_living_lsn(TOKULOGGER logger) {
LSN lsn = {0};
if (logger) {
if (logger->oldest_living_xid == TXNID_NONE_LIVING)
lsn = MAX_LSN;
else
lsn.lsn = logger->oldest_living_xid;
}
return lsn;
}
LSN
toku_logger_get_next_lsn(TOKULOGGER logger) {
return logger->lsn;

View file

@ -5,12 +5,20 @@
#ident "Copyright (c) 2007, 2008, 2009 Tokutek Inc. All rights reserved."
#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."
enum { TOKU_LOG_VERSION = 1 };
enum {
TOKU_LOG_VERSION_1 = 1,
TOKU_LOG_VERSION_2 = 2,
TOKU_LOG_NEXT_VERSION, // the version after the current version
TOKU_LOG_VERSION = TOKU_LOG_NEXT_VERSION-1, // A hack so I don't have to change this line.
};
#define ROLLBACK_CACHEFILE_NAME "tokudb.rollback"
int toku_logger_create (TOKULOGGER *resultp);
int toku_logger_open (const char *directory, TOKULOGGER logger);
int toku_logger_shutdown(TOKULOGGER logger);
int toku_logger_close(TOKULOGGER *loggerp);
int toku_logger_open_rollback(TOKULOGGER logger, CACHETABLE cachetable, BOOL create);
int toku_logger_close_rollback(TOKULOGGER logger, BOOL recovery_failed);
int toku_logger_fsync (TOKULOGGER logger);
void toku_logger_panic (TOKULOGGER logger, int err);
@ -49,6 +57,7 @@ int toku_fread_u_int32_t_nocrclen (FILE *f, u_int32_t *v);
int toku_fread_u_int32_t (FILE *f, u_int32_t *v, struct x1764 *checksum, u_int32_t *len);
int toku_fread_u_int64_t (FILE *f, u_int64_t *v, struct x1764 *checksum, u_int32_t *len);
int toku_fread_LSN (FILE *f, LSN *lsn, struct x1764 *checksum, u_int32_t *len);
int toku_fread_BLOCKNUM (FILE *f, BLOCKNUM *lsn, struct x1764 *checksum, u_int32_t *len);
int toku_fread_FILENUM (FILE *f, FILENUM *filenum, struct x1764 *checksum, u_int32_t *len);
int toku_fread_TXNID (FILE *f, TXNID *txnid, struct x1764 *checksum, u_int32_t *len);
int toku_fread_BYTESTRING (FILE *f, BYTESTRING *bs, struct x1764 *checksum, u_int32_t *len);
@ -58,6 +67,7 @@ int toku_logprint_LSN (FILE *outf, FILE *inf, const char *fieldname, struct x176
int toku_logprint_TXNID (FILE *outf, FILE *inf, const char *fieldname, struct x1764 *checksum, u_int32_t *len, const char *format __attribute__((__unused__)));
int toku_logprint_u_int8_t (FILE *outf, FILE *inf, const char *fieldname, struct x1764 *checksum, u_int32_t *len, const char *format);
int toku_logprint_u_int32_t (FILE *outf, FILE *inf, const char *fieldname, struct x1764 *checksum, u_int32_t *len, const char *format);
int toku_logprint_BLOCKNUM (FILE *outf, FILE *inf, const char *fieldname, struct x1764 *checksum, u_int32_t *len, const char *format);
int toku_logprint_u_int64_t (FILE *outf, FILE *inf, const char *fieldname, struct x1764 *checksum, u_int32_t *len, const char *format);
void toku_print_BYTESTRING (FILE *outf, u_int32_t len, char *data);
int toku_logprint_BYTESTRING (FILE *outf, FILE *inf, const char *fieldname, struct x1764 *checksum, u_int32_t *len, const char *format __attribute__((__unused__)));
@ -67,7 +77,6 @@ int toku_read_and_print_logmagic (FILE *f, u_int32_t *versionp);
int toku_read_logmagic (FILE *f, u_int32_t *versionp);
TXNID toku_txn_get_txnid (TOKUTXN txn);
LSN toku_txn_get_last_lsn (TOKUTXN txn);
LSN toku_logger_last_lsn(TOKULOGGER logger);
TOKULOGGER toku_txn_logger (TOKUTXN txn);
@ -81,7 +90,6 @@ TOKUTXN toku_logger_txn_parent (TOKUTXN txn);
void toku_logger_note_checkpoint(TOKULOGGER logger, LSN lsn);
TXNID toku_logger_get_oldest_living_xid(TOKULOGGER logger);
LSN toku_logger_get_oldest_living_lsn(TOKULOGGER logger);
LSN toku_logger_get_next_lsn(TOKULOGGER logger);
void toku_logger_set_remove_finalize_callback(TOKULOGGER logger, void (*funcp)(DICTIONARY_ID, void *), void * extra);
void toku_logger_call_remove_finalize_callback(TOKULOGGER logger, DICTIONARY_ID dict_id);

View file

@ -12,9 +12,9 @@ struct memarena {
int n_other_bufs;
};
MEMARENA memarena_create (void) {
MEMARENA MALLOC(result); assert(result);
result->buf_size = 1024;
MEMARENA memarena_create_presized (size_t initial_size) {
MEMARENA XMALLOC(result);
result->buf_size = initial_size;
result->buf_used = 0;
result->other_bufs = NULL;
result->size_of_other_bufs = 0;
@ -23,6 +23,10 @@ MEMARENA memarena_create (void) {
return result;
}
MEMARENA memarena_create (void) {
return memarena_create_presized(1024);
}
void memarena_clear (MEMARENA ma) {
// Free the other bufs.
int i;

View file

@ -19,10 +19,11 @@
#include <sys/types.h>
typedef struct memarena *MEMARENA;
MEMARENA memarena_create_presized (size_t initial_size);
// Effect: Create a memarena with initial size. In case of ENOMEM, aborts.
MEMARENA memarena_create (void);
// Effect: Create a memarena. In case of ENOMEM, aborts.
// Effect: Create a memarena with default initial size. In case of ENOMEM, aborts.
void memarena_clear (MEMARENA ma);
// Effect: Reset the internal state so that the allocated memory can be used again.

View file

@ -100,6 +100,18 @@ static inline BLOCKNUM rbuf_blocknum (struct rbuf *r) {
BLOCKNUM result = make_blocknum(rbuf_longlong(r));
return result;
}
static inline void rbuf_ma_BLOCKNUM (struct rbuf *r, MEMARENA ma __attribute__((__unused__)), BLOCKNUM *blocknum) {
*blocknum = rbuf_blocknum(r);
}
static inline void rbuf_ma_u_int32_t (struct rbuf *r, MEMARENA ma __attribute__((__unused__)), u_int32_t *num) {
*num = rbuf_int(r);
}
static inline void rbuf_ma_u_int64_t (struct rbuf *r, MEMARENA ma __attribute__((__unused__)), u_int64_t *num) {
*num = rbuf_ulonglong(r);
}
static inline void rbuf_TXNID (struct rbuf *r, TXNID *txnid) {
*txnid = rbuf_ulonglong(r);
@ -119,7 +131,7 @@ static inline void rbuf_ma_FILENUM (struct rbuf *r, MEMARENA ma __attribute__((_
static inline void rbuf_BYTESTRING (struct rbuf *r, BYTESTRING *bs) {
bs->len = rbuf_int(r);
u_int32_t newndone = r->ndone + bs->len;
assert(newndone < r->size);
assert(newndone <= r->size);
bs->data = toku_memdup(&r->buf[r->ndone], (size_t)bs->len);
assert(bs->data);
r->ndone = newndone;
@ -128,7 +140,7 @@ static inline void rbuf_BYTESTRING (struct rbuf *r, BYTESTRING *bs) {
static inline void rbuf_ma_BYTESTRING (struct rbuf *r, MEMARENA ma, BYTESTRING *bs) {
bs->len = rbuf_int(r);
u_int32_t newndone = r->ndone + bs->len;
assert(newndone < r->size);
assert(newndone <= r->size);
bs->data = memarena_memdup(ma, &r->buf[r->ndone], (size_t)bs->len);
assert(bs->data);
r->ndone = newndone;

File diff suppressed because it is too large Load diff

View file

@ -11,7 +11,6 @@
#include "../include/db.h"
#include "brttypes.h"
#include "memory.h"
#include "bread.h"
#include "x1764.h"
// Run tokudb recovery from the log
@ -29,10 +28,6 @@ int tokudb_recover (const char *env_dir, const char *log_dir,
// Returns: TRUE if we need recovery, otherwise FALSE.
int tokudb_needs_recovery(const char *logdir, BOOL ignore_empty_log);
// Delete the rolltmp files
// Ruturns 0 if success
int tokudb_recover_delete_rolltmp_files(const char *datadir, const char *logdir);
// Return 0 if recovery log exists, ENOENT if log is missing
int tokudb_recover_log_exists(const char * log_dir);

View file

@ -12,11 +12,11 @@
int
toku_commit_fdelete (u_int8_t file_was_open,
FILENUM filenum, // valid if file_was_open
BYTESTRING bs_fname, // cwd/iname
TOKUTXN txn,
YIELDF UU(yield),
void *UU(yield_v),
FILENUM filenum, // valid if file_was_open
BYTESTRING bs_fname, // cwd/iname
TOKUTXN txn,
YIELDF UU(yield),
void *UU(yield_v),
LSN UU(oplsn)) //oplsn is the lsn of the commit
{
//TODO: #2037 verify the file is (user) closed
@ -24,18 +24,23 @@ toku_commit_fdelete (u_int8_t file_was_open,
CACHEFILE cf;
int r;
if (file_was_open) { // file was open when toku_brt_remove_on_commit() was called
r = toku_cachefile_of_filenum(txn->logger->ct, filenum, &cf);
assert(r == 0); // must still be open (toku_brt_remove_on_commit() incremented refcount)
{
(void)toku_cachefile_get_and_pin_fd(cf);
assert(!toku_cachefile_is_dev_null_unlocked(cf));
struct brt_header *h = toku_cachefile_get_userdata(cf);
DICTIONARY_ID dict_id = h->dict_id;
toku_logger_call_remove_finalize_callback(txn->logger, dict_id);
r = toku_cachefile_of_filenum(txn->logger->ct, filenum, &cf);
if (r==ENOENT) { //Missing file on recovered transaction is not an error
assert(txn->recovered_from_checkpoint);
r = 0;
goto done;
}
assert(r == 0); // must still be open (toku_brt_remove_on_commit() incremented refcount)
{
(void)toku_cachefile_get_and_pin_fd(cf);
assert(!toku_cachefile_is_dev_null_unlocked(cf));
struct brt_header *h = toku_cachefile_get_userdata(cf);
DICTIONARY_ID dict_id = h->dict_id;
toku_logger_call_remove_finalize_callback(txn->logger, dict_id);
toku_cachefile_unpin_fd(cf);
}
r = toku_cachefile_redirect_nullfd(cf);
assert(r==0);
}
r = toku_cachefile_redirect_nullfd(cf);
assert(r==0);
}
char *fname_in_env = fixup_fname(&bs_fname);
char *fname_in_cwd = toku_cachetable_get_fname_in_cwd(txn->logger->ct, fname_in_env);
@ -44,16 +49,17 @@ toku_commit_fdelete (u_int8_t file_was_open,
assert(r==0 || errno==ENOENT);
toku_free(fname_in_env);
toku_free(fname_in_cwd);
done:
return 0;
}
int
toku_rollback_fdelete (u_int8_t UU(file_was_open),
FILENUM UU(filenum),
BYTESTRING UU(bs_fname),
TOKUTXN UU(txn),
YIELDF UU(yield),
void* UU(yield_v),
BYTESTRING UU(bs_fname),
TOKUTXN UU(txn),
YIELDF UU(yield),
void* UU(yield_v),
LSN UU(oplsn)) //oplsn is the lsn of the abort
{
//Rolling back an fdelete is an no-op.
@ -62,10 +68,10 @@ toku_rollback_fdelete (u_int8_t UU(file_was_open),
int
toku_commit_fcreate (FILENUM UU(filenum),
BYTESTRING UU(bs_fname),
TOKUTXN UU(txn),
YIELDF UU(yield),
void *UU(yield_v),
BYTESTRING UU(bs_fname),
TOKUTXN UU(txn),
YIELDF UU(yield),
void *UU(yield_v),
LSN UU(oplsn))
{
return 0;
@ -73,10 +79,10 @@ toku_commit_fcreate (FILENUM UU(filenum),
int
toku_rollback_fcreate (FILENUM filenum,
BYTESTRING bs_fname, // cwd/iname
TOKUTXN txn,
YIELDF UU(yield),
void* UU(yield_v),
BYTESTRING bs_fname, // cwd/iname
TOKUTXN txn,
YIELDF UU(yield),
void* UU(yield_v),
LSN UU(oplsn))
{
//TODO: #2037 verify the file is (user) closed
@ -84,13 +90,18 @@ toku_rollback_fcreate (FILENUM filenum,
//Remove reference to the fd in the cachetable
CACHEFILE cf = NULL;
int r = toku_cachefile_of_filenum(txn->logger->ct, filenum, &cf);
if (r==ENOENT) { //Missing file on recovered transaction is not an error
assert(txn->recovered_from_checkpoint);
r = 0;
goto done;
}
assert(r == 0);
{
(void)toku_cachefile_get_and_pin_fd(cf);
assert(!toku_cachefile_is_dev_null_unlocked(cf));
struct brt_header *h = toku_cachefile_get_userdata(cf);
DICTIONARY_ID dict_id = h->dict_id;
toku_logger_call_remove_finalize_callback(txn->logger, dict_id);
assert(!toku_cachefile_is_dev_null_unlocked(cf));
struct brt_header *h = toku_cachefile_get_userdata(cf);
DICTIONARY_ID dict_id = h->dict_id;
toku_logger_call_remove_finalize_callback(txn->logger, dict_id);
toku_cachefile_unpin_fd(cf);
}
r = toku_cachefile_redirect_nullfd(cf);
@ -103,6 +114,7 @@ toku_rollback_fcreate (FILENUM filenum,
assert(r==0 || errno==ENOENT);
toku_free(fname_in_env);
toku_free(fname_in_cwd);
done:
return 0;
}
@ -119,6 +131,11 @@ static int do_insertion (enum brt_msg_type type, FILENUM filenum, BYTESTRING key
CACHEFILE cf;
//printf("%s:%d committing insert %s %s\n", __FILE__, __LINE__, key.data, data.data);
int r = toku_cachefile_of_filenum(txn->logger->ct, filenum, &cf);
if (r==ENOENT) { //Missing file on recovered transaction is not an error
assert(txn->recovered_from_checkpoint);
r = 0;
goto done;
}
assert(r==0);
(void)toku_cachefile_get_and_pin_fd(cf);
@ -146,6 +163,7 @@ static int do_insertion (enum brt_msg_type type, FILENUM filenum, BYTESTRING key
}
cleanup:
toku_cachefile_unpin_fd(cf);
done:
return r;
}
@ -166,11 +184,11 @@ int toku_commit_cmdinsert (FILENUM filenum, BYTESTRING key, TOKUTXN txn, YIELDF
int
toku_commit_cmdinsertboth (FILENUM filenum,
BYTESTRING key,
BYTESTRING data,
TOKUTXN txn,
YIELDF UU(yield),
void * UU(yieldv),
BYTESTRING key,
BYTESTRING data,
TOKUTXN txn,
YIELDF UU(yield),
void * UU(yieldv),
LSN oplsn)
{
#if TOKU_DO_COMMIT_CMD_INSERT
@ -183,10 +201,10 @@ toku_commit_cmdinsertboth (FILENUM filenum,
int
toku_rollback_cmdinsert (FILENUM filenum,
BYTESTRING key,
TOKUTXN txn,
YIELDF UU(yield),
void * UU(yieldv),
BYTESTRING key,
TOKUTXN txn,
YIELDF UU(yield),
void * UU(yieldv),
LSN oplsn)
{
return do_insertion (BRT_ABORT_ANY, filenum, key, 0, txn, oplsn);
@ -194,11 +212,11 @@ toku_rollback_cmdinsert (FILENUM filenum,
int
toku_rollback_cmdinsertboth (FILENUM filenum,
BYTESTRING key,
BYTESTRING data,
TOKUTXN txn,
YIELDF UU(yield),
void * UU(yieldv),
BYTESTRING key,
BYTESTRING data,
TOKUTXN txn,
YIELDF UU(yield),
void * UU(yieldv),
LSN oplsn)
{
return do_insertion (BRT_ABORT_BOTH, filenum, key, &data, txn, oplsn);
@ -206,11 +224,11 @@ toku_rollback_cmdinsertboth (FILENUM filenum,
int
toku_commit_cmddeleteboth (FILENUM filenum,
BYTESTRING key,
BYTESTRING data,
TOKUTXN txn,
YIELDF UU(yield),
void * UU(yieldv),
BYTESTRING key,
BYTESTRING data,
TOKUTXN txn,
YIELDF UU(yield),
void * UU(yieldv),
LSN oplsn)
{
#if TOKU_DO_COMMIT_CMD_DELETE_BOTH
@ -223,11 +241,11 @@ toku_commit_cmddeleteboth (FILENUM filenum,
int
toku_rollback_cmddeleteboth (FILENUM filenum,
BYTESTRING key,
BYTESTRING data,
TOKUTXN txn,
YIELDF UU(yield),
void * UU(yieldv),
BYTESTRING key,
BYTESTRING data,
TOKUTXN txn,
YIELDF UU(yield),
void * UU(yieldv),
LSN oplsn)
{
return do_insertion (BRT_ABORT_BOTH, filenum, key, &data, txn, oplsn);
@ -235,10 +253,10 @@ toku_rollback_cmddeleteboth (FILENUM filenum,
int
toku_commit_cmddelete (FILENUM filenum,
BYTESTRING key,
TOKUTXN txn,
YIELDF UU(yield),
void * UU(yieldv),
BYTESTRING key,
TOKUTXN txn,
YIELDF UU(yield),
void * UU(yieldv),
LSN oplsn)
{
#if TOKU_DO_COMMIT_CMD_DELETE
@ -251,113 +269,114 @@ toku_commit_cmddelete (FILENUM filenum,
int
toku_rollback_cmddelete (FILENUM filenum,
BYTESTRING key,
TOKUTXN txn,
YIELDF UU(yield),
void * UU(yieldv),
BYTESTRING key,
TOKUTXN txn,
YIELDF UU(yield),
void * UU(yieldv),
LSN oplsn)
{
return do_insertion (BRT_ABORT_ANY, filenum, key, 0, txn, oplsn);
}
int
toku_commit_fileentries (int fd,
TOKUTXN txn,
YIELDF yield,
void * yieldv,
LSN oplsn)
{
BREAD f = create_bread_from_fd_initialize_at(fd);
int r=0;
MEMARENA ma = memarena_create();
static int
toku_apply_rollinclude (TXNID xid,
uint64_t num_nodes,
BLOCKNUM spilled_head,
uint32_t spilled_head_hash,
BLOCKNUM spilled_tail,
uint32_t spilled_tail_hash,
TOKUTXN txn,
YIELDF yield,
void * yieldv,
LSN oplsn,
apply_rollback_item func) {
int r;
struct roll_entry *item;
int count=0;
while (bread_has_more(f)) {
struct roll_entry *item;
r = toku_read_rollback_backwards(f, &item, ma);
if (r!=0) goto finish;
r = toku_commit_rollback_item(txn, item, yield, yieldv, oplsn);
if (r!=0) goto finish;
memarena_clear(ma);
count++;
if (count%2==0) yield(NULL, yieldv);
BLOCKNUM next_log = spilled_tail;
uint32_t next_log_hash = spilled_tail_hash;
uint64_t last_sequence = num_nodes;
BOOL found_head = FALSE;
assert(next_log.b != ROLLBACK_NONE.b);
while (next_log.b != ROLLBACK_NONE.b) {
ROLLBACK_LOG_NODE log;
//pin log
r = toku_get_and_pin_rollback_log(txn, xid, last_sequence - 1, next_log, next_log_hash, &log);
assert(r==0);
last_sequence = log->sequence;
while ((item=log->newest_logentry)) {
log->newest_logentry = item->prev;
r = func(txn, item, yield, yieldv, oplsn);
if (r!=0) return r;
count++;
if (count%2 == 0) yield(NULL, yieldv);
}
if (next_log.b == spilled_head.b) {
assert(!found_head);
found_head = TRUE;
assert(log->sequence == 0);
}
next_log = log->older;
next_log_hash = log->older_hash;
{
//Clean up transaction structure to prevent
//toku_txn_close from double-freeing
spilled_tail = next_log;
spilled_tail_hash = next_log_hash;
if (found_head) {
assert(next_log.b == ROLLBACK_NONE.b);
spilled_head = next_log;
spilled_head_hash = next_log_hash;
}
}
//Unpins log
r = toku_delete_rollback_log(txn, log);
assert(r==0);
}
finish:
{ int r2 = close_bread_without_closing_fd(f); assert(r2==0); }
memarena_close(&ma);
return r;
}
int
toku_rollback_fileentries (int fd,
TOKUTXN txn,
YIELDF yield,
void * yieldv,
LSN oplsn)
{
BREAD f = create_bread_from_fd_initialize_at(fd);
assert(f);
int r=0;
MEMARENA ma = memarena_create();
int count=0;
while (bread_has_more(f)) {
struct roll_entry *item;
r = toku_read_rollback_backwards(f, &item, ma);
if (r!=0) goto finish;
r = toku_abort_rollback_item(txn, item, yield, yieldv, oplsn);
if (r!=0) goto finish;
memarena_clear(ma);
count++;
if (count%2==0) yield(NULL, yieldv);
}
finish:
{ int r2 = close_bread_without_closing_fd(f); assert(r2==0); }
memarena_close(&ma);
return r;
}
int
toku_commit_rollinclude (BYTESTRING bs,
TOKUTXN txn,
YIELDF yield,
void * yieldv,
toku_commit_rollinclude (TXNID xid,
uint64_t num_nodes,
BLOCKNUM spilled_head,
uint32_t spilled_head_hash,
BLOCKNUM spilled_tail,
uint32_t spilled_tail_hash,
TOKUTXN txn,
YIELDF yield,
void * yieldv,
LSN oplsn) {
int r;
char *fname_in_logger = fixup_fname(&bs);
char *fname_in_cwd = toku_construct_full_name(2, txn->logger->directory, fname_in_logger);
int fd = open(fname_in_cwd, O_RDONLY+O_BINARY);
assert(fd>=0);
r = toku_commit_fileentries(fd, txn, yield, yieldv, oplsn);
assert(r==0);
r = close(fd);
assert(r==0);
r = unlink(fname_in_cwd);
assert(r==0);
toku_free(fname_in_logger);
toku_free(fname_in_cwd);
return 0;
r = toku_apply_rollinclude(xid, num_nodes,
spilled_head, spilled_head_hash,
spilled_tail, spilled_tail_hash,
txn, yield, yieldv, oplsn,
toku_commit_rollback_item);
return r;
}
int
toku_rollback_rollinclude (BYTESTRING bs,
TOKUTXN txn,
YIELDF yield,
void * yieldv,
LSN oplsn)
{
toku_rollback_rollinclude (TXNID xid,
uint64_t num_nodes,
BLOCKNUM spilled_head,
uint32_t spilled_head_hash,
BLOCKNUM spilled_tail,
uint32_t spilled_tail_hash,
TOKUTXN txn,
YIELDF yield,
void * yieldv,
LSN oplsn) {
int r;
char *fname_in_logger = fixup_fname(&bs);
char *fname_in_cwd = toku_construct_full_name(2, txn->logger->directory, fname_in_logger);
int fd = open(fname_in_cwd, O_RDONLY+O_BINARY);
assert(fd>=0);
r = toku_rollback_fileentries(fd, txn, yield, yieldv, oplsn);
assert(r==0);
r = close(fd);
assert(r==0);
r = unlink(fname_in_cwd);
assert(r==0);
toku_free(fname_in_logger);
toku_free(fname_in_cwd);
return 0;
r = toku_apply_rollinclude(xid, num_nodes,
spilled_head, spilled_head_hash,
spilled_tail, spilled_tail_hash,
txn, yield, yieldv, oplsn,
toku_abort_rollback_item);
return r;
}
int
@ -365,7 +384,7 @@ toku_rollback_tablelock_on_empty_table (FILENUM filenum,
TOKUTXN txn,
YIELDF yield,
void* yield_v,
LSN UU(oplsn))
LSN oplsn)
{
//TODO: Replace truncate function with something that doesn't need to mess with checkpoints.
// on rollback we have to make the file be empty, since we locked an empty table, and then may have done things to it.
@ -373,22 +392,33 @@ toku_rollback_tablelock_on_empty_table (FILENUM filenum,
CACHEFILE cf;
//printf("%s:%d committing insert %s %s\n", __FILE__, __LINE__, key.data, data.data);
int r = toku_cachefile_of_filenum(txn->logger->ct, filenum, &cf);
if (r==ENOENT) { //Missing file on recovered transaction is not an error
assert(txn->recovered_from_checkpoint);
r = 0;
goto done;
}
assert(r==0);
OMTVALUE brtv=NULL;
r = toku_omt_find_zero(txn->open_brts, find_brt_from_filenum, &filenum, &brtv, NULL, NULL);
if (r==0) {
// If r!=0 it could be because we grabbed a log on an empty table that doesn't even exist, and we never put anything into it.
// So, just don't do anything in this case.
BRT brt = brtv;
toku_poll_txn_progress_function(txn, FALSE, TRUE);
yield(toku_checkpoint_safe_client_lock, yield_v);
toku_poll_txn_progress_function(txn, FALSE, FALSE);
r = toku_brt_truncate(brt);
assert(r==0);
toku_checkpoint_safe_client_unlock();
assert(r==0);
BRT brt = brtv;
{ //Do NOT truncate the file if
//the file already survived the truncate and was checkpointed.
LSN treelsn = toku_brt_checkpoint_lsn(brt);
if (oplsn.lsn != 0 && oplsn.lsn <= treelsn.lsn) {
r = 0;
goto done;
}
}
toku_poll_txn_progress_function(txn, FALSE, TRUE);
yield(toku_checkpoint_safe_client_lock, yield_v);
toku_poll_txn_progress_function(txn, FALSE, FALSE);
r = toku_brt_truncate(brt);
assert(r==0);
toku_checkpoint_safe_client_unlock();
done:
return r;
}
@ -411,7 +441,7 @@ toku_commit_load (BYTESTRING old_iname,
char *fname_in_env = fixup_fname(&old_iname); //Delete old file
r = toku_cachefile_of_iname_in_env(txn->logger->ct, fname_in_env, &cf);
if (r==0) {
r = toku_cachefile_redirect_nullfd(cf);
r = toku_cachefile_redirect_nullfd(cf);
assert(r==0);
}
else {
@ -438,7 +468,7 @@ toku_rollback_load (BYTESTRING UU(old_iname),
char *fname_in_env = fixup_fname(&new_iname); //Delete new file
r = toku_cachefile_of_iname_in_env(txn->logger->ct, fname_in_env, &cf);
if (r==0) {
r = toku_cachefile_redirect_nullfd(cf);
r = toku_cachefile_redirect_nullfd(cf);
assert(r==0);
}
else {
@ -455,38 +485,44 @@ toku_rollback_load (BYTESTRING UU(old_iname),
int
toku_commit_dictionary_redirect (FILENUM UU(old_filenum),
FILENUM UU(new_filenum),
FILENUM UU(new_filenum),
TOKUTXN UU(txn),
YIELDF UU(yield),
void * UU(yield_v),
LSN UU(oplsn)) //oplsn is the lsn of the commit
{
//NO-OP
//Redirect only has meaning during normal operation (NOT during recovery).
if (!txn->recovered_from_checkpoint) {
//NO-OP
}
return 0;
}
int
toku_rollback_dictionary_redirect (FILENUM old_filenum,
FILENUM new_filenum,
FILENUM new_filenum,
TOKUTXN txn,
YIELDF UU(yield),
void * UU(yield_v),
LSN UU(oplsn)) //oplsn is the lsn of the abort
{
int r = 0;
CACHEFILE new_cf = NULL;
r = toku_cachefile_of_filenum(txn->logger->ct, new_filenum, &new_cf);
assert(r == 0);
struct brt_header *new_h = toku_cachefile_get_userdata(new_cf);
//Redirect only has meaning during normal operation (NOT during recovery).
if (!txn->recovered_from_checkpoint) {
CACHEFILE new_cf = NULL;
r = toku_cachefile_of_filenum(txn->logger->ct, new_filenum, &new_cf);
assert(r == 0);
struct brt_header *new_h = toku_cachefile_get_userdata(new_cf);
CACHEFILE old_cf = NULL;
r = toku_cachefile_of_filenum(txn->logger->ct, old_filenum, &old_cf);
assert(r == 0);
struct brt_header *old_h = toku_cachefile_get_userdata(old_cf);
CACHEFILE old_cf = NULL;
r = toku_cachefile_of_filenum(txn->logger->ct, old_filenum, &old_cf);
assert(r == 0);
struct brt_header *old_h = toku_cachefile_get_userdata(old_cf);
//Redirect back from new to old.
r = toku_dictionary_redirect_abort(old_h, new_h, txn);
assert(r==0);
//Redirect back from new to old.
r = toku_dictionary_redirect_abort(old_h, new_h, txn);
assert(r==0);
}
return r;
}

View file

@ -37,18 +37,114 @@ int toku_abort_rollback_item (TOKUTXN txn, struct roll_entry *item, YIELDF yield
return r;
}
void toku_rollback_txn_close (TOKUTXN txn) {
memarena_close(&txn->rollentry_arena);
if (txn->rollentry_filename!=0) {
int r = close(txn->rollentry_fd);
assert(r==0);
char *fname_in_cwd = toku_construct_full_name(2, txn->logger->directory, txn->rollentry_filename);
r = unlink(fname_in_cwd);
assert(r==0);
toku_free(txn->rollentry_filename);
toku_free(fname_in_cwd);
static inline int
txn_has_inprogress_rollback_log(TOKUTXN txn) {
return txn->current_rollback.b != ROLLBACK_NONE.b;
}
static inline int
txn_has_spilled_rollback_logs(TOKUTXN txn) {
return txn->spilled_rollback_tail.b != ROLLBACK_NONE.b;
}
int
toku_delete_rollback_log(TOKUTXN txn, ROLLBACK_LOG_NODE log) {
int r;
CACHEFILE cf = txn->logger->rollback_cachefile;
struct brt_header *h = toku_cachefile_get_userdata(cf);
BLOCKNUM to_free = log->thislogname;
if (txn->pinned_inprogress_rollback_log == log) {
txn->pinned_inprogress_rollback_log = NULL;
}
r = toku_cachetable_unpin_and_remove (cf, log->thislogname);
assert(r==0);
toku_free_blocknum(h->blocktable, &to_free, h);
return r;
}
static int
toku_apply_txn (TOKUTXN txn, YIELDF yield, void*yieldv, LSN lsn,
apply_rollback_item func) {
int r = 0;
// do the commit/abort calls and free everything
// we do the commit/abort calls in reverse order too.
struct roll_entry *item;
//printf("%s:%d abort\n", __FILE__, __LINE__);
int count=0;
BLOCKNUM next_log = ROLLBACK_NONE;
uint32_t next_log_hash = 0;
BOOL is_current = FALSE;
if (txn_has_inprogress_rollback_log(txn)) {
next_log = txn->current_rollback;
next_log_hash = txn->current_rollback_hash;
is_current = TRUE;
}
else if (txn_has_spilled_rollback_logs(txn)) {
next_log = txn->spilled_rollback_tail;
next_log_hash = txn->spilled_rollback_tail_hash;
}
uint64_t last_sequence = txn->num_rollback_nodes;
BOOL found_head = FALSE;
while (next_log.b != ROLLBACK_NONE.b) {
ROLLBACK_LOG_NODE log;
//pin log
r = toku_get_and_pin_rollback_log(txn, txn->txnid64, last_sequence-1, next_log, next_log_hash, &log);
assert(r==0);
last_sequence = log->sequence;
if (func) {
while ((item=log->newest_logentry)) {
log->newest_logentry = item->prev;
r = func(txn, item, yield, yieldv, lsn);
if (r!=0) return r;
count++;
if (count%2 == 0) yield(NULL, yieldv);
}
}
if (next_log.b == txn->spilled_rollback_head.b) {
assert(!found_head);
found_head = TRUE;
assert(log->sequence == 0);
}
next_log = log->older;
next_log_hash = log->older_hash;
{
//Clean up transaction structure to prevent
//toku_txn_close from double-freeing
if (is_current) {
txn->current_rollback = ROLLBACK_NONE;
txn->current_rollback_hash = 0;
is_current = FALSE;
}
else {
txn->spilled_rollback_tail = next_log;
txn->spilled_rollback_tail_hash = next_log_hash;
}
if (found_head) {
assert(next_log.b == ROLLBACK_NONE.b);
txn->spilled_rollback_head = next_log;
txn->spilled_rollback_head_hash = next_log_hash;
}
}
//Unpins log
r = toku_delete_rollback_log(txn, log);
assert(r==0);
}
return r;
}
void toku_rollback_txn_close (TOKUTXN txn) {
{
//Clean up all rollback logs if they exist.
//Note: This will NOT cleanup any rollback logs as in 'rollinclude'
int r = toku_apply_txn(txn, NULL, NULL, ZERO_LSN, NULL);
assert(r==0);
}
assert(txn->spilled_rollback_head.b == ROLLBACK_NONE.b);
assert(txn->spilled_rollback_tail.b == ROLLBACK_NONE.b);
assert(txn->current_rollback.b == ROLLBACK_NONE.b);
{
//Remove txn from list (omt) of live transactions
OMTVALUE txnagain;
@ -86,20 +182,16 @@ void toku_rollback_txn_close (TOKUTXN txn) {
return;
}
void* toku_malloc_in_rollback(TOKUTXN txn, size_t size) {
return malloc_in_memarena(txn->rollentry_arena, size);
void* toku_malloc_in_rollback(ROLLBACK_LOG_NODE log, size_t size) {
return malloc_in_memarena(log->rollentry_arena, size);
}
void *toku_memdup_in_rollback(TOKUTXN txn, const void *v, size_t len) {
void *r=toku_malloc_in_rollback(txn, len);
void *toku_memdup_in_rollback(ROLLBACK_LOG_NODE log, const void *v, size_t len) {
void *r=toku_malloc_in_rollback(log, len);
memcpy(r,v,len);
return r;
}
char *toku_strdup_in_rollback(TOKUTXN txn, const char *s) {
return toku_memdup_in_rollback(txn, s, strlen(s)+1);
}
static int note_brt_used_in_txns_parent(OMTVALUE brtv, u_int32_t UU(index), void*txnv) {
TOKUTXN child = txnv;
TOKUTXN parent = child->parent;
@ -107,215 +199,246 @@ static int note_brt_used_in_txns_parent(OMTVALUE brtv, u_int32_t UU(index), void
int r = toku_txn_note_brt(parent, brt);
if (r==0 &&
brt->h->txnid_that_created_or_locked_when_empty == toku_txn_get_txnid(child)) {
//Pass magic "no rolltmp needed" flag to parent.
//Pass magic "no rollback needed" flag to parent.
brt->h->txnid_that_created_or_locked_when_empty = toku_txn_get_txnid(parent);
}
return r;
}
//Commit each entry in the rollback (rolltmp) log.
//Commit each entry in the rollback log.
//If the transaction has a parent, it just promotes its information to its parent.
int toku_rollback_commit(TOKUTXN txn, YIELDF yield, void*yieldv, LSN lsn) {
int r=0;
if (txn->parent!=0) {
// First we must put a rollinclude entry into the parent if we have a rollentry file.
if (txn->rollentry_filename) {
int len = strlen(txn->rollentry_filename);
// Don't have to strdup the rollentry_filename because
// we take ownership of it.
BYTESTRING fname = {len, toku_strdup_in_rollback(txn, txn->rollentry_filename)};
r = toku_logger_save_rollback_rollinclude(txn->parent, fname);
// First we must put a rollinclude entry into the parent if we spilled
if (txn_has_spilled_rollback_logs(txn)) {
uint64_t num_nodes = txn->num_rollback_nodes;
if (txn_has_inprogress_rollback_log(txn)) {
num_nodes--; //Don't count the in-progress rollback log.
}
r = toku_logger_save_rollback_rollinclude(txn->parent, txn->txnid64, num_nodes,
txn->spilled_rollback_head, txn->spilled_rollback_head_hash,
txn->spilled_rollback_tail, txn->spilled_rollback_tail_hash);
if (r!=0) return r;
r = close(txn->rollentry_fd);
if (r!=0) {
//TODO: #2249.. this is a panic/crash situation
// If the rolltmp file is necessary for a checkpoint
// we CANNOT delete it!
// For now.. delete it, but figure out how to deal with this later.
// Maybe we should just assert that the close succeeds?
// We have to do the unlink ourselves, and then
// set txn->rollentry_filename=0 so that the cleanup
// won't try to close the fd again.
char *fname_in_cwd = toku_construct_full_name(2, txn->logger->directory, txn->rollentry_filename);
r = unlink(fname_in_cwd);
assert(r==0); //Can we assert this at this point?
unlink(txn->rollentry_filename);
toku_free(txn->rollentry_filename);
toku_free(fname_in_cwd);
txn->rollentry_filename = 0;
return r;
//Remove ownership from child.
txn->spilled_rollback_head = ROLLBACK_NONE;
txn->spilled_rollback_head_hash = 0;
txn->spilled_rollback_tail = ROLLBACK_NONE;
txn->spilled_rollback_tail_hash = 0;
}
if (txn_has_inprogress_rollback_log(txn)) {
ROLLBACK_LOG_NODE parent_log;
//Pin parent log
r = toku_get_and_pin_rollback_log_for_new_entry(txn->parent, &parent_log);
assert(r==0);
ROLLBACK_LOG_NODE child_log;
//Pin child log
r = toku_get_and_pin_rollback_log(txn, txn->txnid64, txn->num_rollback_nodes - 1,
txn->current_rollback, txn->current_rollback_hash,
&child_log);
assert(r==0);
// Append the list to the front of the parent.
if (child_log->oldest_logentry) {
// There are some entries, so link them in.
child_log->oldest_logentry->prev = parent_log->newest_logentry;
if (!parent_log->oldest_logentry) {
parent_log->oldest_logentry = child_log->oldest_logentry;
}
parent_log->newest_logentry = child_log->newest_logentry;
parent_log->rollentry_resident_bytecount += child_log->rollentry_resident_bytecount;
txn->parent->rollentry_raw_count += txn->rollentry_raw_count;
child_log->rollentry_resident_bytecount = 0;
}
// Stop the cleanup from closing and unlinking the file.
toku_free(txn->rollentry_filename);
txn->rollentry_filename = 0;
}
// Append the list to the front of the parent.
if (txn->oldest_logentry) {
// There are some entries, so link them in.
txn->oldest_logentry->prev = txn->parent->newest_logentry;
if (txn->parent->newest_logentry) {
txn->parent->newest_logentry->next = txn->oldest_logentry;
} else {
txn->parent->oldest_logentry = txn->oldest_logentry;
if (parent_log->oldest_logentry==NULL) {
parent_log->oldest_logentry = child_log->oldest_logentry;
}
txn->parent->newest_logentry = txn->newest_logentry;
txn->parent->rollentry_resident_bytecount += txn->rollentry_resident_bytecount;
txn->parent->rollentry_raw_count += txn->rollentry_raw_count;
txn->rollentry_resident_bytecount = 0;
}
if (txn->parent->oldest_logentry==0) {
txn->parent->oldest_logentry = txn->oldest_logentry;
}
txn->newest_logentry = txn->oldest_logentry = 0;
// Put all the memarena data into the parent.
if (memarena_total_size_in_use(txn->rollentry_arena) > 0) {
// If there are no bytes to move, then just leave things alone, and let the memory be reclaimed on txn is closed.
memarena_move_buffers(txn->parent->rollentry_arena, txn->rollentry_arena);
child_log->newest_logentry = child_log->oldest_logentry = 0;
// Put all the memarena data into the parent.
if (memarena_total_size_in_use(child_log->rollentry_arena) > 0) {
// If there are no bytes to move, then just leave things alone, and let the memory be reclaimed on txn is closed.
memarena_move_buffers(parent_log->rollentry_arena, child_log->rollentry_arena);
}
//Delete child log (unpins child_log)
r = toku_delete_rollback_log(txn, child_log);
assert(r==0);
txn->current_rollback = ROLLBACK_NONE;
txn->current_rollback_hash = 0;
r = toku_maybe_spill_rollbacks(txn->parent, parent_log); //unpins parent_log
assert(r==0);
}
// Note the open brts, the omts must be merged
r = toku_omt_iterate(txn->open_brts, note_brt_used_in_txns_parent, txn);
assert(r==0);
r = toku_maybe_spill_rollbacks(txn->parent);
assert(r==0);
//If this transaction needs an fsync (if it commits)
//save that in the parent. Since the commit really happens in the root txn.
txn->parent->force_fsync_on_commit |= txn->force_fsync_on_commit;
txn->parent->has_done_work |= txn->has_done_work;
txn->parent->num_rollentries += txn->num_rollentries;
} else {
// do the commit calls and free everything
// we do the commit calls in reverse order too.
{
struct roll_entry *item;
//printf("%s:%d abort\n", __FILE__, __LINE__);
int count=0;
while ((item=txn->newest_logentry)) {
txn->newest_logentry = item->prev;
r = toku_commit_rollback_item(txn, item, yield, yieldv, lsn);
if (r!=0) return r;
count++;
if (count%2 == 0) yield(NULL, yieldv);
}
}
// Read stuff out of the file and execute it.
if (txn->rollentry_filename) {
r = toku_commit_fileentries(txn->rollentry_fd, txn, yield, yieldv, lsn);
}
r = toku_apply_txn(txn, yield, yieldv, lsn, toku_commit_rollback_item);
assert(r==0);
}
return r;
}
int toku_rollback_abort(TOKUTXN txn, YIELDF yield, void*yieldv, LSN lsn) {
struct roll_entry *item;
int count=0;
int r=0;
while ((item=txn->newest_logentry)) {
txn->newest_logentry = item->prev;
r = toku_abort_rollback_item(txn, item, yield, yieldv, lsn);
if (r!=0)
return r;
count++;
if (count%2 == 0) yield(NULL, yieldv);
}
// Read stuff out of the file and roll it back.
if (txn->rollentry_filename) {
r = toku_rollback_fileentries(txn->rollentry_fd, txn, yield, yieldv, lsn);
assert(r==0);
}
return 0;
int r;
r = toku_apply_txn(txn, yield, yieldv, lsn, toku_abort_rollback_item);
assert(r==0);
return r;
}
// Write something out. Keep trying even if partial writes occur.
// On error: Return negative with errno set.
// On success return nbytes.
// NOTE : duplicated from logger.c - FIX THIS!!!
static int write_it (int fd, const void *bufv, int nbytes) {
toku_os_full_write(fd, bufv, nbytes);
return nbytes;
static size_t
rollback_memory_size(ROLLBACK_LOG_NODE log) {
size_t size = sizeof(*log);
size += memarena_total_memory_size(log->rollentry_arena);
return size;
}
int toku_maybe_spill_rollbacks (TOKUTXN txn) {
// Previously:
// if (txn->rollentry_resident_bytecount>txn->logger->write_block_size) {
// But now we use t
if (memarena_total_memory_size(txn->rollentry_arena) > txn->logger->write_block_size) {
struct roll_entry *item;
ssize_t bufsize = txn->rollentry_resident_bytecount;
char *MALLOC_N(bufsize, buf);
if (bufsize==0) return errno;
struct wbuf w;
wbuf_init(&w, buf, bufsize);
while ((item=txn->oldest_logentry)) {
assert(item->prev==0);
u_int32_t rollback_fsize = toku_logger_rollback_fsize(item);
txn->rollentry_resident_bytecount -= rollback_fsize;
txn->oldest_logentry = item->next;
if (item->next) { item->next->prev=0; }
toku_logger_rollback_wbufwrite(&w, item);
}
assert(txn->rollentry_resident_bytecount==0);
assert((ssize_t)w.ndone==bufsize);
txn->oldest_logentry = txn->newest_logentry = 0;
if (txn->rollentry_fd<0) {
char filenamepart[sizeof("__tokudb_rolltmp.") + 16];
snprintf(filenamepart, sizeof(filenamepart), "__tokudb_rolltmp.%.16"PRIx64, txn->txnid64);
txn->rollentry_filename = toku_xstrdup(filenamepart);
char *rollentry_filename_in_cwd = toku_construct_full_name(2, txn->logger->directory, filenamepart);
txn->rollentry_fd = open(rollentry_filename_in_cwd, O_CREAT+O_RDWR+O_EXCL+O_BINARY, 0600);
int r = errno;
toku_free(rollentry_filename_in_cwd);
if (txn->rollentry_fd == -1) return r;
}
uLongf compressed_len = compressBound(w.ndone);
char *MALLOC_N(compressed_len, compressed_buf);
{
int r = compress2((Bytef*)compressed_buf, &compressed_len,
(Bytef*)buf, w.ndone,
1);
assert(r==Z_OK);
}
{
u_int32_t v = toku_htod32(compressed_len);
ssize_t r = write_it(txn->rollentry_fd, &v, sizeof(v)); assert(r==sizeof(v));
}
{
ssize_t r = write_it(txn->rollentry_fd, compressed_buf, compressed_len);
if (r<0) return r;
assert(r==(ssize_t)compressed_len);
}
{
u_int32_t v = toku_htod32(w.ndone);
ssize_t r = write_it(txn->rollentry_fd, &v, sizeof(v)); assert(r==sizeof(v));
}
{
u_int32_t v = toku_htod32(compressed_len);
ssize_t r = write_it(txn->rollentry_fd, &v, sizeof(v)); assert(r==sizeof(v));
}
toku_free(compressed_buf);
txn->rollentry_filesize+=w.ndone;
toku_free(buf);
static void
toku_rollback_log_free(ROLLBACK_LOG_NODE *log_p) {
ROLLBACK_LOG_NODE log = *log_p;
*log_p = NULL; //Sanitize
// Cleanup the rollback memory
memarena_clear(txn->rollentry_arena);
// Cleanup the rollback memory
memarena_close(&log->rollentry_arena);
toku_free(log);
}
static void toku_rollback_flush_callback (CACHEFILE cachefile, int fd, BLOCKNUM logname,
void *rollback_v, void *extraargs, long UU(size),
BOOL write_me, BOOL keep_me, BOOL for_checkpoint) {
assert(extraargs);
int r;
TOKUTXN txn = extraargs;
ROLLBACK_LOG_NODE log = rollback_v;
CACHEFILE rollback_cachefile = txn->logger->rollback_cachefile;
struct brt_header *h = toku_cachefile_get_userdata(rollback_cachefile);
assert(log->thislogname.b==logname.b);
assert(rollback_cachefile == cachefile);
if (write_me && !h->panic) {
int n_workitems, n_threads;
toku_cachefile_get_workqueue_load(cachefile, &n_workitems, &n_threads);
r = toku_serialize_rollback_log_to(fd, log->thislogname, log, h, n_workitems, n_threads, for_checkpoint);
if (r) {
if (h->panic==0) {
char *e = strerror(r);
int l = 200 + strlen(e);
char s[l];
h->panic=r;
snprintf(s, l-1, "While writing data to disk, error %d (%s)", r, e);
h->panic_string = toku_strdup(s);
}
}
}
if (!keep_me) {
toku_rollback_log_free(&log);
}
}
static int toku_rollback_fetch_callback (CACHEFILE cachefile, int fd, BLOCKNUM logname, u_int32_t fullhash,
void **rollback_pv, long *sizep, void *extraargs) {
assert(extraargs);
int r;
TOKUTXN txn = extraargs;
CACHEFILE rollback_cachefile = txn->logger->rollback_cachefile;
struct brt_header *h = toku_cachefile_get_userdata(rollback_cachefile);
assert(rollback_cachefile == cachefile);
ROLLBACK_LOG_NODE *result = (ROLLBACK_LOG_NODE*)rollback_pv;
r = toku_deserialize_rollback_log_from(fd, logname, fullhash, result, txn, h);
if (r==0) {
*sizep = rollback_memory_size(*result);
}
return r;
}
static int toku_create_new_rollback_log (TOKUTXN txn, BLOCKNUM older, uint32_t older_hash, ROLLBACK_LOG_NODE *result) {
TAGMALLOC(ROLLBACK_LOG_NODE, log);
assert(log);
int r;
CACHEFILE cf = txn->logger->rollback_cachefile;
struct brt_header *h = toku_cachefile_get_userdata(cf);
log->layout_version = BRT_LAYOUT_VERSION;
log->layout_version_original = BRT_LAYOUT_VERSION;
log->layout_version_read_from_disk = BRT_LAYOUT_VERSION;
log->dirty = TRUE;
log->txnid = txn->txnid64;
log->sequence = txn->num_rollback_nodes++;
toku_allocate_blocknum(h->blocktable, &log->thislogname, h);
log->thishash = toku_cachetable_hash(cf, log->thislogname);
log->older = older;
log->older_hash = older_hash;
log->oldest_logentry = NULL;
log->newest_logentry = NULL;
log->rollentry_arena = memarena_create();
log->rollentry_resident_bytecount = 0;
*result = log;
r=toku_cachetable_put(cf, log->thislogname, log->thishash,
log, rollback_memory_size(log),
toku_rollback_flush_callback, toku_rollback_fetch_callback,
txn);
assert(r==0);
txn->current_rollback = log->thislogname;
txn->current_rollback_hash = log->thishash;
txn->pinned_inprogress_rollback_log = log;
return 0;
}
int toku_read_rollback_backwards(BREAD br, struct roll_entry **item, MEMARENA ma) {
u_int32_t nbytes_n; ssize_t sr;
if ((sr=bread_backwards(br, &nbytes_n, 4))!=4) { assert(sr<0); return errno; }
u_int32_t n_bytes=toku_dtoh32(nbytes_n);
unsigned char *buf = malloc_in_memarena(ma, n_bytes);
if (buf==0) return errno;
if ((sr=bread_backwards(br, buf, n_bytes-4))!=(ssize_t)n_bytes-4) { assert(sr<0); return errno; }
int r = toku_parse_rollback(buf, n_bytes, item, ma);
if (r!=0) return r;
return 0;
int
toku_rollback_log_unpin(TOKUTXN txn, ROLLBACK_LOG_NODE log) {
int r;
CACHEFILE cf = txn->logger->rollback_cachefile;
if (txn->pinned_inprogress_rollback_log == log) {
txn->pinned_inprogress_rollback_log = NULL;
}
r = toku_cachetable_unpin(cf, log->thislogname, log->thishash,
(enum cachetable_dirty)log->dirty, rollback_memory_size(log));
assert(r==0);
return r;
}
//Requires: log is pinned
// log is current
//After:
// log is unpinned if a spill happened
// Maybe there is no current after (if it spilled)
int toku_maybe_spill_rollbacks (TOKUTXN txn, ROLLBACK_LOG_NODE log) {
int r = 0;
if (log->rollentry_resident_bytecount > txn->logger->write_block_size) {
assert(log->thislogname.b == txn->current_rollback.b);
//spill
if (!txn_has_spilled_rollback_logs(txn)) {
//First spilled. Copy to head.
txn->spilled_rollback_head = txn->current_rollback;
txn->spilled_rollback_head_hash = txn->current_rollback_hash;
}
//Unconditionally copy to tail. Old tail does not need to be cached anymore.
txn->spilled_rollback_tail = txn->current_rollback;
txn->spilled_rollback_tail_hash = txn->current_rollback_hash;
txn->current_rollback = ROLLBACK_NONE;
txn->current_rollback_hash = 0;
//Unpin
r = toku_rollback_log_unpin(txn, log);
assert(r==0);
}
return r;
}
//Heaviside function to find a TOKUTXN by TOKUTXN (used to find the index)
@ -452,7 +575,7 @@ static void note_txn_closing (TOKUTXN txn) {
}
// Return the number of bytes that went into the rollback data structure (the uncompressed count if there is compression)
int toku_logger_txn_rolltmp_raw_count(TOKUTXN txn, u_int64_t *raw_count)
int toku_logger_txn_rollback_raw_count(TOKUTXN txn, u_int64_t *raw_count)
{
*raw_count = txn->rollentry_raw_count;
return 0;
@ -466,3 +589,60 @@ int toku_txn_find_by_xid (BRT brt, TXNID xid, TOKUTXN *txnptr) {
if (r == 0) *txnptr = txnv;
return r;
}
int toku_get_and_pin_rollback_log(TOKUTXN txn, TXNID xid, uint64_t sequence, BLOCKNUM name, uint32_t hash, ROLLBACK_LOG_NODE *result) {
BOOL save_inprogress_node = FALSE;
assert(name.b != ROLLBACK_NONE.b);
int r = 0;
ROLLBACK_LOG_NODE log = NULL;
if (name.b == txn->current_rollback.b) {
assert(hash == txn->current_rollback_hash);
log = txn->pinned_inprogress_rollback_log;
save_inprogress_node = TRUE;
}
if (!log) {
CACHEFILE cf = txn->logger->rollback_cachefile;
void * log_v;
r = toku_cachetable_get_and_pin(cf, name, hash,
&log_v, NULL,
toku_rollback_flush_callback, toku_rollback_fetch_callback,
txn);
assert(r==0);
log = (ROLLBACK_LOG_NODE)log_v;
}
if (r==0) {
assert(log->thislogname.b == name.b);
assert(log->txnid == xid);
assert(log->sequence == sequence);
if (save_inprogress_node) {
txn->pinned_inprogress_rollback_log = log;
}
*result = log;
}
return r;
}
int toku_get_and_pin_rollback_log_for_new_entry (TOKUTXN txn, ROLLBACK_LOG_NODE *result) {
int r;
ROLLBACK_LOG_NODE log;
if (txn_has_inprogress_rollback_log(txn)) {
r = toku_get_and_pin_rollback_log(txn, txn->txnid64, txn->num_rollback_nodes-1,
txn->current_rollback, txn->current_rollback_hash, &log);
assert(r==0);
}
else {
//Generate new one.
//tail will be ROLLBACK_NONE if this is the very first
r = toku_create_new_rollback_log(txn, txn->spilled_rollback_tail, txn->spilled_rollback_tail_hash, &log);
assert(r==0);
}
if (r==0) {
assert(log->txnid == txn->txnid64);
assert(log->thislogname.b != ROLLBACK_NONE.b);
*result = log;
}
return r;
}

View file

@ -13,19 +13,24 @@ void toku_poll_txn_progress_function(TOKUTXN txn, uint8_t is_commit, uint8_t sta
int toku_rollback_commit(TOKUTXN txn, YIELDF yield, void*yieldv, LSN lsn);
int toku_rollback_abort(TOKUTXN txn, YIELDF yield, void*yieldv, LSN lsn);
void toku_rollback_txn_close (TOKUTXN txn);
int toku_get_and_pin_rollback_log_for_new_entry (TOKUTXN txn, ROLLBACK_LOG_NODE *result);
int toku_get_and_pin_rollback_log(TOKUTXN txn, TXNID xid, uint64_t sequence, BLOCKNUM name, uint32_t hash, ROLLBACK_LOG_NODE *result);
int toku_rollback_log_unpin(TOKUTXN txn, ROLLBACK_LOG_NODE log);
int toku_delete_rollback_log(TOKUTXN txn, ROLLBACK_LOG_NODE log);
typedef int(*apply_rollback_item)(TOKUTXN txn, struct roll_entry *item, YIELDF yield, void*yieldv, LSN lsn);
int toku_commit_rollback_item (TOKUTXN txn, struct roll_entry *item, YIELDF yield, void*yieldv, LSN lsn);
int toku_abort_rollback_item (TOKUTXN txn, struct roll_entry *item, YIELDF yield, void*yieldv, LSN lsn);
void *toku_malloc_in_rollback(TOKUTXN txn, size_t size);
void *toku_memdup_in_rollback(TOKUTXN txn, const void *v, size_t len);
char *toku_strdup_in_rollback(TOKUTXN txn, const char *s);
int toku_maybe_spill_rollbacks (TOKUTXN txn);
void *toku_malloc_in_rollback(ROLLBACK_LOG_NODE log, size_t size);
void *toku_memdup_in_rollback(ROLLBACK_LOG_NODE log, const void *v, size_t len);
int toku_maybe_spill_rollbacks (TOKUTXN txn, ROLLBACK_LOG_NODE log);
int toku_txn_note_brt (TOKUTXN txn, BRT brt);
int toku_txn_note_swap_brt (BRT live, BRT zombie);
int toku_txn_note_close_brt (BRT brt);
int toku_logger_txn_rolltmp_raw_count(TOKUTXN txn, u_int64_t *raw_count);
int toku_logger_txn_rollback_raw_count(TOKUTXN txn, u_int64_t *raw_count);
int toku_txn_find_by_xid (BRT brt, TXNID xid, TOKUTXN *txnptr);
@ -36,4 +41,23 @@ int toku_commit_fileentries (int fd, TOKUTXN txn, YIELDF yield,void *yieldv, LSN
//Heaviside function to find a TOKUTXN by TOKUTXN (used to find the index)
int find_xid (OMTVALUE v, void *txnv);
struct rollback_log_node {
enum typ_tag tag;
int layout_version;
int layout_version_original;
int layout_version_read_from_disk;
int dirty;
TXNID txnid; // Which transaction made this?
uint64_t sequence; // Which rollback log in the sequence is this?
BLOCKNUM thislogname; // Which block number is this chunk of the log?
uint32_t thishash;
BLOCKNUM older; // Which block number is the next oldest chunk of the log?
uint32_t older_hash;
struct roll_entry *oldest_logentry;
struct roll_entry *newest_logentry;
MEMARENA rollentry_arena;
size_t rollentry_resident_bytecount; // How many bytes for the rollentries that are stored in main memory.
};
#endif

View file

@ -58,15 +58,6 @@ check_ok:
test 0 = 0 $(SUMMARIZE_CMD)
ifeq ($(TOKU_SKIP_1305),1)
check_test1305:
@echo SKIPPED SLOW TEST $@
else
# Don't run 1305 under valgrind. It takes way too long.
check_test1305$(BINSUF): test1305$(BINSUF)
./$< $(VERBVERBOSE) $(SUMMARIZE_CMD)
endif
ifeq ($(TOKU_SKIP_MINICRON),1)
check_minicron-test$(BINSUF):
@echo "SKIPPED TEST $@ (Fails in win64 vm due to timing issues)"

View file

@ -1,80 +0,0 @@
/* Test bread by writing random data and then reading it using bread_backwards() to see if it gives the right answer.
* See test_1305 for another bread test (testing to see if it can read 1GB files) */
#include "test.h"
#include <toku_portability.h>
#include <fcntl.h>
#include <stdlib.h>
#include <unistd.h>
#include <stdio.h>
#include <string.h>
#include <zlib.h>
#include "../brttypes.h"
#include "../bread.h"
#define FNAME "bread-test.data"
#define RECORDS 20
#define RECORDLEN 100
char buf[RECORDS][RECORDLEN];
int sizes[RECORDS];
int sizesn[RECORDS];
int nwrote=0;
char wrotedata[RECORDS*RECORDLEN];
static void
test (int seed) {
srandom(seed);
unlink(FNAME);
int i;
{
int fd = open(FNAME, O_CREAT+O_RDWR+O_BINARY, 0777);
assert(fd>=0);
for (i=0; i<RECORDS; i++) {
sizes[i] = 1+ random()%RECORDLEN;
sizesn[i] = toku_htod32(sizes[i]);
int j;
for (j=0; j<sizes[i]; j++) {
buf[i][j] = wrotedata[nwrote++] = (char)random();
}
uLongf compressed_size = compressBound(sizes[i]);
Bytef compressed_buf[compressed_size];
{ int r = compress2(compressed_buf, &compressed_size, (Bytef*)(buf[i]), sizes[i], 1); assert(r==Z_OK); }
u_int32_t compressed_size_n = toku_htod32(compressed_size);
{ toku_os_full_write(fd, &compressed_size_n, 4); }
{ toku_os_full_write(fd, compressed_buf, compressed_size); }
{ toku_os_full_write(fd, &sizesn[i], 4); } // the uncompressed size
{ toku_os_full_write(fd, &compressed_size_n, 4); }
}
{ int r=close(fd); assert(r==0); }
}
int fd = open(FNAME, O_RDONLY+O_BINARY); assert(fd>=0);
// Now read it all backward
BREAD br = create_bread_from_fd_initialize_at(fd);
while (bread_has_more(br)) {
assert(nwrote>0);
int to_read = 1+(random()%RECORDLEN); // read from 1 to 100 (if RECORDLEN is 100)
if (to_read>nwrote) to_read=nwrote;
char rbuf[to_read];
int r = bread_backwards(br, rbuf, to_read);
assert(r==to_read);
assert(memcmp(rbuf, &wrotedata[nwrote-to_read], to_read)==0);
nwrote-=to_read;
}
assert(nwrote==0);
{ int r=close_bread_without_closing_fd(br); assert(r==0); }
{ int r=close(fd); assert(r==0); }
unlink(FNAME);
}
int
test_main (int argc __attribute__((__unused__)), const char *argv[] __attribute__((__unused__))) {
int i;
for (i=0; i<10; i++) test(i);
return 0;
}

View file

@ -102,7 +102,7 @@ static void checkpoint_pending(void) {
char fname1[] = __FILE__ "test1.dat";
r = unlink(fname1); if (r!=0) CKERR2(errno, ENOENT);
r = toku_cachetable_openf(&cf, ct, fname1, O_RDWR|O_CREAT, S_IRWXU|S_IRWXG|S_IRWXO); assert(r == 0);
toku_cachefile_set_userdata(cf, NULL, NULL, NULL, NULL, NULL, NULL,
toku_cachefile_set_userdata(cf, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
dummy_pin_unpin, dummy_pin_unpin);
// Insert items into the cachetable. All dirty.

View file

@ -60,7 +60,7 @@ static void cachetable_checkpoint_test(int n, enum cachetable_dirty dirty) {
unlink(fname1);
CACHEFILE f1;
r = toku_cachetable_openf(&f1, ct, fname1, O_RDWR|O_CREAT, S_IRWXU|S_IRWXG|S_IRWXO); assert(r == 0);
toku_cachefile_set_userdata(f1, NULL, NULL, NULL, NULL, NULL, NULL,
toku_cachefile_set_userdata(f1, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
dummy_pin_unpin, dummy_pin_unpin);
// insert items into the cachetable. all should be dirty

View file

@ -46,7 +46,7 @@ static void cachetable_prefetch_checkpoint_test(int n, enum cachetable_dirty dir
unlink(fname1);
CACHEFILE f1;
r = toku_cachetable_openf(&f1, ct, fname1, O_RDWR|O_CREAT, S_IRWXU|S_IRWXG|S_IRWXO); assert(r == 0);
toku_cachefile_set_userdata(f1, NULL, NULL, NULL, NULL, NULL, NULL,
toku_cachefile_set_userdata(f1, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
dummy_pin_unpin, dummy_pin_unpin);
// prefetch block n+1. this will take 10 seconds.

View file

@ -5,7 +5,7 @@
#define TESTDIR "dir." __FILE__
static const int magic_begin_end_checkpoint_sz = 77; // leave this many bytes in file
static const int magic_begin_end_checkpoint_sz = 85; // leave this many bytes in file
static int
run_test(void) {
@ -28,12 +28,12 @@ run_test(void) {
LSN beginlsn;
// all logs must contain a valid checkpoint
r = toku_log_begin_checkpoint(logger, &beginlsn, TRUE, 0); assert(r == 0);
r = toku_log_end_checkpoint(logger, NULL, TRUE, beginlsn.lsn, 0); assert(r == 0);
r = toku_log_end_checkpoint(logger, NULL, TRUE, beginlsn.lsn, 0, 0, 0); assert(r == 0);
r = toku_log_comment(logger, NULL, TRUE, 0, hello); assert(r == 0);
r = toku_log_comment(logger, NULL, TRUE, 0, world); assert(r == 0);
r = toku_log_begin_checkpoint(logger, &beginlsn, TRUE, 0); assert(r == 0);
r = toku_log_end_checkpoint(logger, NULL, TRUE, beginlsn.lsn, 0); assert(r == 0);
r = toku_log_end_checkpoint(logger, NULL, TRUE, beginlsn.lsn, 0, 0, 0); assert(r == 0);
r = toku_log_comment(logger, NULL, TRUE, 0, hello); assert(r == 0);
r = toku_log_comment(logger, NULL, TRUE, 0, there); assert(r == 0);
r = toku_logger_close(&logger); assert(r == 0);

View file

@ -21,7 +21,7 @@ run_test(void) {
// add begin checkpoint, end checkpoint
LSN beginlsn;
r = toku_log_begin_checkpoint(logger, &beginlsn, FALSE, 0); assert(r == 0);
r = toku_log_end_checkpoint(logger, NULL, TRUE, beginlsn.lsn, 0); assert(r == 0);
r = toku_log_end_checkpoint(logger, NULL, TRUE, beginlsn.lsn, 0, 0, 0); assert(r == 0);
r = toku_logger_close(&logger); assert(r == 0);
// add hello

View file

@ -21,7 +21,7 @@ run_test(void) {
r = toku_logger_create(&logger); assert(r == 0);
r = toku_logger_open(TESTDIR, logger); assert(r == 0);
r = toku_log_end_checkpoint(logger, NULL, TRUE, beginlsn.lsn, 0); assert(r == 0);
r = toku_log_end_checkpoint(logger, NULL, TRUE, beginlsn.lsn, 0, 0, 0); assert(r == 0);
r = toku_logger_close(&logger); assert(r == 0);
// run recovery

View file

@ -17,7 +17,7 @@ run_test(void) {
r = toku_logger_open(TESTDIR, logger); assert(r == 0);
LSN beginlsn;
r = toku_log_begin_checkpoint(logger, &beginlsn, TRUE, 0); assert(r == 0);
r = toku_log_end_checkpoint(logger, NULL, TRUE, beginlsn.lsn, 0); assert(r == 0);
r = toku_log_end_checkpoint(logger, NULL, TRUE, beginlsn.lsn, 0, 0, 0); assert(r == 0);
r = toku_log_begin_checkpoint(logger, &beginlsn, TRUE, 0); assert(r == 0);
r = toku_logger_close(&logger); assert(r == 0);

View file

@ -20,7 +20,7 @@ run_test(void) {
LSN firstbegin = ZERO_LSN;
r = toku_log_begin_checkpoint(logger, &firstbegin, TRUE, 0); assert(r == 0);
assert(firstbegin.lsn != ZERO_LSN.lsn);
r = toku_log_end_checkpoint(logger, NULL, FALSE, firstbegin.lsn, 0); assert(r == 0);
r = toku_log_end_checkpoint(logger, NULL, FALSE, firstbegin.lsn, 0, 0, 0); assert(r == 0);
r = toku_log_begin_checkpoint(logger, NULL, TRUE, 0); assert(r == 0);
r = toku_logger_close(&logger); assert(r == 0);

View file

@ -19,7 +19,7 @@ run_test(void) {
r = toku_logger_open(TESTDIR, logger); assert(r == 0);
LSN beginlsn;
r = toku_log_begin_checkpoint(logger, &beginlsn, TRUE, 0); assert(r == 0);
r = toku_log_end_checkpoint(logger, NULL, TRUE, beginlsn.lsn, 0); assert(r == 0);
r = toku_log_end_checkpoint(logger, NULL, TRUE, beginlsn.lsn, 0, 0, 0); assert(r == 0);
BYTESTRING iname = { strlen("missing_tokudb_file"), "missing_tokudb_file" };
FILENUM filenum = {42};

View file

@ -21,7 +21,7 @@ run_test(void) {
r = toku_log_comment(logger, NULL, TRUE, 0, hello); assert(r == 0);
LSN beginlsn;
r = toku_log_begin_checkpoint(logger, &beginlsn, TRUE, 0); assert(r == 0);
r = toku_log_end_checkpoint(logger, NULL, TRUE, beginlsn.lsn, 0); assert(r == 0);
r = toku_log_end_checkpoint(logger, NULL, TRUE, beginlsn.lsn, 0, 0, 0); assert(r == 0);
r = toku_log_comment(logger, NULL, TRUE, 0, hello); assert(r == 0);
BYTESTRING there = { strlen("there"), "there" };
r = toku_log_comment(logger, NULL, TRUE, 0, there); assert(r == 0);

View file

@ -30,7 +30,7 @@ run_test(void) {
LSN beginlsn;
r = toku_log_begin_checkpoint(logger, &beginlsn, TRUE, 0); assert(r == 0);
r = toku_log_end_checkpoint(logger, NULL, TRUE, beginlsn.lsn, 0); assert(r == 0);
r = toku_log_end_checkpoint(logger, NULL, TRUE, beginlsn.lsn, 0, 0, 0); assert(r == 0);
r = toku_logger_close(&logger); assert(r == 0);

View file

@ -1,106 +0,0 @@
/* Test bread_backwards to make sure it can read backwards even for large files. */
#include <toku_portability.h>
#include <fcntl.h>
#include <stdlib.h>
#include <unistd.h>
#include <stdio.h>
#include <string.h>
#include <zlib.h>
#include "../brttypes.h"
#include "../bread.h"
#include "test.h"
#define FNAME "test1305.data"
// THe buffer size in units of 64-bit integers.
#define N_BIGINTS (1<<20)
#define BIGINT_SIZE (sizeof(u_int64_t))
// How big is the readback buffer (in 8-bit integers)?
#define READBACK_BUFSIZE (1<<20)
static void
test (u_int64_t fsize) {
unlink(FNAME);
// Create a file of size fsize. Fill it with 8-byte values which are integers, in order)
assert(fsize%(N_BIGINTS*sizeof(u_int64_t)) == 0); // Make sure the fsize is a multiple of the buffer size.
u_int64_t i = 0;
{
int fd = open(FNAME, O_CREAT+O_RDWR+O_BINARY, 0777);
assert(fd>=0);
static u_int64_t buf[N_BIGINTS]; //windows cannot handle this on the stack
static char compressed_buf[N_BIGINTS*2 + 1000]; // this is more than compressbound returns
uLongf compressed_len;
while (i*BIGINT_SIZE < fsize) {
if (verbose>0 && i % (1<<25) == 0) {
printf(" %s:test (%"PRIu64") forwards [%"PRIu64"%%]\n", __FILE__, fsize, 100*BIGINT_SIZE*((u_int64_t)i) / fsize);
fflush(stdout);
}
int j;
for (j=0; j<N_BIGINTS; j++) {
buf[j] = i++;
}
assert(sizeof(buf) == N_BIGINTS * BIGINT_SIZE);
{
compressed_len = sizeof(compressed_buf);
int r = compress2((Bytef*)compressed_buf, &compressed_len, (Bytef*)buf, sizeof(buf), 1);
assert(r==Z_OK);
}
{
u_int32_t v = toku_htod32(compressed_len);
toku_os_full_write(fd, &v, sizeof(v));
}
{
toku_os_full_write(fd, compressed_buf, compressed_len);
}
{
u_int32_t v = toku_htod32(sizeof(buf));
toku_os_full_write(fd, &v, sizeof(v));
}
{
u_int32_t v = toku_htod32(compressed_len);
toku_os_full_write(fd, &v, sizeof(v));
}
}
{ int r = close(fd); assert(r==0); }
}
assert(i*BIGINT_SIZE == fsize);
// Now read it all backward
{
int fd = open(FNAME, O_RDONLY+O_BINARY); assert(fd>=0);
BREAD br = create_bread_from_fd_initialize_at(fd);
while (bread_has_more(br)) {
if (verbose>0 && (fsize/BIGINT_SIZE - i) % (1<<25) == 0) {
printf(" %s:test (%"PRIu64") backwards [%"PRIu64"%%]\n", __FILE__, fsize, 100*BIGINT_SIZE*((u_int64_t)i) / fsize);
fflush(stdout);
}
assert(i>0);
i--;
u_int64_t storedi;
{ int r = bread_backwards(br, &storedi, sizeof(storedi)); assert(r==sizeof(storedi)); }
assert(storedi==i);
}
assert(i==0);
{ int r=close_bread_without_closing_fd(br); assert(r==0); }
{ int r=close(fd); assert(r==0); }
}
//printf("Did %" PRIu64 "\n", fsize);
//system("ls -l " FNAME);
unlink(FNAME);
}
int
test_main (int argc, const char *argv[]) {
default_parse_args(argc, argv);
test(1LL<<23);
test(1LL<<30);
test(1LL<<31);
test(1LL<<32);
test(1LL<<33);
return 0;
}

View file

@ -153,6 +153,9 @@ int create_logfiles() {
LSN lsn = {0};
TXNID txnid = 0;
TXNID cp_txnid = 0;
u_int32_t num_fassociate = 0;
u_int32_t num_xstillopen = 0;
bs_aname.len = 4; bs_aname.data="a.db";
bs_bname.len = 4; bs_bname.data="b.db";
@ -171,31 +174,40 @@ int create_logfiles() {
//fcreate 'F': lsn=2 txnid=1 filenum=0 fname={len=4 data="a.db"} mode=0777 treeflags=0 crc=18a3d525 len=49
r = toku_log_fcreate(logger, &lsn, NO_FSYNC, txnid, fn_aname, bs_aname, 0x0777, 0, 0, bs_empty); assert(r==0);
//commit 'C': lsn=3 txnid=1 crc=00001f1e len=29
r = toku_log_commit(logger, &lsn, FSYNC, txnid); assert(r==0);
r = toku_log_xcommit(logger, &lsn, FSYNC, txnid); assert(r==0);
//xbegin 'b': lsn=4 parenttxnid=0 crc=00000a1f len=29
r = toku_log_xbegin(logger, &lsn, NO_FSYNC, 0); assert(r==0); txnid = lsn.lsn;
//fcreate 'F': lsn=5 txnid=4 filenum=1 fname={len=4 data="b.db"} mode=0777 treeflags=0 crc=14a47925 len=49
r = toku_log_fcreate(logger, &lsn, NO_FSYNC, txnid, fn_bname, bs_bname, 0x0777, 0, 0, bs_empty); assert(r==0);
//commit 'C': lsn=6 txnid=4 crc=0000c11e len=29
r = toku_log_commit(logger, &lsn, FSYNC, txnid); assert(r==0);
r = toku_log_xcommit(logger, &lsn, FSYNC, txnid); assert(r==0);
//xbegin 'b': lsn=7 parenttxnid=0 crc=0000f91f len=29
r = toku_log_xbegin(logger, &lsn, NO_FSYNC, 0); assert(r==0); txnid = lsn.lsn;
//enq_insert 'I': lsn=8 filenum=0 xid=7 key={len=2 data="a\000"} value={len=2 data="b\000"} crc=40b863e4 len=45
r = toku_log_enq_insert(logger, &lsn, NO_FSYNC, fn_aname, txnid, bs_a, bs_b); assert(r==0);
//begin_checkpoint 'x': lsn=9 timestamp=1251309957584197 crc=cd067878 len=29
r = toku_log_begin_checkpoint(logger, &lsn, NO_FSYNC, 1251309957584197); assert(r==0); cp_txnid = lsn.lsn;
//xstillopen 's': lsn=10 txnid=7 parent=0 crc=00061816 len=37
r = toku_log_xstillopen(logger, &lsn, NO_FSYNC, txnid, 0); assert(r==0);
//fassociate 'f': lsn=11 filenum=1 fname={len=4 data="b.db"} crc=a7126035 len=33
r = toku_log_fassociate(logger, &lsn, NO_FSYNC, fn_bname, 0, bs_bname); assert(r==0);
num_fassociate++;
//fassociate 'f': lsn=12 filenum=0 fname={len=4 data="a.db"} crc=a70c5f35 len=33
r = toku_log_fassociate(logger, &lsn, NO_FSYNC, fn_aname, 0, bs_aname); assert(r==0);
num_fassociate++;
//xstillopen 's': lsn=10 txnid=7 parent=0 crc=00061816 len=37 <- obsolete
{
FILENUMS filenums = {0, NULL};
r = toku_log_xstillopen(logger, &lsn, NO_FSYNC, txnid, 0,
0, filenums, 0, 0, 0,
ROLLBACK_NONE, ROLLBACK_NONE, ROLLBACK_NONE);
assert(r==0);
}
num_xstillopen++;
//end_checkpoint 'X': lsn=13 txnid=9 timestamp=1251309957586872 crc=cd285c30 len=37
r = toku_log_end_checkpoint(logger, &lsn, FSYNC, cp_txnid, 1251309957586872); assert(r==0);
r = toku_log_end_checkpoint(logger, &lsn, FSYNC, cp_txnid, 1251309957586872, num_fassociate, num_xstillopen); assert(r==0);
//enq_insert 'I': lsn=14 filenum=1 xid=7 key={len=2 data="b\000"} value={len=2 data="a\000"} crc=40388be4 len=45
r = toku_log_enq_insert(logger, &lsn, NO_FSYNC, fn_bname, txnid, bs_b, bs_a); assert(r==0);
//commit 'C': lsn=15 txnid=7 crc=00016d1e len=29
r = toku_log_commit(logger, &lsn, FSYNC, txnid); assert(r==0);
r = toku_log_xcommit(logger, &lsn, FSYNC, txnid); assert(r==0);
// close logger
r = toku_logger_close(&logger); assert(r==0);

View file

@ -12,18 +12,20 @@ int toku_txn_begin_txn (TOKUTXN parent_tokutxn, TOKUTXN *tokutxn, TOKULOGGER log
int toku_txn_begin_with_xid (TOKUTXN parent_tokutxn, TOKUTXN *tokutxn, TOKULOGGER logger, TXNID xid) {
if (logger->is_panicked) return EINVAL;
assert(logger->rollback_cachefile);
TAGMALLOC(TOKUTXN, result);
if (result==0)
return errno;
int r;
LSN first_lsn;
if (xid == 0) {
r = toku_log_xbegin(logger, &result->first_lsn, 0, parent_tokutxn ? parent_tokutxn->txnid64 : 0);
r = toku_log_xbegin(logger, &first_lsn, 0, parent_tokutxn ? parent_tokutxn->txnid64 : 0);
if (r!=0) goto died;
} else
result->first_lsn.lsn = xid;
first_lsn.lsn = xid;
r = toku_omt_create(&result->open_brts);
if (r!=0) goto died;
result->txnid64 = result->first_lsn.lsn;
result->txnid64 = first_lsn.lsn;
XIDS parent_xids;
if (parent_tokutxn==NULL)
parent_xids = xids_get_root_xids();
@ -33,13 +35,19 @@ int toku_txn_begin_with_xid (TOKUTXN parent_tokutxn, TOKUTXN *tokutxn, TOKULOGGE
goto died;
result->logger = logger;
result->parent = parent_tokutxn;
result->oldest_logentry = result->newest_logentry = 0;
result->rollentry_arena = memarena_create();
result->num_rollentries = 0;
result->num_rollentries_processed = 0;
result->progress_poll_fun = NULL;
result->progress_poll_fun_extra = NULL;
result->spilled_rollback_head = ROLLBACK_NONE;
result->spilled_rollback_tail = ROLLBACK_NONE;
result->spilled_rollback_head_hash = 0;
result->spilled_rollback_tail_hash = 0;
result->current_rollback = ROLLBACK_NONE;
result->current_rollback_hash = 0;
result->num_rollback_nodes = 0;
result->pinned_inprogress_rollback_log = NULL;
if (toku_omt_size(logger->live_txns) == 0) {
assert(logger->oldest_living_xid == TXNID_NONE_LIVING);
@ -59,13 +67,9 @@ int toku_txn_begin_with_xid (TOKUTXN parent_tokutxn, TOKUTXN *tokutxn, TOKULOGGE
assert(idx > 0);
}
result->rollentry_resident_bytecount=0;
result->rollentry_raw_count = 0;
result->rollentry_filename = 0;
result->rollentry_fd = -1;
result->rollentry_filesize = 0;
result->force_fsync_on_commit = FALSE;
result->has_done_work = FALSE;
result->recovered_from_checkpoint = FALSE;
*tokutxn = result;
return 0;
@ -75,6 +79,36 @@ died:
return r;
}
//Used on recovery to recover a transaction.
int
toku_txn_load_txninfo (TOKUTXN txn, TXNINFO info) {
#define COPY_FROM_INFO(field) txn->field = info->field
COPY_FROM_INFO(rollentry_raw_count);
uint32_t i;
for (i = 0; i < info->num_brts; i++) {
BRT brt = info->open_brts[i];
int r = toku_txn_note_brt(txn, brt);
assert(r==0);
}
COPY_FROM_INFO(force_fsync_on_commit );
COPY_FROM_INFO(num_rollback_nodes);
COPY_FROM_INFO(num_rollentries);
CACHEFILE rollback_cachefile = txn->logger->rollback_cachefile;
COPY_FROM_INFO(spilled_rollback_head);
txn->spilled_rollback_head_hash = toku_cachetable_hash(rollback_cachefile,
txn->spilled_rollback_head);
COPY_FROM_INFO(spilled_rollback_tail);
txn->spilled_rollback_tail_hash = toku_cachetable_hash(rollback_cachefile,
txn->spilled_rollback_tail);
COPY_FROM_INFO(current_rollback);
txn->current_rollback_hash = toku_cachetable_hash(rollback_cachefile,
txn->current_rollback);
#undef COPY_FROM_INFO
txn->recovered_from_checkpoint = TRUE;
return 0;
}
// Doesn't close the txn, just performs the commit operations.
int toku_txn_commit_txn(TOKUTXN txn, int nosync, YIELDF yield, void *yieldv,
@ -92,13 +126,13 @@ int toku_txn_commit_with_lsn(TOKUTXN txn, int nosync, YIELDF yield, void *yieldv
// panic handled in log_commit
//Child transactions do not actually 'commit'. They promote their changes to parent, so no need to fsync if this txn has a parent.
int do_fsync = !txn->parent && (txn->force_fsync_on_commit || (!nosync && txn->has_done_work));
int do_fsync = !txn->parent && (txn->force_fsync_on_commit || (!nosync && txn->num_rollentries>0));
txn->progress_poll_fun = poll;
txn->progress_poll_fun_extra = poll_extra;
if (release_locks) release_locks(locks_thunk);
r = toku_log_commit(txn->logger, (LSN*)0, do_fsync, txn->txnid64); // exits holding neither of the tokulogger locks.
r = toku_log_xcommit(txn->logger, (LSN*)0, do_fsync, txn->txnid64); // exits holding neither of the tokulogger locks.
if (reacquire_locks) reacquire_locks(locks_thunk);
if (r!=0)
return r;

View file

@ -7,6 +7,7 @@
int toku_txn_begin_txn (TOKUTXN parent_tokutxn, TOKUTXN *tokutxn, TOKULOGGER logger);
int toku_txn_begin_with_xid (TOKUTXN parent_tokutxn, TOKUTXN *tokutxn, TOKULOGGER logger, TXNID xid);
int toku_txn_load_txninfo (TOKUTXN txn, TXNINFO info);
int toku_txn_commit_txn (TOKUTXN txn, int nosync, YIELDF yield, void *yieldv,
TXN_PROGRESS_POLL_FUNCTION poll, void *poll_extra,

View file

@ -170,7 +170,6 @@ static inline void wbuf_DISKOFF (struct wbuf *w, DISKOFF off) {
static inline void wbuf_BLOCKNUM (struct wbuf *w, BLOCKNUM b) {
wbuf_ulonglong(w, b.b);
}
static inline void wbuf_nocrc_BLOCKNUM (struct wbuf *w, BLOCKNUM b) {
wbuf_nocrc_ulonglong(w, b.b);
}

View file

@ -60,7 +60,7 @@ int singlex_child = 0; // Do a single transaction, but do all work with a child
int singlex = 0; // Do a single transaction
int singlex_create = 0; // Create the db using the single transaction (only valid if singlex)
int insert1first = 0; // insert 1 before doing the rest
int check_small_rolltmp = 0; // verify that the rollback logs are small (only valid if singlex)
int check_small_rollback = 0; // verify that the rollback logs are small (only valid if singlex)
int do_transactions = 0;
int if_transactions_do_logging = DB_INIT_LOG; // set this to zero if we want no logging when transactions are used
int do_abort = 0;
@ -210,14 +210,14 @@ static void benchmark_shutdown (void) {
#endif
if (do_transactions && singlex && !insert1first && (singlex_create || prelock)) {
#if defined(TOKUDB)
//There should be a single 'truncate' in the rolltmp instead of many 'insert' entries.
//There should be a single 'truncate' in the rollback instead of many 'insert' entries.
struct txn_stat *s;
r = tid->txn_stat(tid, &s);
assert(r==0);
//TODO: #1125 Always do the test after performance testing is done.
if (singlex_child) fprintf(stderr, "SKIPPED 'small rolltmp' test for child txn\n");
if (singlex_child) fprintf(stderr, "SKIPPED 'small rollback' test for child txn\n");
else
assert(s->rolltmp_raw_count < 100); // gross test, not worth investigating details
assert(s->rollback_raw_count < 100); // gross test, not worth investigating details
free(s);
//system("ls -l bench.tokudb");
#endif
@ -374,7 +374,7 @@ static int print_usage (const char *argv0) {
fprintf(stderr, " --singlex-child (implies -x) Run the whole job as a single transaction, do all work a child of that transaction.\n");
fprintf(stderr, " --finish-child-first Commit/abort child before doing so to parent (no effect if no child).\n");
fprintf(stderr, " --singlex-create (implies --singlex) Create the file using the single transaction (Default is to use a different transaction to create.)\n");
fprintf(stderr, " --check_small_rolltmp (Only valid in --singlex mode) Verify that very little data was saved in the rollback logs.\n");
fprintf(stderr, " --check_small_rollback (Only valid in --singlex mode) Verify that very little data was saved in the rollback logs.\n");
fprintf(stderr, " --prelock Prelock the database.\n");
fprintf(stderr, " --prelockflag Prelock the database and send the DB_PRELOCKED_WRITE flag.\n");
fprintf(stderr, " --abort Abort the singlex after the transaction is over. (Requires --singlex.)\n");
@ -463,8 +463,8 @@ int main (int argc, const char *argv[]) {
singlex = 1;
} else if (strcmp(arg, "--insert1first") == 0) {
insert1first = 1;
} else if (strcmp(arg, "--check_small_rolltmp") == 0) {
check_small_rolltmp = 1;
} else if (strcmp(arg, "--check_small_rollback") == 0) {
check_small_rollback = 1;
} else if (strcmp(arg, "--xcount") == 0) {
if (i+1 >= argc) return print_usage(argv[0]);
items_per_transaction = strtoll(argv[++i], &endptr, 10); assert(*endptr == 0);
@ -560,13 +560,13 @@ int main (int argc, const char *argv[]) {
printf("insertions of %d per batch%s\n", items_per_iteration, do_transactions ? " (with transactions)" : "");
}
#if !defined TOKUDB
if (check_small_rolltmp) {
fprintf(stderr, "--check_small_rolltmp only works on the TokuDB (not BDB)\n");
if (check_small_rollback) {
fprintf(stderr, "--check_small_rollback only works on the TokuDB (not BDB)\n");
return print_usage(argv[0]);
}
#endif
if (check_small_rolltmp && !singlex) {
fprintf(stderr, "--check_small_rolltmp requires --singlex\n");
if (check_small_rollback && !singlex) {
fprintf(stderr, "--check_small_rollback requires --singlex\n");
return print_usage(argv[0]);
}
benchmark_setup();

View file

@ -332,6 +332,7 @@ endif
mkdir dir.$*.c.tdb.recover && \
cp dir.$*.c.tdb/tokudb.directory dir.$*.c.tdb.recover/ && \
cp dir.$*.c.tdb/tokudb.environment dir.$*.c.tdb.recover/ && \
cp dir.$*.c.tdb/tokudb.rollback dir.$*.c.tdb.recover/ && \
cp dir.$*.c.tdb/*.tokulog dir.$*.c.tdb.recover/ && \
echo doing recovery &&\
$(VGRIND) ../../newbrt/tdb-recover dir.$*.c.tdb.recover dir.$*.c.tdb.recover && \

View file

@ -60,13 +60,13 @@ static void do_1381_maybe_lock (int do_table_lock, u_int64_t *raw_count) {
}
r = txn->txn_stat(txn, &s2); CKERR(r);
//printf("Raw counts = %" PRId64 ", %" PRId64 "\n", s1->rolltmp_raw_count, s2->rolltmp_raw_count);
//printf("Raw counts = %" PRId64 ", %" PRId64 "\n", s1->rollback_raw_count, s2->rollback_raw_count);
*raw_count = s2->rolltmp_raw_count - s1->rolltmp_raw_count;
*raw_count = s2->rollback_raw_count - s1->rollback_raw_count;
if (do_table_lock) {
assert(s1->rolltmp_raw_count == s2->rolltmp_raw_count);
assert(s1->rollback_raw_count == s2->rollback_raw_count);
} else {
assert(s1->rolltmp_raw_count < s2->rolltmp_raw_count);
assert(s1->rollback_raw_count < s2->rollback_raw_count);
}
toku_free(s1); toku_free(s2);

View file

@ -91,7 +91,7 @@ do_db_work(void) {
}
if (did_fail) goto shutdown2;
// Put an extra item in so that the rolltmp file will be created.
// Put an extra item in
r=env->txn_begin(env, 0, &tid, 0); assert(r==0);
r=db->put(db, tid, dbt_init(&key, "a", 2), dbt_init(&data, "b", 2), 0); DOERR(r);
if (did_fail) {

View file

@ -21,7 +21,7 @@ test_stat64 (unsigned int N) {
DB_TXN *txn;
r = db_env_create(&env, 0); CKERR(r);
r = env->set_cachesize(env, 0, 10*1000000, 1);
r = env->set_cachesize(env, 0, 20*1000000, 1);
r = env->open(env, ENVDIR, DB_INIT_LOCK|DB_INIT_LOG|DB_INIT_MPOOL|DB_INIT_TXN|DB_CREATE|DB_PRIVATE, S_IRWXU+S_IRWXG+S_IRWXO); CKERR(r);
r = db_create(&db, env, 0); CKERR(r);
@ -38,6 +38,10 @@ test_stat64 (unsigned int N) {
unsigned int i;
u_int64_t dsize=0;
for (i=0; i<N; i++) {
if (verbose>1 && i % (1<<14) == 0) {
printf("%s(total=%u) inserted %u so far\n", __FILE__, N, i);
fflush(stdout);
}
char hello[30], there[30];
snprintf(hello, sizeof(hello), "hello%8d", i);
snprintf(there, sizeof(there), "there%d", i);

View file

@ -1,85 +0,0 @@
/* -*- mode: C; c-basic-offset: 4 -*- */
#ident "Copyright (c) 2007 Tokutek Inc. All rights reserved."
#include "test.h"
/* Test for #1324. Make sure rolltmp files are removed. */
#include <db.h>
#include <fcntl.h>
#ifndef USE_TDB
#error This test only works for TokuDB.
#endif
static void mkfile (const char *fname) {
mode_t mode = S_IRWXU|S_IRWXG|S_IRWXO;
int fd = open(fname, O_WRONLY | O_CREAT | O_BINARY, mode);
if (fd<0) perror("opening");
assert(fd>=0);
toku_os_full_write(fd, "hello\n", 6);
int r = close(fd); assert(r==0);
}
static void
do_1324 (int moreflags)
{
const char fname[] = ENVDIR "/__tokudb_rolltmp.12345";
const char fnamekeep[] = ENVDIR "/keepme";
system("rm -rf " ENVDIR);
toku_os_mkdir(ENVDIR, S_IRWXU+S_IRWXG+S_IRWXO);
mkfile(fname);
mkfile(fnamekeep);
const int envflags = DB_CREATE|DB_INIT_MPOOL|DB_INIT_TXN|DB_INIT_LOCK |DB_THREAD |DB_PRIVATE | DB_RECOVER | moreflags;
{
DB_ENV *env;
int r;
if (moreflags & DB_INIT_LOG) {
// create the log
r = db_env_create(&env, 0); CKERR(r);
r = env->open(env, ENVDIR, envflags & ~DB_RECOVER, S_IRWXU+S_IRWXG+S_IRWXO); CKERR(r);
r = env->close(env, 0); CKERR(r);
}
r = db_env_create(&env, 0); CKERR(r);
r = env->open(env, ENVDIR, envflags, S_IRWXU+S_IRWXG+S_IRWXO); CKERR(r);
{
toku_struct_stat sbuf;
r = toku_stat(fname, &sbuf);
if (r==0) {
fprintf(stderr, "The rolltmp file %s should have been deleted, but was not.\n", fname);
}
assert(r!=0);
}
{
toku_struct_stat sbuf;
r = toku_stat(fnamekeep, &sbuf);
if (r!=0) {
fprintf(stderr, "The keepme file %s should NOT have been deleted, but was not.\n", fnamekeep);
}
assert(r==0);
}
r = env->close(env, 0); CKERR(r);
}
system("ls -l " ENVDIR);
// make sure we can open the env again.
{
DB_ENV *env;
int r;
r = db_env_create(&env, 0); CKERR(r);
r = env->open(env, ENVDIR, envflags, S_IRWXU+S_IRWXG+S_IRWXO); CKERR(r);
r = env->close(env, 0); CKERR(r);
}
}
int
test_main (int argc, char *const argv[])
{
parse_args(argc, argv);
do_1324(DB_INIT_LOG);
do_1324(0);
return 0;
}

View file

@ -484,13 +484,6 @@ env_setup_real_log_dir(DB_ENV *env) {
}
}
static int delete_rolltmp_files(DB_ENV *env) {
assert(env->i->real_data_dir);
assert(env->i->real_log_dir);
int r = tokudb_recover_delete_rolltmp_files(env->i->real_data_dir, env->i->real_log_dir);
return r;
}
static int
ydb_do_recovery (DB_ENV *env) {
assert(env->i->real_log_dir);
@ -600,9 +593,9 @@ ydb_recover_log_exists(DB_ENV *env) {
// Set *valid_newenv if creating a new environment (all files missing).
// (Note, if special dictionaries exist, then they were created transactionally and log should exist.)
static int
validate_env(DB_ENV * env, BOOL * valid_newenv) {
validate_env(DB_ENV * env, BOOL * valid_newenv, BOOL need_rollback_cachefile) {
int r;
BOOL expect_newenv; // set true if we expect to create a new env
BOOL expect_newenv = FALSE; // set true if we expect to create a new env
toku_struct_stat buf;
char* path = NULL;
@ -610,11 +603,12 @@ validate_env(DB_ENV * env, BOOL * valid_newenv) {
path = toku_construct_full_name(2, env->i->dir, environmentdictionary);
assert(path);
r = toku_stat(path, &buf);
int stat_errno = errno;
toku_free(path);
if (r == 0) {
expect_newenv = FALSE; // persistent info exists
}
else if (errno == ENOENT) {
else if (stat_errno == ENOENT) {
expect_newenv = TRUE;
r = 0;
}
@ -623,17 +617,41 @@ validate_env(DB_ENV * env, BOOL * valid_newenv) {
assert(r);
}
// Test for rollback cachefile
if (r == 0 && need_rollback_cachefile) {
path = toku_construct_full_name(2, env->i->dir, ROLLBACK_CACHEFILE_NAME);
assert(path);
r = toku_stat(path, &buf);
stat_errno = errno;
toku_free(path);
if (r == 0) {
if (expect_newenv) // rollback cachefile exists, but persistent env is missing
r = toku_ydb_do_error(env, ENOENT, "Persistent environment is missing\n");
}
else if (stat_errno == ENOENT) {
if (!expect_newenv) // rollback cachefile is missing but persistent env exists
r = toku_ydb_do_error(env, ENOENT, "rollback cachefile directory is missing\n");
else
r = 0; // both rollback cachefile and persistent env are missing
}
else {
r = toku_ydb_do_error(env, errno, "Unable to access rollback cachefile\n");
assert(r);
}
}
// Test for fileops directory
if (r == 0) {
path = toku_construct_full_name(2, env->i->dir, fileopsdirectory);
assert(path);
r = toku_stat(path, &buf);
stat_errno = errno;
toku_free(path);
if (r == 0) {
if (expect_newenv) // fileops directory exists, but persistent env is missing
r = toku_ydb_do_error(env, ENOENT, "Persistent environment is missing\n");
}
else if (errno == ENOENT) {
else if (stat_errno == ENOENT) {
if (!expect_newenv) // fileops directory is missing but persistent env exists
r = toku_ydb_do_error(env, ENOENT, "Fileops directory is missing\n");
else
@ -739,16 +757,16 @@ toku_env_open(DB_ENV * env, const char *home, u_int32_t flags, int mode) {
env_setup_real_data_dir(env);
env_setup_real_log_dir(env);
r = validate_env(env, &newenv); // make sure that environment is either new or complete
BOOL need_rollback_cachefile = FALSE;
if (flags & (DB_INIT_TXN | DB_INIT_LOG)) {
need_rollback_cachefile = TRUE;
}
r = validate_env(env, &newenv, need_rollback_cachefile); // make sure that environment is either new or complete
if (r != 0) return r;
unused_flags &= ~DB_INIT_TXN & ~DB_INIT_LOG;
if (flags & DB_INIT_TXN) {
r = delete_rolltmp_files(env);
if (r != 0) return r;
}
// do recovery only if there exists a log and recovery is requested
// otherwise, a log is created when the logger is opened later
if (!newenv) {
@ -805,6 +823,8 @@ toku_env_open(DB_ENV * env, const char *home, u_int32_t flags, int mode) {
assert (using_txns);
toku_logger_set_cachetable(env->i->logger, env->i->cachetable);
toku_logger_set_remove_finalize_callback(env->i->logger, finalize_file_removal, env->i->ltm);
r = toku_logger_open_rollback(env->i->logger, env->i->cachetable, newenv);
assert(r==0);
}
DB_TXN *txn=NULL;
@ -894,7 +914,6 @@ static int toku_env_close(DB_ENV * env, u_int32_t flags) {
}
}
}
if (env->i->cachetable) {
toku_ydb_unlock(); // ydb lock must not be held when shutting down minicron
toku_cachetable_minicron_shutdown(env->i->cachetable);
@ -907,6 +926,17 @@ static int toku_env_close(DB_ENV * env, u_int32_t flags) {
toku_ydb_do_error(env, r, "Cannot close environment (error during checkpoint)\n");
goto panic_and_quit_early;
}
r = toku_logger_close_rollback(env->i->logger, FALSE);
if (r) {
toku_ydb_do_error(env, r, "Cannot close environment (error during closing rollback cachefile)\n");
goto panic_and_quit_early;
}
//Do a second checkpoint now that the rollback cachefile is closed.
r = toku_checkpoint(env->i->cachetable, env->i->logger, NULL, NULL, NULL, NULL);
if (r) {
toku_ydb_do_error(env, r, "Cannot close environment (error during checkpoint)\n");
goto panic_and_quit_early;
}
r = toku_logger_shutdown(env->i->logger);
if (r) {
toku_ydb_do_error(env, r, "Cannot close environment (error during logger shutdown)\n");
@ -1954,7 +1984,7 @@ static u_int32_t locked_txn_id(DB_TXN *txn) {
static int toku_txn_stat (DB_TXN *txn, struct txn_stat **txn_stat) {
XMALLOC(*txn_stat);
return toku_logger_txn_rolltmp_raw_count(db_txn_struct_i(txn)->tokutxn, &(*txn_stat)->rolltmp_raw_count);
return toku_logger_txn_rollback_raw_count(db_txn_struct_i(txn)->tokutxn, &(*txn_stat)->rollback_raw_count);
}
static int locked_txn_stat (DB_TXN *txn, struct txn_stat **txn_stat) {
@ -5018,7 +5048,7 @@ int toku_db_pre_acquire_table_lock(DB *db, DB_TXN *txn) {
toku_lt_neg_infinity, toku_lt_neg_infinity,
toku_lt_infinity, toku_lt_infinity);
if (r==0) {
r = toku_brt_note_table_lock(db->i->brt, db_txn_struct_i(txn)->tokutxn); // tell the BRT layer that the table is locked (so that it can reduce the amount of rollback (rolltmp) data.
r = toku_brt_note_table_lock(db->i->brt, db_txn_struct_i(txn)->tokutxn, FALSE); // tell the BRT layer that the table is locked (so that it can reduce the amount of rollback data.
}
return r;

View file

@ -21,7 +21,8 @@ enum typ_tag { TYP_BRTNODE = 0xdead0001,
TYP_GPMA,
TYP_TOKULOGGER,
TYP_TOKUTXN,
TYP_LEAFENTRY
TYP_LEAFENTRY,
TYP_ROLLBACK_LOG_NODE
};
/* Everything should call toku_malloc() instead of malloc(), and toku_calloc() instead of calloc() */

View file

@ -78,7 +78,7 @@ static inline void toku_list_move(struct toku_list *newhead, struct toku_list *o
// Note: Need the extra level of parens in these macros so that
// toku_list_struct(h, foo, b)->zot
// will work right. Otherwise the type cast will try to include ->zot, and it will be all messed up.
#if defined(__GNUC__) && __GNUC__ >= 4
#if (defined(__GNUC__) && __GNUC__ >= 4) || defined(__builtin_offsetof)
#define toku_list_struct(p, t, f) ((t*)((char*)(p) - __builtin_offsetof(t, f)))
#else
#define toku_list_struct(p, t, f) ((t*)((char*)(p) - ((char*)&((t*)0)->f)))

View file

@ -7,6 +7,7 @@ extern "C" {
#include "toku_os.h"
#include <sys/stat.h>
#include <stddef.h>
//These are functions that really exist in windows but are named
//something else.
@ -82,6 +83,9 @@ int mkstemp(char * ttemplate);
toku_off_t ftello(FILE *stream);
#define __builtin_offsetof(type, member) offsetof(type, member)
#if defined(__cplusplus)
};
#endif