From 1bf7a7a40313f247557ba7f47dd203595d33d257 Mon Sep 17 00:00:00 2001 From: Yoni Fogel Date: Tue, 16 Apr 2013 23:59:05 -0400 Subject: [PATCH] closes[t:2449] [t:2484] Merge #2449 changes to main. Rollback logs are now checkpointed. There are no rolltmp files. git-svn-id: file:///svn/toku/tokudb@19167 c7de825b-a66e-492c-adef-691d508d4ae1 --- buildheader/db.h_4_1 | 2 +- buildheader/db.h_4_3 | 2 +- buildheader/db.h_4_4 | 2 +- buildheader/db.h_4_5 | 2 +- buildheader/db.h_4_6 | 2 +- buildheader/make_db_h.c | 2 +- buildheader/tdb.h | 2 +- db-benchmark-test/Makefile | 6 +- db-benchmark-test/db-benchmark-test.c | 22 +- include/db.h | 2 +- newbrt/Makefile | 1 - newbrt/block_table.c | 13 + newbrt/block_table.h | 1 + newbrt/bread.c | 80 -- newbrt/bread.h | 30 - newbrt/brt-internal.h | 4 + newbrt/brt-serialize.c | 381 +++++- newbrt/brt.c | 128 +- newbrt/brt.h | 6 +- newbrt/brttypes.h | 5 + newbrt/cachetable.c | 112 +- newbrt/cachetable.h | 1 + newbrt/checkpoint.c | 3 +- newbrt/log-internal.h | 35 +- newbrt/log.h | 1 - newbrt/logformat.c | 149 ++- newbrt/logger.c | 113 +- newbrt/logger.h | 14 +- newbrt/memarena.c | 10 +- newbrt/memarena.h | 5 +- newbrt/rbuf.h | 16 +- newbrt/recover.c | 1075 ++++++++--------- newbrt/recover.h | 5 - newbrt/roll.c | 390 +++--- newbrt/rollback.c | 554 ++++++--- newbrt/rollback.h | 34 +- newbrt/tests/Makefile | 9 - newbrt/tests/bread-test.c | 80 -- newbrt/tests/cachetable-checkpoint-pending.c | 2 +- newbrt/tests/cachetable-checkpoint-test.c | 2 +- .../cachetable-prefetch-checkpoint-test.c | 2 +- newbrt/tests/recovery-bad-last-entry.c | 6 +- newbrt/tests/recovery-cbegin-cend-hello.c | 2 +- newbrt/tests/recovery-cbegin-cend.c | 2 +- newbrt/tests/recovery-cbegin.c | 2 +- newbrt/tests/recovery-cend-cbegin.c | 2 +- newbrt/tests/recovery-fopen-missing-file.c | 2 +- newbrt/tests/recovery-hello.c | 2 +- .../recovery-lsn-error-during-forward-scan.c | 2 +- newbrt/tests/test1305.c | 106 -- newbrt/tests/test_logcursor.c | 24 +- newbrt/txn.c | 58 +- newbrt/txn.h | 1 + newbrt/wbuf.h | 1 - release/examples/db-insert.c | 22 +- src/tests/Makefile | 1 + src/tests/bug1381.c | 8 +- src/tests/diskfull.c | 2 +- src/tests/stat64.c | 6 +- src/tests/test1324.c | 85 -- src/ydb.c | 70 +- toku_include/memory.h | 3 +- toku_include/toku_list.h | 2 +- windows/misc.h | 4 + 64 files changed, 2018 insertions(+), 1700 deletions(-) delete mode 100644 newbrt/bread.c delete mode 100644 newbrt/bread.h delete mode 100644 newbrt/tests/bread-test.c delete mode 100644 newbrt/tests/test1305.c delete mode 100644 src/tests/test1324.c diff --git a/buildheader/db.h_4_1 b/buildheader/db.h_4_1 index b0de7073dd3..e6fa21a459a 100644 --- a/buildheader/db.h_4_1 +++ b/buildheader/db.h_4_1 @@ -379,7 +379,7 @@ typedef struct __toku_txn_progress { } *TOKU_TXN_PROGRESS, TOKU_TXN_PROGRESS_S; typedef void(*TXN_PROGRESS_POLL_FUNCTION)(TOKU_TXN_PROGRESS, void*); struct txn_stat { - u_int64_t rolltmp_raw_count; + u_int64_t rollback_raw_count; }; struct __toku_db_txn { DB_ENV *mgrp /*In TokuDB, mgrp is a DB_ENV not a DB_TXNMGR*/; /* 32-bit offset=0 size=4, 64=bit offset=0 size=8 */ diff --git a/buildheader/db.h_4_3 b/buildheader/db.h_4_3 index e520e860f1a..8faf2f2f27f 100644 --- a/buildheader/db.h_4_3 +++ b/buildheader/db.h_4_3 @@ -395,7 +395,7 @@ typedef struct __toku_txn_progress { } *TOKU_TXN_PROGRESS, TOKU_TXN_PROGRESS_S; typedef void(*TXN_PROGRESS_POLL_FUNCTION)(TOKU_TXN_PROGRESS, void*); struct txn_stat { - u_int64_t rolltmp_raw_count; + u_int64_t rollback_raw_count; }; struct __toku_db_txn { DB_ENV *mgrp /*In TokuDB, mgrp is a DB_ENV not a DB_TXNMGR*/; /* 32-bit offset=0 size=4, 64=bit offset=0 size=8 */ diff --git a/buildheader/db.h_4_4 b/buildheader/db.h_4_4 index 694b0aa042c..2d0645e4a9d 100644 --- a/buildheader/db.h_4_4 +++ b/buildheader/db.h_4_4 @@ -403,7 +403,7 @@ typedef struct __toku_txn_progress { } *TOKU_TXN_PROGRESS, TOKU_TXN_PROGRESS_S; typedef void(*TXN_PROGRESS_POLL_FUNCTION)(TOKU_TXN_PROGRESS, void*); struct txn_stat { - u_int64_t rolltmp_raw_count; + u_int64_t rollback_raw_count; }; struct __toku_db_txn { DB_ENV *mgrp /*In TokuDB, mgrp is a DB_ENV not a DB_TXNMGR*/; /* 32-bit offset=0 size=4, 64=bit offset=0 size=8 */ diff --git a/buildheader/db.h_4_5 b/buildheader/db.h_4_5 index 90d8d6f0df8..36931c6017a 100644 --- a/buildheader/db.h_4_5 +++ b/buildheader/db.h_4_5 @@ -403,7 +403,7 @@ typedef struct __toku_txn_progress { } *TOKU_TXN_PROGRESS, TOKU_TXN_PROGRESS_S; typedef void(*TXN_PROGRESS_POLL_FUNCTION)(TOKU_TXN_PROGRESS, void*); struct txn_stat { - u_int64_t rolltmp_raw_count; + u_int64_t rollback_raw_count; }; struct __toku_db_txn { DB_ENV *mgrp /*In TokuDB, mgrp is a DB_ENV not a DB_TXNMGR*/; /* 32-bit offset=0 size=4, 64=bit offset=0 size=8 */ diff --git a/buildheader/db.h_4_6 b/buildheader/db.h_4_6 index 9e8415f5144..be9276c7eb7 100644 --- a/buildheader/db.h_4_6 +++ b/buildheader/db.h_4_6 @@ -407,7 +407,7 @@ typedef struct __toku_txn_progress { } *TOKU_TXN_PROGRESS, TOKU_TXN_PROGRESS_S; typedef void(*TXN_PROGRESS_POLL_FUNCTION)(TOKU_TXN_PROGRESS, void*); struct txn_stat { - u_int64_t rolltmp_raw_count; + u_int64_t rollback_raw_count; }; struct __toku_db_txn { DB_ENV *mgrp /*In TokuDB, mgrp is a DB_ENV not a DB_TXNMGR*/; /* 32-bit offset=0 size=4, 64=bit offset=0 size=8 */ diff --git a/buildheader/make_db_h.c b/buildheader/make_db_h.c index 03764fbe8a4..fba63fe1e54 100644 --- a/buildheader/make_db_h.c +++ b/buildheader/make_db_h.c @@ -585,7 +585,7 @@ int main (int argc __attribute__((__unused__)), char *const argv[] __attribute__ printf("} *TOKU_TXN_PROGRESS, TOKU_TXN_PROGRESS_S;\n"); printf("typedef void(*TXN_PROGRESS_POLL_FUNCTION)(TOKU_TXN_PROGRESS, void*);\n"); - printf("struct txn_stat {\n u_int64_t rolltmp_raw_count;\n};\n"); + printf("struct txn_stat {\n u_int64_t rollback_raw_count;\n};\n"); const char *extra[] = { "int (*txn_stat)(DB_TXN *, struct txn_stat **)", "struct { void *next, *prev; } open_txns", diff --git a/buildheader/tdb.h b/buildheader/tdb.h index 89e0c74afd4..17ae7c6d3fc 100644 --- a/buildheader/tdb.h +++ b/buildheader/tdb.h @@ -354,7 +354,7 @@ typedef struct __toku_txn_progress { } *TOKU_TXN_PROGRESS, TOKU_TXN_PROGRESS_S; typedef void(*TXN_PROGRESS_POLL_FUNCTION)(TOKU_TXN_PROGRESS, void*); struct txn_stat { - u_int64_t rolltmp_raw_count; + u_int64_t rollback_raw_count; }; struct __toku_db_txn { DB_ENV *mgrp /*In TokuDB, mgrp is a DB_ENV not a DB_TXNMGR*/; diff --git a/db-benchmark-test/Makefile b/db-benchmark-test/Makefile index d0d668134a1..4c2481c064c 100644 --- a/db-benchmark-test/Makefile +++ b/db-benchmark-test/Makefile @@ -64,7 +64,7 @@ build: build.tdb build.bdb build.bdb: $(TARGET_BDB) $(SCANSCAN_BDB) $(WINDOWS_BDB_LIB_NAME) build.tdb: $(TARGET_TDB) $(SCANSCAN_TDB) -check: check-default check-rowsize-dup check-rowsize check-xfast check-x check-no-rolltmp check-4G +check: check-default check-rowsize-dup check-rowsize check-xfast check-x check-no-rollback check-4G child.benchmark.dir SUPPORT_KEYSIZE=$$((3*1024)) # at least 3KiB SUPPORT_ROWSIZE=$$((80*1024)) # at least 80KiB @@ -96,8 +96,8 @@ check-xfast: $(TARGET_TDB) ./$(TARGET_TDB) $(VERBVERBOSE) --noserial -x --valsize 1000 --cachesize 8000000 --xcount 1000 --periter 20000 --env xfast.dir 1 $(SUMMARIZE_CMD) # A relatively fast test that detects #853 (don't log changes to a dictionary created in the same txn) -check-no-rolltmp: $(TARGET_TDB) - ./$(TARGET_TDB) $(VERBVERBOSE) --env no-rolltmp.dir --singlex --nolog --check_small_rolltmp $(SUMMARIZE_CMD) +check-no-rollback: $(TARGET_TDB) + ./$(TARGET_TDB) $(VERBVERBOSE) --env no-rollback.dir --singlex --nolog --check_small_rollback $(SUMMARIZE_CMD) # Check to make sure that if we make a file that's bigger than 4GB that we can read the file back out and get all the rows. ifeq ($(TOKU_SKIP_4G),1) diff --git a/db-benchmark-test/db-benchmark-test.c b/db-benchmark-test/db-benchmark-test.c index a697f6cf4a5..86e33b467c3 100644 --- a/db-benchmark-test/db-benchmark-test.c +++ b/db-benchmark-test/db-benchmark-test.c @@ -53,7 +53,7 @@ int singlex_child = 0; // Do a single transaction, but do all work with a child int singlex = 0; // Do a single transaction int singlex_create = 0; // Create the db using the single transaction (only valid if singlex) int insert1first = 0; // insert 1 before doing the rest -int check_small_rolltmp = 0; // verify that the rollback logs are small (only valid if singlex) +int check_small_rollback = 0; // verify that the rollback logs are small (only valid if singlex) int do_transactions = 0; int if_transactions_do_logging = DB_INIT_LOG; // set this to zero if we want no logging when transactions are used int do_abort = 0; @@ -294,14 +294,14 @@ static void benchmark_shutdown (void) { #endif if (do_transactions && singlex && !insert1first && (singlex_create || prelock)) { #if defined(TOKUDB) - //There should be a single 'truncate' in the rolltmp instead of many 'insert' entries. + //There should be a single 'truncate' in the rollback instead of many 'insert' entries. struct txn_stat *s; r = tid->txn_stat(tid, &s); assert(r==0); //TODO: #1125 Always do the test after performance testing is done. - if (singlex_child) fprintf(stderr, "SKIPPED 'small rolltmp' test for child txn\n"); + if (singlex_child) fprintf(stderr, "SKIPPED 'small rollback' test for child txn\n"); else - assert(s->rolltmp_raw_count < 100); // gross test, not worth investigating details + assert(s->rollback_raw_count < 100); // gross test, not worth investigating details os_free(s); //system("ls -l bench.tokudb"); #endif @@ -487,7 +487,7 @@ static int print_usage (const char *argv0) { fprintf(stderr, " --singlex-child (implies -x) Run the whole job as a single transaction, do all work a child of that transaction.\n"); fprintf(stderr, " --finish-child-first Commit/abort child before doing so to parent (no effect if no child).\n"); fprintf(stderr, " --singlex-create (implies --singlex) Create the file using the single transaction (Default is to use a different transaction to create.)\n"); - fprintf(stderr, " --check_small_rolltmp (Only valid in --singlex mode) Verify that very little data was saved in the rollback logs.\n"); + fprintf(stderr, " --check_small_rollback (Only valid in --singlex mode) Verify that very little data was saved in the rollback logs.\n"); fprintf(stderr, " --prelock Prelock the database.\n"); fprintf(stderr, " --prelockflag Prelock the database and send the DB_PRELOCKED_WRITE flag.\n"); fprintf(stderr, " --abort Abort the singlex after the transaction is over. (Requires --singlex.)\n"); @@ -589,8 +589,8 @@ int main (int argc, const char *const argv[]) { singlex = 1; } else if (strcmp(arg, "--insert1first") == 0) { insert1first = 1; - } else if (strcmp(arg, "--check_small_rolltmp") == 0) { - check_small_rolltmp = 1; + } else if (strcmp(arg, "--check_small_rollback") == 0) { + check_small_rollback = 1; } else if (strcmp(arg, "--xcount") == 0) { if (i+1 >= argc) return print_usage(argv[0]); items_per_transaction = strtoll(argv[++i], &endptr, 10); assert(*endptr == 0); @@ -685,8 +685,8 @@ int main (int argc, const char *const argv[]) { fprintf(stderr, "--insert_multiple only works on the TokuDB (not BDB)\n"); return print_usage(argv[0]); } - if (check_small_rolltmp) { - fprintf(stderr, "--check_small_rolltmp only works on the TokuDB (not BDB)\n"); + if (check_small_rollback) { + fprintf(stderr, "--check_small_rollback only works on the TokuDB (not BDB)\n"); return print_usage(argv[0]); } #endif @@ -697,8 +697,8 @@ int main (int argc, const char *const argv[]) { put_flagss[i] = put_flags; } } - if (check_small_rolltmp && !singlex) { - fprintf(stderr, "--check_small_rolltmp requires --singlex\n"); + if (check_small_rollback && !singlex) { + fprintf(stderr, "--check_small_rollback requires --singlex\n"); return print_usage(argv[0]); } if (!do_transactions && insert_multiple) { diff --git a/include/db.h b/include/db.h index 89e0c74afd4..17ae7c6d3fc 100644 --- a/include/db.h +++ b/include/db.h @@ -354,7 +354,7 @@ typedef struct __toku_txn_progress { } *TOKU_TXN_PROGRESS, TOKU_TXN_PROGRESS_S; typedef void(*TXN_PROGRESS_POLL_FUNCTION)(TOKU_TXN_PROGRESS, void*); struct txn_stat { - u_int64_t rolltmp_raw_count; + u_int64_t rollback_raw_count; }; struct __toku_db_txn { DB_ENV *mgrp /*In TokuDB, mgrp is a DB_ENV not a DB_TXNMGR*/; diff --git a/newbrt/Makefile b/newbrt/Makefile index c8f7043b29d..b1035735ca8 100644 --- a/newbrt/Makefile +++ b/newbrt/Makefile @@ -41,7 +41,6 @@ local: bins libs $(TEST_NEWBRT); BRT_SOURCES = \ block_allocator \ block_table \ - bread \ brt-serialize \ brt-verify \ brt \ diff --git a/newbrt/block_table.c b/newbrt/block_table.c index 649f3f59c2c..7b1f855d56a 100644 --- a/newbrt/block_table.c +++ b/newbrt/block_table.c @@ -596,6 +596,19 @@ toku_block_verify_no_free_blocknums(BLOCK_TABLE bt) { assert(bt->current.blocknum_freelist_head.b == freelist_null.b); } +//Verify there are no data blocks except root. +void +toku_block_verify_no_data_blocks_except_root_unlocked(BLOCK_TABLE bt, BLOCKNUM root) { + //Relies on checkpoint having used optimize_translation + assert(root.b >= RESERVED_BLOCKNUMS); + assert(bt->current.smallest_never_used_blocknum.b == root.b + 1); + int64_t i; + for (i=RESERVED_BLOCKNUMS; i < root.b; i++) { + BLOCKNUM b = make_blocknum(i); + assert(bt->current.block_translation[b.b].size == size_is_free); + } +} + //Verify a blocknum is currently allocated. void toku_verify_blocknum_allocated(BLOCK_TABLE bt, BLOCKNUM b) { diff --git a/newbrt/block_table.h b/newbrt/block_table.h index c294fbaf555..e83500e9b19 100644 --- a/newbrt/block_table.h +++ b/newbrt/block_table.h @@ -35,6 +35,7 @@ void toku_allocate_blocknum(BLOCK_TABLE bt, BLOCKNUM *res, struct brt_header * h void toku_allocate_blocknum_unlocked(BLOCK_TABLE bt, BLOCKNUM *res, struct brt_header * h); void toku_free_blocknum(BLOCK_TABLE bt, BLOCKNUM *b, struct brt_header * h); void toku_verify_blocknum_allocated(BLOCK_TABLE bt, BLOCKNUM b); +void toku_block_verify_no_data_blocks_except_root_unlocked(BLOCK_TABLE bt, BLOCKNUM root); void toku_block_verify_no_free_blocknums(BLOCK_TABLE bt); void toku_realloc_descriptor_on_disk(BLOCK_TABLE bt, DISKOFF size, DISKOFF *offset, struct brt_header * h); void toku_get_descriptor_offset_size(BLOCK_TABLE bt, DISKOFF *offset, DISKOFF *size); diff --git a/newbrt/bread.c b/newbrt/bread.c deleted file mode 100644 index 5663008b312..00000000000 --- a/newbrt/bread.c +++ /dev/null @@ -1,80 +0,0 @@ -/* Buffered read. */ - -#ident "$Id$" -#ident "Copyright (c) 2007, 2008, 2009 Tokutek Inc. All rights reserved." -#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it." - -#include "includes.h" - -struct bread { - int64_t fileoff; // The byte before this offset is the next byte we will read (since we are reading backward) - int fd; - int bufoff; // The current offset in the buf. The next byte we will read is buf[bufoff-1] (assuming that bufoff>0). - char *buf; // A buffer with at least bufoff bytes in it. -}; - -BREAD create_bread_from_fd_initialize_at(int fd) { - BREAD XMALLOC(result); - int r = toku_os_get_file_size(fd, &result->fileoff); - assert(r==0); - result->fd=fd; - result->bufoff=0; - result->buf = 0; - return result; -} - -int close_bread_without_closing_fd(BREAD br) { - toku_free(br->buf); - toku_free(br); - return 0; -} - - -ssize_t bread_backwards(BREAD br, void *vbuf, size_t nbytes) { - char *buf=vbuf; - ssize_t result=0; - const int i4 = sizeof(u_int32_t); - while (nbytes > 0) { - // read whatever we can out of the buffer. - if (br->bufoff>0) { - size_t to_copy = ((size_t)br->bufoff >= nbytes) ? nbytes : (size_t)br->bufoff; - memcpy(buf+nbytes-to_copy, &br->buf[br->bufoff-to_copy], to_copy); - nbytes -= to_copy; - result += to_copy; - br->bufoff -= to_copy; - } - if (nbytes>0) { - assert(br->bufoff==0); - u_int32_t compressed_length_n, uncompressed_length_n; - assert(br->fileoff>=i4); // there better be the three lengths plus the compressed data. - { ssize_t r = pread(br->fd, &compressed_length_n, i4, br->fileoff- i4); assert(r==i4); } - u_int32_t compressed_length = toku_dtoh32(compressed_length_n); - assert(br->fileoff >= compressed_length + 3*i4); - { ssize_t r = pread(br->fd, &uncompressed_length_n, i4, br->fileoff-2*i4); assert(r==i4); } - u_int32_t uncompressed_length = toku_dtoh32(uncompressed_length_n); - char *XMALLOC_N(compressed_length, zbuf); - { - ssize_t r = pread(br->fd, zbuf, compressed_length, br->fileoff- compressed_length -2*i4); - assert(r==(ssize_t)compressed_length); - } - { - u_int32_t compressed_length_n_again; - ssize_t r = pread(br->fd, &compressed_length_n_again, i4, br->fileoff-compressed_length-3*i4); assert(r==i4); - assert(compressed_length_n_again == compressed_length_n); - } - uLongf destlen = uncompressed_length; - XREALLOC_N(uncompressed_length, br->buf); - uncompress((Bytef*)br->buf, &destlen, (Bytef*)zbuf, compressed_length); - assert(destlen==uncompressed_length); - toku_free(zbuf); - - br->bufoff = uncompressed_length; - br->fileoff -= (compressed_length + 3*i4); - } - } - return result; -} - -int bread_has_more(BREAD br) { - return (br->fileoff>0) || (br->bufoff>0); -} diff --git a/newbrt/bread.h b/newbrt/bread.h deleted file mode 100644 index 01fc52d54c2..00000000000 --- a/newbrt/bread.h +++ /dev/null @@ -1,30 +0,0 @@ -#ifndef BREAD_H -#define BREAD_H -#ident "$Id$" -#ident "Copyright (c) 2007, 2008, 2009 Tokutek Inc. All rights reserved." -#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it." - -// A BREAD reads a file backwards using buffered I/O. BREAD stands for Buffered Read or Backwards Read. -// Conceivably, we could read forward too. -// The buffered I/O is buffered using a large buffer (e.g., something like a megabyte). -// Furthermore, data is compressed into blocks. Each block is a 4-byte compressed length (in network order), followed by compressed data, followed by a 4-byte uncompressed-length (in network order), followed by a 4-byte compressed length -// The compressed-length appears twice so that the file can be read backward or forward. -// If not for the large-buffer requirement, as well as compression, as well as reading backward, we could have used a FILE. - -#include -typedef struct bread *BREAD; - -BREAD create_bread_from_fd_initialize_at(int fd); -// Effect: Given a file descriptor, fd, create a BREAD. -// Requires: The fd must be an open fd. - -int close_bread_without_closing_fd(BREAD); -// Effect: Close the BREAD, but don't close the underlying fd. - -ssize_t bread_backwards(BREAD, void *buf, size_t nbytes); -// Read nbytes into buf, reading backwards. - -int bread_has_more(BREAD); -// Is there more to read? - -#endif diff --git a/newbrt/brt-internal.h b/newbrt/brt-internal.h index cf5b289bbf1..af391218669 100644 --- a/newbrt/brt-internal.h +++ b/newbrt/brt-internal.h @@ -217,6 +217,10 @@ int toku_serialize_brtnode_to_memory (BRTNODE node, int n_workitems, int n_threa /*out*/ size_t *n_bytes_to_write, /*out*/ char **bytes_to_write); int toku_serialize_brtnode_to(int fd, BLOCKNUM, BRTNODE node, struct brt_header *h, int n_workitems, int n_threads, BOOL for_checkpoint); +int toku_serialize_rollback_log_to (int fd, BLOCKNUM blocknum, ROLLBACK_LOG_NODE log, + struct brt_header *h, int n_workitems, int n_threads, + BOOL for_checkpoint); +int toku_deserialize_rollback_log_from (int fd, BLOCKNUM blocknum, u_int32_t fullhash, ROLLBACK_LOG_NODE *logp, TOKUTXN txn, struct brt_header *h); int toku_deserialize_brtnode_from (int fd, BLOCKNUM off, u_int32_t /*fullhash*/, BRTNODE *brtnode, struct brt_header *h); unsigned int toku_serialize_brtnode_size(BRTNODE node); /* How much space will it take? */ int toku_keycompare (bytevec key1, ITEMLEN key1len, bytevec key2, ITEMLEN key2len); diff --git a/newbrt/brt-serialize.c b/newbrt/brt-serialize.c index 2206f8f2335..0fc4b78c8a5 100644 --- a/newbrt/brt-serialize.c +++ b/newbrt/brt-serialize.c @@ -168,7 +168,7 @@ toku_full_pwrite_extend (int fd, const void *buf, size_t count, toku_off_t offse // Overhead calculated in same order fields are written to wbuf enum { - node_header_overhead = (8+ // magic "tokunode" or "tokuleaf" + node_header_overhead = (8+ // magic "tokunode" or "tokuleaf" or "tokuroll" 4+ // layout_version 4), // layout_version_original @@ -430,48 +430,27 @@ serialize_node(BRTNODE node, char *buf, size_t calculated_size, int n_sub_blocks assert(calculated_size==wb.ndone); } -int -toku_serialize_brtnode_to_memory (BRTNODE node, int UU(n_workitems), int UU(n_threads), /*out*/ size_t *n_bytes_to_write, /*out*/ char **bytes_to_write) { - // get the size of the serialized node - unsigned int calculated_size = toku_serialize_brtnode_size(node); - - // choose sub block parameters - int n_sub_blocks = 0, sub_block_size = 0; - size_t data_size = calculated_size - node_header_overhead; - choose_sub_block_size(data_size, max_sub_blocks, &sub_block_size, &n_sub_blocks); - assert(0 < n_sub_blocks && n_sub_blocks <= max_sub_blocks); - assert(sub_block_size > 0); - - // set the initial sub block size for all of the sub blocks - struct sub_block sub_block[n_sub_blocks]; - for (int i = 0; i < n_sub_blocks; i++) - sub_block_init(&sub_block[i]); - set_all_sub_block_sizes(data_size, sub_block_size, n_sub_blocks, sub_block); - - // alloocate space for the serialized node - char *MALLOC_N(calculated_size, buf); - //toku_verify_counts(node); - //assert(size>0); - //printf("%s:%d serializing %lld w height=%d p0=%p\n", __FILE__, __LINE__, off, node->height, node->mdicts[0]); - - // serialize the node into buf - serialize_node(node, buf, calculated_size, n_sub_blocks, sub_block); - - // allocate space for the compressed buf +static void +serialize_uncompressed_block_to_memory(char * uncompressed_buf, + int n_sub_blocks, + struct sub_block sub_block[n_sub_blocks], + /*out*/ size_t *n_bytes_to_write, + /*out*/ char **bytes_to_write) { + // allocate space for the compressed uncompressed_buf size_t compressed_len = get_sum_compressed_size_bound(n_sub_blocks, sub_block); size_t sub_block_header_len = sub_block_header_size(n_sub_blocks); size_t header_len = node_header_overhead + sub_block_header_len + sizeof (uint32_t); // node + sub_block + checksum char *MALLOC_N(header_len + compressed_len, compressed_buf); // copy the header - memcpy(compressed_buf, buf, node_header_overhead); + memcpy(compressed_buf, uncompressed_buf, node_header_overhead); if (0) printf("First 4 bytes before compressing data are %02x%02x%02x%02x\n", - buf[node_header_overhead], buf[node_header_overhead+1], - buf[node_header_overhead+2], buf[node_header_overhead+3]); + uncompressed_buf[node_header_overhead], uncompressed_buf[node_header_overhead+1], + uncompressed_buf[node_header_overhead+2], uncompressed_buf[node_header_overhead+3]); // compress all of the sub blocks - char *uncompressed_ptr = buf + node_header_overhead; + char *uncompressed_ptr = uncompressed_buf + node_header_overhead; char *compressed_ptr = compressed_buf + header_len; compressed_len = compress_all_sub_blocks(n_sub_blocks, sub_block, uncompressed_ptr, compressed_ptr, num_cores); @@ -494,9 +473,40 @@ toku_serialize_brtnode_to_memory (BRTNODE node, int UU(n_workitems), int UU(n_th *n_bytes_to_write = header_len + compressed_len; *bytes_to_write = compressed_buf; +} +int +toku_serialize_brtnode_to_memory (BRTNODE node, int UU(n_workitems), int UU(n_threads), /*out*/ size_t *n_bytes_to_write, /*out*/ char **bytes_to_write) { + + // get the size of the serialized node + size_t calculated_size = toku_serialize_brtnode_size(node); + + // choose sub block parameters + int n_sub_blocks = 0, sub_block_size = 0; + size_t data_size = calculated_size - node_header_overhead; + choose_sub_block_size(data_size, max_sub_blocks, &sub_block_size, &n_sub_blocks); + assert(0 < n_sub_blocks && n_sub_blocks <= max_sub_blocks); + assert(sub_block_size > 0); + + // set the initial sub block size for all of the sub blocks + struct sub_block sub_block[n_sub_blocks]; + for (int i = 0; i < n_sub_blocks; i++) + sub_block_init(&sub_block[i]); + set_all_sub_block_sizes(data_size, sub_block_size, n_sub_blocks, sub_block); + + // allocate space for the serialized node + char *MALLOC_N(calculated_size, buf); + //toku_verify_counts(node); + //assert(size>0); + //printf("%s:%d serializing %lld w height=%d p0=%p\n", __FILE__, __LINE__, off, node->height, node->mdicts[0]); + + // serialize the node into buf + serialize_node(node, buf, calculated_size, n_sub_blocks, sub_block); + + //Compress and malloc buffer to write + serialize_uncompressed_block_to_memory(buf, n_sub_blocks, sub_block, + n_bytes_to_write, bytes_to_write); toku_free(buf); - return 0; } @@ -522,9 +532,8 @@ toku_serialize_brtnode_to (int fd, BLOCKNUM blocknum, BRTNODE node, struct brt_h //printf("%s:%d bt=%p\n", __FILE__, __LINE__, h->block_translation); DISKOFF offset; - //h will be dirtied toku_blocknum_realloc_on_disk(h->blocktable, blocknum, n_to_write, &offset, - h, for_checkpoint); + h, for_checkpoint); //dirties h lock_for_pwrite(); toku_full_pwrite_extend(fd, compressed_buf, n_to_write, offset); unlock_for_pwrite(); @@ -852,7 +861,7 @@ deserialize_brtnode_from_rbuf (BLOCKNUM blocknum, u_int32_t fullhash, BRTNODE *b } static int -decompress_brtnode_from_raw_block_into_rbuf(u_int8_t *raw_block, struct rbuf *rb, BLOCKNUM blocknum) { +decompress_from_raw_block_into_rbuf(u_int8_t *raw_block, struct rbuf *rb, BLOCKNUM blocknum) { toku_trace("decompress"); int r; @@ -914,14 +923,14 @@ decompress_brtnode_from_raw_block_into_rbuf(u_int8_t *raw_block, struct rbuf *rb } static int -decompress_brtnode_from_raw_block_into_rbuf_versioned(u_int32_t version, u_int8_t *raw_block, struct rbuf *rb, BLOCKNUM blocknum) { +decompress_from_raw_block_into_rbuf_versioned(u_int32_t version, u_int8_t *raw_block, struct rbuf *rb, BLOCKNUM blocknum) { int r; switch (version) { case BRT_LAYOUT_VERSION_10: r = decompress_brtnode_from_raw_block_into_rbuf_10(raw_block, rb, blocknum); break; case BRT_LAYOUT_VERSION: - r = decompress_brtnode_from_raw_block_into_rbuf(raw_block, rb, blocknum); + r = decompress_from_raw_block_into_rbuf(raw_block, rb, blocknum); break; default: assert(FALSE); @@ -959,19 +968,16 @@ deserialize_brtnode_from_rbuf_versioned (u_int32_t version, BLOCKNUM blocknum, u return r; } - -// Read brt node from file into struct. Perform version upgrade if necessary. -int -toku_deserialize_brtnode_from (int fd, BLOCKNUM blocknum, u_int32_t fullhash, BRTNODE *brtnode, struct brt_header *h) { - toku_trace("deserial start"); - +static int +read_and_decompress_block_from_fd_into_rbuf(int fd, BLOCKNUM blocknum, + struct brt_header *h, + struct rbuf *rb, + /* out */ int *layout_version_p) { int r; - struct rbuf rb = {.buf = NULL, .size = 0, .ndone = 0}; - if (0) printf("Deserializing Block %" PRId64 "\n", blocknum.b); if (h->panic) return h->panic; - toku_trace("deserial start"); + toku_trace("deserial start nopanic"); // get the file offset and block size for the block DISKOFF offset, size; @@ -986,7 +992,9 @@ toku_deserialize_brtnode_from (int fd, BLOCKNUM blocknum, u_int32_t fullhash, BR int layout_version; { u_int8_t *magic = raw_block + uncompressed_magic_offset; - if (memcmp(magic, "tokuleaf", 8)!=0 && memcmp(magic, "tokunode", 8)!=0) { + if (memcmp(magic, "tokuleaf", 8)!=0 && + memcmp(magic, "tokunode", 8)!=0 && + memcmp(magic, "tokuroll", 8)!=0) { r = toku_db_badformat(); goto cleanup; } @@ -1006,16 +1014,47 @@ toku_deserialize_brtnode_from (int fd, BLOCKNUM blocknum, u_int32_t fullhash, BR u_int32_t stored_xsum = toku_dtoh32(*(u_int32_t *)(raw_block + header_length)); assert(xsum == stored_xsum); - r = decompress_brtnode_from_raw_block_into_rbuf_versioned(layout_version, raw_block, &rb, blocknum); + r = decompress_from_raw_block_into_rbuf_versioned(layout_version, raw_block, rb, blocknum); if (r!=0) goto cleanup; + *layout_version_p = layout_version; +cleanup: + if (r!=0) { + if (rb->buf) toku_free(rb->buf); + rb->buf = NULL; + } + if (raw_block) toku_free(raw_block); + return r; +} + +// Read brt node from file into struct. Perform version upgrade if necessary. +int +toku_deserialize_brtnode_from (int fd, BLOCKNUM blocknum, u_int32_t fullhash, + BRTNODE *brtnode, struct brt_header *h) { + toku_trace("deserial start"); + + int r; + struct rbuf rb = {.buf = NULL, .size = 0, .ndone = 0}; + + int layout_version; + r = read_and_decompress_block_from_fd_into_rbuf(fd, blocknum, h, &rb, &layout_version); + if (r!=0) goto cleanup; + + { + u_int8_t *magic = rb.buf + uncompressed_magic_offset; + if (memcmp(magic, "tokuleaf", 8)!=0 && + memcmp(magic, "tokunode", 8)!=0) { + r = toku_db_badformat(); + goto cleanup; + } + } + r = deserialize_brtnode_from_rbuf_versioned(layout_version, blocknum, fullhash, brtnode, h, &rb); toku_trace("deserial done"); cleanup: if (rb.buf) toku_free(rb.buf); - if (raw_block) toku_free(raw_block); return r; } @@ -1603,5 +1642,245 @@ toku_db_badformat(void) { return DB_BADFORMAT; } +static size_t +serialize_rollback_log_size(ROLLBACK_LOG_NODE log) { + size_t size = node_header_overhead //8 "tokuroll", 4 version, 4 version_original + +8 //TXNID + +8 //sequence + +8 //thislogname + +8 //older (blocknum) + +8 //resident_bytecount + +8 //memarena_size_needed_to_load + +log->rollentry_resident_bytecount; + return size; +} + +static void +serialize_rollback_log_node_to_buf(ROLLBACK_LOG_NODE log, char *buf, size_t calculated_size, int UU(n_sub_blocks), struct sub_block UU(sub_block[])) { + struct wbuf wb; + wbuf_init(&wb, buf, calculated_size); + { //Serialize rollback log to local wbuf + wbuf_nocrc_literal_bytes(&wb, "tokuroll", 8); + assert(log->layout_version == BRT_LAYOUT_VERSION); + wbuf_nocrc_int(&wb, log->layout_version); + wbuf_nocrc_int(&wb, log->layout_version_original); + wbuf_nocrc_TXNID(&wb, log->txnid); + wbuf_nocrc_ulonglong(&wb, log->sequence); + wbuf_nocrc_BLOCKNUM(&wb, log->thislogname); + wbuf_nocrc_BLOCKNUM(&wb, log->older); + wbuf_nocrc_ulonglong(&wb, log->rollentry_resident_bytecount); + //Write down memarena size needed to restore + wbuf_nocrc_ulonglong(&wb, memarena_total_size_in_use(log->rollentry_arena)); + + { + //Store rollback logs + struct roll_entry *item; + size_t done_before = wb.ndone; + for (item = log->newest_logentry; item; item = item->prev) { + toku_logger_rollback_wbuf_nocrc_write(&wb, item); + } + assert(done_before + log->rollentry_resident_bytecount == wb.ndone); + } + } + assert(wb.ndone == wb.size); + assert(calculated_size==wb.ndone); +} + +static int +toku_serialize_rollback_log_to_memory (ROLLBACK_LOG_NODE log, + int UU(n_workitems), int UU(n_threads), + /*out*/ size_t *n_bytes_to_write, + /*out*/ char **bytes_to_write) { + // get the size of the serialized node + size_t calculated_size = serialize_rollback_log_size(log); + + // choose sub block parameters + int n_sub_blocks = 0, sub_block_size = 0; + size_t data_size = calculated_size - node_header_overhead; + choose_sub_block_size(data_size, max_sub_blocks, &sub_block_size, &n_sub_blocks); + assert(0 < n_sub_blocks && n_sub_blocks <= max_sub_blocks); + assert(sub_block_size > 0); + + // set the initial sub block size for all of the sub blocks + struct sub_block sub_block[n_sub_blocks]; + for (int i = 0; i < n_sub_blocks; i++) + sub_block_init(&sub_block[i]); + set_all_sub_block_sizes(data_size, sub_block_size, n_sub_blocks, sub_block); + + // allocate space for the serialized node + char *XMALLOC_N(calculated_size, buf); + // serialize the node into buf + serialize_rollback_log_node_to_buf(log, buf, calculated_size, n_sub_blocks, sub_block); + + //Compress and malloc buffer to write + serialize_uncompressed_block_to_memory(buf, n_sub_blocks, sub_block, + n_bytes_to_write, bytes_to_write); + toku_free(buf); + return 0; +} + +int +toku_serialize_rollback_log_to (int fd, BLOCKNUM blocknum, ROLLBACK_LOG_NODE log, + struct brt_header *h, int n_workitems, int n_threads, + BOOL for_checkpoint) { + size_t n_to_write; + char *compressed_buf; + { + int r = toku_serialize_rollback_log_to_memory(log, n_workitems, n_threads, &n_to_write, &compressed_buf); + if (r!=0) return r; + } + + { + assert(blocknum.b>=0); + DISKOFF offset; + toku_blocknum_realloc_on_disk(h->blocktable, blocknum, n_to_write, &offset, + h, for_checkpoint); //dirties h + lock_for_pwrite(); + toku_full_pwrite_extend(fd, compressed_buf, n_to_write, offset); + unlock_for_pwrite(); + } + toku_free(compressed_buf); + log->dirty = 0; // See #1957. Must set the node to be clean after serializing it so that it doesn't get written again on the next checkpoint or eviction. + return 0; +} + +static int +deserialize_rollback_log_from_rbuf (BLOCKNUM blocknum, u_int32_t fullhash, ROLLBACK_LOG_NODE *log_p, + TOKUTXN txn, struct brt_header *h, struct rbuf *rb) { + TAGMALLOC(ROLLBACK_LOG_NODE, result); + int r; + if (result==NULL) { + r=errno; + if (0) { died0: toku_free(result); } + return r; + } + + //printf("Deserializing %lld datasize=%d\n", off, datasize); + bytevec magic; + rbuf_literal_bytes(rb, &magic, 8); + assert(!memcmp(magic, "tokuroll", 8)); + + result->layout_version = rbuf_int(rb); + assert(result->layout_version == BRT_LAYOUT_VERSION); + result->layout_version_original = rbuf_int(rb); + result->layout_version_read_from_disk = result->layout_version; + result->dirty = FALSE; + //TODO: Maybe add descriptor (or just descriptor version) here eventually? + //TODO: This is hard.. everything is shared in a single dictionary. + rbuf_TXNID(rb, &result->txnid); + result->sequence = rbuf_ulonglong(rb); + if (result->txnid == txn->txnid64 && result->sequence > txn->num_rollback_nodes) { + r = toku_db_badformat(); + goto died0; + } + result->thislogname = rbuf_blocknum(rb); + if (result->thislogname.b != blocknum.b) { + r = toku_db_badformat(); + goto died0; + } + result->thishash = toku_cachetable_hash(h->cf, result->thislogname); + if (result->thishash != fullhash) { + r = toku_db_badformat(); + goto died0; + } + result->older = rbuf_blocknum(rb); + result->older_hash = toku_cachetable_hash(h->cf, result->older); + result->rollentry_resident_bytecount = rbuf_ulonglong(rb); + + size_t arena_initial_size = rbuf_ulonglong(rb); + result->rollentry_arena = memarena_create_presized(arena_initial_size); + if (0) { died1: memarena_close(&result->rollentry_arena); goto died0; } + + //Load rollback entries + assert(rb->size > 4); + //Start with empty list + result->oldest_logentry = result->newest_logentry = NULL; + while (rb->ndone < rb->size) { + struct roll_entry *item; + uint32_t rollback_fsize = rbuf_int(rb); //Already read 4. Rest is 4 smaller + bytevec item_vec; + rbuf_literal_bytes(rb, &item_vec, rollback_fsize-4); + unsigned char* item_buf = (unsigned char*)item_vec; + r = toku_parse_rollback(item_buf, rollback_fsize-4, &item, result->rollentry_arena); + if (r!=0) { + r = toku_db_badformat(); + goto died1; + } + //Add to head of list + if (result->oldest_logentry) { + result->oldest_logentry->prev = item; + result->oldest_logentry = item; + item->prev = NULL; + } + else { + result->oldest_logentry = result->newest_logentry = item; + item->prev = NULL; + } + } + + toku_free(rb->buf); + rb->buf = NULL; + *log_p = result; + return 0; +} + +static int +deserialize_rollback_log_from_rbuf_versioned (u_int32_t version, BLOCKNUM blocknum, u_int32_t fullhash, + ROLLBACK_LOG_NODE *log, + TOKUTXN txn, struct brt_header *h, struct rbuf *rb) { + int r = 0; + ROLLBACK_LOG_NODE rollback_log_node = NULL; + + int upgrade = 0; + switch (version) { + case BRT_LAYOUT_VERSION: + if (!upgrade) + r = deserialize_rollback_log_from_rbuf(blocknum, fullhash, &rollback_log_node, txn, h, rb); + if (r==0) { + assert(rollback_log_node); + *log = rollback_log_node; + } + if (upgrade && r == 0) (*log)->dirty = 1; + break; // this is the only break + default: + assert(FALSE); + } + return r; +} + +// Read rollback log node from file into struct. Perform version upgrade if necessary. +int +toku_deserialize_rollback_log_from (int fd, BLOCKNUM blocknum, u_int32_t fullhash, + ROLLBACK_LOG_NODE *logp, TOKUTXN txn, struct brt_header *h) { + toku_trace("deserial start"); + + int r; + struct rbuf rb = {.buf = NULL, .size = 0, .ndone = 0}; + + int layout_version; + r = read_and_decompress_block_from_fd_into_rbuf(fd, blocknum, h, &rb, &layout_version); + if (r!=0) goto cleanup; + + { + u_int8_t *magic = rb.buf + uncompressed_magic_offset; + if (memcmp(magic, "tokuroll", 8)!=0) { + r = toku_db_badformat(); + goto cleanup; + } + } + + r = deserialize_rollback_log_from_rbuf_versioned(layout_version, blocknum, fullhash, logp, txn, h, &rb); + + toku_trace("deserial done"); + +cleanup: + if (rb.buf) toku_free(rb.buf); + return r; +} + + + + // NOTE: Backwards compatibility functions are in the included .c file(s): #include "backwards_10.c" + diff --git a/newbrt/brt.c b/newbrt/brt.c index 1e80483fb2a..5a4317b11bd 100644 --- a/newbrt/brt.c +++ b/newbrt/brt.c @@ -2650,13 +2650,6 @@ int toku_brt_insert (BRT brt, DBT *key, DBT *val, TOKUTXN txn) { return toku_brt_maybe_insert(brt, key, val, txn, FALSE, ZERO_LSN, TRUE, BRT_INSERT); } -static void -txn_note_doing_work(TOKUTXN txn) { - if (txn) - txn->has_done_work = TRUE; -} - - int toku_brt_load_recovery(TOKUTXN txn, char const * old_iname, char const * new_iname, int do_fsync, int do_log, LSN *load_lsn) { int r = 0; @@ -2665,12 +2658,9 @@ toku_brt_load_recovery(TOKUTXN txn, char const * old_iname, char const * new_ina //before the (old) file is actually unlinked TOKULOGGER logger = toku_txn_logger(txn); - BYTESTRING old_iname_bs = {.len=strlen(old_iname), - .data=toku_memdup_in_rollback(txn, old_iname, strlen(old_iname))}; - BYTESTRING new_iname_bs = {.len=strlen(new_iname), - .data=toku_memdup_in_rollback(txn, new_iname, strlen(new_iname))}; - - r = toku_logger_save_rollback_load(txn, old_iname_bs, new_iname_bs); + BYTESTRING old_iname_bs = {.len=strlen(old_iname), .data=(char*)old_iname}; + BYTESTRING new_iname_bs = {.len=strlen(new_iname), .data=(char*)new_iname}; + r = toku_logger_save_rollback_load(txn, &old_iname_bs, &new_iname_bs); if (r==0 && do_log && logger) { TXNID xid = toku_txn_get_txnid(txn); r = toku_log_load(logger, load_lsn, do_fsync, xid, old_iname_bs, new_iname_bs); @@ -2715,15 +2705,14 @@ int toku_brt_maybe_insert (BRT brt, DBT *key, DBT *val, TOKUTXN txn, BOOL oplsn_ int r = 0; XIDS message_xids; TXNID xid = toku_txn_get_txnid(txn); - txn_note_doing_work(txn); if (txn && (brt->h->txnid_that_created_or_locked_when_empty != xid)) { - BYTESTRING keybs = {key->size, toku_memdup_in_rollback(txn, key->data, key->size)}; + BYTESTRING keybs = {key->size, key->data}; int need_data = (brt->flags&TOKU_DB_DUPSORT)!=0; // dupsorts don't need the data part if (need_data) { - BYTESTRING databs = {val->size, toku_memdup_in_rollback(txn, val->data, val->size)}; - r = toku_logger_save_rollback_cmdinsertboth(txn, toku_cachefile_filenum(brt->cf), keybs, databs); + BYTESTRING databs = {val->size, val->data}; + r = toku_logger_save_rollback_cmdinsertboth(txn, toku_cachefile_filenum(brt->cf), &keybs, &databs); } else { - r = toku_logger_save_rollback_cmdinsert (txn, toku_cachefile_filenum(brt->cf), keybs); + r = toku_logger_save_rollback_cmdinsert (txn, toku_cachefile_filenum(brt->cf), &keybs); } if (r!=0) return r; r = toku_txn_note_brt(txn, brt); @@ -2788,10 +2777,9 @@ int toku_brt_maybe_delete(BRT brt, DBT *key, TOKUTXN txn, BOOL oplsn_valid, LSN int r; XIDS message_xids; TXNID xid = toku_txn_get_txnid(txn); - txn_note_doing_work(txn); if (txn && (brt->h->txnid_that_created_or_locked_when_empty != xid)) { - BYTESTRING keybs = {key->size, toku_memdup_in_rollback(txn, key->data, key->size)}; - r = toku_logger_save_rollback_cmddelete(txn, toku_cachefile_filenum(brt->cf), keybs); + BYTESTRING keybs = {key->size, key->data}; + r = toku_logger_save_rollback_cmddelete(txn, toku_cachefile_filenum(brt->cf), &keybs); if (r!=0) return r; r = toku_txn_note_brt(txn, brt); if (r!=0) return r; @@ -2975,6 +2963,20 @@ brtheader_log_fassociate_during_checkpoint (CACHEFILE cf, void *header_v) { return r; } +static int +brtheader_log_suppress_rollback_during_checkpoint (CACHEFILE cf, void *header_v) { + int r = 0; + struct brt_header *h = header_v; + TXNID xid = h->txnid_that_created_or_locked_when_empty; + if (xid != TXNID_NONE) { + //Only log if useful. + TOKULOGGER logger = toku_cachefile_logger(cf); + FILENUM filenum = toku_cachefile_filenum (cf); + r = toku_log_suppress_rollback(logger, NULL, 0, filenum, xid); + } + return r; +} + static int brtheader_note_pin_by_checkpoint (CACHEFILE cachefile, void *header_v); static int brtheader_note_unpin_by_checkpoint (CACHEFILE cachefile, void *header_v); @@ -2997,6 +2999,7 @@ brt_init_header_partial (BRT t) { toku_cachefile_set_userdata(t->cf, t->h, brtheader_log_fassociate_during_checkpoint, + brtheader_log_suppress_rollback_during_checkpoint, toku_brtheader_close, toku_brtheader_checkpoint, toku_brtheader_begin_checkpoint, @@ -3074,6 +3077,7 @@ int toku_read_brt_header_and_store_in_cachefile (CACHEFILE cf, struct brt_header toku_cachefile_set_userdata(cf, (void*)h, brtheader_log_fassociate_during_checkpoint, + brtheader_log_suppress_rollback_during_checkpoint, toku_brtheader_close, toku_brtheader_checkpoint, toku_brtheader_begin_checkpoint, @@ -3129,7 +3133,7 @@ verify_builtin_comparisons_consistent(BRT t, u_int32_t flags) { // This is the actual open, used for various purposes, such as normal use, recovery, and redirect. // fname_in_env is the iname, relative to the env_dir (data_dir is already in iname as prefix) static int -brt_open(BRT t, const char *fname_in_env, int is_create, int only_create, CACHETABLE cachetable, TOKUTXN txn, DB *db, int recovery_force_fcreate, FILENUM use_filenum, DICTIONARY_ID use_dictionary_id) { +brt_open(BRT t, const char *fname_in_env, int is_create, int only_create, CACHETABLE cachetable, TOKUTXN txn, DB *db, FILENUM use_filenum, DICTIONARY_ID use_dictionary_id) { int r; BOOL txn_created = FALSE; @@ -3147,11 +3151,11 @@ brt_open(BRT t, const char *fname_in_env, int is_create, int only_create, CACHET assert(is_create || !only_create); t->db = db; BOOL log_fopen = FALSE; // set true if we're opening a pre-existing file + BOOL did_create = FALSE; + FILENUM reserved_filenum = use_filenum; { int fd = -1; - BOOL did_create = FALSE; r = brt_open_file(fname_in_cwd, &fd); - FILENUM reserved_filenum = use_filenum; int use_reserved_filenum = reserved_filenum.fileid != FILENUM_NONE.fileid; if (r==ENOENT && is_create) { toku_cachetable_reserve_filenum(cachetable, &reserved_filenum, use_reserved_filenum, reserved_filenum); @@ -3164,6 +3168,12 @@ brt_open(BRT t, const char *fname_in_env, int is_create, int only_create, CACHET if (use_reserved_filenum) assert(reserved_filenum.fileid == use_filenum.fileid); did_create = TRUE; mode_t mode = S_IRWXU|S_IRWXG|S_IRWXO; + if (txn) { + BYTESTRING bs = { .len=strlen(fname_in_env), .data = (char*)fname_in_env }; + r = toku_logger_save_rollback_fcreate(txn, reserved_filenum, &bs); // bs is a copy of the fname relative to the environment + if (r != 0) goto died1; + } + txn_created = (BOOL)(txn!=NULL); r = toku_logger_log_fcreate(txn, fname_in_env, reserved_filenum, mode, t->flags, &(t->temp_descriptor)); if (r!=0) goto died1; r = brt_create_file(t, fname_in_cwd, &fd); @@ -3176,14 +3186,7 @@ brt_open(BRT t, const char *fname_in_env, int is_create, int only_create, CACHET fname_in_env, use_reserved_filenum||did_create, reserved_filenum, did_create); if (r != 0) goto died1; - if (did_create || recovery_force_fcreate) { - if (txn) { - BYTESTRING bs = { .len=strlen(fname_in_env), .data = toku_strdup_in_rollback(txn, fname_in_env) }; - r = toku_logger_save_rollback_fcreate(txn, toku_cachefile_filenum(t->cf), bs); // bs is a copy of the fname relative to the environment - if (r != 0) goto died_after_open; - } - txn_created = (BOOL)(txn!=NULL); - } else + if (!did_create) log_fopen = TRUE; //Log of fopen must be delayed till flags are available } if (r!=0) { @@ -3294,7 +3297,7 @@ brt_open(BRT t, const char *fname_in_env, int is_create, int only_create, CACHET if (t->db) t->db->descriptor = &t->h->descriptor.dbt; if (txn_created) { assert(txn); - assert(t->h->txnid_that_created_or_locked_when_empty == 0); // Uses 0 for no transaction. + assert(t->h->txnid_that_created_or_locked_when_empty == TXNID_NONE); t->h->txnid_that_created_or_locked_when_empty = toku_txn_get_txnid(txn); r = toku_txn_note_brt(txn, t); assert(r==0); @@ -3312,11 +3315,11 @@ brt_open(BRT t, const char *fname_in_env, int is_create, int only_create, CACHET // Open a brt for the purpose of recovery, which requires that the brt be open to a pre-determined FILENUM. (dict_id is assigned by the brt_open() function.) int -toku_brt_open_recovery(BRT t, const char *fname_in_env, int is_create, int only_create, CACHETABLE cachetable, TOKUTXN txn, DB *db, int recovery_force_fcreate, FILENUM use_filenum) { +toku_brt_open_recovery(BRT t, const char *fname_in_env, int is_create, int only_create, CACHETABLE cachetable, TOKUTXN txn, DB *db, FILENUM use_filenum) { int r; assert(use_filenum.fileid != FILENUM_NONE.fileid); r = brt_open(t, fname_in_env, is_create, only_create, cachetable, - txn, db, recovery_force_fcreate, use_filenum, DICTIONARY_ID_NONE); + txn, db, use_filenum, DICTIONARY_ID_NONE); return r; } @@ -3324,7 +3327,7 @@ toku_brt_open_recovery(BRT t, const char *fname_in_env, int is_create, int only_ int toku_brt_open(BRT t, const char *fname_in_env, int is_create, int only_create, CACHETABLE cachetable, TOKUTXN txn, DB *db) { int r; - r = brt_open(t, fname_in_env, is_create, only_create, cachetable, txn, db, FALSE, FILENUM_NONE, DICTIONARY_ID_NONE); + r = brt_open(t, fname_in_env, is_create, only_create, cachetable, txn, db, FILENUM_NONE, DICTIONARY_ID_NONE); return r; } @@ -3359,7 +3362,7 @@ brt_open_for_redirect(BRT *new_brtp, const char *fname_in_env, TOKUTXN txn, BRT assert(r==0); } CACHETABLE ct = toku_cachefile_get_cachetable(old_brt->cf); - r = brt_open(t, fname_in_env, 0, 0, ct, txn, old_brt->db, FALSE, FILENUM_NONE, old_h->dict_id); + r = brt_open(t, fname_in_env, 0, 0, ct, txn, old_brt->db, FILENUM_NONE, old_h->dict_id); assert(r==0); if (old_h->descriptor.version==0) { assert(t->h->descriptor.version == 0); @@ -3400,7 +3403,7 @@ brt_redirect_db (BRT brt_to, BRT brt_from) { } static int -redirect_brt_close_delayed(DB *db, u_int32_t UU(flags)) { +fake_db_brt_close_delayed(DB *db, u_int32_t UU(flags)) { BRT brt_to_close = db->api_internal; char *error_string = NULL; int r = toku_close_brt(brt_to_close, &error_string); @@ -3435,7 +3438,7 @@ toku_brt_header_close_redirected_brts(struct brt_header * h) { assert(which == num_brts); for (which = 0; which < num_brts; which++) { int r; - r = toku_brt_db_delay_closed(brts[which], dbs[which], redirect_brt_close_delayed, 0); + r = toku_brt_db_delay_closed(brts[which], dbs[which], fake_db_brt_close_delayed, 0); assert(r==0); } return 0; @@ -3590,7 +3593,6 @@ toku_dictionary_redirect (const char *dst_fname_in_env, BRT old_brt, TOKUTXN txn } if (txn) { - txn_note_doing_work(txn); r = toku_txn_note_brt(txn, old_brt); // mark old brt as touched by this txn assert(r==0); } @@ -3618,6 +3620,8 @@ toku_dictionary_redirect (const char *dst_fname_in_env, BRT old_brt, TOKUTXN txn assert(new_h->txnid_that_created_or_locked_when_empty == TXNID_NONE); TXNID xid = toku_txn_get_txnid(txn); new_h->txnid_that_created_or_locked_when_empty = xid; + r = toku_log_suppress_rollback(txn->logger, NULL, 0, new_filenum, xid); + assert(r==0); } cleanup: @@ -3856,6 +3860,8 @@ toku_brtheader_close (CACHEFILE cachefile, int fd, void *header_v, char **malloc if (h->panic) { r = h->panic; } else if (h->dictionary_opened) { //Otherwise header has never fully been created. + assert(h->cf == cachefile); + TOKULOGGER logger = toku_cachefile_logger(cachefile); LSN lsn = ZERO_LSN; //Get LSN if (oplsn_valid) { @@ -3868,17 +3874,19 @@ toku_brtheader_close (CACHEFILE cachefile, int fd, void *header_v, char **malloc else { //Get LSN from logger lsn = ZERO_LSN; // if there is no logger, we use zero for the lsn - TOKULOGGER logger = toku_cachefile_logger(cachefile); if (logger) { - //NEED NAME char* fname_in_env = toku_cachefile_fname_in_env(cachefile); assert(fname_in_env); BYTESTRING bs = {.len=strlen(fname_in_env), .data=fname_in_env}; - r = toku_log_fclose(logger, &lsn, h->dirty, bs, toku_cachefile_filenum(cachefile), h->flags); // flush the log on close (if new header is being written), otherwise it might not make it out. + r = toku_log_fclose(logger, &lsn, h->dirty, bs, toku_cachefile_filenum(cachefile)); // flush the log on close (if new header is being written), otherwise it might not make it out. if (r!=0) return r; } } if (h->dirty) { // this is the only place this bit is tested (in currentheader) + if (logger) { //Rollback cachefile MUST NOT BE CLOSED DIRTY + //It can be checkpointed only via 'checkpoint' + assert(logger->rollback_cachefile != cachefile); + } int r2; //assert(lsn.lsn!=0); r2 = toku_brtheader_begin_checkpoint(cachefile, fd, lsn, header_v); @@ -5315,11 +5323,10 @@ int toku_brt_maybe_delete_both(BRT brt, DBT *key, DBT *val, TOKUTXN txn, BOOL op int r; XIDS message_xids; TXNID xid = toku_txn_get_txnid(txn); - txn_note_doing_work(txn); if (txn && (brt->h->txnid_that_created_or_locked_when_empty != xid)) { - BYTESTRING keybs = {key->size, toku_memdup_in_rollback(txn, key->data, key->size)}; - BYTESTRING databs = {val->size, toku_memdup_in_rollback(txn, val->data, val->size)}; - r = toku_logger_save_rollback_cmddeleteboth(txn, toku_cachefile_filenum(brt->cf), keybs, databs); + BYTESTRING keybs = {key->size, key->data}; + BYTESTRING databs = {val->size, val->data}; + r = toku_logger_save_rollback_cmddeleteboth(txn, toku_cachefile_filenum(brt->cf), &keybs, &databs); if (r!=0) return r; r = toku_txn_note_brt(txn, brt); if (r!=0) return r; @@ -5671,8 +5678,8 @@ int toku_brt_destroy(void) { } //Return TRUE if empty, FALSE if not empty. -static BOOL -brt_is_empty (BRT brt) { +BOOL +toku_brt_is_empty (BRT brt) { BRT_CURSOR cursor; int r, r2; BOOL is_empty; @@ -5687,12 +5694,12 @@ brt_is_empty (BRT brt) { } int -toku_brt_note_table_lock (BRT brt, TOKUTXN txn) -{ +toku_brt_note_table_lock (BRT brt, TOKUTXN txn, BOOL ignore_not_empty) { int r = 0; if (brt->h->txnid_that_created_or_locked_when_empty != toku_txn_get_txnid(txn) && - brt_is_empty(brt) && - brt->h->txnid_that_created_or_locked_when_empty == 0) { + (ignore_not_empty || toku_brt_is_empty(brt)) && + brt->h->txnid_that_created_or_locked_when_empty == TXNID_NONE) + { brt->h->txnid_that_created_or_locked_when_empty = toku_txn_get_txnid(txn); r = toku_txn_note_brt(txn, brt); assert(r==0); @@ -5711,7 +5718,7 @@ LSN toku_brt_checkpoint_lsn(BRT brt) { return brt->h->checkpoint_lsn; } -static int toku_brt_header_set_panic(struct brt_header *h, int panic, char *panic_string) { +int toku_brt_header_set_panic(struct brt_header *h, int panic, char *panic_string) { if (h->panic == 0) { h->panic = panic; if (h->panic_string) @@ -5743,7 +5750,7 @@ int toku_logger_log_fdelete (TOKUTXN txn, const char *fname, FILENUM filenum, u_ // Prepare to remove a dictionary from the database when this transaction is committed: // - if cachetable has file open, mark it as in use so that cf remains valid until we're done // - mark transaction as NEED fsync on commit -// - make entry in rolltmp log +// - make entry in rollback log // - make fdelete entry in recovery log int toku_brt_remove_on_commit(TOKUTXN txn, DBT* iname_in_env_dbt_p) { assert(txn); @@ -5779,12 +5786,9 @@ int toku_brt_remove_on_commit(TOKUTXN txn, DBT* iname_in_env_dbt_p) { toku_txn_force_fsync_on_commit(txn); //If the txn commits, the commit MUST be in the log //before the file is actually unlinked { - BYTESTRING iname_in_env_bs = { - .len=strlen(iname_in_env), - .data = toku_strdup_in_rollback(txn, iname_in_env) - }; - // make entry in rolltmp log - r = toku_logger_save_rollback_fdelete(txn, was_open, filenum, iname_in_env_bs); + BYTESTRING iname_in_env_bs = { .len=strlen(iname_in_env), .data = (char*)iname_in_env }; + // make entry in rollback log + r = toku_logger_save_rollback_fdelete(txn, was_open, filenum, &iname_in_env_bs); assert(r==0); //On error we would need to remove the CF reference, which is complicated. } if (r==0) @@ -5794,7 +5798,7 @@ int toku_brt_remove_on_commit(TOKUTXN txn, DBT* iname_in_env_dbt_p) { } -// +// Non-transaction version of fdelete int toku_brt_remove_now(CACHETABLE ct, DBT* iname_in_env_dbt_p) { int r; const char *iname_in_env = iname_in_env_dbt_p->data; diff --git a/newbrt/brt.h b/newbrt/brt.h index be9aab3899e..25277a428d7 100644 --- a/newbrt/brt.h +++ b/newbrt/brt.h @@ -52,7 +52,7 @@ int brt_set_cachetable(BRT, CACHETABLE); int toku_brt_open(BRT, const char *fname_in_env, int is_create, int only_create, CACHETABLE ct, TOKUTXN txn, DB *db); int toku_brt_open_recovery(BRT, const char *fname_in_env, - int is_create, int only_create, CACHETABLE ct, TOKUTXN txn, DB *db, int recovery_force_fcreate, FILENUM use_filenum); + int is_create, int only_create, CACHETABLE ct, TOKUTXN txn, DB *db, FILENUM use_filenum); int toku_brt_remove_subdb(BRT brt, const char *dbname, u_int32_t flags); @@ -206,12 +206,14 @@ void toku_maybe_truncate_cachefile (CACHEFILE cf, int fd, u_int64_t size_used); int maybe_preallocate_in_file (int fd, u_int64_t size); // Effect: If file size is less than SIZE, make it bigger by either doubling it or growing by 16MB whichever is less. -int toku_brt_note_table_lock (BRT brt, TOKUTXN txn); +int toku_brt_note_table_lock (BRT brt, TOKUTXN txn, BOOL ignore_not_empty); // Effect: Record the fact that the BRT has a table lock (and thus no other txn will modify it until this txn completes. As a result, we can limit the amount of information in the rollback data structure. int toku_brt_zombie_needed (BRT brt); int toku_brt_get_fragmentation(BRT brt, TOKU_DB_FRAGMENTATION report); +int toku_brt_header_set_panic(struct brt_header *h, int panic, char *panic_string); +BOOL toku_brt_is_empty (BRT brt); double get_tdiff(void) __attribute__((__visibility__("default"))); diff --git a/newbrt/brttypes.h b/newbrt/brttypes.h index cf5930f7ec4..dae5ce0716b 100644 --- a/newbrt/brttypes.h +++ b/newbrt/brttypes.h @@ -33,6 +33,7 @@ typedef u_int64_t TXNID; #define TXNID_NONE ((TXNID)0) typedef struct s_blocknum { int64_t b; } BLOCKNUM; // make a struct so that we will notice type problems. +#define ROLLBACK_NONE ((BLOCKNUM){0}) static inline BLOCKNUM make_blocknum(int64_t b) { BLOCKNUM result={b}; return result; } @@ -70,6 +71,7 @@ typedef enum __toku_bool { FALSE=0, TRUE=1} BOOL; typedef struct tokulogger *TOKULOGGER; #define NULL_LOGGER ((TOKULOGGER)0) typedef struct tokutxn *TOKUTXN; +typedef struct txninfo *TXNINFO; #define NULL_TXN ((TOKUTXN)0) struct logged_btt_pair { @@ -121,5 +123,8 @@ typedef int (*generate_row_for_del_func)(DB *dest_db, DB *src_db, DBT *dest_val, #define UU(x) x __attribute__((__unused__)) +typedef struct memarena *MEMARENA; +typedef struct rollback_log_node *ROLLBACK_LOG_NODE; + #endif diff --git a/newbrt/cachetable.c b/newbrt/cachetable.c index 4f2131f4bb7..fc31227c664 100644 --- a/newbrt/cachetable.c +++ b/newbrt/cachetable.c @@ -100,8 +100,6 @@ struct ctpair { PAIR next,prev; // In LRU list. PAIR hash_chain; - LSN modified_lsn; // What was the LSN when modified (undefined if not dirty) - LSN written_lsn; // What was the LSN when written (we need to get this information when we fetch) BOOL checkpoint_pending; // If this is on, then we have got to write the pair out to disk before modifying it. PAIR pending_next; @@ -155,6 +153,8 @@ struct cachetable { struct workqueue wq; // async work queue THREADPOOL threadpool; // pool of worker threads LSN lsn_of_checkpoint_in_progress; + u_int32_t checkpoint_num_files; // how many cachefiles are in the checkpoint + u_int32_t checkpoint_num_txns; // how many transactions are in the checkpoint PAIR pending_head; // list of pairs marked with checkpoint_pending struct rwlock pending_lock; // multiple writer threads, single checkpoint thread struct minicron checkpointer; // the periodic checkpointing thread @@ -165,7 +165,6 @@ struct cachetable { BOOL set_env_dir; //Can only set env_dir once }; - // Lock the cachetable static inline void cachefiles_lock(CACHETABLE ct) { int r = toku_pthread_mutex_lock(&ct->cachefiles_mutex); assert(r == 0); @@ -224,6 +223,7 @@ struct cachefile { void *userdata; int (*log_fassociate_during_checkpoint)(CACHEFILE cf, void *userdata); // When starting a checkpoint we must log all open files. + int (*log_suppress_rollback_during_checkpoint)(CACHEFILE cf, void *userdata); // When starting a checkpoint we must log which files need rollbacks suppressed int (*close_userdata)(CACHEFILE cf, int fd, void *userdata, char **error_string, BOOL lsnvalid, LSN); // when closing the last reference to a cachefile, first call this function. int (*begin_checkpoint_userdata)(CACHEFILE cf, int fd, LSN lsn_of_checkpoint, void *userdata); // before checkpointing cachefiles call this function. int (*checkpoint_userdata)(CACHEFILE cf, int fd, void *userdata); // when checkpointing a cachefile, call this function. @@ -1239,8 +1239,7 @@ static PAIR cachetable_insert_at(CACHETABLE ct, CACHETABLE_FLUSH_CALLBACK flush_callback, CACHETABLE_FETCH_CALLBACK fetch_callback, void *extraargs, - enum cachetable_dirty dirty, - LSN written_lsn) { + enum cachetable_dirty dirty) { TAGMALLOC(PAIR, p); assert(p); memset(p, 0, sizeof *p); @@ -1255,8 +1254,6 @@ static PAIR cachetable_insert_at(CACHETABLE ct, p->flush_callback = flush_callback; p->fetch_callback = fetch_callback; p->extraargs = extraargs; - p->modified_lsn.lsn = 0; - p->written_lsn = written_lsn; p->fullhash = fullhash; p->next = p->prev = 0; rwlock_init(&p->rwlock); @@ -1321,7 +1318,7 @@ int toku_cachetable_put(CACHEFILE cachefile, CACHEKEY key, u_int32_t fullhash, v } // flushing could change the table size, but wont' change the fullhash cachetable_puts++; - PAIR p = cachetable_insert_at(ct, cachefile, key, value, CTPAIR_IDLE, fullhash, size, flush_callback, fetch_callback, extraargs, CACHETABLE_DIRTY, ZERO_LSN); + PAIR p = cachetable_insert_at(ct, cachefile, key, value, CTPAIR_IDLE, fullhash, size, flush_callback, fetch_callback, extraargs, CACHETABLE_DIRTY); assert(p); rwlock_read_lock(&p->rwlock, ct->mutex); note_hash_count(count); @@ -1465,7 +1462,7 @@ int toku_cachetable_get_and_pin(CACHEFILE cachefile, CACHEKEY key, u_int32_t ful int r; // Note. hashit(t,key) may have changed as a result of flushing. But fullhash won't have changed. { - p = cachetable_insert_at(ct, cachefile, key, zero_value, CTPAIR_READING, fullhash, zero_size, flush_callback, fetch_callback, extraargs, CACHETABLE_CLEAN, ZERO_LSN); + p = cachetable_insert_at(ct, cachefile, key, zero_value, CTPAIR_READING, fullhash, zero_size, flush_callback, fetch_callback, extraargs, CACHETABLE_CLEAN); assert(p); get_and_pin_footprint = 10; rwlock_write_lock(&p->rwlock, ct->mutex); @@ -1619,7 +1616,7 @@ int toku_cachefile_prefetch(CACHEFILE cf, CACHEKEY key, u_int32_t fullhash, // if not found then create a pair in the READING state and fetch it if (p == 0) { cachetable_prefetches++; - p = cachetable_insert_at(ct, cf, key, zero_value, CTPAIR_READING, fullhash, zero_size, flush_callback, fetch_callback, extraargs, CACHETABLE_CLEAN, ZERO_LSN); + p = cachetable_insert_at(ct, cf, key, zero_value, CTPAIR_READING, fullhash, zero_size, flush_callback, fetch_callback, extraargs, CACHETABLE_CLEAN); assert(p); rwlock_write_lock(&p->rwlock, ct->mutex); #if DO_WORKER_THREAD @@ -1906,18 +1903,53 @@ int toku_cachetable_unpin_and_remove (CACHEFILE cachefile, CACHEKEY key) { } static int -log_open_txn (OMTVALUE txnv, u_int32_t UU(index), void *loggerv) { - TOKUTXN txn = txnv; - TOKULOGGER logger = loggerv; - if (toku_logger_txn_parent(txn)==NULL) { // only have to log the open root transactions - int r = toku_log_xstillopen(logger, NULL, 0, - toku_txn_get_txnid(txn), - toku_txn_get_txnid(toku_logger_txn_parent(txn))); - assert(r==0); - } +set_filenum_in_array(OMTVALUE brtv, u_int32_t index, void*arrayv) { + FILENUM *array = arrayv; + BRT brt = brtv; + array[index] = toku_cachefile_filenum(brt->cf); return 0; } +static int +log_open_txn (OMTVALUE txnv, u_int32_t UU(index), void *UU(extra)) { + TOKUTXN txn = txnv; + TOKULOGGER logger = txn->logger; + FILENUMS open_filenums; + uint32_t num_filenums = toku_omt_size(txn->open_brts); + FILENUM array[num_filenums]; + { + open_filenums.num = num_filenums; + open_filenums.filenums = array; + //Fill in open_filenums + int r = toku_omt_iterate(txn->open_brts, set_filenum_in_array, array); + assert(r==0); + } + int r = toku_log_xstillopen(logger, NULL, 0, + toku_txn_get_txnid(txn), + toku_txn_get_txnid(toku_logger_txn_parent(txn)), + txn->rollentry_raw_count, + open_filenums, + txn->force_fsync_on_commit, + txn->num_rollback_nodes, + txn->num_rollentries, + txn->spilled_rollback_head, + txn->spilled_rollback_tail, + txn->current_rollback); + assert(r==0); + return 0; +} + +static int +unpin_rollback_log_for_checkpoint (OMTVALUE txnv, u_int32_t UU(index), void *UU(extra)) { + int r = 0; + TOKUTXN txn = txnv; + if (txn->pinned_inprogress_rollback_log) { + r = toku_rollback_log_unpin(txn, txn->pinned_inprogress_rollback_log); + assert(r==0); + } + return r; +} + // TODO: #1510 locking of cachetable is suspect // verify correct algorithm overall @@ -1931,7 +1963,17 @@ toku_cachetable_begin_checkpoint (CACHETABLE ct, TOKULOGGER logger) { { unsigned i; + if (logger) { // Unpin all 'inprogress rollback log nodes' pinned by transactions + int r = toku_omt_iterate(logger->live_txns, + unpin_rollback_log_for_checkpoint, + NULL); + assert(r==0); + } cachetable_lock(ct); + //Initialize accountability counters + ct->checkpoint_num_files = 0; + ct->checkpoint_num_txns = 0; + //Make list of cachefiles to be included in checkpoint. //If refcount is 0, the cachefile is closing (performing a local checkpoint) { @@ -1960,11 +2002,6 @@ toku_cachetable_begin_checkpoint (CACHETABLE ct, TOKULOGGER logger) { assert(r==0); ct->lsn_of_checkpoint_in_progress = begin_lsn; } - // Log all the open transactions - { - int r = toku_omt_iterate(logger->live_txns, log_open_txn, logger); - assert(r==0); - } // Log all the open files { //Must loop through ALL open files (even if not included in checkpoint). @@ -1973,6 +2010,26 @@ toku_cachetable_begin_checkpoint (CACHETABLE ct, TOKULOGGER logger) { for (cf = ct->cachefiles; cf; cf=cf->next) { if (cf->log_fassociate_during_checkpoint) { int r = cf->log_fassociate_during_checkpoint(cf, cf->userdata); + ct->checkpoint_num_files++; + assert(r==0); + } + } + cachefiles_unlock(ct); + } + // Log all the open transactions MUST BE AFTER OPEN FILES + { + ct->checkpoint_num_txns = toku_omt_size(logger->live_txns); + int r = toku_omt_iterate(logger->live_txns, log_open_txn, NULL); + assert(r==0); + } + // Log rollback suppression for all the open files MUST BE AFTER TXNS + { + //Must loop through ALL open files (even if not included in checkpoint). + CACHEFILE cf; + cachefiles_lock(ct); + for (cf = ct->cachefiles; cf; cf=cf->next) { + if (cf->log_suppress_rollback_during_checkpoint) { + int r = cf->log_suppress_rollback_during_checkpoint(cf, cf->userdata); assert(r==0); } } @@ -2115,7 +2172,10 @@ toku_cachetable_end_checkpoint(CACHETABLE ct, TOKULOGGER logger, if (logger) { int r = toku_log_end_checkpoint(logger, NULL, 1, // want the end_checkpoint to be fsync'd - ct->lsn_of_checkpoint_in_progress.lsn, 0); + ct->lsn_of_checkpoint_in_progress.lsn, + 0, + ct->checkpoint_num_files, + ct->checkpoint_num_txns); assert(r==0); toku_logger_note_checkpoint(logger, ct->lsn_of_checkpoint_in_progress); } @@ -2262,6 +2322,7 @@ void toku_cachefile_set_userdata (CACHEFILE cf, void *userdata, int (*log_fassociate_during_checkpoint)(CACHEFILE, void*), + int (*log_suppress_rollback_during_checkpoint)(CACHEFILE, void*), int (*close_userdata)(CACHEFILE, int, void*, char**, BOOL, LSN), int (*checkpoint_userdata)(CACHEFILE, int, void*), int (*begin_checkpoint_userdata)(CACHEFILE, int, LSN, void*), @@ -2270,6 +2331,7 @@ toku_cachefile_set_userdata (CACHEFILE cf, int (*note_unpin_by_checkpoint)(CACHEFILE, void*)) { cf->userdata = userdata; cf->log_fassociate_during_checkpoint = log_fassociate_during_checkpoint; + cf->log_suppress_rollback_during_checkpoint = log_suppress_rollback_during_checkpoint; cf->close_userdata = close_userdata; cf->checkpoint_userdata = checkpoint_userdata; cf->begin_checkpoint_userdata = begin_checkpoint_userdata; diff --git a/newbrt/cachetable.h b/newbrt/cachetable.h index cb880950bbb..311b6bb40b5 100644 --- a/newbrt/cachetable.h +++ b/newbrt/cachetable.h @@ -123,6 +123,7 @@ typedef int (*CACHETABLE_FETCH_CALLBACK)(CACHEFILE, int fd, CACHEKEY key, u_int3 void toku_cachefile_set_userdata(CACHEFILE cf, void *userdata, int (*log_fassociate_during_checkpoint)(CACHEFILE, void*), + int (*log_suppress_rollback_during_checkpoint)(CACHEFILE, void*), int (*close_userdata)(CACHEFILE, int, void*, char **/*error_string*/, BOOL, LSN), int (*checkpoint_userdata)(CACHEFILE, int, void*), int (*begin_checkpoint_userdata)(CACHEFILE, int, LSN, void*), diff --git a/newbrt/checkpoint.c b/newbrt/checkpoint.c index 3e2297ae69e..00226e9dcb0 100644 --- a/newbrt/checkpoint.c +++ b/newbrt/checkpoint.c @@ -218,7 +218,6 @@ toku_checkpoint(CACHETABLE ct, TOKULOGGER logger, checkpoint_footprint = 40; time_last_checkpoint_begin = time(NULL); r = toku_cachetable_begin_checkpoint(ct, logger); - LSN oldest_live_lsn = toku_logger_get_oldest_living_lsn(logger); multi_operation_checkpoint_unlock(); ydb_unlock(); @@ -230,7 +229,7 @@ toku_checkpoint(CACHETABLE ct, TOKULOGGER logger, r = toku_cachetable_end_checkpoint(ct, logger, ydb_lock, ydb_unlock, callback2_f, extra2); } if (r==0 && logger) { - LSN trim_lsn = (oldest_live_lsn.lsn < logger->checkpoint_lsn.lsn) ? oldest_live_lsn : logger->checkpoint_lsn; + LSN trim_lsn = logger->last_completed_checkpoint_lsn; r = toku_logger_maybe_trim_log(logger, trim_lsn); } diff --git a/newbrt/log-internal.h b/newbrt/log-internal.h index a1a4236a0f6..035d9217693 100644 --- a/newbrt/log-internal.h +++ b/newbrt/log-internal.h @@ -85,7 +85,7 @@ struct tokulogger { // To access these, you must have the output condition lock. LSN written_lsn; // the last lsn written LSN fsynced_lsn; // What is the LSN of the highest fsynced log entry (accessed only while holding the output lock, and updated only when the output lock and output permission are held) - LSN checkpoint_lsn; // What is the LSN of the most recent completed checkpoint. + LSN last_completed_checkpoint_lsn; // What is the LSN of the most recent completed checkpoint. long long next_log_file_number; struct logbuf outbuf; // data being written to the file int n_in_file; // The amount of data in the current file @@ -101,6 +101,7 @@ struct tokulogger { u_int64_t swap_ctr; // how many times have input/output log buffers been swapped void (*remove_finalize_callback) (DICTIONARY_ID, void*); // ydb-level callback to be called when a transaction that ... void * remove_finalize_callback_extra; // ... deletes a file is committed or when one that creates a file is aborted. + CACHEFILE rollback_cachefile; }; int toku_logger_find_next_unused_log_file(const char *directory, long long *result); @@ -116,25 +117,36 @@ struct tokutxn { u_int64_t txnid64; /* this happens to be the first lsn */ TOKULOGGER logger; TOKUTXN parent; - LSN last_lsn; /* Everytime anything is logged, update the LSN. (We need to atomically record the LSN along with writing into the log.) */ - LSN first_lsn; /* The first lsn in the transaction. */ - struct roll_entry *oldest_logentry,*newest_logentry; /* Only logentries with rollbacks are here. There is a list going from newest to oldest. */ - MEMARENA rollentry_arena; - - size_t rollentry_resident_bytecount; // How many bytes for the rollentries that are stored in main memory. - char *rollentry_filename; - int rollentry_fd; // If we spill the roll_entries, we write them into this fd. - toku_off_t rollentry_filesize; // How many bytes are in the rollentry file (this is the uncompressed bytes. If the file is compressed it may actually be smaller (or even larger with header information)) u_int64_t rollentry_raw_count; // the total count of every byte in the transaction and all its children. OMT open_brts; // a collection of the brts that we touched. Indexed by filenum. XIDS xids; //Represents the xid list BOOL force_fsync_on_commit; //This transaction NEEDS an fsync once (if) it commits. (commit means root txn) - BOOL has_done_work; //If this transaction has not done work, there is no need to fsync. TXN_PROGRESS_POLL_FUNCTION progress_poll_fun; void * progress_poll_fun_extra; + uint64_t num_rollback_nodes; uint64_t num_rollentries; uint64_t num_rollentries_processed; + BLOCKNUM spilled_rollback_head; + uint32_t spilled_rollback_head_hash; + BLOCKNUM spilled_rollback_tail; + uint32_t spilled_rollback_tail_hash; + BLOCKNUM current_rollback; + uint32_t current_rollback_hash; + BOOL recovered_from_checkpoint; + ROLLBACK_LOG_NODE pinned_inprogress_rollback_log; +}; + +struct txninfo { + uint64_t rollentry_raw_count; // the total count of every byte in the transaction and all its children. + uint32_t num_brts; + BRT *open_brts; + BOOL force_fsync_on_commit; //This transaction NEEDS an fsync once (if) it commits. (commit means root txn) + uint64_t num_rollback_nodes; + uint64_t num_rollentries; + BLOCKNUM spilled_rollback_head; + BLOCKNUM spilled_rollback_tail; + BLOCKNUM current_rollback; }; static inline int toku_logsizeof_u_int8_t (u_int32_t v __attribute__((__unused__))) { @@ -180,5 +192,4 @@ static inline char *fixup_fname(BYTESTRING *f) { return fname; } -int toku_read_rollback_backwards(BREAD, struct roll_entry **item, MEMARENA); #endif diff --git a/newbrt/log.h b/newbrt/log.h index 2d0569af250..3a845723b12 100644 --- a/newbrt/log.h +++ b/newbrt/log.h @@ -11,7 +11,6 @@ #include "../include/db.h" #include "brttypes.h" #include "memory.h" -#include "bread.h" #include "x1764.h" typedef void(*voidfp)(void); diff --git a/newbrt/logformat.c b/newbrt/logformat.c index 30213b8595c..9532c4ee079 100644 --- a/newbrt/logformat.c +++ b/newbrt/logformat.c @@ -41,8 +41,6 @@ struct logtype { // In the fields, don't mention the command, the LSN, the CRC or the trailing LEN. -int logformat_version_number = 0; - const struct logtype rollbacks[] = { //TODO: #2037 Add dname {"fdelete", 'U', FA{{"u_int8_t", "file_was_open", 0}, @@ -72,7 +70,12 @@ const struct logtype rollbacks[] = { {"FILENUM", "filenum", 0}, {"BYTESTRING", "key", 0}, NULLFIELD}}, - {"rollinclude", 'r', FA{{"BYTESTRING", "fname", 0}, + {"rollinclude", 'r', FA{{"TXNID", "xid", 0}, + {"u_int64_t", "num_nodes", 0}, + {"BLOCKNUM", "spilled_head", 0}, + {"u_int32_t", "spilled_head_hash", 0}, + {"BLOCKNUM", "spilled_tail", 0}, + {"u_int32_t", "spilled_tail_hash", 0}, NULLFIELD}}, {"tablelock_on_empty_table", 'L', FA{{"FILENUM", "filenum", 0}, NULLFIELD}}, @@ -82,46 +85,44 @@ const struct logtype rollbacks[] = { {"dictionary_redirect", 'R', FA{{"FILENUM", "old_filenum", 0}, {"FILENUM", "new_filenum", 0}, NULLFIELD}}, -// {"fclose", 'c', FA{{"FILENUM", "filenum", 0}, -// {"BYTESTRING", "fname", 0}, -// NULLFIELD}}, -// {"deleteatleaf", 'd', FA{{"FILENUM", "filenum", 0}, // Note a delete for rollback. The delete takes place in a leaf. -// {"BYTESTRING", "key", 0}, -// {"BYTESTRING", "data", 0}, -// NULLFIELD}}, -// {"insertatleaf", 'i', FA{{"FILENUM", "filenum", 0}, // Note an insert for rollback. The insert takes place in a leaf. -// {"BYTESTRING", "key", 0}, -// {"BYTESTRING", "data", 0}, -// NULLFIELD}}, -// {"xactiontouchednonleaf", 'n', FA{{"FILENUM", "filenum", 0}, -// {"DISKOFFARRAY", "parents", 0}, -// {"DISKOFF", "diskoff", 0}, -// NULLFIELD}}, {0,0,FA{NULLFIELD}} }; const struct logtype logtypes[] = { // Records produced by checkpoints {"begin_checkpoint", 'x', FA{{"u_int64_t", "timestamp", 0}, NULLFIELD}}, - {"end_checkpoint", 'X', FA{{"TXNID", "txnid", 0}, {"u_int64_t", "timestamp", 0}, NULLFIELD}}, // TXNID is LSN of begin_checkpoint + {"end_checkpoint", 'X', FA{{"TXNID", "xid", 0}, // xid is LSN of begin_checkpoint + {"u_int64_t", "timestamp", 0}, + {"u_int32_t", "num_fassociate_entries", 0}, // how many files were checkpointed + {"u_int32_t", "num_xstillopen_entries", 0}, // how many txns were checkpointed + NULLFIELD}}, //TODO: #2037 Add dname {"fassociate", 'f', FA{{"FILENUM", "filenum", 0}, {"u_int32_t", "treeflags", 0}, {"BYTESTRING", "iname", 0}, // pathname of file NULLFIELD}}, - {"xstillopen", 's', FA{{"TXNID", "txnid", 0}, - {"TXNID", "parent", 0}, - NULLFIELD}}, // only record root transactions + //We do not use a TXNINFO struct since recovery log has + //FILENUMS and TOKUTXN has BRTs (for open_brts) + {"xstillopen", 's', FA{{"TXNID", "xid", 0}, + {"TXNID", "parentxid", 0}, + {"u_int64_t", "rollentry_raw_count", 0}, + {"FILENUMS", "open_filenums", 0}, + {"u_int8_t", "force_fsync_on_commit", 0}, + {"u_int64_t", "num_rollback_nodes", 0}, + {"u_int64_t", "num_rollentries", 0}, + {"BLOCKNUM", "spilled_rollback_head", 0}, + {"BLOCKNUM", "spilled_rollback_tail", 0}, + {"BLOCKNUM", "current_rollback", 0}, + NULLFIELD}}, // record all transactions + {"suppress_rollback", 'S', FA{{"FILENUM", "filenum", 0}, + {"TXNID", "xid", 0}, + NULLFIELD}}, // Records produced by transactions - {"commit", 'C', FA{{"TXNID", "txnid", 0},NULLFIELD}}, - {"xabort", 'q', FA{{"TXNID", "txnid", 0},NULLFIELD}}, - {"xbegin", 'b', FA{{"TXNID", "parenttxnid", 0},NULLFIELD}}, + {"xbegin", 'b', FA{{"TXNID", "parentxid", 0},NULLFIELD}}, + {"xcommit",'C', FA{{"TXNID", "xid", 0},NULLFIELD}}, + {"xabort", 'q', FA{{"TXNID", "xid", 0},NULLFIELD}}, //TODO: #2037 Add dname - {"fdelete", 'U', FA{{"TXNID", "txnid", 0}, - {"BYTESTRING", "iname", 0}, - NULLFIELD}}, - //TODO: #2037 Add dname - {"fcreate", 'F', FA{{"TXNID", "txnid", 0}, + {"fcreate", 'F', FA{{"TXNID", "xid", 0}, {"FILENUM", "filenum", 0}, {"BYTESTRING", "iname", 0}, {"u_int32_t", "mode", "0%o"}, @@ -137,21 +138,24 @@ const struct logtype logtypes[] = { //TODO: #2037 Add dname {"fclose", 'e', FA{{"BYTESTRING", "iname", 0}, {"FILENUM", "filenum", 0}, - {"u_int32_t", "treeflags", 0}, NULLFIELD}}, + //TODO: #2037 Add dname + {"fdelete", 'U', FA{{"TXNID", "xid", 0}, + {"BYTESTRING", "iname", 0}, + NULLFIELD}}, {"tablelock_on_empty_table", 'L', FA{{"FILENUM", "filenum", 0}, {"TXNID", "xid", 0}, NULLFIELD}}, + {"enq_insert", 'I', FA{{"FILENUM", "filenum", 0}, + {"TXNID", "xid", 0}, + {"BYTESTRING", "key", 0}, + {"BYTESTRING", "value", 0}, + NULLFIELD}}, {"enq_insert_no_overwrite", 'i', FA{{"FILENUM", "filenum", 0}, {"TXNID", "xid", 0}, {"BYTESTRING", "key", 0}, {"BYTESTRING", "value", 0}, NULLFIELD}}, - {"enq_insert", 'I', FA{{"FILENUM", "filenum", 0}, - {"TXNID", "xid", 0}, - {"BYTESTRING", "key", 0}, - {"BYTESTRING", "value", 0}, - NULLFIELD}}, {"enq_delete_both", 'D', FA{{"FILENUM", "filenum", 0}, {"TXNID", "xid", 0}, {"BYTESTRING", "key", 0}, @@ -277,11 +281,10 @@ generate_log_struct (void) { fprintf(hf, "struct roll_entry {\n"); fprintf(hf, " enum rt_cmd cmd;\n"); + fprintf(hf, " struct roll_entry *prev; /* for in-memory list of log entries. Threads from newest to oldest. */\n"); fprintf(hf, " union {\n"); DO_ROLLBACKS(lt, fprintf(hf," struct rolltype_%s %s;\n", lt->name, lt->name)); fprintf(hf, " } u;\n"); - fprintf(hf, " struct roll_entry *prev; /* for in-memory list of log entries. Threads from newest to oldest. */\n"); - fprintf(hf, " struct roll_entry *next; /* Points to a newer logentry. Needed for flushing to disk, since we want to write the oldest one first. */\n"); fprintf(hf, "};\n"); } @@ -532,47 +535,76 @@ static void generate_rollbacks (void) { DO_ROLLBACKS(lt, { fprintf2(cf, hf, "int toku_logger_save_rollback_%s (TOKUTXN txn", lt->name); - DO_FIELDS(ft, lt, fprintf2(cf, hf, ", %s %s", ft->type, ft->name)); + DO_FIELDS(ft, lt, { + if ( strcmp(ft->type, "BYTESTRING") == 0 ) { + fprintf2(cf, hf, ", BYTESTRING *%s_ptr", ft->name); + } + else { + fprintf2(cf, hf, ", %s %s", ft->type, ft->name); + } + }); + fprintf(hf, ");\n"); fprintf(cf, ") {\n"); + fprintf(cf, " int r;\n"); + fprintf(cf, " ROLLBACK_LOG_NODE log;\n"); + fprintf(cf, " r = toku_get_and_pin_rollback_log_for_new_entry(txn, &log);\n"); + fprintf(cf, " assert(r==0);\n"); + // 'memdup' all BYTESTRINGS here + DO_FIELDS(ft, lt, { + if ( strcmp(ft->type, "BYTESTRING") == 0 ) { + fprintf(cf, " BYTESTRING %s = {\n" + " .len = %s_ptr->len,\n" + " .data = toku_memdup_in_rollback(log, %s_ptr->data, %s_ptr->len)\n" + " };\n", + ft->name, ft->name, ft->name, ft->name); + } + }); { int count=0; fprintf(cf, " u_int32_t rollback_fsize = toku_logger_rollback_fsize_%s(", lt->name); DO_FIELDS(ft, lt, fprintf(cf, "%s%s", (count++>0)?", ":"", ft->name)); fprintf(cf, ");\n"); } - fprintf(cf, " struct roll_entry *v = toku_malloc_in_rollback(txn, sizeof(*v));\n"); + fprintf(cf, " struct roll_entry *v;\n"); + fprintf(cf, " size_t mem_needed = sizeof(v->u.%s) + __builtin_offsetof(struct roll_entry, u.%s);\n", lt->name, lt->name); + fprintf(cf, " v = toku_malloc_in_rollback(log, mem_needed);\n"); fprintf(cf, " if (v==0) return errno;\n"); fprintf(cf, " v->cmd = (enum rt_cmd)%u;\n", lt->command_and_flags&0xff); DO_FIELDS(ft, lt, fprintf(cf, " v->u.%s.%s = %s;\n", lt->name, ft->name, ft->name)); - fprintf(cf, " v->prev = txn->newest_logentry;\n"); - fprintf(cf, " v->next = 0;\n"); - fprintf(cf, " if (txn->oldest_logentry==0) txn->oldest_logentry=v;\n"); - fprintf(cf, " else txn->newest_logentry->next = v;\n"); - fprintf(cf, " txn->newest_logentry = v;\n"); - fprintf(cf, " txn->rollentry_resident_bytecount += rollback_fsize;\n"); + fprintf(cf, " v->prev = log->newest_logentry;\n"); + fprintf(cf, " if (log->oldest_logentry==NULL) log->oldest_logentry=v;\n"); + fprintf(cf, " log->newest_logentry = v;\n"); + fprintf(cf, " log->rollentry_resident_bytecount += rollback_fsize;\n"); fprintf(cf, " txn->rollentry_raw_count += rollback_fsize;\n"); fprintf(cf, " txn->num_rollentries++;\n"); - fprintf(cf, " return toku_maybe_spill_rollbacks(txn);\n}\n"); + fprintf(cf, " log->dirty = TRUE;\n"); + fprintf(cf, " return toku_maybe_spill_rollbacks(txn, log);\n}\n"); }); DO_ROLLBACKS(lt, { - fprintf2(cf, hf, "void toku_logger_rollback_wbufwrite_%s (struct wbuf *wbuf", lt->name); + fprintf2(cf, hf, "void toku_logger_rollback_wbuf_nocrc_write_%s (struct wbuf *wbuf", lt->name); DO_FIELDS(ft, lt, fprintf2(cf, hf, ", %s %s", ft->type, ft->name)); fprintf2(cf, hf, ")"); fprintf(hf, ";\n"); fprintf(cf, " {\n"); - fprintf(cf, " u_int32_t ndone_at_start = wbuf->ndone;\n"); - fprintf(cf, " wbuf_char(wbuf, '%c');\n", (char)(0xff<->command_and_flags)); - DO_FIELDS(ft, lt, fprintf(cf, " wbuf_%s(wbuf, %s);\n", ft->type, ft->name)); - fprintf(cf, " wbuf_int(wbuf, 4+wbuf->ndone - ndone_at_start);\n"); + + { + int count=0; + fprintf(cf, " u_int32_t rollback_fsize = toku_logger_rollback_fsize_%s(", lt->name); + DO_FIELDS(ft, lt, fprintf(cf, "%s%s", (count++>0)?", ":"", ft->name)); + fprintf(cf, ");\n"); + fprintf(cf, " wbuf_nocrc_int(wbuf, rollback_fsize);\n"); + } + fprintf(cf, " wbuf_nocrc_char(wbuf, '%c');\n", (char)(0xff<->command_and_flags)); + DO_FIELDS(ft, lt, fprintf(cf, " wbuf_nocrc_%s(wbuf, %s);\n", ft->type, ft->name)); fprintf(cf, "}\n"); }); - fprintf2(cf, hf, "void toku_logger_rollback_wbufwrite (struct wbuf *wbuf, struct roll_entry *r)"); + fprintf2(cf, hf, "void toku_logger_rollback_wbuf_nocrc_write (struct wbuf *wbuf, struct roll_entry *r)"); fprintf(hf, ";\n"); fprintf(cf, " {\n switch (r->cmd) {\n"); DO_ROLLBACKS(lt, { - fprintf(cf, " case RT_%s: toku_logger_rollback_wbufwrite_%s(wbuf", lt->name, lt->name); + fprintf(cf, " case RT_%s: toku_logger_rollback_wbuf_nocrc_write_%s(wbuf", lt->name, lt->name); DO_FIELDS(ft, lt, fprintf(cf, ", r->u.%s.%s", lt->name, ft->name)); fprintf(cf, "); return;\n"); }); @@ -604,12 +636,15 @@ generate_rollbacks (void) { fprintf2(cf, hf, "int toku_parse_rollback(unsigned char *buf, u_int32_t n_bytes, struct roll_entry **itemp, MEMARENA ma)"); fprintf(hf, ";\n"); - fprintf(cf, " {\n assert(n_bytes>0);\n struct roll_entry *item = malloc_in_memarena(ma, sizeof(*item));\n item->cmd=(enum rt_cmd)(buf[0]);\n"); + fprintf(cf, " {\n assert(n_bytes>0);\n struct roll_entry *item;\n enum rt_cmd cmd = (enum rt_cmd)(buf[0]);\n size_t mem_needed;\n"); fprintf(cf, " struct rbuf rc = {buf, n_bytes, 1};\n"); - fprintf(cf, " switch(item->cmd) {\n"); + fprintf(cf, " switch(cmd) {\n"); DO_ROLLBACKS(lt, { fprintf(cf, " case RT_%s:\n", lt->name); - DO_FIELDS(ft, lt, fprintf(cf, " rbuf_ma_%s(&rc, ma, &item->u.%s.%s);\n", ft->type, lt->name, ft->name)); + fprintf(cf, " mem_needed = sizeof(item->u.%s) + __builtin_offsetof(struct roll_entry, u.%s);\n", lt->name, lt->name); + fprintf(cf, " item = malloc_in_memarena(ma, mem_needed);\n"); + fprintf(cf, " item->cmd = cmd;\n"); + DO_FIELDS(ft, lt, fprintf(cf, " rbuf_ma_%s(&rc, ma, &item->u.%s.%s);\n", ft->type, lt->name, ft->name)); fprintf(cf, " *itemp = item;\n"); fprintf(cf, " return 0;\n"); }); diff --git a/newbrt/logger.c b/newbrt/logger.c index 0f4f3c33d54..86bfcc72f25 100644 --- a/newbrt/logger.c +++ b/newbrt/logger.c @@ -55,7 +55,7 @@ int toku_logger_create (TOKULOGGER *resultp) { result->outbuf = (struct logbuf) {0, LOGGER_MIN_BUF_SIZE, toku_xmalloc(LOGGER_MIN_BUF_SIZE), ZERO_LSN}; // written_lsn is uninitialized // fsynced_lsn is uninitialized - result->checkpoint_lsn = ZERO_LSN; + result->last_completed_checkpoint_lsn = ZERO_LSN; // next_log_file_number is uninitialized // n_in_file is uninitialized result->write_block_size = BRT_DEFAULT_NODE_SIZE; // default logging size is the same as the default brt block size @@ -68,6 +68,7 @@ int toku_logger_create (TOKULOGGER *resultp) { result->input_lock_ctr = 0; result->output_condition_lock_ctr = 0; result->swap_ctr = 0; + result->rollback_cachefile = NULL; result->output_is_available = TRUE; return 0; @@ -136,6 +137,68 @@ int toku_logger_open (const char *directory, TOKULOGGER logger) { return 0; } +int +toku_logger_open_rollback(TOKULOGGER logger, CACHETABLE cachetable, BOOL create) { + assert(logger->is_open); + assert(!logger->is_panicked); + assert(!logger->rollback_cachefile); + + int r; + BRT t = NULL; // Note, there is no DB associated with this BRT. + + r = toku_brt_create(&t); + assert(r==0); + r = toku_brt_open(t, ROLLBACK_CACHEFILE_NAME, create, create, cachetable, NULL_TXN, NULL); + assert(r==0); + logger->rollback_cachefile = t->cf; + toku_brtheader_lock(t->h); + //Verify it is empty + assert(!t->h->panic); + //Must have no data blocks (rollback logs or otherwise). + toku_block_verify_no_data_blocks_except_root_unlocked(t->h->blocktable, t->h->root); + toku_brtheader_unlock(t->h); + assert(toku_brt_is_empty(t)); + return r; +} + + +// Requires: Rollback cachefile can only be closed immediately after a checkpoint, +// so it will always be clean (!h->dirty) when about to be closed. +// Rollback log can only be closed when there are no open transactions, +// so it will always be empty (no data blocks) when about to be closed. +int +toku_logger_close_rollback(TOKULOGGER logger, BOOL recovery_failed) { + int r = 0; + CACHEFILE cf = logger->rollback_cachefile; // stored in logger at rollback cachefile open + if (!logger->is_panicked && cf) { + BRT brt_to_close; + { //Find "brt" + struct brt_header *h = toku_cachefile_get_userdata(cf); + toku_brtheader_lock(h); + if (!h->panic && recovery_failed) { + toku_brt_header_set_panic(h, EINVAL, "Recovery failed"); + } + //Verify it is safe to close it. + if (!h->panic) { //If paniced, it is safe to close. + assert(!h->dirty); //Must not be dirty. + //Must have no data blocks (rollback logs or otherwise). + toku_block_verify_no_data_blocks_except_root_unlocked(h->blocktable, h->root); + } + assert(!toku_list_empty(&h->live_brts)); // there is always one brt associated with the header + brt_to_close = toku_list_struct(toku_list_head(&h->live_brts), struct brt, live_brt_link); + assert(brt_to_close); + toku_brtheader_unlock(h); + assert(toku_brt_is_empty(brt_to_close)); + } + + char *error_string_ignore = NULL; + r = toku_close_brt(brt_to_close, &error_string_ignore); + //Set as dealt with already. + logger->rollback_cachefile = NULL; + } + return r; +} + // No locks held on entry // No locks held on exit. // No locks are needed, since you cannot legally close the log concurrently with doing anything else. @@ -183,7 +246,8 @@ int toku_logger_shutdown(TOKULOGGER logger) { if (logger->is_open) { if (toku_omt_size(logger->live_txns) == 0) { BYTESTRING comment = { strlen("shutdown"), "shutdown" }; - r = toku_log_comment(logger, NULL, TRUE, 0, comment); + int r2 = toku_log_comment(logger, NULL, TRUE, 0, comment); + if (!r) r = r2; } } return r; @@ -787,6 +851,10 @@ int toku_fread_LSN (FILE *f, LSN *lsn, struct x1764 *checksum, u_int32_t *le return toku_fread_u_int64_t (f, &lsn->lsn, checksum, len); } +int toku_fread_BLOCKNUM (FILE *f, BLOCKNUM *b, struct x1764 *checksum, u_int32_t *len) { + return toku_fread_u_int64_t (f, (u_int64_t*)&b->b, checksum, len); +} + int toku_fread_FILENUM (FILE *f, FILENUM *filenum, struct x1764 *checksum, u_int32_t *len) { return toku_fread_u_int32_t (f, &filenum->fileid, checksum, len); } @@ -903,6 +971,11 @@ int toku_logprint_BYTESTRING (FILE *outf, FILE *inf, const char *fieldname, stru return 0; } +int toku_logprint_BLOCKNUM (FILE *outf, FILE *inf, const char *fieldname, struct x1764 *checksum, u_int32_t *len, const char *format) { + return toku_logprint_u_int64_t(outf, inf, fieldname, checksum, len, format); + +} + int toku_logprint_FILENUM (FILE *outf, FILE *inf, const char *fieldname, struct x1764 *checksum, u_int32_t *len, const char *format) { return toku_logprint_u_int32_t(outf, inf, fieldname, checksum, len, format); @@ -982,11 +1055,6 @@ TXNID toku_txn_get_txnid (TOKUTXN txn) { else return txn->txnid64; } -LSN toku_txn_get_last_lsn (TOKUTXN txn) { - if (txn==0) return (LSN){0}; - return txn->last_lsn; -} - LSN toku_logger_last_lsn(TOKULOGGER logger) { return logger->lsn; } @@ -1083,32 +1151,20 @@ int toku_logger_log_archive (TOKULOGGER logger, char ***logs_p, int flags) { // get them into increasing order qsort(all_logs, all_n_logs, sizeof(all_logs[0]), logfilenamecompare); - LSN oldest_live_txn_lsn; - { - TXNID oldest_living_xid = toku_logger_get_oldest_living_xid(logger); - if (oldest_living_xid == TXNID_NONE_LIVING) - oldest_live_txn_lsn = MAX_LSN; - else - oldest_live_txn_lsn.lsn = oldest_living_xid; - } - - //printf("%s:%d Oldest txn is %lld\n", __FILE__, __LINE__, (long long)oldest_live_txn_lsn.lsn); + LSN save_lsn = logger->last_completed_checkpoint_lsn; // Now starting at the last one, look for archivable ones. // Count the total number of bytes, because we have to return a single big array. (That's the BDB interface. Bleah...) LSN earliest_lsn_in_logfile={(unsigned long long)(-1LL)}; r = peek_at_log(logger, all_logs[all_n_logs-1], &earliest_lsn_in_logfile); // try to find the lsn that's in the most recent log - if ((earliest_lsn_in_logfile.lsn <= logger->checkpoint_lsn.lsn)&& - (earliest_lsn_in_logfile.lsn <= oldest_live_txn_lsn.lsn)) { + if (earliest_lsn_in_logfile.lsn <= save_lsn.lsn) { i=all_n_logs-1; } else { for (i=all_n_logs-2; i>=0; i--) { // start at all_n_logs-2 because we never archive the most recent log r = peek_at_log(logger, all_logs[i], &earliest_lsn_in_logfile); if (r!=0) continue; // In case of error, just keep going - //printf("%s:%d file=%s firstlsn=%lld checkpoint_lsns={%lld %lld}\n", __FILE__, __LINE__, all_logs[i], (long long)earliest_lsn_in_logfile.lsn, (long long)logger->checkpoint_lsns[0].lsn, (long long)logger->checkpoint_lsns[1].lsn); - if ((earliest_lsn_in_logfile.lsn <= logger->checkpoint_lsn.lsn)&& - (earliest_lsn_in_logfile.lsn <= oldest_live_txn_lsn.lsn)) { + if (earliest_lsn_in_logfile.lsn <= save_lsn.lsn) { break; } } @@ -1148,7 +1204,7 @@ TOKUTXN toku_logger_txn_parent (TOKUTXN txn) { } void toku_logger_note_checkpoint(TOKULOGGER logger, LSN lsn) { - logger->checkpoint_lsn = lsn; + logger->last_completed_checkpoint_lsn = lsn; } TXNID toku_logger_get_oldest_living_xid(TOKULOGGER logger) { @@ -1158,17 +1214,6 @@ TXNID toku_logger_get_oldest_living_xid(TOKULOGGER logger) { return rval; } -LSN toku_logger_get_oldest_living_lsn(TOKULOGGER logger) { - LSN lsn = {0}; - if (logger) { - if (logger->oldest_living_xid == TXNID_NONE_LIVING) - lsn = MAX_LSN; - else - lsn.lsn = logger->oldest_living_xid; - } - return lsn; -} - LSN toku_logger_get_next_lsn(TOKULOGGER logger) { return logger->lsn; diff --git a/newbrt/logger.h b/newbrt/logger.h index 40326d0e356..7c8ac247959 100644 --- a/newbrt/logger.h +++ b/newbrt/logger.h @@ -5,12 +5,20 @@ #ident "Copyright (c) 2007, 2008, 2009 Tokutek Inc. All rights reserved." #ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it." -enum { TOKU_LOG_VERSION = 1 }; +enum { + TOKU_LOG_VERSION_1 = 1, + TOKU_LOG_VERSION_2 = 2, + TOKU_LOG_NEXT_VERSION, // the version after the current version + TOKU_LOG_VERSION = TOKU_LOG_NEXT_VERSION-1, // A hack so I don't have to change this line. +}; +#define ROLLBACK_CACHEFILE_NAME "tokudb.rollback" int toku_logger_create (TOKULOGGER *resultp); int toku_logger_open (const char *directory, TOKULOGGER logger); int toku_logger_shutdown(TOKULOGGER logger); int toku_logger_close(TOKULOGGER *loggerp); +int toku_logger_open_rollback(TOKULOGGER logger, CACHETABLE cachetable, BOOL create); +int toku_logger_close_rollback(TOKULOGGER logger, BOOL recovery_failed); int toku_logger_fsync (TOKULOGGER logger); void toku_logger_panic (TOKULOGGER logger, int err); @@ -49,6 +57,7 @@ int toku_fread_u_int32_t_nocrclen (FILE *f, u_int32_t *v); int toku_fread_u_int32_t (FILE *f, u_int32_t *v, struct x1764 *checksum, u_int32_t *len); int toku_fread_u_int64_t (FILE *f, u_int64_t *v, struct x1764 *checksum, u_int32_t *len); int toku_fread_LSN (FILE *f, LSN *lsn, struct x1764 *checksum, u_int32_t *len); +int toku_fread_BLOCKNUM (FILE *f, BLOCKNUM *lsn, struct x1764 *checksum, u_int32_t *len); int toku_fread_FILENUM (FILE *f, FILENUM *filenum, struct x1764 *checksum, u_int32_t *len); int toku_fread_TXNID (FILE *f, TXNID *txnid, struct x1764 *checksum, u_int32_t *len); int toku_fread_BYTESTRING (FILE *f, BYTESTRING *bs, struct x1764 *checksum, u_int32_t *len); @@ -58,6 +67,7 @@ int toku_logprint_LSN (FILE *outf, FILE *inf, const char *fieldname, struct x176 int toku_logprint_TXNID (FILE *outf, FILE *inf, const char *fieldname, struct x1764 *checksum, u_int32_t *len, const char *format __attribute__((__unused__))); int toku_logprint_u_int8_t (FILE *outf, FILE *inf, const char *fieldname, struct x1764 *checksum, u_int32_t *len, const char *format); int toku_logprint_u_int32_t (FILE *outf, FILE *inf, const char *fieldname, struct x1764 *checksum, u_int32_t *len, const char *format); +int toku_logprint_BLOCKNUM (FILE *outf, FILE *inf, const char *fieldname, struct x1764 *checksum, u_int32_t *len, const char *format); int toku_logprint_u_int64_t (FILE *outf, FILE *inf, const char *fieldname, struct x1764 *checksum, u_int32_t *len, const char *format); void toku_print_BYTESTRING (FILE *outf, u_int32_t len, char *data); int toku_logprint_BYTESTRING (FILE *outf, FILE *inf, const char *fieldname, struct x1764 *checksum, u_int32_t *len, const char *format __attribute__((__unused__))); @@ -67,7 +77,6 @@ int toku_read_and_print_logmagic (FILE *f, u_int32_t *versionp); int toku_read_logmagic (FILE *f, u_int32_t *versionp); TXNID toku_txn_get_txnid (TOKUTXN txn); -LSN toku_txn_get_last_lsn (TOKUTXN txn); LSN toku_logger_last_lsn(TOKULOGGER logger); TOKULOGGER toku_txn_logger (TOKUTXN txn); @@ -81,7 +90,6 @@ TOKUTXN toku_logger_txn_parent (TOKUTXN txn); void toku_logger_note_checkpoint(TOKULOGGER logger, LSN lsn); TXNID toku_logger_get_oldest_living_xid(TOKULOGGER logger); -LSN toku_logger_get_oldest_living_lsn(TOKULOGGER logger); LSN toku_logger_get_next_lsn(TOKULOGGER logger); void toku_logger_set_remove_finalize_callback(TOKULOGGER logger, void (*funcp)(DICTIONARY_ID, void *), void * extra); void toku_logger_call_remove_finalize_callback(TOKULOGGER logger, DICTIONARY_ID dict_id); diff --git a/newbrt/memarena.c b/newbrt/memarena.c index e41a4e1a812..d2b46f9edd3 100644 --- a/newbrt/memarena.c +++ b/newbrt/memarena.c @@ -12,9 +12,9 @@ struct memarena { int n_other_bufs; }; -MEMARENA memarena_create (void) { - MEMARENA MALLOC(result); assert(result); - result->buf_size = 1024; +MEMARENA memarena_create_presized (size_t initial_size) { + MEMARENA XMALLOC(result); + result->buf_size = initial_size; result->buf_used = 0; result->other_bufs = NULL; result->size_of_other_bufs = 0; @@ -23,6 +23,10 @@ MEMARENA memarena_create (void) { return result; } +MEMARENA memarena_create (void) { + return memarena_create_presized(1024); +} + void memarena_clear (MEMARENA ma) { // Free the other bufs. int i; diff --git a/newbrt/memarena.h b/newbrt/memarena.h index 0ba9d3caa17..2928432a708 100644 --- a/newbrt/memarena.h +++ b/newbrt/memarena.h @@ -19,10 +19,11 @@ #include -typedef struct memarena *MEMARENA; +MEMARENA memarena_create_presized (size_t initial_size); +// Effect: Create a memarena with initial size. In case of ENOMEM, aborts. MEMARENA memarena_create (void); -// Effect: Create a memarena. In case of ENOMEM, aborts. +// Effect: Create a memarena with default initial size. In case of ENOMEM, aborts. void memarena_clear (MEMARENA ma); // Effect: Reset the internal state so that the allocated memory can be used again. diff --git a/newbrt/rbuf.h b/newbrt/rbuf.h index b704aa59bd5..63b473717bc 100644 --- a/newbrt/rbuf.h +++ b/newbrt/rbuf.h @@ -100,6 +100,18 @@ static inline BLOCKNUM rbuf_blocknum (struct rbuf *r) { BLOCKNUM result = make_blocknum(rbuf_longlong(r)); return result; } +static inline void rbuf_ma_BLOCKNUM (struct rbuf *r, MEMARENA ma __attribute__((__unused__)), BLOCKNUM *blocknum) { + *blocknum = rbuf_blocknum(r); +} + +static inline void rbuf_ma_u_int32_t (struct rbuf *r, MEMARENA ma __attribute__((__unused__)), u_int32_t *num) { + *num = rbuf_int(r); +} + +static inline void rbuf_ma_u_int64_t (struct rbuf *r, MEMARENA ma __attribute__((__unused__)), u_int64_t *num) { + *num = rbuf_ulonglong(r); +} + static inline void rbuf_TXNID (struct rbuf *r, TXNID *txnid) { *txnid = rbuf_ulonglong(r); @@ -119,7 +131,7 @@ static inline void rbuf_ma_FILENUM (struct rbuf *r, MEMARENA ma __attribute__((_ static inline void rbuf_BYTESTRING (struct rbuf *r, BYTESTRING *bs) { bs->len = rbuf_int(r); u_int32_t newndone = r->ndone + bs->len; - assert(newndone < r->size); + assert(newndone <= r->size); bs->data = toku_memdup(&r->buf[r->ndone], (size_t)bs->len); assert(bs->data); r->ndone = newndone; @@ -128,7 +140,7 @@ static inline void rbuf_BYTESTRING (struct rbuf *r, BYTESTRING *bs) { static inline void rbuf_ma_BYTESTRING (struct rbuf *r, MEMARENA ma, BYTESTRING *bs) { bs->len = rbuf_int(r); u_int32_t newndone = r->ndone + bs->len; - assert(newndone < r->size); + assert(newndone <= r->size); bs->data = memarena_memdup(ma, &r->buf[r->ndone], (size_t)bs->len); assert(bs->data); r->ndone = newndone; diff --git a/newbrt/recover.c b/newbrt/recover.c index 16740a91ba0..c06b7257a2e 100644 --- a/newbrt/recover.c +++ b/newbrt/recover.c @@ -23,19 +23,23 @@ struct scan_state { enum { BACKWARD_NEWER_CHECKPOINT_END = 1, BACKWARD_BETWEEN_CHECKPOINT_BEGIN_END, - BACKWARD_OLDER_CHECKPOINT_BEGIN, - FORWARD_OLDER_CHECKPOINT_BEGIN, FORWARD_BETWEEN_CHECKPOINT_BEGIN_END, FORWARD_NEWER_CHECKPOINT_END, } ss; - LSN checkpoint_lsn; - uint64_t checkpoint_timestamp; - int n_live_txns; - TXNID oldest_txnid; + LSN checkpoint_begin_lsn; + LSN checkpoint_end_lsn; + uint64_t checkpoint_end_timestamp; + uint64_t checkpoint_begin_timestamp; + uint32_t checkpoint_num_fassociate; + uint32_t checkpoint_num_xstillopen; }; static void scan_state_init(struct scan_state *ss) { - ss->ss = BACKWARD_NEWER_CHECKPOINT_END; ss->checkpoint_lsn = ZERO_LSN; ss->n_live_txns = 0; ss->oldest_txnid = 0; + ss->ss = BACKWARD_NEWER_CHECKPOINT_END; + ss->checkpoint_begin_lsn = ZERO_LSN; + ss->checkpoint_end_lsn = ZERO_LSN; + ss->checkpoint_num_fassociate = 0; + ss->checkpoint_num_xstillopen = 0; } static const char *scan_state_strings[] = { @@ -85,7 +89,7 @@ static uint32_t file_map_get_num_dictionaries(struct file_map *fmap) { return toku_omt_size(fmap->filenums); } -static void file_map_close_dictionaries(struct file_map *fmap, BOOL recovery_succeeded) { +static void file_map_close_dictionaries(struct file_map *fmap, BOOL recovery_succeeded, TOKULOGGER logger) { int r; while (1) { @@ -106,15 +110,18 @@ static void file_map_close_dictionaries(struct file_map *fmap, BOOL recovery_suc //Logging is already back on. No need to pass LSN into close. char *error_string = NULL; DB *fake_db = tuple->brt->db; //Need to free the fake db that was malloced - r = toku_close_brt(tuple->brt, &error_string); - if (!recovery_succeeded) { - if (tokudb_recovery_trace) - fprintf(stderr, "%s:%d %d %s\n", __FUNCTION__, __LINE__, r, error_string); - assert(r != 0); - } else - assert(r == 0); - if (error_string) - toku_free(error_string); + if (logger->rollback_cachefile != tuple->brt->cf) { + //Rollback cachefile is closed manually at end of recovery, not here + r = toku_close_brt(tuple->brt, &error_string); + if (!recovery_succeeded) { + if (tokudb_recovery_trace) + fprintf(stderr, "%s:%d %d %s\n", __FUNCTION__, __LINE__, r, error_string); + assert(r != 0); + } else + assert(r == 0); + if (error_string) + toku_free(error_string); + } toku_free(fake_db); //Must free the DB after the brt is closed file_map_tuple_destroy(tuple); @@ -158,6 +165,7 @@ static int file_map_find(struct file_map *fmap, FILENUM fnum, struct file_map_tu assert(tuple->filenum.fileid == fnum.fileid); *file_map_tuple = tuple; } + else assert(r==DB_NOTFOUND); return r; } @@ -202,9 +210,11 @@ static int recover_env_init (RECOVER_ENV renv, brt_compare_func bt_compare, brt_ static void recover_env_cleanup (RECOVER_ENV renv, BOOL recovery_succeeded) { int r; - file_map_close_dictionaries(&renv->fmap, recovery_succeeded); + file_map_close_dictionaries(&renv->fmap, recovery_succeeded, renv->logger); file_map_destroy(&renv->fmap); + r = toku_logger_close_rollback(renv->logger, !recovery_succeeded); + assert(r==0); r = toku_logger_close(&renv->logger); assert(r == 0); @@ -234,8 +244,8 @@ abort_on_upgrade(DB* UU(pdb), } // Open the file if it is not already open. If it is already open, then do nothing. -static int internal_toku_recover_fopen_or_fcreate (RECOVER_ENV renv, BOOL must_create, int mode, BYTESTRING *bs_iname, FILENUM filenum, u_int32_t treeflags, - u_int32_t descriptor_version, BYTESTRING* descriptor, int recovery_force_fcreate, TOKUTXN txn) { +static int internal_recover_fopen_or_fcreate (RECOVER_ENV renv, BOOL must_create, int mode, BYTESTRING *bs_iname, FILENUM filenum, u_int32_t treeflags, + u_int32_t descriptor_version, BYTESTRING* descriptor, TOKUTXN txn) { int r; char *iname = fixup_fname(bs_iname); @@ -262,7 +272,7 @@ static int internal_toku_recover_fopen_or_fcreate (RECOVER_ENV renv, BOOL must_c r = toku_brt_set_descriptor(brt, descriptor_version, &descriptor_dbt, abort_on_upgrade); if (r!=0) goto close_brt; } - r = toku_brt_open_recovery(brt, iname, must_create, must_create, renv->ct, txn, fake_db, recovery_force_fcreate, filenum); + r = toku_brt_open_recovery(brt, iname, must_create, must_create, renv->ct, txn, fake_db, filenum); if (r != 0) { close_brt: ; @@ -280,93 +290,365 @@ static int internal_toku_recover_fopen_or_fcreate (RECOVER_ENV renv, BOOL must_c return 0; } -static int -maybe_do_fclose_during_recover_backward(RECOVER_ENV renv, FILENUM filenum, BYTESTRING *bs_iname) { - // close the tree - struct file_map_tuple *tuple = NULL; - int r = file_map_find(&renv->fmap, filenum, &tuple); - if (r == 0) { - char *iname = fixup_fname(bs_iname); - assert(strcmp(tuple->iname, iname) == 0); - toku_free(iname); - - struct scan_state *ss = &renv->ss; - assert(ss->ss == BACKWARD_OLDER_CHECKPOINT_BEGIN); - - //Must keep existing lsn. - //The only way this should be dirty, is if its doing a file-format upgrade. - //If not dirty, header will not be written. - DB *fake_db = tuple->brt->db; //Need to free the fake db that was malloced - r = toku_close_brt_lsn(tuple->brt, 0, TRUE, tuple->brt->h->checkpoint_lsn); - assert(r == 0); - toku_free(fake_db); //Must free the DB after the brt is closed - file_map_remove(&renv->fmap, filenum); +static int toku_recover_begin_checkpoint (struct logtype_begin_checkpoint *l, RECOVER_ENV renv) { + int r; + switch (renv->ss.ss) { + case FORWARD_BETWEEN_CHECKPOINT_BEGIN_END: + assert(l->lsn.lsn == renv->ss.checkpoint_begin_lsn.lsn); + r = 0; + break; + case FORWARD_NEWER_CHECKPOINT_END: + assert(l->lsn.lsn > renv->ss.checkpoint_end_lsn.lsn); + r = 0; // ignore it (log only has a begin checkpoint) + break; + default: + fprintf(stderr, "Tokudb recovery %s: %d Unknown checkpoint state %d\n", __FILE__, __LINE__, (int)renv->ss.ss); + abort(); + break; } + return r; +} + +static int toku_recover_backward_begin_checkpoint (struct logtype_begin_checkpoint *l, RECOVER_ENV renv) { + int r; + time_t tnow = time(NULL); + fprintf(stderr, "%.24s Tokudb recovery bw_begin_checkpoint at %"PRIu64" timestamp %"PRIu64" (%s)\n", ctime(&tnow), l->lsn.lsn, l->timestamp, recover_state(renv)); + switch (renv->ss.ss) { + case BACKWARD_NEWER_CHECKPOINT_END: + r = 0; // incomplete checkpoint. Nothing to do. + break; + case BACKWARD_BETWEEN_CHECKPOINT_BEGIN_END: + assert(l->lsn.lsn == renv->ss.checkpoint_begin_lsn.lsn); + renv->ss.ss = FORWARD_BETWEEN_CHECKPOINT_BEGIN_END; + renv->ss.checkpoint_begin_timestamp = l->timestamp; + renv->goforward = TRUE; + tnow = time(NULL); + fprintf(stderr, "%.24s Tokudb recovery turning around at begin checkpoint %"PRIu64" time %"PRIu64"\n", + ctime(&tnow), l->lsn.lsn, + renv->ss.checkpoint_end_timestamp - renv->ss.checkpoint_begin_timestamp); + r = 0; + break; + default: + fprintf(stderr, "Tokudb recovery %s: %d Unknown checkpoint state %d\n", __FILE__, __LINE__, (int)renv->ss.ss); + abort(); + break; + } + return r; +} + +static int toku_recover_end_checkpoint (struct logtype_end_checkpoint *l, RECOVER_ENV renv) { + int r; + switch (renv->ss.ss) { + case FORWARD_BETWEEN_CHECKPOINT_BEGIN_END: + assert(l->xid == renv->ss.checkpoint_begin_lsn.lsn); + assert(l->lsn.lsn == renv->ss.checkpoint_end_lsn.lsn); + assert(l->num_fassociate_entries == renv->ss.checkpoint_num_fassociate); + assert(l->num_xstillopen_entries == renv->ss.checkpoint_num_xstillopen); + renv->ss.ss = FORWARD_NEWER_CHECKPOINT_END; + r = 0; + break; + case FORWARD_NEWER_CHECKPOINT_END: + assert(0); + return 0; + default: + assert(0); + return 0; + } + return r; +} + +static int toku_recover_backward_end_checkpoint (struct logtype_end_checkpoint *l, RECOVER_ENV renv) { + time_t tnow = time(NULL); + fprintf(stderr, "%.24s Tokudb recovery bw_end_checkpoint at %"PRIu64" timestamp %"PRIu64" xid %"PRIu64" (%s)\n", ctime(&tnow), l->lsn.lsn, l->timestamp, l->xid, recover_state(renv)); + switch (renv->ss.ss) { + case BACKWARD_NEWER_CHECKPOINT_END: + renv->ss.ss = BACKWARD_BETWEEN_CHECKPOINT_BEGIN_END; + renv->ss.checkpoint_begin_lsn.lsn = l->xid; + renv->ss.checkpoint_end_lsn.lsn = l->lsn.lsn; + renv->ss.checkpoint_end_timestamp = l->timestamp; + return 0; + case BACKWARD_BETWEEN_CHECKPOINT_BEGIN_END: + fprintf(stderr, "Tokudb recovery %s:%d Should not see two end_checkpoint log entries without an intervening begin_checkpoint\n", __FILE__, __LINE__); + abort(); + default: + break; + } + fprintf(stderr, "Tokudb recovery %s: %d Unknown checkpoint state %d\n", __FILE__, __LINE__, (int)renv->ss.ss); + abort(); +} + +static int toku_recover_fassociate (struct logtype_fassociate *l, RECOVER_ENV renv) { + struct file_map_tuple *tuple = NULL; + int r = file_map_find(&renv->fmap, l->filenum, &tuple); + char *fname = fixup_fname(&l->iname); + switch (renv->ss.ss) { + case FORWARD_BETWEEN_CHECKPOINT_BEGIN_END: + renv->ss.checkpoint_num_fassociate++; + assert(r==DB_NOTFOUND); //Not open + // open it if it exists + r = internal_recover_fopen_or_fcreate(renv, FALSE, 0, &l->iname, l->filenum, l->treeflags, 0, NULL, NULL); + if (r==0 && !strcmp(fname, ROLLBACK_CACHEFILE_NAME)) { + //Load rollback cachefile + r = file_map_find(&renv->fmap, l->filenum, &tuple); + assert(r==0); + renv->logger->rollback_cachefile = tuple->brt->cf; + } + break; + case FORWARD_NEWER_CHECKPOINT_END: + if (r == 0) { //IF it is open + // assert that the filenum maps to the correct iname + assert(strcmp(fname, tuple->iname) == 0); + } + r = 0; + break; + default: + assert(0); + return 0; + } + toku_free(fname); + return r; +} + +static int toku_recover_backward_fassociate (struct logtype_fassociate *UU(l), RECOVER_ENV UU(renv)) { + // nothing + return 0; +} + +static int +recover_transaction(TOKUTXN *txnp, TXNID xid, TXNID parentxid, TOKULOGGER logger) { + int r; + + // lookup the parent + TOKUTXN parent = NULL; + if (parentxid != TXNID_NONE) { + r = toku_txnid2txn(logger, parentxid, &parent); + assert(r == 0); + assert(parent!=NULL); + } + + // create a transaction and bind it to the transaction id + TOKUTXN txn = NULL; + { + //Verify it does not yet exist. + r = toku_txnid2txn(logger, xid, &txn); + assert(r == 0); + assert(txn==NULL); + } + r = toku_txn_begin_with_xid(parent, &txn, logger, xid); + assert(r == 0); + if (txnp) *txnp = txn; + return 0; +} + +static int toku_recover_xstillopen (struct logtype_xstillopen *l, RECOVER_ENV UU(renv)) { + int r; + switch (renv->ss.ss) { + case FORWARD_BETWEEN_CHECKPOINT_BEGIN_END: { + renv->ss.checkpoint_num_xstillopen++; + TOKUTXN txn = NULL; + { //Create the transaction. + r = recover_transaction(&txn, l->xid, l->parentxid, renv->logger); + assert(r==0); + assert(txn!=NULL); + } + { //Recover rest of transaction. +#define COPY_TO_INFO(field) .field = l->field + struct txninfo info = { + COPY_TO_INFO(rollentry_raw_count), + .num_brts = 0, //Set afterwards + .open_brts = NULL, //Set afterwards + COPY_TO_INFO(force_fsync_on_commit), + COPY_TO_INFO(num_rollback_nodes), + COPY_TO_INFO(num_rollentries), + COPY_TO_INFO(spilled_rollback_head), + COPY_TO_INFO(spilled_rollback_tail), + COPY_TO_INFO(current_rollback) + }; +#undef COPY_TO_INFO + //Generate open_brts + BRT array[l->open_filenums.num]; //Allocate maximum possible requirement + info.open_brts = array; + uint32_t i; + for (i = 0; i < l->open_filenums.num; i++) { + //open_filenums.filenums[] + struct file_map_tuple *tuple = NULL; + r = file_map_find(&renv->fmap, l->open_filenums.filenums[i], &tuple); + if (r==0) { + info.open_brts[info.num_brts++] = tuple->brt; + } + else { + assert(r==DB_NOTFOUND); + } + } + r = toku_txn_load_txninfo(txn, &info); + assert(r==0); + } + break; + } + case FORWARD_NEWER_CHECKPOINT_END: { + // assert that the transaction exists + TOKUTXN txn = NULL; + r = toku_txnid2txn(renv->logger, l->xid, &txn); + assert(r == 0 && txn != NULL); + r = 0; + break; + } + default: + assert(0); + return 0; + } + return r; +} + +static int toku_recover_backward_xstillopen (struct logtype_xstillopen *UU(l), RECOVER_ENV UU(renv)) { + // nothing + return 0; +} + +static int toku_recover_suppress_rollback (struct logtype_suppress_rollback *UU(l), RECOVER_ENV UU(renv)) { + struct file_map_tuple *tuple = NULL; + int r = file_map_find(&renv->fmap, l->filenum, &tuple); + if (r==0) { + //File is open + TOKUTXN txn = NULL; + r = toku_txnid2txn(renv->logger, l->xid, &txn); + assert(r == 0); + assert(txn!=NULL); + struct brt_header *h = tuple->brt->h; + assert(h->txnid_that_created_or_locked_when_empty == TXNID_NONE || + h->txnid_that_created_or_locked_when_empty == l->xid); + h->txnid_that_created_or_locked_when_empty = l->xid; + } + return 0; +} + +static int toku_recover_backward_suppress_rollback (struct logtype_suppress_rollback *UU(l), RECOVER_ENV UU(renv)) { + // nothing + return 0; +} + +static int toku_recover_xbegin (struct logtype_xbegin *l, RECOVER_ENV renv) { + int r; + r = recover_transaction(NULL, l->lsn.lsn, l->parentxid, renv->logger); + return r; +} + +static int toku_recover_backward_xbegin (struct logtype_xbegin *UU(l), RECOVER_ENV UU(renv)) { + // nothing + return 0; +} + +static int toku_recover_xcommit (struct logtype_xcommit *l, RECOVER_ENV renv) { + int r; + + // find the transaction by transaction id + TOKUTXN txn = NULL; + r = toku_txnid2txn(renv->logger, l->xid, &txn); + assert(r == 0); + assert(txn!=NULL); + + // commit the transaction + r = toku_txn_commit_with_lsn(txn, TRUE, recover_yield, NULL, l->lsn, + NULL, NULL, + // No need to release locks during recovery. + NULL, NULL, NULL); + assert(r == 0); + + // close the transaction + toku_txn_close_txn(txn); + + return 0; +} + +static int toku_recover_backward_xcommit (struct logtype_xcommit *UU(l), RECOVER_ENV UU(renv)) { + // nothing + return 0; +} + +static int toku_recover_xabort (struct logtype_xabort *l, RECOVER_ENV renv) { + int r; + + // find the transaction by transaction id + TOKUTXN txn = NULL; + r = toku_txnid2txn(renv->logger, l->xid, &txn); + assert(r == 0); + assert(txn!=NULL); + + // abort the transaction + r = toku_txn_abort_with_lsn(txn, recover_yield, NULL, l->lsn, NULL, NULL); + assert(r == 0); + + // close the transaction + toku_txn_close_txn(txn); + + return 0; +} + +static int toku_recover_backward_xabort (struct logtype_xabort *UU(l), RECOVER_ENV UU(renv)) { + // nothing return 0; } // fcreate is like fopen except that the file must be created. static int toku_recover_fcreate (struct logtype_fcreate *l, RECOVER_ENV renv) { - struct scan_state *ss = &renv->ss; int r; TOKUTXN txn = NULL; - r = toku_txnid2txn(renv->logger, l->txnid, &txn); + r = toku_txnid2txn(renv->logger, l->xid, &txn); assert(r == 0); // assert that filenum is closed struct file_map_tuple *tuple = NULL; r = file_map_find(&renv->fmap, l->filenum, &tuple); - assert(r != 0); + assert(r==DB_NOTFOUND); - BOOL must_create; - if (ss->ss == FORWARD_OLDER_CHECKPOINT_BEGIN) { - must_create = FALSE; // do not create file if it does not exist - } else { - assert(txn != NULL); //Straddle txns cannot happen after checkpoint - assert(ss->ss == FORWARD_BETWEEN_CHECKPOINT_BEGIN_END || ss->ss == FORWARD_NEWER_CHECKPOINT_END); - must_create = TRUE; + assert(txn!=NULL); - // maybe unlink - char *iname = fixup_fname(&l->iname); - r = unlink(iname); - if (r != 0 && errno != ENOENT) { - fprintf(stderr, "Tokudb recovery %s:%d unlink %s %d\n", __FUNCTION__, __LINE__, iname, errno); - toku_free(iname); - return r; - } + //unlink if it exists (recreate from scratch). + char *iname = fixup_fname(&l->iname); + r = unlink(iname); + if (r != 0 && errno != ENOENT) { + fprintf(stderr, "Tokudb recovery %s:%d unlink %s %d\n", __FUNCTION__, __LINE__, iname, errno); toku_free(iname); + return r; } + assert(strcmp(iname, ROLLBACK_CACHEFILE_NAME)); //Creation of rollback cachefile never gets logged. + toku_free(iname); - r = internal_toku_recover_fopen_or_fcreate(renv, must_create, l->mode, &l->iname, l->filenum, l->treeflags, l->descriptor_version, &l->descriptor, 1, txn); + BOOL must_create = TRUE; + r = internal_recover_fopen_or_fcreate(renv, must_create, l->mode, &l->iname, l->filenum, l->treeflags, l->descriptor_version, &l->descriptor, txn); return r; } -static int toku_recover_backward_fcreate (struct logtype_fcreate *l, RECOVER_ENV renv) { - int r = maybe_do_fclose_during_recover_backward(renv, l->filenum, &l->iname); - assert(r==0); +static int toku_recover_backward_fcreate (struct logtype_fcreate *UU(l), RECOVER_ENV UU(renv)) { + // nothing return 0; } static int toku_recover_fopen (struct logtype_fopen *l, RECOVER_ENV renv) { - // check if the file is already open by backward scan of fassociate + int r; + + // assert that filenum is closed struct file_map_tuple *tuple = NULL; - char *iname = fixup_fname(&l->iname); - int r = file_map_find(&renv->fmap, l->filenum, &tuple); - if (r == 0) { - // file is already opened by fassociate - assert(strcmp(tuple->iname, iname) == 0); - } else { - // file is not open, open it - r = internal_toku_recover_fopen_or_fcreate(renv, FALSE, 0, &l->iname, l->filenum, l->treeflags, 0, NULL, 0, NULL); + r = file_map_find(&renv->fmap, l->filenum, &tuple); + assert(r==DB_NOTFOUND); + + BOOL must_create = FALSE; + uint32_t descriptor_version = 0; + BYTESTRING *descriptor = NULL; + TOKUTXN txn = NULL; + char *fname = fixup_fname(&l->iname); + + if (strcmp(fname, ROLLBACK_CACHEFILE_NAME)) { + //Rollback cachefile can only be opened via fassociate. + r = internal_recover_fopen_or_fcreate(renv, must_create, 0, &l->iname, l->filenum, l->treeflags, descriptor_version, descriptor, txn); } - toku_free(iname); + toku_free(fname); return r; } -static int toku_recover_backward_fopen (struct logtype_fopen *l, RECOVER_ENV renv) { - int r = maybe_do_fclose_during_recover_backward(renv, l->filenum, &l->iname); - assert(r==0); +static int toku_recover_backward_fopen (struct logtype_fopen *UU(l), RECOVER_ENV UU(renv)) { + // nothing return 0; } @@ -377,13 +659,16 @@ static int toku_recover_fclose (struct logtype_fclose *l, RECOVER_ENV renv) { if (r == 0) { // if file is open char *iname = fixup_fname(&l->iname); assert(strcmp(tuple->iname, iname) == 0); // verify that file_map has same iname as log entry - toku_free(iname); DB *fake_db = tuple->brt->db; //Need to free the fake db that was malloced - r = toku_close_brt_lsn(tuple->brt, 0, TRUE, l->lsn); - assert(r == 0); + if (strcmp(iname, ROLLBACK_CACHEFILE_NAME)) { + //Rollback cachefile is closed manually at end of recovery, not here + r = toku_close_brt_lsn(tuple->brt, 0, TRUE, l->lsn); + assert(r == 0); + } toku_free(fake_db); //Must free the DB after the brt is closed file_map_remove(&renv->fmap, l->filenum); + toku_free(iname); } return 0; } @@ -396,27 +681,25 @@ static int toku_recover_backward_fclose (struct logtype_fclose *UU(l), RECOVER_E // fdelete is a transactional file delete. static int toku_recover_fdelete (struct logtype_fdelete *l, RECOVER_ENV renv) { TOKUTXN txn = NULL; - int r = toku_txnid2txn(renv->logger, l->txnid, &txn); + int r = toku_txnid2txn(renv->logger, l->xid, &txn); assert(r == 0); - if (txn == NULL) { - //This is a straddle txn. - assert(renv->ss.ss == FORWARD_OLDER_CHECKPOINT_BEGIN); //cannot happen after checkpoint begin - return 0; - } + assert(txn != NULL); char *fixediname = fixup_fname(&l->iname); - { //Skip if does not exist. + { //Only if it exists toku_struct_stat buf; r = toku_stat(fixediname, &buf); - if (r==-1 && errno==ENOENT) - goto cleanup; + if (r==0) { + // txn exists and file exists, so create fdelete rollback entry + DBT iname_dbt; + toku_fill_dbt(&iname_dbt, fixediname, strlen(fixediname)+1); + r = toku_brt_remove_on_commit(txn, &iname_dbt); + assert(r==0); + } + else { + assert(errno==ENOENT); + } } - // txn exists and file exists, so create fdelete rollback entry - DBT iname_dbt; - toku_fill_dbt(&iname_dbt, fixediname, strlen(fixediname)+1); - r = toku_brt_remove_on_commit(txn, &iname_dbt); - assert(r==0); -cleanup: toku_free(fixediname); return 0; } @@ -429,19 +712,14 @@ static int toku_recover_backward_fdelete (struct logtype_fdelete *UU(l), RECOVER static int toku_recover_tablelock_on_empty_table(struct logtype_tablelock_on_empty_table *l, RECOVER_ENV renv) { struct file_map_tuple *tuple = NULL; int r = file_map_find(&renv->fmap, l->filenum, &tuple); - if (r!=0) { - // if we didn't find a cachefile, then we don't have to do anything. - return 0; - } - TOKUTXN txn = NULL; - r = toku_txnid2txn(renv->logger, l->xid, &txn); - assert(r == 0); - if (txn != NULL) { - r = toku_brt_note_table_lock(tuple->brt, txn); + if (r==0) { + //Our work is only if it is open + TOKUTXN txn = NULL; + r = toku_txnid2txn(renv->logger, l->xid, &txn); + assert(r == 0); + assert(txn != NULL); + r = toku_brt_note_table_lock(tuple->brt, txn, TRUE); assert(r == 0); - } else { - //This is a straddle txn. - assert(renv->ss.ss == FORWARD_OLDER_CHECKPOINT_BEGIN); //cannot happen after checkpoint begin } return 0; } @@ -456,23 +734,17 @@ static int toku_recover_enq_insert (struct logtype_enq_insert *l, RECOVER_ENV re TOKUTXN txn = NULL; r = toku_txnid2txn(renv->logger, l->xid, &txn); assert(r == 0); - if (txn == NULL) { - //This is a straddle txn. - assert(renv->ss.ss == FORWARD_OLDER_CHECKPOINT_BEGIN); //cannot happen after checkpoint begin - return 0; - } + assert(txn!=NULL); struct file_map_tuple *tuple = NULL; r = file_map_find(&renv->fmap, l->filenum, &tuple); - if (r!=0) { - // if we didn't find a cachefile, then we don't have to do anything. - return 0; + if (r==0) { + //Maybe do the insertion if we found the cachefile. + DBT keydbt, valdbt; + toku_fill_dbt(&keydbt, l->key.data, l->key.len); + toku_fill_dbt(&valdbt, l->value.data, l->value.len); + r = toku_brt_maybe_insert(tuple->brt, &keydbt, &valdbt, txn, TRUE, l->lsn, FALSE, BRT_INSERT); + assert(r == 0); } - DBT keydbt, valdbt; - toku_fill_dbt(&keydbt, l->key.data, l->key.len); - toku_fill_dbt(&valdbt, l->value.data, l->value.len); - r = toku_brt_maybe_insert(tuple->brt, &keydbt, &valdbt, txn, TRUE, l->lsn, FALSE, BRT_INSERT); - assert(r == 0); - return 0; } @@ -486,23 +758,17 @@ static int toku_recover_enq_insert_no_overwrite (struct logtype_enq_insert_no_ov TOKUTXN txn = NULL; r = toku_txnid2txn(renv->logger, l->xid, &txn); assert(r == 0); - if (txn == NULL) { - //This is a straddle txn. - assert(renv->ss.ss == FORWARD_OLDER_CHECKPOINT_BEGIN); //cannot happen after checkpoint begin - return 0; - } + assert(txn!=NULL); struct file_map_tuple *tuple = NULL; r = file_map_find(&renv->fmap, l->filenum, &tuple); - if (r!=0) { - // if we didn't find a cachefile, then we don't have to do anything. - return 0; + if (r==0) { + //Maybe do the insertion if we found the cachefile. + DBT keydbt, valdbt; + toku_fill_dbt(&keydbt, l->key.data, l->key.len); + toku_fill_dbt(&valdbt, l->value.data, l->value.len); + r = toku_brt_maybe_insert(tuple->brt, &keydbt, &valdbt, txn, TRUE, l->lsn, FALSE, BRT_INSERT_NO_OVERWRITE); + assert(r == 0); } - DBT keydbt, valdbt; - toku_fill_dbt(&keydbt, l->key.data, l->key.len); - toku_fill_dbt(&valdbt, l->value.data, l->value.len); - r = toku_brt_maybe_insert(tuple->brt, &keydbt, &valdbt, txn, TRUE, l->lsn, FALSE, BRT_INSERT_NO_OVERWRITE); - assert(r == 0); - return 0; } @@ -511,22 +777,65 @@ static int toku_recover_backward_enq_insert_no_overwrite (struct logtype_enq_ins return 0; } +static int toku_recover_enq_delete_both (struct logtype_enq_delete_both *l, RECOVER_ENV renv) { + int r; + TOKUTXN txn = NULL; + r = toku_txnid2txn(renv->logger, l->xid, &txn); + assert(r == 0); + assert(txn!=NULL); + struct file_map_tuple *tuple = NULL; + r = file_map_find(&renv->fmap, l->filenum, &tuple); + if (r==0) { + //Maybe do the deletion if we found the cachefile. + DBT keydbt, valdbt; + toku_fill_dbt(&keydbt, l->key.data, l->key.len); + toku_fill_dbt(&valdbt, l->value.data, l->value.len); + r = toku_brt_maybe_delete_both(tuple->brt, &keydbt, &valdbt, txn, TRUE, l->lsn); + assert(r == 0); + } + return 0; +} + +static int toku_recover_backward_enq_delete_both (struct logtype_enq_delete_both *UU(l), RECOVER_ENV UU(renv)) { + // nothing + return 0; +} + +static int toku_recover_enq_delete_any (struct logtype_enq_delete_any *l, RECOVER_ENV renv) { + int r; + TOKUTXN txn = NULL; + r = toku_txnid2txn(renv->logger, l->xid, &txn); + assert(r == 0); + assert(txn!=NULL); + struct file_map_tuple *tuple = NULL; + r = file_map_find(&renv->fmap, l->filenum, &tuple); + if (r==0) { + //Maybe do the deletion if we found the cachefile. + DBT keydbt; + toku_fill_dbt(&keydbt, l->key.data, l->key.len); + r = toku_brt_maybe_delete(tuple->brt, &keydbt, txn, TRUE, l->lsn, FALSE); + assert(r == 0); + } + return 0; +} + +static int toku_recover_backward_enq_delete_any (struct logtype_enq_delete_any *UU(l), RECOVER_ENV UU(renv)) { + // nothing + return 0; +} + static int toku_recover_enq_insert_multiple (struct logtype_enq_insert_multiple *l, RECOVER_ENV renv) { int r; TOKUTXN txn = NULL; r = toku_txnid2txn(renv->logger, l->xid, &txn); assert(r == 0); - if (txn == NULL) { - //This is a straddle txn. - assert(renv->ss.ss == FORWARD_OLDER_CHECKPOINT_BEGIN); //cannot happen after checkpoint begin - return 0; - } + assert(txn!=NULL); DB *src_db = NULL; { struct file_map_tuple *tuple = NULL; r = file_map_find(&renv->fmap, l->src_filenum, &tuple); if (l->src_filenum.fileid == FILENUM_NONE.fileid) - assert(r!=0); + assert(r==DB_NOTFOUND); else { assert(r==0); //How do we continue if src_db is specified but missing? src_db = tuple->brt->db; @@ -544,26 +853,25 @@ static int toku_recover_enq_insert_multiple (struct logtype_enq_insert_multiple for (file = 0; file < l->dest_filenums.num; file++) { struct file_map_tuple *tuple = NULL; r = file_map_find(&renv->fmap, l->dest_filenums.filenums[file], &tuple); - if (r!=0) { - // if we didn't find a cachefile, then we don't have to do anything for this file. - continue; - } - DB *db = tuple->brt->db; - r = renv->generate_row_for_put(db, src_db, &dest_key, &dest_val, &src_key, &src_val, NULL); - assert(r==0); - r = toku_brt_maybe_insert(tuple->brt, &dest_key, &dest_val, txn, TRUE, l->lsn, FALSE, BRT_INSERT); - assert(r == 0); - //flags==0 means generate_row_for_put callback changed it - //(and freed any memory necessary to do so) so that values are now stored - //in temporary memory that does not need to be freed. We need to continue - //using DB_DBT_REALLOC however. - if (dest_key.flags == 0) { - toku_init_dbt(&dest_key); - dest_key.flags = DB_DBT_REALLOC; - } - if (dest_val.flags == 0) { - toku_init_dbt(&dest_val); - dest_val.flags = DB_DBT_REALLOC; + if (r==0) { + // We found the cachefile. (maybe) Do the insert. + DB *db = tuple->brt->db; + r = renv->generate_row_for_put(db, src_db, &dest_key, &dest_val, &src_key, &src_val, NULL); + assert(r==0); + r = toku_brt_maybe_insert(tuple->brt, &dest_key, &dest_val, txn, TRUE, l->lsn, FALSE, BRT_INSERT); + assert(r == 0); + //flags==0 means generate_row_for_put callback changed it + //(and freed any memory necessary to do so) so that values are now stored + //in temporary memory that does not need to be freed. We need to continue + //using DB_DBT_REALLOC however. + if (dest_key.flags == 0) { + toku_init_dbt(&dest_key); + dest_key.flags = DB_DBT_REALLOC; + } + if (dest_val.flags == 0) { + toku_init_dbt(&dest_val); + dest_val.flags = DB_DBT_REALLOC; + } } } if (dest_key.data) toku_free(dest_key.data); //TODO: #2321 May need windows hack @@ -582,17 +890,13 @@ static int toku_recover_enq_delete_multiple (struct logtype_enq_delete_multiple TOKUTXN txn = NULL; r = toku_txnid2txn(renv->logger, l->xid, &txn); assert(r == 0); - if (txn == NULL) { - //This is a straddle txn. - assert(renv->ss.ss == FORWARD_OLDER_CHECKPOINT_BEGIN); //cannot happen after checkpoint begin - return 0; - } + assert(txn!=NULL); DB *src_db = NULL; { struct file_map_tuple *tuple = NULL; r = file_map_find(&renv->fmap, l->src_filenum, &tuple); if (l->src_filenum.fileid == FILENUM_NONE.fileid) - assert(r!=0); + assert(r==DB_NOTFOUND); else { assert(r==0); //How do we continue if src_db is specified but missing? src_db = tuple->brt->db; @@ -608,20 +912,19 @@ static int toku_recover_enq_delete_multiple (struct logtype_enq_delete_multiple for (file = 0; file < l->dest_filenums.num; file++) { struct file_map_tuple *tuple = NULL; r = file_map_find(&renv->fmap, l->dest_filenums.filenums[file], &tuple); - if (r!=0) { - // if we didn't find a cachefile, then we don't have to do anything for this file. - continue; - } - DB *db = tuple->brt->db; - r = renv->generate_row_for_del(db, src_db, &dest_key, &src_key, &src_val, NULL); - assert(r==0); - r = toku_brt_maybe_delete(tuple->brt, &dest_key, txn, TRUE, l->lsn, FALSE); - assert(r == 0); - //flags==0 indicates the return values are stored in temporary memory that does - //not need to be freed. We need to continue using DB_DBT_REALLOC however. - if (dest_key.flags == 0) { - toku_init_dbt(&dest_key); - dest_key.flags = DB_DBT_REALLOC; + if (r==0) { + // We found the cachefile. (maybe) Do the delete. + DB *db = tuple->brt->db; + r = renv->generate_row_for_del(db, src_db, &dest_key, &src_key, &src_val, NULL); + assert(r==0); + r = toku_brt_maybe_delete(tuple->brt, &dest_key, txn, TRUE, l->lsn, FALSE); + assert(r == 0); + //flags==0 indicates the return values are stored in temporary memory that does + //not need to be freed. We need to continue using DB_DBT_REALLOC however. + if (dest_key.flags == 0) { + toku_init_dbt(&dest_key); + dest_key.flags = DB_DBT_REALLOC; + } } } if (dest_key.flags & DB_DBT_REALLOC && dest_key.data) toku_free(dest_key.data); //TODO: #2321 May need windows hack @@ -634,349 +937,6 @@ static int toku_recover_backward_enq_delete_multiple (struct logtype_enq_delete_ return 0; } -static int toku_recover_enq_delete_both (struct logtype_enq_delete_both *l, RECOVER_ENV renv) { - int r; - TOKUTXN txn = NULL; - r = toku_txnid2txn(renv->logger, l->xid, &txn); - assert(r == 0); - if (txn == NULL) { - //This is a straddle txn. - assert(renv->ss.ss == FORWARD_OLDER_CHECKPOINT_BEGIN); //cannot happen after checkpoint begin - return 0; - } - struct file_map_tuple *tuple = NULL; - r = file_map_find(&renv->fmap, l->filenum, &tuple); - if (r!=0) { - // if we didn't find a cachefile, then we don't have to do anything. - return 0; - } - DBT keydbt, valdbt; - toku_fill_dbt(&keydbt, l->key.data, l->key.len); - toku_fill_dbt(&valdbt, l->value.data, l->value.len); - r = toku_brt_maybe_delete_both(tuple->brt, &keydbt, &valdbt, txn, TRUE, l->lsn); - assert(r == 0); - - return 0; -} - -static int toku_recover_backward_enq_delete_both (struct logtype_enq_delete_both *UU(l), RECOVER_ENV UU(renv)) { - // nothing - return 0; -} - -static int toku_recover_enq_delete_any (struct logtype_enq_delete_any *l, RECOVER_ENV renv) { - int r; - TOKUTXN txn = NULL; - r = toku_txnid2txn(renv->logger, l->xid, &txn); - assert(r == 0); - if (txn == NULL) { - //This is a straddle txn. - assert(renv->ss.ss == FORWARD_OLDER_CHECKPOINT_BEGIN); //cannot happen after checkpoint begin - return 0; - } - struct file_map_tuple *tuple = NULL; - r = file_map_find(&renv->fmap, l->filenum, &tuple); - if (r!=0) { - // if we didn't find a cachefile, then we don't have to do anything. - return 0; - } - DBT keydbt; - toku_fill_dbt(&keydbt, l->key.data, l->key.len); - r = toku_brt_maybe_delete(tuple->brt, &keydbt, txn, TRUE, l->lsn, FALSE); - assert(r == 0); - - return 0; -} - -static int toku_recover_backward_enq_delete_any (struct logtype_enq_delete_any *UU(l), RECOVER_ENV UU(renv)) { - // nothing - return 0; -} - -static int toku_recover_begin_checkpoint (struct logtype_begin_checkpoint *l, RECOVER_ENV renv) { - switch (renv->ss.ss) { - case FORWARD_OLDER_CHECKPOINT_BEGIN: - assert(l->lsn.lsn <= renv->ss.checkpoint_lsn.lsn); - if (l->lsn.lsn == renv->ss.checkpoint_lsn.lsn) - renv->ss.ss = FORWARD_BETWEEN_CHECKPOINT_BEGIN_END; - return 0; - case FORWARD_BETWEEN_CHECKPOINT_BEGIN_END: - assert(0); - return 0; - case FORWARD_NEWER_CHECKPOINT_END: - assert(l->lsn.lsn > renv->ss.checkpoint_lsn.lsn); - return 0; // ignore it (log only has a begin checkpoint) - default: - break; - } - fprintf(stderr, "Tokudb recovery %s: %d Unknown checkpoint state %d\n", __FILE__, __LINE__, (int)renv->ss.ss); - abort(); - // nothing - return 0; -} - -static int toku_recover_backward_begin_checkpoint (struct logtype_begin_checkpoint *l, RECOVER_ENV renv) { - time_t tnow = time(NULL); - fprintf(stderr, "%.24s Tokudb recovery bw_begin_checkpoint at %"PRIu64" timestamp %"PRIu64" (%s)\n", ctime(&tnow), l->lsn.lsn, l->timestamp, recover_state(renv)); - switch (renv->ss.ss) { - case BACKWARD_OLDER_CHECKPOINT_BEGIN: - assert(l->lsn.lsn < renv->ss.checkpoint_lsn.lsn); - return 0; // ignore it - case BACKWARD_BETWEEN_CHECKPOINT_BEGIN_END: - assert(renv->ss.checkpoint_lsn.lsn == l->lsn.lsn); - if (renv->ss.n_live_txns==0) { - renv->ss.ss = FORWARD_OLDER_CHECKPOINT_BEGIN; - renv->goforward = TRUE; - tnow = time(NULL); - fprintf(stderr, "%.24s Tokudb recovery turning around at begin checkpoint %"PRIu64" time %"PRIu64"\n", - ctime(&tnow), l->lsn.lsn, renv->ss.checkpoint_timestamp - l->timestamp); - } else { - renv->ss.ss = BACKWARD_OLDER_CHECKPOINT_BEGIN; - tnow = time(NULL); - fprintf(stderr, "%.24s Tokudb recovery begin checkpoint at %"PRIu64" looking for %"PRIu64" time %"PRIu64". Scanning backwards through %"PRIu64" log entries.\n", - ctime(&tnow), l->lsn.lsn, renv->ss.oldest_txnid, renv->ss.checkpoint_timestamp - l->timestamp, l->lsn.lsn - renv->ss.oldest_txnid); - } - return 0; - case BACKWARD_NEWER_CHECKPOINT_END: - return 0; // incomplete checkpoint - default: - break; - } - fprintf(stderr, "Tokudb recovery %s: %d Unknown checkpoint state %d\n", __FILE__, __LINE__, (int)renv->ss.ss); - abort(); -} - -static int toku_recover_end_checkpoint (struct logtype_end_checkpoint *l, RECOVER_ENV renv) { - switch (renv->ss.ss) { - case FORWARD_OLDER_CHECKPOINT_BEGIN: - assert(l->lsn.lsn < renv->ss.checkpoint_lsn.lsn); - return 0; - case FORWARD_BETWEEN_CHECKPOINT_BEGIN_END: - assert(l->txnid == renv->ss.checkpoint_lsn.lsn); - renv->ss.ss = FORWARD_NEWER_CHECKPOINT_END; - return 0; - case FORWARD_NEWER_CHECKPOINT_END: - assert(0); - return 0; - default: - assert(0); - return 0; - } -} - -static int toku_recover_backward_end_checkpoint (struct logtype_end_checkpoint *l, RECOVER_ENV renv) { - time_t tnow = time(NULL); - fprintf(stderr, "%.24s Tokudb recovery bw_end_checkpoint at %"PRIu64" timestamp %"PRIu64" txnid %"PRIu64" (%s)\n", ctime(&tnow), l->lsn.lsn, l->timestamp, l->txnid, recover_state(renv)); - switch (renv->ss.ss) { - case BACKWARD_OLDER_CHECKPOINT_BEGIN: - return 0; - case BACKWARD_BETWEEN_CHECKPOINT_BEGIN_END: - fprintf(stderr, "Tokudb recovery %s:%d Should not see two end_checkpoint log entries without an intervening begin_checkpoint\n", __FILE__, __LINE__); - abort(); - case BACKWARD_NEWER_CHECKPOINT_END: - renv->ss.ss = BACKWARD_BETWEEN_CHECKPOINT_BEGIN_END; - renv->ss.checkpoint_lsn.lsn = l->txnid; - renv->ss.checkpoint_timestamp = l->timestamp; - return 0; - default: - break; - } - fprintf(stderr, "Tokudb recovery %s: %d Unknown checkpoint state %d\n", __FILE__, __LINE__, (int)renv->ss.ss); - abort(); -} - -static int toku_recover_fassociate (struct logtype_fassociate *l, RECOVER_ENV renv) { - switch (renv->ss.ss) { - case FORWARD_OLDER_CHECKPOINT_BEGIN: - return 0; - case FORWARD_BETWEEN_CHECKPOINT_BEGIN_END: - case FORWARD_NEWER_CHECKPOINT_END: { - struct file_map_tuple *tuple = NULL; - int r = file_map_find(&renv->fmap, l->filenum, &tuple); - if (r == 0) { - // assert that the filenum maps to the correct iname - char *fname = fixup_fname(&l->iname); - assert(strcmp(fname, tuple->iname) == 0); - toku_free(fname); - } - return 0; - } - default: - assert(0); - return 0; - } -} - -static int toku_recover_backward_fassociate (struct logtype_fassociate *l, RECOVER_ENV renv) { - switch (renv->ss.ss) { - case BACKWARD_OLDER_CHECKPOINT_BEGIN: - return 0; - case BACKWARD_BETWEEN_CHECKPOINT_BEGIN_END: { - // assert that the filenum is not open - struct file_map_tuple *tuple = NULL; - int r = file_map_find(&renv->fmap, l->filenum, &tuple); - assert(r != 0); - - // open it - r = internal_toku_recover_fopen_or_fcreate(renv, FALSE, 0, &l->iname, l->filenum, l->treeflags, 0, NULL, 0, NULL); - return r; - } - case BACKWARD_NEWER_CHECKPOINT_END: - return 0; - default: - assert(0); - return 0; - } -} - -static int toku_recover_xstillopen (struct logtype_xstillopen *UU(l), RECOVER_ENV UU(renv)) { - switch (renv->ss.ss) { - case FORWARD_OLDER_CHECKPOINT_BEGIN: - return 0; - case FORWARD_BETWEEN_CHECKPOINT_BEGIN_END: - case FORWARD_NEWER_CHECKPOINT_END: { - // assert that the transaction exists - TOKUTXN txn = NULL; - int r = toku_txnid2txn(renv->logger, l->txnid, &txn); - assert(r == 0 && txn != NULL); - return 0; - } - default: - assert(0); - return 0; - } -} - -static int toku_recover_backward_xstillopen (struct logtype_xstillopen *l, RECOVER_ENV renv) { - switch (renv->ss.ss) { - case BACKWARD_OLDER_CHECKPOINT_BEGIN: - return 0; // ignore live txns from older checkpoints - case BACKWARD_BETWEEN_CHECKPOINT_BEGIN_END: - assert(l->txnid < renv->ss.checkpoint_lsn.lsn); - if (renv->ss.n_live_txns == 0) - renv->ss.oldest_txnid = l->txnid; - else if (toku_txnid_older(l->txnid, renv->ss.oldest_txnid)) - renv->ss.oldest_txnid = l->txnid; - renv->ss.n_live_txns++; - return 0; - case BACKWARD_NEWER_CHECKPOINT_END: - return 0; // ignore live txns from incomplete checkpoint - default: - break; - } - fprintf(stderr, "Tokudb recovery %s: %d Unknown checkpoint state %d\n", __FILE__, __LINE__, (int)renv->ss.ss); - abort(); -} - -static int toku_recover_xbegin (struct logtype_xbegin *l, RECOVER_ENV renv) { - int r; - - // lookup the parent - TOKUTXN parent = NULL; - if (l->parenttxnid != 0) { - r = toku_txnid2txn(renv->logger, l->parenttxnid, &parent); - assert(r == 0); - if (parent == NULL) { - //This is a straddle txn. - assert(renv->ss.ss == FORWARD_OLDER_CHECKPOINT_BEGIN); //cannot happen after checkpoint begin - return 0; - } - } - - // create a transaction and bind it to the transaction id - TOKUTXN txn = NULL; - r = toku_txn_begin_with_xid(parent, &txn, renv->logger, l->lsn.lsn); - assert(r == 0); - return 0; -} - -static int toku_recover_backward_xbegin (struct logtype_xbegin *l, RECOVER_ENV renv) { - struct scan_state *ss = &renv->ss; - switch (ss->ss) { - case BACKWARD_OLDER_CHECKPOINT_BEGIN: - assert(ss->n_live_txns > 0); // the only thing we are doing here is looking for a live txn, so there better be one - assert(ss->oldest_txnid <= l->lsn.lsn); //Did not pass it. - if (ss->oldest_txnid == l->lsn.lsn) { - renv->goforward = TRUE; - renv->ss.ss = FORWARD_OLDER_CHECKPOINT_BEGIN; - time_t tnow = time(NULL); - fprintf(stderr, "%.24s Tokudb recovery turning around at xbegin %" PRIu64 " live txns=%d (%s)\n", ctime(&tnow), l->lsn.lsn, renv->ss.n_live_txns, recover_state(renv)); - } else { - if (tokudb_recovery_trace) - fprintf(stderr, "Tokudb recovery scanning back at xbegin %" PRIu64 " looking for %" PRIu64 " (%s)\n", l->lsn.lsn, ss->oldest_txnid, recover_state(renv)); - } - return 0; - case BACKWARD_BETWEEN_CHECKPOINT_BEGIN_END: - assert(l->lsn.lsn > renv->ss.checkpoint_lsn.lsn); - return 0; // ignore txns that began during the checkpoint - case BACKWARD_NEWER_CHECKPOINT_END: - return 0; // ignore txns that began after checkpoint - default: - break; - } - fprintf(stderr, "Tokudb recovery %s: %d Unknown checkpoint state %d\n", __FILE__, __LINE__, (int)ss->ss); - abort(); -} - -static int toku_recover_commit (struct logtype_commit *l, RECOVER_ENV renv) { - int r; - - // find the transaction by transaction id - TOKUTXN txn = NULL; - r = toku_txnid2txn(renv->logger, l->txnid, &txn); - assert(r == 0); - if (txn == NULL) { - //This is a straddle txn. - assert(renv->ss.ss == FORWARD_OLDER_CHECKPOINT_BEGIN); // cannot happen after checkpoint begin - return 0; - } - - // commit the transaction - r = toku_txn_commit_with_lsn(txn, TRUE, recover_yield, NULL, l->lsn, - NULL, NULL, - // No need to release locks during recovery. - NULL, NULL, NULL); - assert(r == 0); - - // close the transaction - toku_txn_close_txn(txn); - - return 0; -} - -static int toku_recover_backward_commit (struct logtype_commit *UU(l), RECOVER_ENV UU(renv)) { - // nothing - return 0; -} - -static int toku_recover_xabort (struct logtype_xabort *l, RECOVER_ENV renv) { - int r; - - // find the transaction by transaction id - TOKUTXN txn = NULL; - r = toku_txnid2txn(renv->logger, l->txnid, &txn); - assert(r == 0); - if (txn == NULL) { - //This is a straddle txn. - assert(renv->ss.ss == FORWARD_OLDER_CHECKPOINT_BEGIN); // cannot happen after checkpoint begin - return 0; - } - - // abort the transaction - r = toku_txn_abort_with_lsn(txn, recover_yield, NULL, l->lsn, NULL, NULL); - assert(r == 0); - - // close the transaction - toku_txn_close_txn(txn); - - return 0; -} - -static int toku_recover_backward_xabort (struct logtype_xabort *UU(l), RECOVER_ENV UU(renv)) { - // nothing - return 0; -} - static int toku_recover_comment (struct logtype_comment *UU(l), RECOVER_ENV UU(renv)) { // nothing return 0; @@ -992,11 +952,7 @@ static int toku_recover_load(struct logtype_load *UU(l), RECOVER_ENV UU(renv)) { TOKUTXN txn = NULL; r = toku_txnid2txn(renv->logger, l->xid, &txn); assert(r == 0); - if (txn == NULL) { - //This is a straddle txn. - assert(renv->ss.ss == FORWARD_OLDER_CHECKPOINT_BEGIN); //cannot happen after checkpoint begin - return 0; - } + assert(txn!=NULL); char *old_iname = fixup_fname(&l->old_iname); char *new_iname = fixup_fname(&l->new_iname); @@ -1013,35 +969,6 @@ static int toku_recover_backward_load(struct logtype_load *UU(l), RECOVER_ENV UU return 0; } -static int toku_delete_rolltmp_files (const char *log_dir) { - struct dirent *de; - DIR *d = opendir(log_dir); - if (d==0) { - return errno; - } - int result = 0; - while ((de=readdir(d))) { - char rolltmp_prefix[] = "__tokudb_rolltmp."; - int r = memcmp(de->d_name, rolltmp_prefix, sizeof(rolltmp_prefix) - 1); - if (r == 0) { - int fnamelen = strlen(log_dir) + strlen(de->d_name) + 2; // One for the slash and one for the trailing NUL. - char fname[fnamelen]; - int l = snprintf(fname, fnamelen, "%s/%s", log_dir, de->d_name); - assert(l+1 == fnamelen); - r = unlink(fname); - if (r!=0) { - result = errno; - perror("Trying to delete a rolltmp file"); - } - } - } - { - int r = closedir(d); - if (r==-1) return errno; - } - return result; -} - // Effects: If there are no log files, or if there is a "clean" checkpoint at the end of the log, // then we don't need recovery to run. Skip the shutdown log entry if there is one. // Returns: TRUE if we need recovery, otherwise FALSE. @@ -1078,7 +1005,7 @@ int tokudb_needs_recovery(const char *log_dir, BOOL ignore_log_empty) { if (r != 0 || le->cmd != LT_begin_checkpoint) { needs_recovery = TRUE; goto exit; } - if (le->u.begin_checkpoint.lsn.lsn != end_checkpoint.u.end_checkpoint.txnid) { + if (le->u.begin_checkpoint.lsn.lsn != end_checkpoint.u.end_checkpoint.xid) { needs_recovery = TRUE; goto exit; } needs_recovery = FALSE; @@ -1207,8 +1134,7 @@ static int do_recovery(RECOVER_ENV renv, const char *env_dir, const char *log_di } // dispatch the log entry handler - assert(renv->ss.ss == BACKWARD_OLDER_CHECKPOINT_BEGIN || - renv->ss.ss == BACKWARD_BETWEEN_CHECKPOINT_BEGIN_END || + assert(renv->ss.ss == BACKWARD_BETWEEN_CHECKPOINT_BEGIN_END || renv->ss.ss == BACKWARD_NEWER_CHECKPOINT_END); logtype_dispatch_assign(le, toku_recover_backward_, r, renv); if (tokudb_recovery_trace) @@ -1246,8 +1172,7 @@ static int do_recovery(RECOVER_ENV renv, const char *env_dir, const char *log_di } // dispatch the log entry handler (first time calls the forward handler for the log entry at the turnaround - assert(renv->ss.ss == FORWARD_OLDER_CHECKPOINT_BEGIN || - renv->ss.ss == FORWARD_BETWEEN_CHECKPOINT_BEGIN_END || + assert(renv->ss.ss == FORWARD_BETWEEN_CHECKPOINT_BEGIN_END || renv->ss.ss == FORWARD_NEWER_CHECKPOINT_END); logtype_dispatch_assign(le, toku_recover_, r, renv); if (tokudb_recovery_trace) @@ -1299,7 +1224,7 @@ static int do_recovery(RECOVER_ENV renv, const char *env_dir, const char *log_di tnow = time(NULL); fprintf(stderr, "%.24s Tokudb recovery closing %"PRIu32" dictionar%s\n", ctime(&tnow), n, n > 1 ? "ies" : "y"); } - file_map_close_dictionaries(&renv->fmap, TRUE); + file_map_close_dictionaries(&renv->fmap, TRUE, renv->logger); // write a recovery log entry BYTESTRING recover_comment = { strlen("recover"), "recover" }; @@ -1360,22 +1285,6 @@ static int recover_unlock(int lockfd) { return 0; } -int tokudb_recover_delete_rolltmp_files(const char *UU(data_dir), const char *log_dir) { - int lockfd = -1; - - int r = recover_lock(log_dir, &lockfd); - if (r != 0) - return r; - - r = toku_delete_rolltmp_files(log_dir); - - int rr = recover_unlock(lockfd); - if (r == 0 && rr != 0) - r = rr; - - return r; -} - int tokudb_recover(const char *env_dir, const char *log_dir, brt_compare_func bt_compare, brt_compare_func dup_compare, @@ -1389,12 +1298,6 @@ int tokudb_recover(const char *env_dir, const char *log_dir, if (r != 0) return r; - r = toku_delete_rolltmp_files(log_dir); - if (r != 0) { - (void) recover_unlock(lockfd); - return r; - } - int rr = 0; if (tokudb_needs_recovery(log_dir, FALSE)) { struct recover_env renv; diff --git a/newbrt/recover.h b/newbrt/recover.h index 5daa8a312b5..37c802a2ebb 100644 --- a/newbrt/recover.h +++ b/newbrt/recover.h @@ -11,7 +11,6 @@ #include "../include/db.h" #include "brttypes.h" #include "memory.h" -#include "bread.h" #include "x1764.h" // Run tokudb recovery from the log @@ -29,10 +28,6 @@ int tokudb_recover (const char *env_dir, const char *log_dir, // Returns: TRUE if we need recovery, otherwise FALSE. int tokudb_needs_recovery(const char *logdir, BOOL ignore_empty_log); -// Delete the rolltmp files -// Ruturns 0 if success -int tokudb_recover_delete_rolltmp_files(const char *datadir, const char *logdir); - // Return 0 if recovery log exists, ENOENT if log is missing int tokudb_recover_log_exists(const char * log_dir); diff --git a/newbrt/roll.c b/newbrt/roll.c index 257abe87cd2..4451e162fab 100644 --- a/newbrt/roll.c +++ b/newbrt/roll.c @@ -12,11 +12,11 @@ int toku_commit_fdelete (u_int8_t file_was_open, - FILENUM filenum, // valid if file_was_open - BYTESTRING bs_fname, // cwd/iname - TOKUTXN txn, - YIELDF UU(yield), - void *UU(yield_v), + FILENUM filenum, // valid if file_was_open + BYTESTRING bs_fname, // cwd/iname + TOKUTXN txn, + YIELDF UU(yield), + void *UU(yield_v), LSN UU(oplsn)) //oplsn is the lsn of the commit { //TODO: #2037 verify the file is (user) closed @@ -24,18 +24,23 @@ toku_commit_fdelete (u_int8_t file_was_open, CACHEFILE cf; int r; if (file_was_open) { // file was open when toku_brt_remove_on_commit() was called - r = toku_cachefile_of_filenum(txn->logger->ct, filenum, &cf); - assert(r == 0); // must still be open (toku_brt_remove_on_commit() incremented refcount) - { - (void)toku_cachefile_get_and_pin_fd(cf); - assert(!toku_cachefile_is_dev_null_unlocked(cf)); - struct brt_header *h = toku_cachefile_get_userdata(cf); - DICTIONARY_ID dict_id = h->dict_id; - toku_logger_call_remove_finalize_callback(txn->logger, dict_id); + r = toku_cachefile_of_filenum(txn->logger->ct, filenum, &cf); + if (r==ENOENT) { //Missing file on recovered transaction is not an error + assert(txn->recovered_from_checkpoint); + r = 0; + goto done; + } + assert(r == 0); // must still be open (toku_brt_remove_on_commit() incremented refcount) + { + (void)toku_cachefile_get_and_pin_fd(cf); + assert(!toku_cachefile_is_dev_null_unlocked(cf)); + struct brt_header *h = toku_cachefile_get_userdata(cf); + DICTIONARY_ID dict_id = h->dict_id; + toku_logger_call_remove_finalize_callback(txn->logger, dict_id); toku_cachefile_unpin_fd(cf); - } - r = toku_cachefile_redirect_nullfd(cf); - assert(r==0); + } + r = toku_cachefile_redirect_nullfd(cf); + assert(r==0); } char *fname_in_env = fixup_fname(&bs_fname); char *fname_in_cwd = toku_cachetable_get_fname_in_cwd(txn->logger->ct, fname_in_env); @@ -44,16 +49,17 @@ toku_commit_fdelete (u_int8_t file_was_open, assert(r==0 || errno==ENOENT); toku_free(fname_in_env); toku_free(fname_in_cwd); +done: return 0; } int toku_rollback_fdelete (u_int8_t UU(file_was_open), FILENUM UU(filenum), - BYTESTRING UU(bs_fname), - TOKUTXN UU(txn), - YIELDF UU(yield), - void* UU(yield_v), + BYTESTRING UU(bs_fname), + TOKUTXN UU(txn), + YIELDF UU(yield), + void* UU(yield_v), LSN UU(oplsn)) //oplsn is the lsn of the abort { //Rolling back an fdelete is an no-op. @@ -62,10 +68,10 @@ toku_rollback_fdelete (u_int8_t UU(file_was_open), int toku_commit_fcreate (FILENUM UU(filenum), - BYTESTRING UU(bs_fname), - TOKUTXN UU(txn), - YIELDF UU(yield), - void *UU(yield_v), + BYTESTRING UU(bs_fname), + TOKUTXN UU(txn), + YIELDF UU(yield), + void *UU(yield_v), LSN UU(oplsn)) { return 0; @@ -73,10 +79,10 @@ toku_commit_fcreate (FILENUM UU(filenum), int toku_rollback_fcreate (FILENUM filenum, - BYTESTRING bs_fname, // cwd/iname - TOKUTXN txn, - YIELDF UU(yield), - void* UU(yield_v), + BYTESTRING bs_fname, // cwd/iname + TOKUTXN txn, + YIELDF UU(yield), + void* UU(yield_v), LSN UU(oplsn)) { //TODO: #2037 verify the file is (user) closed @@ -84,13 +90,18 @@ toku_rollback_fcreate (FILENUM filenum, //Remove reference to the fd in the cachetable CACHEFILE cf = NULL; int r = toku_cachefile_of_filenum(txn->logger->ct, filenum, &cf); + if (r==ENOENT) { //Missing file on recovered transaction is not an error + assert(txn->recovered_from_checkpoint); + r = 0; + goto done; + } assert(r == 0); { (void)toku_cachefile_get_and_pin_fd(cf); - assert(!toku_cachefile_is_dev_null_unlocked(cf)); - struct brt_header *h = toku_cachefile_get_userdata(cf); - DICTIONARY_ID dict_id = h->dict_id; - toku_logger_call_remove_finalize_callback(txn->logger, dict_id); + assert(!toku_cachefile_is_dev_null_unlocked(cf)); + struct brt_header *h = toku_cachefile_get_userdata(cf); + DICTIONARY_ID dict_id = h->dict_id; + toku_logger_call_remove_finalize_callback(txn->logger, dict_id); toku_cachefile_unpin_fd(cf); } r = toku_cachefile_redirect_nullfd(cf); @@ -103,6 +114,7 @@ toku_rollback_fcreate (FILENUM filenum, assert(r==0 || errno==ENOENT); toku_free(fname_in_env); toku_free(fname_in_cwd); +done: return 0; } @@ -119,6 +131,11 @@ static int do_insertion (enum brt_msg_type type, FILENUM filenum, BYTESTRING key CACHEFILE cf; //printf("%s:%d committing insert %s %s\n", __FILE__, __LINE__, key.data, data.data); int r = toku_cachefile_of_filenum(txn->logger->ct, filenum, &cf); + if (r==ENOENT) { //Missing file on recovered transaction is not an error + assert(txn->recovered_from_checkpoint); + r = 0; + goto done; + } assert(r==0); (void)toku_cachefile_get_and_pin_fd(cf); @@ -146,6 +163,7 @@ static int do_insertion (enum brt_msg_type type, FILENUM filenum, BYTESTRING key } cleanup: toku_cachefile_unpin_fd(cf); +done: return r; } @@ -166,11 +184,11 @@ int toku_commit_cmdinsert (FILENUM filenum, BYTESTRING key, TOKUTXN txn, YIELDF int toku_commit_cmdinsertboth (FILENUM filenum, - BYTESTRING key, - BYTESTRING data, - TOKUTXN txn, - YIELDF UU(yield), - void * UU(yieldv), + BYTESTRING key, + BYTESTRING data, + TOKUTXN txn, + YIELDF UU(yield), + void * UU(yieldv), LSN oplsn) { #if TOKU_DO_COMMIT_CMD_INSERT @@ -183,10 +201,10 @@ toku_commit_cmdinsertboth (FILENUM filenum, int toku_rollback_cmdinsert (FILENUM filenum, - BYTESTRING key, - TOKUTXN txn, - YIELDF UU(yield), - void * UU(yieldv), + BYTESTRING key, + TOKUTXN txn, + YIELDF UU(yield), + void * UU(yieldv), LSN oplsn) { return do_insertion (BRT_ABORT_ANY, filenum, key, 0, txn, oplsn); @@ -194,11 +212,11 @@ toku_rollback_cmdinsert (FILENUM filenum, int toku_rollback_cmdinsertboth (FILENUM filenum, - BYTESTRING key, - BYTESTRING data, - TOKUTXN txn, - YIELDF UU(yield), - void * UU(yieldv), + BYTESTRING key, + BYTESTRING data, + TOKUTXN txn, + YIELDF UU(yield), + void * UU(yieldv), LSN oplsn) { return do_insertion (BRT_ABORT_BOTH, filenum, key, &data, txn, oplsn); @@ -206,11 +224,11 @@ toku_rollback_cmdinsertboth (FILENUM filenum, int toku_commit_cmddeleteboth (FILENUM filenum, - BYTESTRING key, - BYTESTRING data, - TOKUTXN txn, - YIELDF UU(yield), - void * UU(yieldv), + BYTESTRING key, + BYTESTRING data, + TOKUTXN txn, + YIELDF UU(yield), + void * UU(yieldv), LSN oplsn) { #if TOKU_DO_COMMIT_CMD_DELETE_BOTH @@ -223,11 +241,11 @@ toku_commit_cmddeleteboth (FILENUM filenum, int toku_rollback_cmddeleteboth (FILENUM filenum, - BYTESTRING key, - BYTESTRING data, - TOKUTXN txn, - YIELDF UU(yield), - void * UU(yieldv), + BYTESTRING key, + BYTESTRING data, + TOKUTXN txn, + YIELDF UU(yield), + void * UU(yieldv), LSN oplsn) { return do_insertion (BRT_ABORT_BOTH, filenum, key, &data, txn, oplsn); @@ -235,10 +253,10 @@ toku_rollback_cmddeleteboth (FILENUM filenum, int toku_commit_cmddelete (FILENUM filenum, - BYTESTRING key, - TOKUTXN txn, - YIELDF UU(yield), - void * UU(yieldv), + BYTESTRING key, + TOKUTXN txn, + YIELDF UU(yield), + void * UU(yieldv), LSN oplsn) { #if TOKU_DO_COMMIT_CMD_DELETE @@ -251,113 +269,114 @@ toku_commit_cmddelete (FILENUM filenum, int toku_rollback_cmddelete (FILENUM filenum, - BYTESTRING key, - TOKUTXN txn, - YIELDF UU(yield), - void * UU(yieldv), + BYTESTRING key, + TOKUTXN txn, + YIELDF UU(yield), + void * UU(yieldv), LSN oplsn) { return do_insertion (BRT_ABORT_ANY, filenum, key, 0, txn, oplsn); } -int -toku_commit_fileentries (int fd, - TOKUTXN txn, - YIELDF yield, - void * yieldv, - LSN oplsn) -{ - BREAD f = create_bread_from_fd_initialize_at(fd); - int r=0; - MEMARENA ma = memarena_create(); +static int +toku_apply_rollinclude (TXNID xid, + uint64_t num_nodes, + BLOCKNUM spilled_head, + uint32_t spilled_head_hash, + BLOCKNUM spilled_tail, + uint32_t spilled_tail_hash, + TOKUTXN txn, + YIELDF yield, + void * yieldv, + LSN oplsn, + apply_rollback_item func) { + int r; + struct roll_entry *item; int count=0; - while (bread_has_more(f)) { - struct roll_entry *item; - r = toku_read_rollback_backwards(f, &item, ma); - if (r!=0) goto finish; - r = toku_commit_rollback_item(txn, item, yield, yieldv, oplsn); - if (r!=0) goto finish; - memarena_clear(ma); - count++; - if (count%2==0) yield(NULL, yieldv); + + BLOCKNUM next_log = spilled_tail; + uint32_t next_log_hash = spilled_tail_hash; + uint64_t last_sequence = num_nodes; + + BOOL found_head = FALSE; + assert(next_log.b != ROLLBACK_NONE.b); + while (next_log.b != ROLLBACK_NONE.b) { + ROLLBACK_LOG_NODE log; + //pin log + r = toku_get_and_pin_rollback_log(txn, xid, last_sequence - 1, next_log, next_log_hash, &log); + assert(r==0); + last_sequence = log->sequence; + + while ((item=log->newest_logentry)) { + log->newest_logentry = item->prev; + r = func(txn, item, yield, yieldv, oplsn); + if (r!=0) return r; + count++; + if (count%2 == 0) yield(NULL, yieldv); + } + if (next_log.b == spilled_head.b) { + assert(!found_head); + found_head = TRUE; + assert(log->sequence == 0); + } + next_log = log->older; + next_log_hash = log->older_hash; + { + //Clean up transaction structure to prevent + //toku_txn_close from double-freeing + spilled_tail = next_log; + spilled_tail_hash = next_log_hash; + if (found_head) { + assert(next_log.b == ROLLBACK_NONE.b); + spilled_head = next_log; + spilled_head_hash = next_log_hash; + } + } + //Unpins log + r = toku_delete_rollback_log(txn, log); + assert(r==0); } - finish: - { int r2 = close_bread_without_closing_fd(f); assert(r2==0); } - memarena_close(&ma); return r; } int -toku_rollback_fileentries (int fd, - TOKUTXN txn, - YIELDF yield, - void * yieldv, - LSN oplsn) -{ - BREAD f = create_bread_from_fd_initialize_at(fd); - assert(f); - int r=0; - MEMARENA ma = memarena_create(); - int count=0; - while (bread_has_more(f)) { - struct roll_entry *item; - r = toku_read_rollback_backwards(f, &item, ma); - if (r!=0) goto finish; - r = toku_abort_rollback_item(txn, item, yield, yieldv, oplsn); - if (r!=0) goto finish; - memarena_clear(ma); - count++; - if (count%2==0) yield(NULL, yieldv); - } - finish: - { int r2 = close_bread_without_closing_fd(f); assert(r2==0); } - memarena_close(&ma); - return r; -} - -int -toku_commit_rollinclude (BYTESTRING bs, - TOKUTXN txn, - YIELDF yield, - void * yieldv, +toku_commit_rollinclude (TXNID xid, + uint64_t num_nodes, + BLOCKNUM spilled_head, + uint32_t spilled_head_hash, + BLOCKNUM spilled_tail, + uint32_t spilled_tail_hash, + TOKUTXN txn, + YIELDF yield, + void * yieldv, LSN oplsn) { int r; - char *fname_in_logger = fixup_fname(&bs); - char *fname_in_cwd = toku_construct_full_name(2, txn->logger->directory, fname_in_logger); - int fd = open(fname_in_cwd, O_RDONLY+O_BINARY); - assert(fd>=0); - r = toku_commit_fileentries(fd, txn, yield, yieldv, oplsn); - assert(r==0); - r = close(fd); - assert(r==0); - r = unlink(fname_in_cwd); - assert(r==0); - toku_free(fname_in_logger); - toku_free(fname_in_cwd); - return 0; + r = toku_apply_rollinclude(xid, num_nodes, + spilled_head, spilled_head_hash, + spilled_tail, spilled_tail_hash, + txn, yield, yieldv, oplsn, + toku_commit_rollback_item); + return r; } int -toku_rollback_rollinclude (BYTESTRING bs, - TOKUTXN txn, - YIELDF yield, - void * yieldv, - LSN oplsn) -{ +toku_rollback_rollinclude (TXNID xid, + uint64_t num_nodes, + BLOCKNUM spilled_head, + uint32_t spilled_head_hash, + BLOCKNUM spilled_tail, + uint32_t spilled_tail_hash, + TOKUTXN txn, + YIELDF yield, + void * yieldv, + LSN oplsn) { int r; - char *fname_in_logger = fixup_fname(&bs); - char *fname_in_cwd = toku_construct_full_name(2, txn->logger->directory, fname_in_logger); - int fd = open(fname_in_cwd, O_RDONLY+O_BINARY); - assert(fd>=0); - r = toku_rollback_fileentries(fd, txn, yield, yieldv, oplsn); - assert(r==0); - r = close(fd); - assert(r==0); - r = unlink(fname_in_cwd); - assert(r==0); - toku_free(fname_in_logger); - toku_free(fname_in_cwd); - return 0; + r = toku_apply_rollinclude(xid, num_nodes, + spilled_head, spilled_head_hash, + spilled_tail, spilled_tail_hash, + txn, yield, yieldv, oplsn, + toku_abort_rollback_item); + return r; } int @@ -365,7 +384,7 @@ toku_rollback_tablelock_on_empty_table (FILENUM filenum, TOKUTXN txn, YIELDF yield, void* yield_v, - LSN UU(oplsn)) + LSN oplsn) { //TODO: Replace truncate function with something that doesn't need to mess with checkpoints. // on rollback we have to make the file be empty, since we locked an empty table, and then may have done things to it. @@ -373,22 +392,33 @@ toku_rollback_tablelock_on_empty_table (FILENUM filenum, CACHEFILE cf; //printf("%s:%d committing insert %s %s\n", __FILE__, __LINE__, key.data, data.data); int r = toku_cachefile_of_filenum(txn->logger->ct, filenum, &cf); + if (r==ENOENT) { //Missing file on recovered transaction is not an error + assert(txn->recovered_from_checkpoint); + r = 0; + goto done; + } assert(r==0); OMTVALUE brtv=NULL; r = toku_omt_find_zero(txn->open_brts, find_brt_from_filenum, &filenum, &brtv, NULL, NULL); - if (r==0) { - // If r!=0 it could be because we grabbed a log on an empty table that doesn't even exist, and we never put anything into it. - // So, just don't do anything in this case. - BRT brt = brtv; - toku_poll_txn_progress_function(txn, FALSE, TRUE); - yield(toku_checkpoint_safe_client_lock, yield_v); - toku_poll_txn_progress_function(txn, FALSE, FALSE); - r = toku_brt_truncate(brt); - assert(r==0); - toku_checkpoint_safe_client_unlock(); + assert(r==0); + BRT brt = brtv; + { //Do NOT truncate the file if + //the file already survived the truncate and was checkpointed. + LSN treelsn = toku_brt_checkpoint_lsn(brt); + if (oplsn.lsn != 0 && oplsn.lsn <= treelsn.lsn) { + r = 0; + goto done; + } } + toku_poll_txn_progress_function(txn, FALSE, TRUE); + yield(toku_checkpoint_safe_client_lock, yield_v); + toku_poll_txn_progress_function(txn, FALSE, FALSE); + r = toku_brt_truncate(brt); + assert(r==0); + toku_checkpoint_safe_client_unlock(); +done: return r; } @@ -411,7 +441,7 @@ toku_commit_load (BYTESTRING old_iname, char *fname_in_env = fixup_fname(&old_iname); //Delete old file r = toku_cachefile_of_iname_in_env(txn->logger->ct, fname_in_env, &cf); if (r==0) { - r = toku_cachefile_redirect_nullfd(cf); + r = toku_cachefile_redirect_nullfd(cf); assert(r==0); } else { @@ -438,7 +468,7 @@ toku_rollback_load (BYTESTRING UU(old_iname), char *fname_in_env = fixup_fname(&new_iname); //Delete new file r = toku_cachefile_of_iname_in_env(txn->logger->ct, fname_in_env, &cf); if (r==0) { - r = toku_cachefile_redirect_nullfd(cf); + r = toku_cachefile_redirect_nullfd(cf); assert(r==0); } else { @@ -455,38 +485,44 @@ toku_rollback_load (BYTESTRING UU(old_iname), int toku_commit_dictionary_redirect (FILENUM UU(old_filenum), - FILENUM UU(new_filenum), + FILENUM UU(new_filenum), TOKUTXN UU(txn), YIELDF UU(yield), void * UU(yield_v), LSN UU(oplsn)) //oplsn is the lsn of the commit { - //NO-OP + //Redirect only has meaning during normal operation (NOT during recovery). + if (!txn->recovered_from_checkpoint) { + //NO-OP + } return 0; } int toku_rollback_dictionary_redirect (FILENUM old_filenum, - FILENUM new_filenum, + FILENUM new_filenum, TOKUTXN txn, YIELDF UU(yield), void * UU(yield_v), LSN UU(oplsn)) //oplsn is the lsn of the abort { int r = 0; - CACHEFILE new_cf = NULL; - r = toku_cachefile_of_filenum(txn->logger->ct, new_filenum, &new_cf); - assert(r == 0); - struct brt_header *new_h = toku_cachefile_get_userdata(new_cf); + //Redirect only has meaning during normal operation (NOT during recovery). + if (!txn->recovered_from_checkpoint) { + CACHEFILE new_cf = NULL; + r = toku_cachefile_of_filenum(txn->logger->ct, new_filenum, &new_cf); + assert(r == 0); + struct brt_header *new_h = toku_cachefile_get_userdata(new_cf); - CACHEFILE old_cf = NULL; - r = toku_cachefile_of_filenum(txn->logger->ct, old_filenum, &old_cf); - assert(r == 0); - struct brt_header *old_h = toku_cachefile_get_userdata(old_cf); + CACHEFILE old_cf = NULL; + r = toku_cachefile_of_filenum(txn->logger->ct, old_filenum, &old_cf); + assert(r == 0); + struct brt_header *old_h = toku_cachefile_get_userdata(old_cf); - //Redirect back from new to old. - r = toku_dictionary_redirect_abort(old_h, new_h, txn); - assert(r==0); + //Redirect back from new to old. + r = toku_dictionary_redirect_abort(old_h, new_h, txn); + assert(r==0); + } return r; } diff --git a/newbrt/rollback.c b/newbrt/rollback.c index 1e340548ee1..ac383664b26 100644 --- a/newbrt/rollback.c +++ b/newbrt/rollback.c @@ -37,18 +37,114 @@ int toku_abort_rollback_item (TOKUTXN txn, struct roll_entry *item, YIELDF yield return r; } -void toku_rollback_txn_close (TOKUTXN txn) { - memarena_close(&txn->rollentry_arena); - if (txn->rollentry_filename!=0) { - int r = close(txn->rollentry_fd); - assert(r==0); - char *fname_in_cwd = toku_construct_full_name(2, txn->logger->directory, txn->rollentry_filename); - r = unlink(fname_in_cwd); - assert(r==0); - toku_free(txn->rollentry_filename); - toku_free(fname_in_cwd); +static inline int +txn_has_inprogress_rollback_log(TOKUTXN txn) { + return txn->current_rollback.b != ROLLBACK_NONE.b; +} + +static inline int +txn_has_spilled_rollback_logs(TOKUTXN txn) { + return txn->spilled_rollback_tail.b != ROLLBACK_NONE.b; +} + +int +toku_delete_rollback_log(TOKUTXN txn, ROLLBACK_LOG_NODE log) { + int r; + CACHEFILE cf = txn->logger->rollback_cachefile; + struct brt_header *h = toku_cachefile_get_userdata(cf); + BLOCKNUM to_free = log->thislogname; + if (txn->pinned_inprogress_rollback_log == log) { + txn->pinned_inprogress_rollback_log = NULL; + } + r = toku_cachetable_unpin_and_remove (cf, log->thislogname); + assert(r==0); + toku_free_blocknum(h->blocktable, &to_free, h); + return r; +} + +static int +toku_apply_txn (TOKUTXN txn, YIELDF yield, void*yieldv, LSN lsn, + apply_rollback_item func) { + int r = 0; + // do the commit/abort calls and free everything + // we do the commit/abort calls in reverse order too. + struct roll_entry *item; + //printf("%s:%d abort\n", __FILE__, __LINE__); + int count=0; + + BLOCKNUM next_log = ROLLBACK_NONE; + uint32_t next_log_hash = 0; + + BOOL is_current = FALSE; + if (txn_has_inprogress_rollback_log(txn)) { + next_log = txn->current_rollback; + next_log_hash = txn->current_rollback_hash; + is_current = TRUE; + } + else if (txn_has_spilled_rollback_logs(txn)) { + next_log = txn->spilled_rollback_tail; + next_log_hash = txn->spilled_rollback_tail_hash; } + uint64_t last_sequence = txn->num_rollback_nodes; + BOOL found_head = FALSE; + while (next_log.b != ROLLBACK_NONE.b) { + ROLLBACK_LOG_NODE log; + //pin log + r = toku_get_and_pin_rollback_log(txn, txn->txnid64, last_sequence-1, next_log, next_log_hash, &log); + assert(r==0); + last_sequence = log->sequence; + if (func) { + while ((item=log->newest_logentry)) { + log->newest_logentry = item->prev; + r = func(txn, item, yield, yieldv, lsn); + if (r!=0) return r; + count++; + if (count%2 == 0) yield(NULL, yieldv); + } + } + if (next_log.b == txn->spilled_rollback_head.b) { + assert(!found_head); + found_head = TRUE; + assert(log->sequence == 0); + } + next_log = log->older; + next_log_hash = log->older_hash; + { + //Clean up transaction structure to prevent + //toku_txn_close from double-freeing + if (is_current) { + txn->current_rollback = ROLLBACK_NONE; + txn->current_rollback_hash = 0; + is_current = FALSE; + } + else { + txn->spilled_rollback_tail = next_log; + txn->spilled_rollback_tail_hash = next_log_hash; + } + if (found_head) { + assert(next_log.b == ROLLBACK_NONE.b); + txn->spilled_rollback_head = next_log; + txn->spilled_rollback_head_hash = next_log_hash; + } + } + //Unpins log + r = toku_delete_rollback_log(txn, log); + assert(r==0); + } + return r; +} + +void toku_rollback_txn_close (TOKUTXN txn) { + { + //Clean up all rollback logs if they exist. + //Note: This will NOT cleanup any rollback logs as in 'rollinclude' + int r = toku_apply_txn(txn, NULL, NULL, ZERO_LSN, NULL); + assert(r==0); + } + assert(txn->spilled_rollback_head.b == ROLLBACK_NONE.b); + assert(txn->spilled_rollback_tail.b == ROLLBACK_NONE.b); + assert(txn->current_rollback.b == ROLLBACK_NONE.b); { //Remove txn from list (omt) of live transactions OMTVALUE txnagain; @@ -86,20 +182,16 @@ void toku_rollback_txn_close (TOKUTXN txn) { return; } -void* toku_malloc_in_rollback(TOKUTXN txn, size_t size) { - return malloc_in_memarena(txn->rollentry_arena, size); +void* toku_malloc_in_rollback(ROLLBACK_LOG_NODE log, size_t size) { + return malloc_in_memarena(log->rollentry_arena, size); } -void *toku_memdup_in_rollback(TOKUTXN txn, const void *v, size_t len) { - void *r=toku_malloc_in_rollback(txn, len); +void *toku_memdup_in_rollback(ROLLBACK_LOG_NODE log, const void *v, size_t len) { + void *r=toku_malloc_in_rollback(log, len); memcpy(r,v,len); return r; } -char *toku_strdup_in_rollback(TOKUTXN txn, const char *s) { - return toku_memdup_in_rollback(txn, s, strlen(s)+1); -} - static int note_brt_used_in_txns_parent(OMTVALUE brtv, u_int32_t UU(index), void*txnv) { TOKUTXN child = txnv; TOKUTXN parent = child->parent; @@ -107,215 +199,246 @@ static int note_brt_used_in_txns_parent(OMTVALUE brtv, u_int32_t UU(index), void int r = toku_txn_note_brt(parent, brt); if (r==0 && brt->h->txnid_that_created_or_locked_when_empty == toku_txn_get_txnid(child)) { - //Pass magic "no rolltmp needed" flag to parent. + //Pass magic "no rollback needed" flag to parent. brt->h->txnid_that_created_or_locked_when_empty = toku_txn_get_txnid(parent); } return r; } -//Commit each entry in the rollback (rolltmp) log. +//Commit each entry in the rollback log. //If the transaction has a parent, it just promotes its information to its parent. int toku_rollback_commit(TOKUTXN txn, YIELDF yield, void*yieldv, LSN lsn) { int r=0; if (txn->parent!=0) { - // First we must put a rollinclude entry into the parent if we have a rollentry file. - if (txn->rollentry_filename) { - int len = strlen(txn->rollentry_filename); - // Don't have to strdup the rollentry_filename because - // we take ownership of it. - BYTESTRING fname = {len, toku_strdup_in_rollback(txn, txn->rollentry_filename)}; - r = toku_logger_save_rollback_rollinclude(txn->parent, fname); + // First we must put a rollinclude entry into the parent if we spilled + + if (txn_has_spilled_rollback_logs(txn)) { + uint64_t num_nodes = txn->num_rollback_nodes; + if (txn_has_inprogress_rollback_log(txn)) { + num_nodes--; //Don't count the in-progress rollback log. + } + r = toku_logger_save_rollback_rollinclude(txn->parent, txn->txnid64, num_nodes, + txn->spilled_rollback_head, txn->spilled_rollback_head_hash, + txn->spilled_rollback_tail, txn->spilled_rollback_tail_hash); if (r!=0) return r; - r = close(txn->rollentry_fd); - if (r!=0) { - //TODO: #2249.. this is a panic/crash situation - // If the rolltmp file is necessary for a checkpoint - // we CANNOT delete it! - // For now.. delete it, but figure out how to deal with this later. - // Maybe we should just assert that the close succeeds? - // We have to do the unlink ourselves, and then - // set txn->rollentry_filename=0 so that the cleanup - // won't try to close the fd again. - char *fname_in_cwd = toku_construct_full_name(2, txn->logger->directory, txn->rollentry_filename); - r = unlink(fname_in_cwd); - assert(r==0); //Can we assert this at this point? - unlink(txn->rollentry_filename); - toku_free(txn->rollentry_filename); - toku_free(fname_in_cwd); - txn->rollentry_filename = 0; - return r; + //Remove ownership from child. + txn->spilled_rollback_head = ROLLBACK_NONE; + txn->spilled_rollback_head_hash = 0; + txn->spilled_rollback_tail = ROLLBACK_NONE; + txn->spilled_rollback_tail_hash = 0; + } + if (txn_has_inprogress_rollback_log(txn)) { + ROLLBACK_LOG_NODE parent_log; + //Pin parent log + r = toku_get_and_pin_rollback_log_for_new_entry(txn->parent, &parent_log); + assert(r==0); + + ROLLBACK_LOG_NODE child_log; + //Pin child log + r = toku_get_and_pin_rollback_log(txn, txn->txnid64, txn->num_rollback_nodes - 1, + txn->current_rollback, txn->current_rollback_hash, + &child_log); + assert(r==0); + + // Append the list to the front of the parent. + if (child_log->oldest_logentry) { + // There are some entries, so link them in. + child_log->oldest_logentry->prev = parent_log->newest_logentry; + if (!parent_log->oldest_logentry) { + parent_log->oldest_logentry = child_log->oldest_logentry; + } + parent_log->newest_logentry = child_log->newest_logentry; + parent_log->rollentry_resident_bytecount += child_log->rollentry_resident_bytecount; + txn->parent->rollentry_raw_count += txn->rollentry_raw_count; + child_log->rollentry_resident_bytecount = 0; } - // Stop the cleanup from closing and unlinking the file. - toku_free(txn->rollentry_filename); - txn->rollentry_filename = 0; - } - // Append the list to the front of the parent. - if (txn->oldest_logentry) { - // There are some entries, so link them in. - txn->oldest_logentry->prev = txn->parent->newest_logentry; - if (txn->parent->newest_logentry) { - txn->parent->newest_logentry->next = txn->oldest_logentry; - } else { - txn->parent->oldest_logentry = txn->oldest_logentry; + if (parent_log->oldest_logentry==NULL) { + parent_log->oldest_logentry = child_log->oldest_logentry; } - txn->parent->newest_logentry = txn->newest_logentry; - txn->parent->rollentry_resident_bytecount += txn->rollentry_resident_bytecount; - txn->parent->rollentry_raw_count += txn->rollentry_raw_count; - txn->rollentry_resident_bytecount = 0; - } - if (txn->parent->oldest_logentry==0) { - txn->parent->oldest_logentry = txn->oldest_logentry; - } - txn->newest_logentry = txn->oldest_logentry = 0; - // Put all the memarena data into the parent. - if (memarena_total_size_in_use(txn->rollentry_arena) > 0) { - // If there are no bytes to move, then just leave things alone, and let the memory be reclaimed on txn is closed. - memarena_move_buffers(txn->parent->rollentry_arena, txn->rollentry_arena); + child_log->newest_logentry = child_log->oldest_logentry = 0; + // Put all the memarena data into the parent. + if (memarena_total_size_in_use(child_log->rollentry_arena) > 0) { + // If there are no bytes to move, then just leave things alone, and let the memory be reclaimed on txn is closed. + memarena_move_buffers(parent_log->rollentry_arena, child_log->rollentry_arena); + } + //Delete child log (unpins child_log) + r = toku_delete_rollback_log(txn, child_log); + assert(r==0); + txn->current_rollback = ROLLBACK_NONE; + txn->current_rollback_hash = 0; + + r = toku_maybe_spill_rollbacks(txn->parent, parent_log); //unpins parent_log + assert(r==0); } // Note the open brts, the omts must be merged r = toku_omt_iterate(txn->open_brts, note_brt_used_in_txns_parent, txn); assert(r==0); - r = toku_maybe_spill_rollbacks(txn->parent); - assert(r==0); - //If this transaction needs an fsync (if it commits) //save that in the parent. Since the commit really happens in the root txn. txn->parent->force_fsync_on_commit |= txn->force_fsync_on_commit; - txn->parent->has_done_work |= txn->has_done_work; txn->parent->num_rollentries += txn->num_rollentries; } else { - // do the commit calls and free everything - // we do the commit calls in reverse order too. - { - struct roll_entry *item; - //printf("%s:%d abort\n", __FILE__, __LINE__); - int count=0; - while ((item=txn->newest_logentry)) { - txn->newest_logentry = item->prev; - r = toku_commit_rollback_item(txn, item, yield, yieldv, lsn); - if (r!=0) return r; - count++; - if (count%2 == 0) yield(NULL, yieldv); - } - } - - // Read stuff out of the file and execute it. - if (txn->rollentry_filename) { - r = toku_commit_fileentries(txn->rollentry_fd, txn, yield, yieldv, lsn); - } + r = toku_apply_txn(txn, yield, yieldv, lsn, toku_commit_rollback_item); + assert(r==0); } + return r; } int toku_rollback_abort(TOKUTXN txn, YIELDF yield, void*yieldv, LSN lsn) { - struct roll_entry *item; - int count=0; - int r=0; - while ((item=txn->newest_logentry)) { - txn->newest_logentry = item->prev; - r = toku_abort_rollback_item(txn, item, yield, yieldv, lsn); - if (r!=0) - return r; - count++; - if (count%2 == 0) yield(NULL, yieldv); - } - // Read stuff out of the file and roll it back. - if (txn->rollentry_filename) { - r = toku_rollback_fileentries(txn->rollentry_fd, txn, yield, yieldv, lsn); - assert(r==0); - } - return 0; + int r; + r = toku_apply_txn(txn, yield, yieldv, lsn, toku_abort_rollback_item); + assert(r==0); + return r; } // Write something out. Keep trying even if partial writes occur. // On error: Return negative with errno set. // On success return nbytes. -// NOTE : duplicated from logger.c - FIX THIS!!! -static int write_it (int fd, const void *bufv, int nbytes) { - toku_os_full_write(fd, bufv, nbytes); - return nbytes; +static size_t +rollback_memory_size(ROLLBACK_LOG_NODE log) { + size_t size = sizeof(*log); + size += memarena_total_memory_size(log->rollentry_arena); + return size; } -int toku_maybe_spill_rollbacks (TOKUTXN txn) { - // Previously: - // if (txn->rollentry_resident_bytecount>txn->logger->write_block_size) { - // But now we use t - if (memarena_total_memory_size(txn->rollentry_arena) > txn->logger->write_block_size) { - struct roll_entry *item; - ssize_t bufsize = txn->rollentry_resident_bytecount; - char *MALLOC_N(bufsize, buf); - if (bufsize==0) return errno; - struct wbuf w; - wbuf_init(&w, buf, bufsize); - while ((item=txn->oldest_logentry)) { - assert(item->prev==0); - u_int32_t rollback_fsize = toku_logger_rollback_fsize(item); - txn->rollentry_resident_bytecount -= rollback_fsize; - txn->oldest_logentry = item->next; - if (item->next) { item->next->prev=0; } - toku_logger_rollback_wbufwrite(&w, item); - } - assert(txn->rollentry_resident_bytecount==0); - assert((ssize_t)w.ndone==bufsize); - txn->oldest_logentry = txn->newest_logentry = 0; - if (txn->rollentry_fd<0) { - char filenamepart[sizeof("__tokudb_rolltmp.") + 16]; - snprintf(filenamepart, sizeof(filenamepart), "__tokudb_rolltmp.%.16"PRIx64, txn->txnid64); - txn->rollentry_filename = toku_xstrdup(filenamepart); - char *rollentry_filename_in_cwd = toku_construct_full_name(2, txn->logger->directory, filenamepart); - txn->rollentry_fd = open(rollentry_filename_in_cwd, O_CREAT+O_RDWR+O_EXCL+O_BINARY, 0600); - int r = errno; - toku_free(rollentry_filename_in_cwd); - if (txn->rollentry_fd == -1) return r; - } - uLongf compressed_len = compressBound(w.ndone); - char *MALLOC_N(compressed_len, compressed_buf); - { - int r = compress2((Bytef*)compressed_buf, &compressed_len, - (Bytef*)buf, w.ndone, - 1); - assert(r==Z_OK); - } - { - u_int32_t v = toku_htod32(compressed_len); - ssize_t r = write_it(txn->rollentry_fd, &v, sizeof(v)); assert(r==sizeof(v)); - } - { - ssize_t r = write_it(txn->rollentry_fd, compressed_buf, compressed_len); - if (r<0) return r; - assert(r==(ssize_t)compressed_len); - } - { - u_int32_t v = toku_htod32(w.ndone); - ssize_t r = write_it(txn->rollentry_fd, &v, sizeof(v)); assert(r==sizeof(v)); - } - { - u_int32_t v = toku_htod32(compressed_len); - ssize_t r = write_it(txn->rollentry_fd, &v, sizeof(v)); assert(r==sizeof(v)); - } - toku_free(compressed_buf); - txn->rollentry_filesize+=w.ndone; - toku_free(buf); +static void +toku_rollback_log_free(ROLLBACK_LOG_NODE *log_p) { + ROLLBACK_LOG_NODE log = *log_p; + *log_p = NULL; //Sanitize - // Cleanup the rollback memory - memarena_clear(txn->rollentry_arena); + // Cleanup the rollback memory + memarena_close(&log->rollentry_arena); + toku_free(log); +} + +static void toku_rollback_flush_callback (CACHEFILE cachefile, int fd, BLOCKNUM logname, + void *rollback_v, void *extraargs, long UU(size), + BOOL write_me, BOOL keep_me, BOOL for_checkpoint) { + assert(extraargs); + int r; + TOKUTXN txn = extraargs; + ROLLBACK_LOG_NODE log = rollback_v; + CACHEFILE rollback_cachefile = txn->logger->rollback_cachefile; + struct brt_header *h = toku_cachefile_get_userdata(rollback_cachefile); + + assert(log->thislogname.b==logname.b); + assert(rollback_cachefile == cachefile); + if (write_me && !h->panic) { + int n_workitems, n_threads; + toku_cachefile_get_workqueue_load(cachefile, &n_workitems, &n_threads); + + r = toku_serialize_rollback_log_to(fd, log->thislogname, log, h, n_workitems, n_threads, for_checkpoint); + if (r) { + if (h->panic==0) { + char *e = strerror(r); + int l = 200 + strlen(e); + char s[l]; + h->panic=r; + snprintf(s, l-1, "While writing data to disk, error %d (%s)", r, e); + h->panic_string = toku_strdup(s); + } + } } + if (!keep_me) { + toku_rollback_log_free(&log); + } +} + +static int toku_rollback_fetch_callback (CACHEFILE cachefile, int fd, BLOCKNUM logname, u_int32_t fullhash, + void **rollback_pv, long *sizep, void *extraargs) { + assert(extraargs); + int r; + TOKUTXN txn = extraargs; + CACHEFILE rollback_cachefile = txn->logger->rollback_cachefile; + struct brt_header *h = toku_cachefile_get_userdata(rollback_cachefile); + assert(rollback_cachefile == cachefile); + + ROLLBACK_LOG_NODE *result = (ROLLBACK_LOG_NODE*)rollback_pv; + r = toku_deserialize_rollback_log_from(fd, logname, fullhash, result, txn, h); + if (r==0) { + *sizep = rollback_memory_size(*result); + } + return r; +} + +static int toku_create_new_rollback_log (TOKUTXN txn, BLOCKNUM older, uint32_t older_hash, ROLLBACK_LOG_NODE *result) { + TAGMALLOC(ROLLBACK_LOG_NODE, log); + assert(log); + + int r; + CACHEFILE cf = txn->logger->rollback_cachefile; + struct brt_header *h = toku_cachefile_get_userdata(cf); + + log->layout_version = BRT_LAYOUT_VERSION; + log->layout_version_original = BRT_LAYOUT_VERSION; + log->layout_version_read_from_disk = BRT_LAYOUT_VERSION; + log->dirty = TRUE; + log->txnid = txn->txnid64; + log->sequence = txn->num_rollback_nodes++; + toku_allocate_blocknum(h->blocktable, &log->thislogname, h); + log->thishash = toku_cachetable_hash(cf, log->thislogname); + log->older = older; + log->older_hash = older_hash; + log->oldest_logentry = NULL; + log->newest_logentry = NULL; + log->rollentry_arena = memarena_create(); + log->rollentry_resident_bytecount = 0; + + *result = log; + r=toku_cachetable_put(cf, log->thislogname, log->thishash, + log, rollback_memory_size(log), + toku_rollback_flush_callback, toku_rollback_fetch_callback, + txn); + assert(r==0); + txn->current_rollback = log->thislogname; + txn->current_rollback_hash = log->thishash; + txn->pinned_inprogress_rollback_log = log; return 0; } -int toku_read_rollback_backwards(BREAD br, struct roll_entry **item, MEMARENA ma) { - u_int32_t nbytes_n; ssize_t sr; - if ((sr=bread_backwards(br, &nbytes_n, 4))!=4) { assert(sr<0); return errno; } - u_int32_t n_bytes=toku_dtoh32(nbytes_n); - unsigned char *buf = malloc_in_memarena(ma, n_bytes); - if (buf==0) return errno; - if ((sr=bread_backwards(br, buf, n_bytes-4))!=(ssize_t)n_bytes-4) { assert(sr<0); return errno; } - int r = toku_parse_rollback(buf, n_bytes, item, ma); - if (r!=0) return r; - return 0; +int +toku_rollback_log_unpin(TOKUTXN txn, ROLLBACK_LOG_NODE log) { + int r; + CACHEFILE cf = txn->logger->rollback_cachefile; + if (txn->pinned_inprogress_rollback_log == log) { + txn->pinned_inprogress_rollback_log = NULL; + } + r = toku_cachetable_unpin(cf, log->thislogname, log->thishash, + (enum cachetable_dirty)log->dirty, rollback_memory_size(log)); + assert(r==0); + return r; +} + +//Requires: log is pinned +// log is current +//After: +// log is unpinned if a spill happened +// Maybe there is no current after (if it spilled) +int toku_maybe_spill_rollbacks (TOKUTXN txn, ROLLBACK_LOG_NODE log) { + int r = 0; + if (log->rollentry_resident_bytecount > txn->logger->write_block_size) { + assert(log->thislogname.b == txn->current_rollback.b); + //spill + if (!txn_has_spilled_rollback_logs(txn)) { + //First spilled. Copy to head. + txn->spilled_rollback_head = txn->current_rollback; + txn->spilled_rollback_head_hash = txn->current_rollback_hash; + } + //Unconditionally copy to tail. Old tail does not need to be cached anymore. + txn->spilled_rollback_tail = txn->current_rollback; + txn->spilled_rollback_tail_hash = txn->current_rollback_hash; + + txn->current_rollback = ROLLBACK_NONE; + txn->current_rollback_hash = 0; + //Unpin + r = toku_rollback_log_unpin(txn, log); + assert(r==0); + } + return r; } //Heaviside function to find a TOKUTXN by TOKUTXN (used to find the index) @@ -452,7 +575,7 @@ static void note_txn_closing (TOKUTXN txn) { } // Return the number of bytes that went into the rollback data structure (the uncompressed count if there is compression) -int toku_logger_txn_rolltmp_raw_count(TOKUTXN txn, u_int64_t *raw_count) +int toku_logger_txn_rollback_raw_count(TOKUTXN txn, u_int64_t *raw_count) { *raw_count = txn->rollentry_raw_count; return 0; @@ -466,3 +589,60 @@ int toku_txn_find_by_xid (BRT brt, TXNID xid, TOKUTXN *txnptr) { if (r == 0) *txnptr = txnv; return r; } + +int toku_get_and_pin_rollback_log(TOKUTXN txn, TXNID xid, uint64_t sequence, BLOCKNUM name, uint32_t hash, ROLLBACK_LOG_NODE *result) { + BOOL save_inprogress_node = FALSE; + assert(name.b != ROLLBACK_NONE.b); + int r = 0; + ROLLBACK_LOG_NODE log = NULL; + + if (name.b == txn->current_rollback.b) { + assert(hash == txn->current_rollback_hash); + log = txn->pinned_inprogress_rollback_log; + save_inprogress_node = TRUE; + } + if (!log) { + CACHEFILE cf = txn->logger->rollback_cachefile; + void * log_v; + r = toku_cachetable_get_and_pin(cf, name, hash, + &log_v, NULL, + toku_rollback_flush_callback, toku_rollback_fetch_callback, + txn); + assert(r==0); + log = (ROLLBACK_LOG_NODE)log_v; + } + if (r==0) { + assert(log->thislogname.b == name.b); + assert(log->txnid == xid); + assert(log->sequence == sequence); + if (save_inprogress_node) { + txn->pinned_inprogress_rollback_log = log; + } + *result = log; + } + return r; +} + +int toku_get_and_pin_rollback_log_for_new_entry (TOKUTXN txn, ROLLBACK_LOG_NODE *result) { + int r; + ROLLBACK_LOG_NODE log; + if (txn_has_inprogress_rollback_log(txn)) { + r = toku_get_and_pin_rollback_log(txn, txn->txnid64, txn->num_rollback_nodes-1, + txn->current_rollback, txn->current_rollback_hash, &log); + assert(r==0); + } + else { + //Generate new one. + //tail will be ROLLBACK_NONE if this is the very first + r = toku_create_new_rollback_log(txn, txn->spilled_rollback_tail, txn->spilled_rollback_tail_hash, &log); + assert(r==0); + } + if (r==0) { + assert(log->txnid == txn->txnid64); + assert(log->thislogname.b != ROLLBACK_NONE.b); + *result = log; + } + return r; +} + + diff --git a/newbrt/rollback.h b/newbrt/rollback.h index 459e9b5c893..d69ad1aab9e 100644 --- a/newbrt/rollback.h +++ b/newbrt/rollback.h @@ -13,19 +13,24 @@ void toku_poll_txn_progress_function(TOKUTXN txn, uint8_t is_commit, uint8_t sta int toku_rollback_commit(TOKUTXN txn, YIELDF yield, void*yieldv, LSN lsn); int toku_rollback_abort(TOKUTXN txn, YIELDF yield, void*yieldv, LSN lsn); void toku_rollback_txn_close (TOKUTXN txn); +int toku_get_and_pin_rollback_log_for_new_entry (TOKUTXN txn, ROLLBACK_LOG_NODE *result); +int toku_get_and_pin_rollback_log(TOKUTXN txn, TXNID xid, uint64_t sequence, BLOCKNUM name, uint32_t hash, ROLLBACK_LOG_NODE *result); +int toku_rollback_log_unpin(TOKUTXN txn, ROLLBACK_LOG_NODE log); +int toku_delete_rollback_log(TOKUTXN txn, ROLLBACK_LOG_NODE log); + +typedef int(*apply_rollback_item)(TOKUTXN txn, struct roll_entry *item, YIELDF yield, void*yieldv, LSN lsn); int toku_commit_rollback_item (TOKUTXN txn, struct roll_entry *item, YIELDF yield, void*yieldv, LSN lsn); int toku_abort_rollback_item (TOKUTXN txn, struct roll_entry *item, YIELDF yield, void*yieldv, LSN lsn); -void *toku_malloc_in_rollback(TOKUTXN txn, size_t size); -void *toku_memdup_in_rollback(TOKUTXN txn, const void *v, size_t len); -char *toku_strdup_in_rollback(TOKUTXN txn, const char *s); -int toku_maybe_spill_rollbacks (TOKUTXN txn); +void *toku_malloc_in_rollback(ROLLBACK_LOG_NODE log, size_t size); +void *toku_memdup_in_rollback(ROLLBACK_LOG_NODE log, const void *v, size_t len); +int toku_maybe_spill_rollbacks (TOKUTXN txn, ROLLBACK_LOG_NODE log); int toku_txn_note_brt (TOKUTXN txn, BRT brt); int toku_txn_note_swap_brt (BRT live, BRT zombie); int toku_txn_note_close_brt (BRT brt); -int toku_logger_txn_rolltmp_raw_count(TOKUTXN txn, u_int64_t *raw_count); +int toku_logger_txn_rollback_raw_count(TOKUTXN txn, u_int64_t *raw_count); int toku_txn_find_by_xid (BRT brt, TXNID xid, TOKUTXN *txnptr); @@ -36,4 +41,23 @@ int toku_commit_fileentries (int fd, TOKUTXN txn, YIELDF yield,void *yieldv, LSN //Heaviside function to find a TOKUTXN by TOKUTXN (used to find the index) int find_xid (OMTVALUE v, void *txnv); +struct rollback_log_node { + enum typ_tag tag; + int layout_version; + int layout_version_original; + int layout_version_read_from_disk; + int dirty; + TXNID txnid; // Which transaction made this? + uint64_t sequence; // Which rollback log in the sequence is this? + BLOCKNUM thislogname; // Which block number is this chunk of the log? + uint32_t thishash; + BLOCKNUM older; // Which block number is the next oldest chunk of the log? + uint32_t older_hash; + struct roll_entry *oldest_logentry; + struct roll_entry *newest_logentry; + MEMARENA rollentry_arena; + size_t rollentry_resident_bytecount; // How many bytes for the rollentries that are stored in main memory. +}; + #endif + diff --git a/newbrt/tests/Makefile b/newbrt/tests/Makefile index 4d5055ed8ee..497c7cfc3f3 100644 --- a/newbrt/tests/Makefile +++ b/newbrt/tests/Makefile @@ -58,15 +58,6 @@ check_ok: test 0 = 0 $(SUMMARIZE_CMD) -ifeq ($(TOKU_SKIP_1305),1) -check_test1305: - @echo SKIPPED SLOW TEST $@ -else -# Don't run 1305 under valgrind. It takes way too long. -check_test1305$(BINSUF): test1305$(BINSUF) - ./$< $(VERBVERBOSE) $(SUMMARIZE_CMD) -endif - ifeq ($(TOKU_SKIP_MINICRON),1) check_minicron-test$(BINSUF): @echo "SKIPPED TEST $@ (Fails in win64 vm due to timing issues)" diff --git a/newbrt/tests/bread-test.c b/newbrt/tests/bread-test.c deleted file mode 100644 index dac821ac060..00000000000 --- a/newbrt/tests/bread-test.c +++ /dev/null @@ -1,80 +0,0 @@ -/* Test bread by writing random data and then reading it using bread_backwards() to see if it gives the right answer. - * See test_1305 for another bread test (testing to see if it can read 1GB files) */ - -#include "test.h" -#include - - -#include -#include -#include -#include -#include -#include - -#include "../brttypes.h" -#include "../bread.h" - -#define FNAME "bread-test.data" - -#define RECORDS 20 -#define RECORDLEN 100 - -char buf[RECORDS][RECORDLEN]; -int sizes[RECORDS]; -int sizesn[RECORDS]; -int nwrote=0; -char wrotedata[RECORDS*RECORDLEN]; - -static void -test (int seed) { - srandom(seed); - unlink(FNAME); - int i; - { - int fd = open(FNAME, O_CREAT+O_RDWR+O_BINARY, 0777); - assert(fd>=0); - for (i=0; i=0); - // Now read it all backward - BREAD br = create_bread_from_fd_initialize_at(fd); - while (bread_has_more(br)) { - assert(nwrote>0); - int to_read = 1+(random()%RECORDLEN); // read from 1 to 100 (if RECORDLEN is 100) - if (to_read>nwrote) to_read=nwrote; - char rbuf[to_read]; - int r = bread_backwards(br, rbuf, to_read); - assert(r==to_read); - assert(memcmp(rbuf, &wrotedata[nwrote-to_read], to_read)==0); - nwrote-=to_read; - } - assert(nwrote==0); - - { int r=close_bread_without_closing_fd(br); assert(r==0); } - { int r=close(fd); assert(r==0); } - unlink(FNAME); -} - -int -test_main (int argc __attribute__((__unused__)), const char *argv[] __attribute__((__unused__))) { - int i; - for (i=0; i<10; i++) test(i); - return 0; -} diff --git a/newbrt/tests/cachetable-checkpoint-pending.c b/newbrt/tests/cachetable-checkpoint-pending.c index 85cdd53602e..cbdc53a4e69 100644 --- a/newbrt/tests/cachetable-checkpoint-pending.c +++ b/newbrt/tests/cachetable-checkpoint-pending.c @@ -102,7 +102,7 @@ static void checkpoint_pending(void) { char fname1[] = __FILE__ "test1.dat"; r = unlink(fname1); if (r!=0) CKERR2(errno, ENOENT); r = toku_cachetable_openf(&cf, ct, fname1, O_RDWR|O_CREAT, S_IRWXU|S_IRWXG|S_IRWXO); assert(r == 0); - toku_cachefile_set_userdata(cf, NULL, NULL, NULL, NULL, NULL, NULL, + toku_cachefile_set_userdata(cf, NULL, NULL, NULL, NULL, NULL, NULL, NULL, dummy_pin_unpin, dummy_pin_unpin); // Insert items into the cachetable. All dirty. diff --git a/newbrt/tests/cachetable-checkpoint-test.c b/newbrt/tests/cachetable-checkpoint-test.c index 9a36af8101f..b68a629046d 100644 --- a/newbrt/tests/cachetable-checkpoint-test.c +++ b/newbrt/tests/cachetable-checkpoint-test.c @@ -60,7 +60,7 @@ static void cachetable_checkpoint_test(int n, enum cachetable_dirty dirty) { unlink(fname1); CACHEFILE f1; r = toku_cachetable_openf(&f1, ct, fname1, O_RDWR|O_CREAT, S_IRWXU|S_IRWXG|S_IRWXO); assert(r == 0); - toku_cachefile_set_userdata(f1, NULL, NULL, NULL, NULL, NULL, NULL, + toku_cachefile_set_userdata(f1, NULL, NULL, NULL, NULL, NULL, NULL, NULL, dummy_pin_unpin, dummy_pin_unpin); // insert items into the cachetable. all should be dirty diff --git a/newbrt/tests/cachetable-prefetch-checkpoint-test.c b/newbrt/tests/cachetable-prefetch-checkpoint-test.c index eaa342c50fb..9922f47ad6b 100644 --- a/newbrt/tests/cachetable-prefetch-checkpoint-test.c +++ b/newbrt/tests/cachetable-prefetch-checkpoint-test.c @@ -46,7 +46,7 @@ static void cachetable_prefetch_checkpoint_test(int n, enum cachetable_dirty dir unlink(fname1); CACHEFILE f1; r = toku_cachetable_openf(&f1, ct, fname1, O_RDWR|O_CREAT, S_IRWXU|S_IRWXG|S_IRWXO); assert(r == 0); - toku_cachefile_set_userdata(f1, NULL, NULL, NULL, NULL, NULL, NULL, + toku_cachefile_set_userdata(f1, NULL, NULL, NULL, NULL, NULL, NULL, NULL, dummy_pin_unpin, dummy_pin_unpin); // prefetch block n+1. this will take 10 seconds. diff --git a/newbrt/tests/recovery-bad-last-entry.c b/newbrt/tests/recovery-bad-last-entry.c index afd6d448f06..839f5098b2a 100644 --- a/newbrt/tests/recovery-bad-last-entry.c +++ b/newbrt/tests/recovery-bad-last-entry.c @@ -5,7 +5,7 @@ #define TESTDIR "dir." __FILE__ -static const int magic_begin_end_checkpoint_sz = 77; // leave this many bytes in file +static const int magic_begin_end_checkpoint_sz = 85; // leave this many bytes in file static int run_test(void) { @@ -28,12 +28,12 @@ run_test(void) { LSN beginlsn; // all logs must contain a valid checkpoint r = toku_log_begin_checkpoint(logger, &beginlsn, TRUE, 0); assert(r == 0); - r = toku_log_end_checkpoint(logger, NULL, TRUE, beginlsn.lsn, 0); assert(r == 0); + r = toku_log_end_checkpoint(logger, NULL, TRUE, beginlsn.lsn, 0, 0, 0); assert(r == 0); r = toku_log_comment(logger, NULL, TRUE, 0, hello); assert(r == 0); r = toku_log_comment(logger, NULL, TRUE, 0, world); assert(r == 0); r = toku_log_begin_checkpoint(logger, &beginlsn, TRUE, 0); assert(r == 0); - r = toku_log_end_checkpoint(logger, NULL, TRUE, beginlsn.lsn, 0); assert(r == 0); + r = toku_log_end_checkpoint(logger, NULL, TRUE, beginlsn.lsn, 0, 0, 0); assert(r == 0); r = toku_log_comment(logger, NULL, TRUE, 0, hello); assert(r == 0); r = toku_log_comment(logger, NULL, TRUE, 0, there); assert(r == 0); r = toku_logger_close(&logger); assert(r == 0); diff --git a/newbrt/tests/recovery-cbegin-cend-hello.c b/newbrt/tests/recovery-cbegin-cend-hello.c index 4af8a19bb31..ab0e35293ed 100644 --- a/newbrt/tests/recovery-cbegin-cend-hello.c +++ b/newbrt/tests/recovery-cbegin-cend-hello.c @@ -21,7 +21,7 @@ run_test(void) { // add begin checkpoint, end checkpoint LSN beginlsn; r = toku_log_begin_checkpoint(logger, &beginlsn, FALSE, 0); assert(r == 0); - r = toku_log_end_checkpoint(logger, NULL, TRUE, beginlsn.lsn, 0); assert(r == 0); + r = toku_log_end_checkpoint(logger, NULL, TRUE, beginlsn.lsn, 0, 0, 0); assert(r == 0); r = toku_logger_close(&logger); assert(r == 0); // add hello diff --git a/newbrt/tests/recovery-cbegin-cend.c b/newbrt/tests/recovery-cbegin-cend.c index bb7f79e2508..792201148f1 100644 --- a/newbrt/tests/recovery-cbegin-cend.c +++ b/newbrt/tests/recovery-cbegin-cend.c @@ -21,7 +21,7 @@ run_test(void) { r = toku_logger_create(&logger); assert(r == 0); r = toku_logger_open(TESTDIR, logger); assert(r == 0); - r = toku_log_end_checkpoint(logger, NULL, TRUE, beginlsn.lsn, 0); assert(r == 0); + r = toku_log_end_checkpoint(logger, NULL, TRUE, beginlsn.lsn, 0, 0, 0); assert(r == 0); r = toku_logger_close(&logger); assert(r == 0); // run recovery diff --git a/newbrt/tests/recovery-cbegin.c b/newbrt/tests/recovery-cbegin.c index 5608c82bf65..ea2618b46f0 100644 --- a/newbrt/tests/recovery-cbegin.c +++ b/newbrt/tests/recovery-cbegin.c @@ -17,7 +17,7 @@ run_test(void) { r = toku_logger_open(TESTDIR, logger); assert(r == 0); LSN beginlsn; r = toku_log_begin_checkpoint(logger, &beginlsn, TRUE, 0); assert(r == 0); - r = toku_log_end_checkpoint(logger, NULL, TRUE, beginlsn.lsn, 0); assert(r == 0); + r = toku_log_end_checkpoint(logger, NULL, TRUE, beginlsn.lsn, 0, 0, 0); assert(r == 0); r = toku_log_begin_checkpoint(logger, &beginlsn, TRUE, 0); assert(r == 0); r = toku_logger_close(&logger); assert(r == 0); diff --git a/newbrt/tests/recovery-cend-cbegin.c b/newbrt/tests/recovery-cend-cbegin.c index 074c0c2ad03..25d33bc83da 100644 --- a/newbrt/tests/recovery-cend-cbegin.c +++ b/newbrt/tests/recovery-cend-cbegin.c @@ -20,7 +20,7 @@ run_test(void) { LSN firstbegin = ZERO_LSN; r = toku_log_begin_checkpoint(logger, &firstbegin, TRUE, 0); assert(r == 0); assert(firstbegin.lsn != ZERO_LSN.lsn); - r = toku_log_end_checkpoint(logger, NULL, FALSE, firstbegin.lsn, 0); assert(r == 0); + r = toku_log_end_checkpoint(logger, NULL, FALSE, firstbegin.lsn, 0, 0, 0); assert(r == 0); r = toku_log_begin_checkpoint(logger, NULL, TRUE, 0); assert(r == 0); r = toku_logger_close(&logger); assert(r == 0); diff --git a/newbrt/tests/recovery-fopen-missing-file.c b/newbrt/tests/recovery-fopen-missing-file.c index f088f41f2a5..f16501031c6 100644 --- a/newbrt/tests/recovery-fopen-missing-file.c +++ b/newbrt/tests/recovery-fopen-missing-file.c @@ -19,7 +19,7 @@ run_test(void) { r = toku_logger_open(TESTDIR, logger); assert(r == 0); LSN beginlsn; r = toku_log_begin_checkpoint(logger, &beginlsn, TRUE, 0); assert(r == 0); - r = toku_log_end_checkpoint(logger, NULL, TRUE, beginlsn.lsn, 0); assert(r == 0); + r = toku_log_end_checkpoint(logger, NULL, TRUE, beginlsn.lsn, 0, 0, 0); assert(r == 0); BYTESTRING iname = { strlen("missing_tokudb_file"), "missing_tokudb_file" }; FILENUM filenum = {42}; diff --git a/newbrt/tests/recovery-hello.c b/newbrt/tests/recovery-hello.c index 83a18562ecd..b5147822e1b 100644 --- a/newbrt/tests/recovery-hello.c +++ b/newbrt/tests/recovery-hello.c @@ -21,7 +21,7 @@ run_test(void) { r = toku_log_comment(logger, NULL, TRUE, 0, hello); assert(r == 0); LSN beginlsn; r = toku_log_begin_checkpoint(logger, &beginlsn, TRUE, 0); assert(r == 0); - r = toku_log_end_checkpoint(logger, NULL, TRUE, beginlsn.lsn, 0); assert(r == 0); + r = toku_log_end_checkpoint(logger, NULL, TRUE, beginlsn.lsn, 0, 0, 0); assert(r == 0); r = toku_log_comment(logger, NULL, TRUE, 0, hello); assert(r == 0); BYTESTRING there = { strlen("there"), "there" }; r = toku_log_comment(logger, NULL, TRUE, 0, there); assert(r == 0); diff --git a/newbrt/tests/recovery-lsn-error-during-forward-scan.c b/newbrt/tests/recovery-lsn-error-during-forward-scan.c index 9a79a929cc9..bc07d9deadf 100644 --- a/newbrt/tests/recovery-lsn-error-during-forward-scan.c +++ b/newbrt/tests/recovery-lsn-error-during-forward-scan.c @@ -30,7 +30,7 @@ run_test(void) { LSN beginlsn; r = toku_log_begin_checkpoint(logger, &beginlsn, TRUE, 0); assert(r == 0); - r = toku_log_end_checkpoint(logger, NULL, TRUE, beginlsn.lsn, 0); assert(r == 0); + r = toku_log_end_checkpoint(logger, NULL, TRUE, beginlsn.lsn, 0, 0, 0); assert(r == 0); r = toku_logger_close(&logger); assert(r == 0); diff --git a/newbrt/tests/test1305.c b/newbrt/tests/test1305.c deleted file mode 100644 index 1b860f53763..00000000000 --- a/newbrt/tests/test1305.c +++ /dev/null @@ -1,106 +0,0 @@ -/* Test bread_backwards to make sure it can read backwards even for large files. */ - -#include - -#include -#include -#include -#include -#include -#include - -#include "../brttypes.h" -#include "../bread.h" -#include "test.h" - -#define FNAME "test1305.data" - -// THe buffer size in units of 64-bit integers. -#define N_BIGINTS (1<<20) -#define BIGINT_SIZE (sizeof(u_int64_t)) -// How big is the readback buffer (in 8-bit integers)? -#define READBACK_BUFSIZE (1<<20) - - -static void -test (u_int64_t fsize) { - unlink(FNAME); - // Create a file of size fsize. Fill it with 8-byte values which are integers, in order) - assert(fsize%(N_BIGINTS*sizeof(u_int64_t)) == 0); // Make sure the fsize is a multiple of the buffer size. - u_int64_t i = 0; - { - int fd = open(FNAME, O_CREAT+O_RDWR+O_BINARY, 0777); - assert(fd>=0); - static u_int64_t buf[N_BIGINTS]; //windows cannot handle this on the stack - static char compressed_buf[N_BIGINTS*2 + 1000]; // this is more than compressbound returns - uLongf compressed_len; - while (i*BIGINT_SIZE < fsize) { - if (verbose>0 && i % (1<<25) == 0) { - printf(" %s:test (%"PRIu64") forwards [%"PRIu64"%%]\n", __FILE__, fsize, 100*BIGINT_SIZE*((u_int64_t)i) / fsize); - fflush(stdout); - } - - int j; - for (j=0; j=0); - BREAD br = create_bread_from_fd_initialize_at(fd); - while (bread_has_more(br)) { - if (verbose>0 && (fsize/BIGINT_SIZE - i) % (1<<25) == 0) { - printf(" %s:test (%"PRIu64") backwards [%"PRIu64"%%]\n", __FILE__, fsize, 100*BIGINT_SIZE*((u_int64_t)i) / fsize); - fflush(stdout); - } - assert(i>0); - i--; - u_int64_t storedi; - { int r = bread_backwards(br, &storedi, sizeof(storedi)); assert(r==sizeof(storedi)); } - assert(storedi==i); - } - assert(i==0); - { int r=close_bread_without_closing_fd(br); assert(r==0); } - { int r=close(fd); assert(r==0); } - } - //printf("Did %" PRIu64 "\n", fsize); - //system("ls -l " FNAME); - unlink(FNAME); -} - -int -test_main (int argc, const char *argv[]) { - default_parse_args(argc, argv); - test(1LL<<23); - test(1LL<<30); - test(1LL<<31); - test(1LL<<32); - test(1LL<<33); - return 0; -} - diff --git a/newbrt/tests/test_logcursor.c b/newbrt/tests/test_logcursor.c index 2db3badcb07..6364b8dcac8 100644 --- a/newbrt/tests/test_logcursor.c +++ b/newbrt/tests/test_logcursor.c @@ -153,6 +153,9 @@ int create_logfiles() { LSN lsn = {0}; TXNID txnid = 0; TXNID cp_txnid = 0; + + u_int32_t num_fassociate = 0; + u_int32_t num_xstillopen = 0; bs_aname.len = 4; bs_aname.data="a.db"; bs_bname.len = 4; bs_bname.data="b.db"; @@ -171,31 +174,40 @@ int create_logfiles() { //fcreate 'F': lsn=2 txnid=1 filenum=0 fname={len=4 data="a.db"} mode=0777 treeflags=0 crc=18a3d525 len=49 r = toku_log_fcreate(logger, &lsn, NO_FSYNC, txnid, fn_aname, bs_aname, 0x0777, 0, 0, bs_empty); assert(r==0); //commit 'C': lsn=3 txnid=1 crc=00001f1e len=29 - r = toku_log_commit(logger, &lsn, FSYNC, txnid); assert(r==0); + r = toku_log_xcommit(logger, &lsn, FSYNC, txnid); assert(r==0); //xbegin 'b': lsn=4 parenttxnid=0 crc=00000a1f len=29 r = toku_log_xbegin(logger, &lsn, NO_FSYNC, 0); assert(r==0); txnid = lsn.lsn; //fcreate 'F': lsn=5 txnid=4 filenum=1 fname={len=4 data="b.db"} mode=0777 treeflags=0 crc=14a47925 len=49 r = toku_log_fcreate(logger, &lsn, NO_FSYNC, txnid, fn_bname, bs_bname, 0x0777, 0, 0, bs_empty); assert(r==0); //commit 'C': lsn=6 txnid=4 crc=0000c11e len=29 - r = toku_log_commit(logger, &lsn, FSYNC, txnid); assert(r==0); + r = toku_log_xcommit(logger, &lsn, FSYNC, txnid); assert(r==0); //xbegin 'b': lsn=7 parenttxnid=0 crc=0000f91f len=29 r = toku_log_xbegin(logger, &lsn, NO_FSYNC, 0); assert(r==0); txnid = lsn.lsn; //enq_insert 'I': lsn=8 filenum=0 xid=7 key={len=2 data="a\000"} value={len=2 data="b\000"} crc=40b863e4 len=45 r = toku_log_enq_insert(logger, &lsn, NO_FSYNC, fn_aname, txnid, bs_a, bs_b); assert(r==0); //begin_checkpoint 'x': lsn=9 timestamp=1251309957584197 crc=cd067878 len=29 r = toku_log_begin_checkpoint(logger, &lsn, NO_FSYNC, 1251309957584197); assert(r==0); cp_txnid = lsn.lsn; - //xstillopen 's': lsn=10 txnid=7 parent=0 crc=00061816 len=37 - r = toku_log_xstillopen(logger, &lsn, NO_FSYNC, txnid, 0); assert(r==0); //fassociate 'f': lsn=11 filenum=1 fname={len=4 data="b.db"} crc=a7126035 len=33 r = toku_log_fassociate(logger, &lsn, NO_FSYNC, fn_bname, 0, bs_bname); assert(r==0); + num_fassociate++; //fassociate 'f': lsn=12 filenum=0 fname={len=4 data="a.db"} crc=a70c5f35 len=33 r = toku_log_fassociate(logger, &lsn, NO_FSYNC, fn_aname, 0, bs_aname); assert(r==0); + num_fassociate++; + //xstillopen 's': lsn=10 txnid=7 parent=0 crc=00061816 len=37 <- obsolete + { + FILENUMS filenums = {0, NULL}; + r = toku_log_xstillopen(logger, &lsn, NO_FSYNC, txnid, 0, + 0, filenums, 0, 0, 0, + ROLLBACK_NONE, ROLLBACK_NONE, ROLLBACK_NONE); + assert(r==0); + } + num_xstillopen++; //end_checkpoint 'X': lsn=13 txnid=9 timestamp=1251309957586872 crc=cd285c30 len=37 - r = toku_log_end_checkpoint(logger, &lsn, FSYNC, cp_txnid, 1251309957586872); assert(r==0); + r = toku_log_end_checkpoint(logger, &lsn, FSYNC, cp_txnid, 1251309957586872, num_fassociate, num_xstillopen); assert(r==0); //enq_insert 'I': lsn=14 filenum=1 xid=7 key={len=2 data="b\000"} value={len=2 data="a\000"} crc=40388be4 len=45 r = toku_log_enq_insert(logger, &lsn, NO_FSYNC, fn_bname, txnid, bs_b, bs_a); assert(r==0); //commit 'C': lsn=15 txnid=7 crc=00016d1e len=29 - r = toku_log_commit(logger, &lsn, FSYNC, txnid); assert(r==0); + r = toku_log_xcommit(logger, &lsn, FSYNC, txnid); assert(r==0); // close logger r = toku_logger_close(&logger); assert(r==0); diff --git a/newbrt/txn.c b/newbrt/txn.c index 42378986285..9790f1fac1a 100644 --- a/newbrt/txn.c +++ b/newbrt/txn.c @@ -12,18 +12,20 @@ int toku_txn_begin_txn (TOKUTXN parent_tokutxn, TOKUTXN *tokutxn, TOKULOGGER log int toku_txn_begin_with_xid (TOKUTXN parent_tokutxn, TOKUTXN *tokutxn, TOKULOGGER logger, TXNID xid) { if (logger->is_panicked) return EINVAL; + assert(logger->rollback_cachefile); TAGMALLOC(TOKUTXN, result); if (result==0) return errno; int r; + LSN first_lsn; if (xid == 0) { - r = toku_log_xbegin(logger, &result->first_lsn, 0, parent_tokutxn ? parent_tokutxn->txnid64 : 0); + r = toku_log_xbegin(logger, &first_lsn, 0, parent_tokutxn ? parent_tokutxn->txnid64 : 0); if (r!=0) goto died; } else - result->first_lsn.lsn = xid; + first_lsn.lsn = xid; r = toku_omt_create(&result->open_brts); if (r!=0) goto died; - result->txnid64 = result->first_lsn.lsn; + result->txnid64 = first_lsn.lsn; XIDS parent_xids; if (parent_tokutxn==NULL) parent_xids = xids_get_root_xids(); @@ -33,13 +35,19 @@ int toku_txn_begin_with_xid (TOKUTXN parent_tokutxn, TOKUTXN *tokutxn, TOKULOGGE goto died; result->logger = logger; result->parent = parent_tokutxn; - result->oldest_logentry = result->newest_logentry = 0; - result->rollentry_arena = memarena_create(); result->num_rollentries = 0; result->num_rollentries_processed = 0; result->progress_poll_fun = NULL; result->progress_poll_fun_extra = NULL; + result->spilled_rollback_head = ROLLBACK_NONE; + result->spilled_rollback_tail = ROLLBACK_NONE; + result->spilled_rollback_head_hash = 0; + result->spilled_rollback_tail_hash = 0; + result->current_rollback = ROLLBACK_NONE; + result->current_rollback_hash = 0; + result->num_rollback_nodes = 0; + result->pinned_inprogress_rollback_log = NULL; if (toku_omt_size(logger->live_txns) == 0) { assert(logger->oldest_living_xid == TXNID_NONE_LIVING); @@ -59,13 +67,9 @@ int toku_txn_begin_with_xid (TOKUTXN parent_tokutxn, TOKUTXN *tokutxn, TOKULOGGE assert(idx > 0); } - result->rollentry_resident_bytecount=0; result->rollentry_raw_count = 0; - result->rollentry_filename = 0; - result->rollentry_fd = -1; - result->rollentry_filesize = 0; result->force_fsync_on_commit = FALSE; - result->has_done_work = FALSE; + result->recovered_from_checkpoint = FALSE; *tokutxn = result; return 0; @@ -75,6 +79,36 @@ died: return r; } +//Used on recovery to recover a transaction. +int +toku_txn_load_txninfo (TOKUTXN txn, TXNINFO info) { +#define COPY_FROM_INFO(field) txn->field = info->field + COPY_FROM_INFO(rollentry_raw_count); + uint32_t i; + for (i = 0; i < info->num_brts; i++) { + BRT brt = info->open_brts[i]; + int r = toku_txn_note_brt(txn, brt); + assert(r==0); + } + COPY_FROM_INFO(force_fsync_on_commit ); + COPY_FROM_INFO(num_rollback_nodes); + COPY_FROM_INFO(num_rollentries); + + CACHEFILE rollback_cachefile = txn->logger->rollback_cachefile; + + COPY_FROM_INFO(spilled_rollback_head); + txn->spilled_rollback_head_hash = toku_cachetable_hash(rollback_cachefile, + txn->spilled_rollback_head); + COPY_FROM_INFO(spilled_rollback_tail); + txn->spilled_rollback_tail_hash = toku_cachetable_hash(rollback_cachefile, + txn->spilled_rollback_tail); + COPY_FROM_INFO(current_rollback); + txn->current_rollback_hash = toku_cachetable_hash(rollback_cachefile, + txn->current_rollback); +#undef COPY_FROM_INFO + txn->recovered_from_checkpoint = TRUE; + return 0; +} // Doesn't close the txn, just performs the commit operations. int toku_txn_commit_txn(TOKUTXN txn, int nosync, YIELDF yield, void *yieldv, @@ -92,13 +126,13 @@ int toku_txn_commit_with_lsn(TOKUTXN txn, int nosync, YIELDF yield, void *yieldv // panic handled in log_commit //Child transactions do not actually 'commit'. They promote their changes to parent, so no need to fsync if this txn has a parent. - int do_fsync = !txn->parent && (txn->force_fsync_on_commit || (!nosync && txn->has_done_work)); + int do_fsync = !txn->parent && (txn->force_fsync_on_commit || (!nosync && txn->num_rollentries>0)); txn->progress_poll_fun = poll; txn->progress_poll_fun_extra = poll_extra; if (release_locks) release_locks(locks_thunk); - r = toku_log_commit(txn->logger, (LSN*)0, do_fsync, txn->txnid64); // exits holding neither of the tokulogger locks. + r = toku_log_xcommit(txn->logger, (LSN*)0, do_fsync, txn->txnid64); // exits holding neither of the tokulogger locks. if (reacquire_locks) reacquire_locks(locks_thunk); if (r!=0) return r; diff --git a/newbrt/txn.h b/newbrt/txn.h index 910aa9d8320..d6007d913a6 100644 --- a/newbrt/txn.h +++ b/newbrt/txn.h @@ -7,6 +7,7 @@ int toku_txn_begin_txn (TOKUTXN parent_tokutxn, TOKUTXN *tokutxn, TOKULOGGER logger); int toku_txn_begin_with_xid (TOKUTXN parent_tokutxn, TOKUTXN *tokutxn, TOKULOGGER logger, TXNID xid); +int toku_txn_load_txninfo (TOKUTXN txn, TXNINFO info); int toku_txn_commit_txn (TOKUTXN txn, int nosync, YIELDF yield, void *yieldv, TXN_PROGRESS_POLL_FUNCTION poll, void *poll_extra, diff --git a/newbrt/wbuf.h b/newbrt/wbuf.h index c7bce265750..0260dd9184a 100644 --- a/newbrt/wbuf.h +++ b/newbrt/wbuf.h @@ -170,7 +170,6 @@ static inline void wbuf_DISKOFF (struct wbuf *w, DISKOFF off) { static inline void wbuf_BLOCKNUM (struct wbuf *w, BLOCKNUM b) { wbuf_ulonglong(w, b.b); } - static inline void wbuf_nocrc_BLOCKNUM (struct wbuf *w, BLOCKNUM b) { wbuf_nocrc_ulonglong(w, b.b); } diff --git a/release/examples/db-insert.c b/release/examples/db-insert.c index e356a06361e..8693e957095 100644 --- a/release/examples/db-insert.c +++ b/release/examples/db-insert.c @@ -60,7 +60,7 @@ int singlex_child = 0; // Do a single transaction, but do all work with a child int singlex = 0; // Do a single transaction int singlex_create = 0; // Create the db using the single transaction (only valid if singlex) int insert1first = 0; // insert 1 before doing the rest -int check_small_rolltmp = 0; // verify that the rollback logs are small (only valid if singlex) +int check_small_rollback = 0; // verify that the rollback logs are small (only valid if singlex) int do_transactions = 0; int if_transactions_do_logging = DB_INIT_LOG; // set this to zero if we want no logging when transactions are used int do_abort = 0; @@ -210,14 +210,14 @@ static void benchmark_shutdown (void) { #endif if (do_transactions && singlex && !insert1first && (singlex_create || prelock)) { #if defined(TOKUDB) - //There should be a single 'truncate' in the rolltmp instead of many 'insert' entries. + //There should be a single 'truncate' in the rollback instead of many 'insert' entries. struct txn_stat *s; r = tid->txn_stat(tid, &s); assert(r==0); //TODO: #1125 Always do the test after performance testing is done. - if (singlex_child) fprintf(stderr, "SKIPPED 'small rolltmp' test for child txn\n"); + if (singlex_child) fprintf(stderr, "SKIPPED 'small rollback' test for child txn\n"); else - assert(s->rolltmp_raw_count < 100); // gross test, not worth investigating details + assert(s->rollback_raw_count < 100); // gross test, not worth investigating details free(s); //system("ls -l bench.tokudb"); #endif @@ -374,7 +374,7 @@ static int print_usage (const char *argv0) { fprintf(stderr, " --singlex-child (implies -x) Run the whole job as a single transaction, do all work a child of that transaction.\n"); fprintf(stderr, " --finish-child-first Commit/abort child before doing so to parent (no effect if no child).\n"); fprintf(stderr, " --singlex-create (implies --singlex) Create the file using the single transaction (Default is to use a different transaction to create.)\n"); - fprintf(stderr, " --check_small_rolltmp (Only valid in --singlex mode) Verify that very little data was saved in the rollback logs.\n"); + fprintf(stderr, " --check_small_rollback (Only valid in --singlex mode) Verify that very little data was saved in the rollback logs.\n"); fprintf(stderr, " --prelock Prelock the database.\n"); fprintf(stderr, " --prelockflag Prelock the database and send the DB_PRELOCKED_WRITE flag.\n"); fprintf(stderr, " --abort Abort the singlex after the transaction is over. (Requires --singlex.)\n"); @@ -463,8 +463,8 @@ int main (int argc, const char *argv[]) { singlex = 1; } else if (strcmp(arg, "--insert1first") == 0) { insert1first = 1; - } else if (strcmp(arg, "--check_small_rolltmp") == 0) { - check_small_rolltmp = 1; + } else if (strcmp(arg, "--check_small_rollback") == 0) { + check_small_rollback = 1; } else if (strcmp(arg, "--xcount") == 0) { if (i+1 >= argc) return print_usage(argv[0]); items_per_transaction = strtoll(argv[++i], &endptr, 10); assert(*endptr == 0); @@ -560,13 +560,13 @@ int main (int argc, const char *argv[]) { printf("insertions of %d per batch%s\n", items_per_iteration, do_transactions ? " (with transactions)" : ""); } #if !defined TOKUDB - if (check_small_rolltmp) { - fprintf(stderr, "--check_small_rolltmp only works on the TokuDB (not BDB)\n"); + if (check_small_rollback) { + fprintf(stderr, "--check_small_rollback only works on the TokuDB (not BDB)\n"); return print_usage(argv[0]); } #endif - if (check_small_rolltmp && !singlex) { - fprintf(stderr, "--check_small_rolltmp requires --singlex\n"); + if (check_small_rollback && !singlex) { + fprintf(stderr, "--check_small_rollback requires --singlex\n"); return print_usage(argv[0]); } benchmark_setup(); diff --git a/src/tests/Makefile b/src/tests/Makefile index 0256b5798ab..99033d8c93a 100644 --- a/src/tests/Makefile +++ b/src/tests/Makefile @@ -332,6 +332,7 @@ endif mkdir dir.$*.c.tdb.recover && \ cp dir.$*.c.tdb/tokudb.directory dir.$*.c.tdb.recover/ && \ cp dir.$*.c.tdb/tokudb.environment dir.$*.c.tdb.recover/ && \ + cp dir.$*.c.tdb/tokudb.rollback dir.$*.c.tdb.recover/ && \ cp dir.$*.c.tdb/*.tokulog dir.$*.c.tdb.recover/ && \ echo doing recovery &&\ $(VGRIND) ../../newbrt/tdb-recover dir.$*.c.tdb.recover dir.$*.c.tdb.recover && \ diff --git a/src/tests/bug1381.c b/src/tests/bug1381.c index af9c68bc429..d2cbea6b7fc 100644 --- a/src/tests/bug1381.c +++ b/src/tests/bug1381.c @@ -60,13 +60,13 @@ static void do_1381_maybe_lock (int do_table_lock, u_int64_t *raw_count) { } r = txn->txn_stat(txn, &s2); CKERR(r); - //printf("Raw counts = %" PRId64 ", %" PRId64 "\n", s1->rolltmp_raw_count, s2->rolltmp_raw_count); + //printf("Raw counts = %" PRId64 ", %" PRId64 "\n", s1->rollback_raw_count, s2->rollback_raw_count); - *raw_count = s2->rolltmp_raw_count - s1->rolltmp_raw_count; + *raw_count = s2->rollback_raw_count - s1->rollback_raw_count; if (do_table_lock) { - assert(s1->rolltmp_raw_count == s2->rolltmp_raw_count); + assert(s1->rollback_raw_count == s2->rollback_raw_count); } else { - assert(s1->rolltmp_raw_count < s2->rolltmp_raw_count); + assert(s1->rollback_raw_count < s2->rollback_raw_count); } toku_free(s1); toku_free(s2); diff --git a/src/tests/diskfull.c b/src/tests/diskfull.c index 50204b3cd88..7d2cb11a0d6 100644 --- a/src/tests/diskfull.c +++ b/src/tests/diskfull.c @@ -91,7 +91,7 @@ do_db_work(void) { } if (did_fail) goto shutdown2; - // Put an extra item in so that the rolltmp file will be created. + // Put an extra item in r=env->txn_begin(env, 0, &tid, 0); assert(r==0); r=db->put(db, tid, dbt_init(&key, "a", 2), dbt_init(&data, "b", 2), 0); DOERR(r); if (did_fail) { diff --git a/src/tests/stat64.c b/src/tests/stat64.c index 5e194fb30fd..8068882f18b 100644 --- a/src/tests/stat64.c +++ b/src/tests/stat64.c @@ -21,7 +21,7 @@ test_stat64 (unsigned int N) { DB_TXN *txn; r = db_env_create(&env, 0); CKERR(r); - r = env->set_cachesize(env, 0, 10*1000000, 1); + r = env->set_cachesize(env, 0, 20*1000000, 1); r = env->open(env, ENVDIR, DB_INIT_LOCK|DB_INIT_LOG|DB_INIT_MPOOL|DB_INIT_TXN|DB_CREATE|DB_PRIVATE, S_IRWXU+S_IRWXG+S_IRWXO); CKERR(r); r = db_create(&db, env, 0); CKERR(r); @@ -38,6 +38,10 @@ test_stat64 (unsigned int N) { unsigned int i; u_int64_t dsize=0; for (i=0; i1 && i % (1<<14) == 0) { + printf("%s(total=%u) inserted %u so far\n", __FILE__, N, i); + fflush(stdout); + } char hello[30], there[30]; snprintf(hello, sizeof(hello), "hello%8d", i); snprintf(there, sizeof(there), "there%d", i); diff --git a/src/tests/test1324.c b/src/tests/test1324.c deleted file mode 100644 index 129a854b2ed..00000000000 --- a/src/tests/test1324.c +++ /dev/null @@ -1,85 +0,0 @@ -/* -*- mode: C; c-basic-offset: 4 -*- */ -#ident "Copyright (c) 2007 Tokutek Inc. All rights reserved." -#include "test.h" -/* Test for #1324. Make sure rolltmp files are removed. */ -#include -#include - -#ifndef USE_TDB -#error This test only works for TokuDB. -#endif - -static void mkfile (const char *fname) { - mode_t mode = S_IRWXU|S_IRWXG|S_IRWXO; - int fd = open(fname, O_WRONLY | O_CREAT | O_BINARY, mode); - if (fd<0) perror("opening"); - assert(fd>=0); - toku_os_full_write(fd, "hello\n", 6); - int r = close(fd); assert(r==0); -} - -static void -do_1324 (int moreflags) -{ - const char fname[] = ENVDIR "/__tokudb_rolltmp.12345"; - const char fnamekeep[] = ENVDIR "/keepme"; - - system("rm -rf " ENVDIR); - toku_os_mkdir(ENVDIR, S_IRWXU+S_IRWXG+S_IRWXO); - mkfile(fname); - mkfile(fnamekeep); - - const int envflags = DB_CREATE|DB_INIT_MPOOL|DB_INIT_TXN|DB_INIT_LOCK |DB_THREAD |DB_PRIVATE | DB_RECOVER | moreflags; - - { - DB_ENV *env; - int r; - - if (moreflags & DB_INIT_LOG) { - // create the log - r = db_env_create(&env, 0); CKERR(r); - r = env->open(env, ENVDIR, envflags & ~DB_RECOVER, S_IRWXU+S_IRWXG+S_IRWXO); CKERR(r); - r = env->close(env, 0); CKERR(r); - } - - r = db_env_create(&env, 0); CKERR(r); - r = env->open(env, ENVDIR, envflags, S_IRWXU+S_IRWXG+S_IRWXO); CKERR(r); - - { - toku_struct_stat sbuf; - r = toku_stat(fname, &sbuf); - if (r==0) { - fprintf(stderr, "The rolltmp file %s should have been deleted, but was not.\n", fname); - } - assert(r!=0); - } - { - toku_struct_stat sbuf; - r = toku_stat(fnamekeep, &sbuf); - if (r!=0) { - fprintf(stderr, "The keepme file %s should NOT have been deleted, but was not.\n", fnamekeep); - } - assert(r==0); - } - - r = env->close(env, 0); CKERR(r); - } - system("ls -l " ENVDIR); - // make sure we can open the env again. - { - DB_ENV *env; - int r; - r = db_env_create(&env, 0); CKERR(r); - r = env->open(env, ENVDIR, envflags, S_IRWXU+S_IRWXG+S_IRWXO); CKERR(r); - r = env->close(env, 0); CKERR(r); - } -} - -int -test_main (int argc, char *const argv[]) -{ - parse_args(argc, argv); - do_1324(DB_INIT_LOG); - do_1324(0); - return 0; -} diff --git a/src/ydb.c b/src/ydb.c index 8bfc303957b..55683b75400 100644 --- a/src/ydb.c +++ b/src/ydb.c @@ -484,13 +484,6 @@ env_setup_real_log_dir(DB_ENV *env) { } } -static int delete_rolltmp_files(DB_ENV *env) { - assert(env->i->real_data_dir); - assert(env->i->real_log_dir); - int r = tokudb_recover_delete_rolltmp_files(env->i->real_data_dir, env->i->real_log_dir); - return r; -} - static int ydb_do_recovery (DB_ENV *env) { assert(env->i->real_log_dir); @@ -600,9 +593,9 @@ ydb_recover_log_exists(DB_ENV *env) { // Set *valid_newenv if creating a new environment (all files missing). // (Note, if special dictionaries exist, then they were created transactionally and log should exist.) static int -validate_env(DB_ENV * env, BOOL * valid_newenv) { +validate_env(DB_ENV * env, BOOL * valid_newenv, BOOL need_rollback_cachefile) { int r; - BOOL expect_newenv; // set true if we expect to create a new env + BOOL expect_newenv = FALSE; // set true if we expect to create a new env toku_struct_stat buf; char* path = NULL; @@ -610,11 +603,12 @@ validate_env(DB_ENV * env, BOOL * valid_newenv) { path = toku_construct_full_name(2, env->i->dir, environmentdictionary); assert(path); r = toku_stat(path, &buf); + int stat_errno = errno; toku_free(path); if (r == 0) { expect_newenv = FALSE; // persistent info exists } - else if (errno == ENOENT) { + else if (stat_errno == ENOENT) { expect_newenv = TRUE; r = 0; } @@ -623,17 +617,41 @@ validate_env(DB_ENV * env, BOOL * valid_newenv) { assert(r); } + // Test for rollback cachefile + if (r == 0 && need_rollback_cachefile) { + path = toku_construct_full_name(2, env->i->dir, ROLLBACK_CACHEFILE_NAME); + assert(path); + r = toku_stat(path, &buf); + stat_errno = errno; + toku_free(path); + if (r == 0) { + if (expect_newenv) // rollback cachefile exists, but persistent env is missing + r = toku_ydb_do_error(env, ENOENT, "Persistent environment is missing\n"); + } + else if (stat_errno == ENOENT) { + if (!expect_newenv) // rollback cachefile is missing but persistent env exists + r = toku_ydb_do_error(env, ENOENT, "rollback cachefile directory is missing\n"); + else + r = 0; // both rollback cachefile and persistent env are missing + } + else { + r = toku_ydb_do_error(env, errno, "Unable to access rollback cachefile\n"); + assert(r); + } + } + // Test for fileops directory if (r == 0) { path = toku_construct_full_name(2, env->i->dir, fileopsdirectory); assert(path); r = toku_stat(path, &buf); + stat_errno = errno; toku_free(path); if (r == 0) { if (expect_newenv) // fileops directory exists, but persistent env is missing r = toku_ydb_do_error(env, ENOENT, "Persistent environment is missing\n"); } - else if (errno == ENOENT) { + else if (stat_errno == ENOENT) { if (!expect_newenv) // fileops directory is missing but persistent env exists r = toku_ydb_do_error(env, ENOENT, "Fileops directory is missing\n"); else @@ -739,16 +757,16 @@ toku_env_open(DB_ENV * env, const char *home, u_int32_t flags, int mode) { env_setup_real_data_dir(env); env_setup_real_log_dir(env); - r = validate_env(env, &newenv); // make sure that environment is either new or complete + BOOL need_rollback_cachefile = FALSE; + if (flags & (DB_INIT_TXN | DB_INIT_LOG)) { + need_rollback_cachefile = TRUE; + } + + r = validate_env(env, &newenv, need_rollback_cachefile); // make sure that environment is either new or complete if (r != 0) return r; unused_flags &= ~DB_INIT_TXN & ~DB_INIT_LOG; - if (flags & DB_INIT_TXN) { - r = delete_rolltmp_files(env); - if (r != 0) return r; - } - // do recovery only if there exists a log and recovery is requested // otherwise, a log is created when the logger is opened later if (!newenv) { @@ -805,6 +823,8 @@ toku_env_open(DB_ENV * env, const char *home, u_int32_t flags, int mode) { assert (using_txns); toku_logger_set_cachetable(env->i->logger, env->i->cachetable); toku_logger_set_remove_finalize_callback(env->i->logger, finalize_file_removal, env->i->ltm); + r = toku_logger_open_rollback(env->i->logger, env->i->cachetable, newenv); + assert(r==0); } DB_TXN *txn=NULL; @@ -894,7 +914,6 @@ static int toku_env_close(DB_ENV * env, u_int32_t flags) { } } } - if (env->i->cachetable) { toku_ydb_unlock(); // ydb lock must not be held when shutting down minicron toku_cachetable_minicron_shutdown(env->i->cachetable); @@ -907,6 +926,17 @@ static int toku_env_close(DB_ENV * env, u_int32_t flags) { toku_ydb_do_error(env, r, "Cannot close environment (error during checkpoint)\n"); goto panic_and_quit_early; } + r = toku_logger_close_rollback(env->i->logger, FALSE); + if (r) { + toku_ydb_do_error(env, r, "Cannot close environment (error during closing rollback cachefile)\n"); + goto panic_and_quit_early; + } + //Do a second checkpoint now that the rollback cachefile is closed. + r = toku_checkpoint(env->i->cachetable, env->i->logger, NULL, NULL, NULL, NULL); + if (r) { + toku_ydb_do_error(env, r, "Cannot close environment (error during checkpoint)\n"); + goto panic_and_quit_early; + } r = toku_logger_shutdown(env->i->logger); if (r) { toku_ydb_do_error(env, r, "Cannot close environment (error during logger shutdown)\n"); @@ -1954,7 +1984,7 @@ static u_int32_t locked_txn_id(DB_TXN *txn) { static int toku_txn_stat (DB_TXN *txn, struct txn_stat **txn_stat) { XMALLOC(*txn_stat); - return toku_logger_txn_rolltmp_raw_count(db_txn_struct_i(txn)->tokutxn, &(*txn_stat)->rolltmp_raw_count); + return toku_logger_txn_rollback_raw_count(db_txn_struct_i(txn)->tokutxn, &(*txn_stat)->rollback_raw_count); } static int locked_txn_stat (DB_TXN *txn, struct txn_stat **txn_stat) { @@ -5018,7 +5048,7 @@ int toku_db_pre_acquire_table_lock(DB *db, DB_TXN *txn) { toku_lt_neg_infinity, toku_lt_neg_infinity, toku_lt_infinity, toku_lt_infinity); if (r==0) { - r = toku_brt_note_table_lock(db->i->brt, db_txn_struct_i(txn)->tokutxn); // tell the BRT layer that the table is locked (so that it can reduce the amount of rollback (rolltmp) data. + r = toku_brt_note_table_lock(db->i->brt, db_txn_struct_i(txn)->tokutxn, FALSE); // tell the BRT layer that the table is locked (so that it can reduce the amount of rollback data. } return r; diff --git a/toku_include/memory.h b/toku_include/memory.h index bc83df6264e..5ca78a15fb3 100644 --- a/toku_include/memory.h +++ b/toku_include/memory.h @@ -21,7 +21,8 @@ enum typ_tag { TYP_BRTNODE = 0xdead0001, TYP_GPMA, TYP_TOKULOGGER, TYP_TOKUTXN, - TYP_LEAFENTRY + TYP_LEAFENTRY, + TYP_ROLLBACK_LOG_NODE }; /* Everything should call toku_malloc() instead of malloc(), and toku_calloc() instead of calloc() */ diff --git a/toku_include/toku_list.h b/toku_include/toku_list.h index 15b57f9d624..1db086dfad1 100644 --- a/toku_include/toku_list.h +++ b/toku_include/toku_list.h @@ -78,7 +78,7 @@ static inline void toku_list_move(struct toku_list *newhead, struct toku_list *o // Note: Need the extra level of parens in these macros so that // toku_list_struct(h, foo, b)->zot // will work right. Otherwise the type cast will try to include ->zot, and it will be all messed up. -#if defined(__GNUC__) && __GNUC__ >= 4 +#if (defined(__GNUC__) && __GNUC__ >= 4) || defined(__builtin_offsetof) #define toku_list_struct(p, t, f) ((t*)((char*)(p) - __builtin_offsetof(t, f))) #else #define toku_list_struct(p, t, f) ((t*)((char*)(p) - ((char*)&((t*)0)->f))) diff --git a/windows/misc.h b/windows/misc.h index 36cad7d3e11..f299029c4e2 100644 --- a/windows/misc.h +++ b/windows/misc.h @@ -7,6 +7,7 @@ extern "C" { #include "toku_os.h" #include +#include //These are functions that really exist in windows but are named //something else. @@ -82,6 +83,9 @@ int mkstemp(char * ttemplate); toku_off_t ftello(FILE *stream); +#define __builtin_offsetof(type, member) offsetof(type, member) + + #if defined(__cplusplus) }; #endif