diff --git a/buildheader/db.h_4_1 b/buildheader/db.h_4_1 index ddadd29dd2e..5f7e774b8e5 100644 --- a/buildheader/db.h_4_1 +++ b/buildheader/db.h_4_1 @@ -336,9 +336,10 @@ struct __toku_dbt { u_int32_t flags; /* 32-bit offset=20 size=4, 64=bit offset=24 size=4 */ /* 4 more bytes of alignment in the 64-bit case. */ }; -typedef int (*toku_dbt_upgradef)(DB*, - u_int32_t old_version, const DBT *old_descriptor, const DBT *old_key, const DBT *old_val, - u_int32_t new_version, const DBT *new_descriptor, const DBT *new_key, const DBT *new_val); +typedef struct __toku_descriptor { + u_int32_t version; + DBT dbt; +} *DESCRIPTOR, DESCRIPTOR_S; //One header is included in 'data' //One header is included in 'additional for checkpoint' typedef struct __toku_db_fragmentation { @@ -364,8 +365,8 @@ struct __toku_db { const DBT* (*dbt_neg_infty)(void)/* Return the special DBT that refers to negative infinity in the lock table.*/; int (*delboth) (DB*, DB_TXN*, DBT*, DBT*, u_int32_t) /* Delete the key/value pair. */; int (*row_size_supported) (DB*, u_int32_t) /* Test whether a row size is supported. */; - const DBT *descriptor /* saved row/dictionary descriptor for aiding in comparisons */; - int (*set_descriptor) (DB*, u_int32_t version, const DBT* descriptor, toku_dbt_upgradef dbt_userformat_upgrade) /* set row/dictionary descriptor for a db. Available only while db is open */; + DESCRIPTOR descriptor /* saved row/dictionary descriptor for aiding in comparisons */; + int (*set_descriptor) (DB*, u_int32_t version, const DBT* descriptor) /* set row/dictionary descriptor for a db. Available only while db is open */; int (*getf_set)(DB*, DB_TXN*, u_int32_t, DBT*, YDB_CALLBACK_FUNCTION, void*) /* same as DBC->c_getf_set without a persistent cursor) */; int (*getf_get_both)(DB*, DB_TXN*, u_int32_t, DBT*, DBT*, YDB_CALLBACK_FUNCTION, void*) /* same as DBC->c_getf_get_both without a persistent cursor) */; int (*flatten)(DB*, DB_TXN*) /* Flatten a dictionary, similar to (but faster than) a table scan */; diff --git a/buildheader/db.h_4_3 b/buildheader/db.h_4_3 index 796d73c713a..7d3dcbdfb7d 100644 --- a/buildheader/db.h_4_3 +++ b/buildheader/db.h_4_3 @@ -346,9 +346,10 @@ struct __toku_dbt { u_int32_t flags; /* 32-bit offset=20 size=4, 64=bit offset=24 size=4 */ /* 4 more bytes of alignment in the 64-bit case. */ }; -typedef int (*toku_dbt_upgradef)(DB*, - u_int32_t old_version, const DBT *old_descriptor, const DBT *old_key, const DBT *old_val, - u_int32_t new_version, const DBT *new_descriptor, const DBT *new_key, const DBT *new_val); +typedef struct __toku_descriptor { + u_int32_t version; + DBT dbt; +} *DESCRIPTOR, DESCRIPTOR_S; //One header is included in 'data' //One header is included in 'additional for checkpoint' typedef struct __toku_db_fragmentation { @@ -374,8 +375,8 @@ struct __toku_db { const DBT* (*dbt_neg_infty)(void)/* Return the special DBT that refers to negative infinity in the lock table.*/; int (*delboth) (DB*, DB_TXN*, DBT*, DBT*, u_int32_t) /* Delete the key/value pair. */; int (*row_size_supported) (DB*, u_int32_t) /* Test whether a row size is supported. */; - const DBT *descriptor /* saved row/dictionary descriptor for aiding in comparisons */; - int (*set_descriptor) (DB*, u_int32_t version, const DBT* descriptor, toku_dbt_upgradef dbt_userformat_upgrade) /* set row/dictionary descriptor for a db. Available only while db is open */; + DESCRIPTOR descriptor /* saved row/dictionary descriptor for aiding in comparisons */; + int (*set_descriptor) (DB*, u_int32_t version, const DBT* descriptor) /* set row/dictionary descriptor for a db. Available only while db is open */; int (*getf_set)(DB*, DB_TXN*, u_int32_t, DBT*, YDB_CALLBACK_FUNCTION, void*) /* same as DBC->c_getf_set without a persistent cursor) */; int (*getf_get_both)(DB*, DB_TXN*, u_int32_t, DBT*, DBT*, YDB_CALLBACK_FUNCTION, void*) /* same as DBC->c_getf_get_both without a persistent cursor) */; int (*flatten)(DB*, DB_TXN*) /* Flatten a dictionary, similar to (but faster than) a table scan */; diff --git a/buildheader/db.h_4_4 b/buildheader/db.h_4_4 index 99bf64ce501..3311e4225a8 100644 --- a/buildheader/db.h_4_4 +++ b/buildheader/db.h_4_4 @@ -350,9 +350,10 @@ struct __toku_dbt { u_int32_t flags; /* 32-bit offset=20 size=4, 64=bit offset=24 size=4 */ /* 4 more bytes of alignment in the 64-bit case. */ }; -typedef int (*toku_dbt_upgradef)(DB*, - u_int32_t old_version, const DBT *old_descriptor, const DBT *old_key, const DBT *old_val, - u_int32_t new_version, const DBT *new_descriptor, const DBT *new_key, const DBT *new_val); +typedef struct __toku_descriptor { + u_int32_t version; + DBT dbt; +} *DESCRIPTOR, DESCRIPTOR_S; //One header is included in 'data' //One header is included in 'additional for checkpoint' typedef struct __toku_db_fragmentation { @@ -378,8 +379,8 @@ struct __toku_db { const DBT* (*dbt_neg_infty)(void)/* Return the special DBT that refers to negative infinity in the lock table.*/; int (*delboth) (DB*, DB_TXN*, DBT*, DBT*, u_int32_t) /* Delete the key/value pair. */; int (*row_size_supported) (DB*, u_int32_t) /* Test whether a row size is supported. */; - const DBT *descriptor /* saved row/dictionary descriptor for aiding in comparisons */; - int (*set_descriptor) (DB*, u_int32_t version, const DBT* descriptor, toku_dbt_upgradef dbt_userformat_upgrade) /* set row/dictionary descriptor for a db. Available only while db is open */; + DESCRIPTOR descriptor /* saved row/dictionary descriptor for aiding in comparisons */; + int (*set_descriptor) (DB*, u_int32_t version, const DBT* descriptor) /* set row/dictionary descriptor for a db. Available only while db is open */; int (*getf_set)(DB*, DB_TXN*, u_int32_t, DBT*, YDB_CALLBACK_FUNCTION, void*) /* same as DBC->c_getf_set without a persistent cursor) */; int (*getf_get_both)(DB*, DB_TXN*, u_int32_t, DBT*, DBT*, YDB_CALLBACK_FUNCTION, void*) /* same as DBC->c_getf_get_both without a persistent cursor) */; int (*flatten)(DB*, DB_TXN*) /* Flatten a dictionary, similar to (but faster than) a table scan */; diff --git a/buildheader/db.h_4_5 b/buildheader/db.h_4_5 index c59394677c9..b05d25ce025 100644 --- a/buildheader/db.h_4_5 +++ b/buildheader/db.h_4_5 @@ -350,9 +350,10 @@ struct __toku_dbt { u_int32_t flags; /* 32-bit offset=24 size=4, 64=bit offset=32 size=4 */ /* 4 more bytes of alignment in the 64-bit case. */ }; -typedef int (*toku_dbt_upgradef)(DB*, - u_int32_t old_version, const DBT *old_descriptor, const DBT *old_key, const DBT *old_val, - u_int32_t new_version, const DBT *new_descriptor, const DBT *new_key, const DBT *new_val); +typedef struct __toku_descriptor { + u_int32_t version; + DBT dbt; +} *DESCRIPTOR, DESCRIPTOR_S; //One header is included in 'data' //One header is included in 'additional for checkpoint' typedef struct __toku_db_fragmentation { @@ -378,8 +379,8 @@ struct __toku_db { const DBT* (*dbt_neg_infty)(void)/* Return the special DBT that refers to negative infinity in the lock table.*/; int (*delboth) (DB*, DB_TXN*, DBT*, DBT*, u_int32_t) /* Delete the key/value pair. */; int (*row_size_supported) (DB*, u_int32_t) /* Test whether a row size is supported. */; - const DBT *descriptor /* saved row/dictionary descriptor for aiding in comparisons */; - int (*set_descriptor) (DB*, u_int32_t version, const DBT* descriptor, toku_dbt_upgradef dbt_userformat_upgrade) /* set row/dictionary descriptor for a db. Available only while db is open */; + DESCRIPTOR descriptor /* saved row/dictionary descriptor for aiding in comparisons */; + int (*set_descriptor) (DB*, u_int32_t version, const DBT* descriptor) /* set row/dictionary descriptor for a db. Available only while db is open */; int (*getf_set)(DB*, DB_TXN*, u_int32_t, DBT*, YDB_CALLBACK_FUNCTION, void*) /* same as DBC->c_getf_set without a persistent cursor) */; int (*getf_get_both)(DB*, DB_TXN*, u_int32_t, DBT*, DBT*, YDB_CALLBACK_FUNCTION, void*) /* same as DBC->c_getf_get_both without a persistent cursor) */; int (*flatten)(DB*, DB_TXN*) /* Flatten a dictionary, similar to (but faster than) a table scan */; diff --git a/buildheader/db.h_4_6 b/buildheader/db.h_4_6 index 239a0315cd4..942f722f6c0 100644 --- a/buildheader/db.h_4_6 +++ b/buildheader/db.h_4_6 @@ -353,9 +353,10 @@ struct __toku_dbt { u_int32_t flags; /* 32-bit offset=24 size=4, 64=bit offset=32 size=4 */ /* 4 more bytes of alignment in the 64-bit case. */ }; -typedef int (*toku_dbt_upgradef)(DB*, - u_int32_t old_version, const DBT *old_descriptor, const DBT *old_key, const DBT *old_val, - u_int32_t new_version, const DBT *new_descriptor, const DBT *new_key, const DBT *new_val); +typedef struct __toku_descriptor { + u_int32_t version; + DBT dbt; +} *DESCRIPTOR, DESCRIPTOR_S; //One header is included in 'data' //One header is included in 'additional for checkpoint' typedef struct __toku_db_fragmentation { @@ -382,8 +383,8 @@ struct __toku_db { const DBT* (*dbt_neg_infty)(void)/* Return the special DBT that refers to negative infinity in the lock table.*/; int (*delboth) (DB*, DB_TXN*, DBT*, DBT*, u_int32_t) /* Delete the key/value pair. */; int (*row_size_supported) (DB*, u_int32_t) /* Test whether a row size is supported. */; - const DBT *descriptor /* saved row/dictionary descriptor for aiding in comparisons */; - int (*set_descriptor) (DB*, u_int32_t version, const DBT* descriptor, toku_dbt_upgradef dbt_userformat_upgrade) /* set row/dictionary descriptor for a db. Available only while db is open */; + DESCRIPTOR descriptor /* saved row/dictionary descriptor for aiding in comparisons */; + int (*set_descriptor) (DB*, u_int32_t version, const DBT* descriptor) /* set row/dictionary descriptor for a db. Available only while db is open */; int (*getf_set)(DB*, DB_TXN*, u_int32_t, DBT*, YDB_CALLBACK_FUNCTION, void*) /* same as DBC->c_getf_set without a persistent cursor) */; int (*getf_get_both)(DB*, DB_TXN*, u_int32_t, DBT*, DBT*, YDB_CALLBACK_FUNCTION, void*) /* same as DBC->c_getf_get_both without a persistent cursor) */; int (*flatten)(DB*, DB_TXN*) /* Flatten a dictionary, similar to (but faster than) a table scan */; diff --git a/buildheader/make_db_h.c b/buildheader/make_db_h.c index 2e4ccedb693..9c2fa0b2252 100644 --- a/buildheader/make_db_h.c +++ b/buildheader/make_db_h.c @@ -572,9 +572,11 @@ int main (int argc __attribute__((__unused__)), char *const argv[] __attribute__ assert(sizeof(dbt_fields32)==sizeof(dbt_fields64)); print_struct("dbt", 0, dbt_fields32, dbt_fields64, sizeof(dbt_fields32)/sizeof(dbt_fields32[0]), 0); - printf("typedef int (*toku_dbt_upgradef)(DB*,\n"); - printf(" u_int32_t old_version, const DBT *old_descriptor, const DBT *old_key, const DBT *old_val,\n"); - printf(" u_int32_t new_version, const DBT *new_descriptor, const DBT *new_key, const DBT *new_val);\n"); + //descriptor + printf("typedef struct __toku_descriptor {\n"); + printf(" u_int32_t version;\n"); + printf(" DBT dbt;\n"); + printf("} *DESCRIPTOR, DESCRIPTOR_S;\n"); assert(sizeof(db_fields32)==sizeof(db_fields64)); { @@ -601,8 +603,8 @@ int main (int argc __attribute__((__unused__)), char *const argv[] __attribute__ "const DBT* (*dbt_neg_infty)(void)/* Return the special DBT that refers to negative infinity in the lock table.*/", "int (*delboth) (DB*, DB_TXN*, DBT*, DBT*, u_int32_t) /* Delete the key/value pair. */", "int (*row_size_supported) (DB*, u_int32_t) /* Test whether a row size is supported. */", - "const DBT *descriptor /* saved row/dictionary descriptor for aiding in comparisons */", - "int (*set_descriptor) (DB*, u_int32_t version, const DBT* descriptor, toku_dbt_upgradef dbt_userformat_upgrade) /* set row/dictionary descriptor for a db. Available only while db is open */", + "DESCRIPTOR descriptor /* saved row/dictionary descriptor for aiding in comparisons */", + "int (*set_descriptor) (DB*, u_int32_t version, const DBT* descriptor) /* set row/dictionary descriptor for a db. Available only while db is open */", "int (*getf_set)(DB*, DB_TXN*, u_int32_t, DBT*, YDB_CALLBACK_FUNCTION, void*) /* same as DBC->c_getf_set without a persistent cursor) */", "int (*getf_get_both)(DB*, DB_TXN*, u_int32_t, DBT*, DBT*, YDB_CALLBACK_FUNCTION, void*) /* same as DBC->c_getf_get_both without a persistent cursor) */", "int (*flatten)(DB*, DB_TXN*) /* Flatten a dictionary, similar to (but faster than) a table scan */", diff --git a/buildheader/tdb.h b/buildheader/tdb.h index 9f57dafb631..7926e59629b 100644 --- a/buildheader/tdb.h +++ b/buildheader/tdb.h @@ -323,9 +323,10 @@ struct __toku_dbt { u_int32_t ulen; u_int32_t flags; }; -typedef int (*toku_dbt_upgradef)(DB*, - u_int32_t old_version, const DBT *old_descriptor, const DBT *old_key, const DBT *old_val, - u_int32_t new_version, const DBT *new_descriptor, const DBT *new_key, const DBT *new_val); +typedef struct __toku_descriptor { + u_int32_t version; + DBT dbt; +} *DESCRIPTOR, DESCRIPTOR_S; //One header is included in 'data' //One header is included in 'additional for checkpoint' typedef struct __toku_db_fragmentation { @@ -351,8 +352,8 @@ struct __toku_db { const DBT* (*dbt_neg_infty)(void)/* Return the special DBT that refers to negative infinity in the lock table.*/; int (*delboth) (DB*, DB_TXN*, DBT*, DBT*, u_int32_t) /* Delete the key/value pair. */; int (*row_size_supported) (DB*, u_int32_t) /* Test whether a row size is supported. */; - const DBT *descriptor /* saved row/dictionary descriptor for aiding in comparisons */; - int (*set_descriptor) (DB*, u_int32_t version, const DBT* descriptor, toku_dbt_upgradef dbt_userformat_upgrade) /* set row/dictionary descriptor for a db. Available only while db is open */; + DESCRIPTOR descriptor /* saved row/dictionary descriptor for aiding in comparisons */; + int (*set_descriptor) (DB*, u_int32_t version, const DBT* descriptor) /* set row/dictionary descriptor for a db. Available only while db is open */; int (*getf_set)(DB*, DB_TXN*, u_int32_t, DBT*, YDB_CALLBACK_FUNCTION, void*) /* same as DBC->c_getf_set without a persistent cursor) */; int (*getf_get_both)(DB*, DB_TXN*, u_int32_t, DBT*, DBT*, YDB_CALLBACK_FUNCTION, void*) /* same as DBC->c_getf_get_both without a persistent cursor) */; int (*flatten)(DB*, DB_TXN*) /* Flatten a dictionary, similar to (but faster than) a table scan */; diff --git a/include/db.h b/include/db.h index 9f57dafb631..7926e59629b 100644 --- a/include/db.h +++ b/include/db.h @@ -323,9 +323,10 @@ struct __toku_dbt { u_int32_t ulen; u_int32_t flags; }; -typedef int (*toku_dbt_upgradef)(DB*, - u_int32_t old_version, const DBT *old_descriptor, const DBT *old_key, const DBT *old_val, - u_int32_t new_version, const DBT *new_descriptor, const DBT *new_key, const DBT *new_val); +typedef struct __toku_descriptor { + u_int32_t version; + DBT dbt; +} *DESCRIPTOR, DESCRIPTOR_S; //One header is included in 'data' //One header is included in 'additional for checkpoint' typedef struct __toku_db_fragmentation { @@ -351,8 +352,8 @@ struct __toku_db { const DBT* (*dbt_neg_infty)(void)/* Return the special DBT that refers to negative infinity in the lock table.*/; int (*delboth) (DB*, DB_TXN*, DBT*, DBT*, u_int32_t) /* Delete the key/value pair. */; int (*row_size_supported) (DB*, u_int32_t) /* Test whether a row size is supported. */; - const DBT *descriptor /* saved row/dictionary descriptor for aiding in comparisons */; - int (*set_descriptor) (DB*, u_int32_t version, const DBT* descriptor, toku_dbt_upgradef dbt_userformat_upgrade) /* set row/dictionary descriptor for a db. Available only while db is open */; + DESCRIPTOR descriptor /* saved row/dictionary descriptor for aiding in comparisons */; + int (*set_descriptor) (DB*, u_int32_t version, const DBT* descriptor) /* set row/dictionary descriptor for a db. Available only while db is open */; int (*getf_set)(DB*, DB_TXN*, u_int32_t, DBT*, YDB_CALLBACK_FUNCTION, void*) /* same as DBC->c_getf_set without a persistent cursor) */; int (*getf_get_both)(DB*, DB_TXN*, u_int32_t, DBT*, DBT*, YDB_CALLBACK_FUNCTION, void*) /* same as DBC->c_getf_get_both without a persistent cursor) */; int (*flatten)(DB*, DB_TXN*) /* Flatten a dictionary, similar to (but faster than) a table scan */; diff --git a/linux/file.c b/linux/file.c index 527c7ddbfa1..5f4bc6bf898 100644 --- a/linux/file.c +++ b/linux/file.c @@ -372,6 +372,25 @@ toku_fsync_dirfd_without_accounting(DIR *dirp) { return r; } +int +toku_fsync_dir_by_name_without_accounting(const char *dir_name) { + int r = 0; + DIR * dir = opendir(dir_name); + if (!dir) { + r = errno; + assert(r); + } + else { + r = toku_fsync_dirfd_without_accounting(dir); + int rc = closedir(dir); + if (r==0 && rc!=0) { + r = errno; + assert(r); + } + } + return r; +} + // include fsync in scheduling accounting int toku_file_fsync(int fd) { @@ -421,16 +440,7 @@ toku_fsync_directory(const char *fname) { } if (result == 0) { - // fsync the dir - DIR *d = opendir(dirname); - if (d == NULL) { - result = errno; - } else { - result = toku_fsync_dirfd_without_accounting(d); - int r = closedir(d); - if (result == 0 && r != 0) - result = errno; - } + result = toku_fsync_dir_by_name_without_accounting(dirname); } toku_free(dirname); return result; diff --git a/newbrt/Makefile b/newbrt/Makefile index 4ecd8f0d365..71d32dfab0e 100644 --- a/newbrt/Makefile +++ b/newbrt/Makefile @@ -61,6 +61,7 @@ BRT_SOURCES = \ logfilemgr \ logger \ log_code \ + log_upgrade \ log_print \ logcursor \ memarena \ @@ -94,6 +95,8 @@ BRT_O_FILES = $(patsubst %,%.$(OEXT),$(BRT_SOURCES)) newbrt.$(OEXT): $(BRT_C_FILES) $(DEPEND_COMPILE) $(CC) -c $(BRT_C_FILES) $(COMBINE_C) $(CPPFLAGS) $(CFLAGS) $(OOUTPUT)$@ +brt-serialize.$(OEXT): $(wildcard backwards_*.c) + ifneq ($(CYGWIN),) NEWBRT_O_FILES = $(BRT_O_FILES) else ifeq ($(CC),icc) diff --git a/newbrt/backwards_10.c b/newbrt/backwards_10.c deleted file mode 100644 index cd54c32ab96..00000000000 --- a/newbrt/backwards_10.c +++ /dev/null @@ -1,1086 +0,0 @@ -/* -*- mode: C; c-basic-offset: 4 -*- */ -#ident "$Id$" -#ident "Copyright (c) 2007-2010 Tokutek Inc. All rights reserved." -#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it." - -#include "ule.h" -#include "fifo.h" -/******************* - * Purpose of this file is to provide backwards compatibility with earlier versions - * of the file format. - * - * Used by brt-serialize.c. - * - * NOTE: All functions in this file are static. - * This file is included in brt-serialize.c. It should not be compiled by itself. - * - * - */ - -enum { BRT_CMD_OVERHEAD_10 = (1 // the type - + 8) // the xid -}; -// FIFO_10 (data structure changed, so we need to provide the old one) -// Calculate the fingerprint for a kvpair -static void toku_calc_more_murmur_kvpair (struct x1764 *mm, const void *key, int keylen, const void *val, int vallen) { - int i; - i = toku_htod32(keylen); - x1764_add(mm, (void*)&i, 4); - x1764_add(mm, key, keylen); - i = toku_htod32(vallen); - x1764_add(mm, (void*)&i, 4); - x1764_add(mm, val, vallen); -} - -static u_int32_t calc_fingerprint_cmd10 (u_int32_t type, TXNID xid, const void *key, u_int32_t keylen, const void *val, u_int32_t vallen) { - unsigned char type_c = (unsigned char)type; - unsigned int a = toku_htod32(xid>>32); - unsigned int b = toku_htod32(xid&0xffffffff); - struct x1764 mm; - x1764_init(&mm); - x1764_add(&mm, &type_c, 1); - x1764_add(&mm, &a, 4); - x1764_add(&mm, &b, 4); - toku_calc_more_murmur_kvpair(&mm, key, keylen, val, vallen); - return x1764_finish(&mm); -} - -#define FIFO10_ITERATE(fifo10,keyvar,keylenvar,datavar,datalenvar,typevar,xidvar,body) do { \ - int fifo10_iterate_off; \ - for (fifo10_iterate_off = toku_fifo10_iterate_internal_start(fifo10); \ - toku_fifo10_iterate_internal_has_more(fifo10, fifo10_iterate_off); \ - fifo10_iterate_off = toku_fifo10_iterate_internal_next(fifo10, fifo10_iterate_off)) { \ - struct fifo10_entry *e = toku_fifo10_iterate_internal_get_entry(fifo10, fifo10_iterate_off); \ - bytevec keyvar = e->key; \ - ITEMLEN keylenvar = e->keylen; \ - bytevec datavar = e->key + e->keylen; \ - ITEMLEN datalenvar = e->vallen; \ - int typevar = e->type; \ - TXNID xidvar = e->xid; \ - body; \ - } } while (0) -struct fifo10_entry { - unsigned int keylen; - unsigned int vallen; - unsigned char type; - TXNID xid; - unsigned char key[]; -}; - -struct fifo { - int n_items_in_fifo; - char *memory; // An array of bytes into which fifo10_entries are embedded. - int memory_size; // How big is fifo10_memory - int memory_start; // Where is the first used byte? - int memory_used; // How many bytes are in use? -}; - -const int fifo10_initial_size = 4096; -static void fifo10_init(struct fifo *fifo10) { - fifo10->n_items_in_fifo = 0; - fifo10->memory = 0; - fifo10->memory_size = 0; - fifo10->memory_start = 0; - fifo10->memory_used = 0; -} - -static int fifo10_entry_size(struct fifo10_entry *entry) { - return sizeof (struct fifo10_entry) + entry->keylen + entry->vallen; -} - -static int toku_fifo10_create(FIFO *ptr) { - struct fifo *MALLOC(fifo10); - if (fifo10 == 0) return ENOMEM; - fifo10_init(fifo10); - *ptr = fifo10; - return 0; -} - -static void toku_fifo10_free(FIFO *ptr) { - FIFO fifo10 = *ptr; - if (fifo10->memory) toku_free(fifo10->memory); - fifo10->memory=0; - toku_free(fifo10); - *ptr = 0; -} - -static int next_power_of_two (int n) { - int r = 4096; - while (r < n) { - r*=2; - assert(r>0); - } - return r; -} - -static int toku_fifo10_enq(FIFO fifo10, const void *key, unsigned int keylen, const void *data, unsigned int datalen, int type, TXNID xid) { - int need_space_here = sizeof(struct fifo10_entry) + keylen + datalen; - int need_space_total = fifo10->memory_used+need_space_here; - if (fifo10->memory == NULL) { - fifo10->memory_size = next_power_of_two(need_space_total); - fifo10->memory = toku_malloc(fifo10->memory_size); - } - if (fifo10->memory_start+need_space_total > fifo10->memory_size) { - // Out of memory at the end. - int next_2 = next_power_of_two(need_space_total); - if ((2*next_2 > fifo10->memory_size) - || (8*next_2 < fifo10->memory_size)) { - // resize the fifo10 - char *newmem = toku_malloc(next_2); - char *oldmem = fifo10->memory; - if (newmem==0) return ENOMEM; - memcpy(newmem, oldmem+fifo10->memory_start, fifo10->memory_used); - fifo10->memory_size = next_2; - fifo10->memory_start = 0; - fifo10->memory = newmem; - toku_free(oldmem); - } else { - // slide things over - memmove(fifo10->memory, fifo10->memory+fifo10->memory_start, fifo10->memory_used); - fifo10->memory_start = 0; - } - } - struct fifo10_entry *entry = (struct fifo10_entry *)(fifo10->memory + fifo10->memory_start + fifo10->memory_used); - entry->type = (unsigned char)type; - entry->xid = xid; - entry->keylen = keylen; - memcpy(entry->key, key, keylen); - entry->vallen = datalen; - memcpy(entry->key + keylen, data, datalen); - fifo10->n_items_in_fifo++; - fifo10->memory_used += need_space_here; - return 0; -} - -static int toku_fifo10_iterate_internal_start(FIFO fifo10) { return fifo10->memory_start; } -static int toku_fifo10_iterate_internal_has_more(FIFO fifo10, int off) { return off < fifo10->memory_start + fifo10->memory_used; } -static int toku_fifo10_iterate_internal_next(FIFO fifo10, int off) { - struct fifo10_entry *e = (struct fifo10_entry *)(fifo10->memory + off); - return off + fifo10_entry_size(e); -} -static struct fifo10_entry * toku_fifo10_iterate_internal_get_entry(FIFO fifo10, int off) { - return (struct fifo10_entry *)(fifo10->memory + off); -} - -// LEAFENTRY ACCESSORS -// -// This ugly factorization of the macro is done so that we can do ## or not depending on which version of the -// compiler we are using, without repeating all this crufty offset calculation. - -static inline void putint (unsigned char *p, u_int32_t i) { -#if 1 - *(u_int32_t*)p = toku_htod32(i); -#else - p[0]=(i>>24)&0xff; - p[1]=(i>>16)&0xff; - p[2]=(i>> 8)&0xff; - p[3]=(i>> 0)&0xff; -#endif -} -static inline void putint64 (unsigned char *p, u_int64_t i) { - putint(p, (u_int32_t)(i>>32)); - putint(p+4, (u_int32_t)(i&0xffffffff)); -} -static inline u_int32_t getint (unsigned char *p) { -#if 1 - return toku_dtoh32(*(u_int32_t*)p); -#else - return (p[0]<<24)+(p[1]<<16)+(p[2]<<8)+(p[3]); -#endif -} -static inline u_int64_t getint64 (unsigned char *p) { - u_int64_t H = getint(p); - u_int64_t L = getint(p+4); - return (H<<32) + L; -} - - -#define DO_LE_COMMITTED(funname,le) case LE_COMMITTED: { \ - unsigned char* __klenaddr = 1+(unsigned char*)le; u_int32_t __klen = getint(__klenaddr); \ - unsigned char* __kvaladdr = 4 + __klenaddr; \ - unsigned char* __clenaddr = __klen + __kvaladdr; u_int32_t __clen = getint(__clenaddr); \ - unsigned char* __cvaladdr = 4 + __clenaddr; \ - return funname ## _le10_committed(__klen, __kvaladdr, __clen, __cvaladdr - -#define DO_LE_BOTH(funname,le) case LE_BOTH: { \ - unsigned char* __xidaddr = 1+(unsigned char*)le; u_int64_t __xid = getint64(__xidaddr); \ - unsigned char* __klenaddr = 8 + __xidaddr; u_int32_t __klen = getint(__klenaddr); \ - unsigned char* __kvaladdr = 4 + __klenaddr; \ - unsigned char* __clenaddr = __klen + __kvaladdr; u_int32_t __clen = getint(__clenaddr); \ - unsigned char* __cvaladdr = 4 + __clenaddr; \ - unsigned char* __plenaddr = __clen + __cvaladdr; u_int32_t __plen = getint(__plenaddr); \ - unsigned char* __pvaladdr = 4 + __plenaddr; \ - return funname ## _le10_both(__xid, __klen, __kvaladdr, __clen, __cvaladdr, __plen, __pvaladdr - -#define DO_LE_PROVDEL(funname,le ) case LE_PROVDEL: { \ - unsigned char* __xidaddr = 1+(unsigned char*)le; u_int64_t __xid = getint64(__xidaddr); \ - unsigned char* __klenaddr = 8 + __xidaddr; u_int32_t __klen = getint(__klenaddr); \ - unsigned char* __kvaladdr = 4 + __klenaddr; \ - unsigned char* __dlenaddr = __klen + __kvaladdr; u_int32_t __dlen = getint(__dlenaddr); \ - unsigned char* __dvaladdr = 4 + __dlenaddr; \ - return funname ## _le10_provdel(__xid, __klen, __kvaladdr, __dlen, __dvaladdr - -#define DO_LE_PROVPAIR(funname,le) case LE_PROVPAIR: { \ - unsigned char* __xidaddr = 1+(unsigned char*)le; u_int64_t __xid = getint64(__xidaddr); \ - unsigned char* __klenaddr = 8 + __xidaddr; u_int32_t __klen = getint(__klenaddr); \ - unsigned char* __kvaladdr = 4 + __klenaddr; \ - unsigned char* __plenaddr = __klen + __kvaladdr; u_int32_t __plen = getint(__plenaddr); \ - unsigned char* __pvaladdr = 4 + __plenaddr; \ - return funname ## _le10_provpair(__xid, __klen, __kvaladdr, __plen, __pvaladdr - -#ifdef __ICL -#define LESWITCHCALL(le,funname, ...) do { \ - switch(get_le_state(le)) { \ - DO_LE_COMMITTED(funname,le) , __VA_ARGS__); } \ - DO_LE_BOTH (funname,le) , __VA_ARGS__); } \ - DO_LE_PROVDEL (funname,le) , __VA_ARGS__); } \ - DO_LE_PROVPAIR (funname,le) , __VA_ARGS__); } \ - } abort(); } while (0) -#else -#define LESWITCHCALL(le,funname, ...) do { \ - switch(get_le_state(le)) { \ - DO_LE_COMMITTED(funname,le) , ## __VA_ARGS__); } \ - DO_LE_BOTH (funname,le) , ## __VA_ARGS__); } \ - DO_LE_PROVDEL (funname,le) , ## __VA_ARGS__); } \ - DO_LE_PROVPAIR (funname,le) , ## __VA_ARGS__); } \ - } abort(); } while (0) -#endif - -static u_int32_t memsize_le10_committed (u_int32_t keylen, void *key __attribute__((__unused__)), - u_int32_t vallen, void *val __attribute__((__unused__))) { - return 1+ 2*4 + keylen + vallen; -} - -static u_int32_t memsize_le10_both (TXNID txnid __attribute__((__unused__)), - u_int32_t klen, void *kval __attribute__((__unused__)), - u_int32_t clen, void *cval __attribute__((__unused__)), - u_int32_t plen, void *pval __attribute__((__unused__))) { - return 1 + 8 + 4*3 + klen + clen + plen; -} - -static u_int32_t memsize_le10_provdel (TXNID txnid __attribute__((__unused__)), - u_int32_t klen, void *kval __attribute__((__unused__)), - u_int32_t clen, void *cval __attribute__((__unused__))) { - return 1 + 8 + 4*2 + klen + clen; -} - -static u_int32_t memsize_le10_provpair (TXNID txnid __attribute__((__unused__)), - u_int32_t klen, void *kval __attribute__((__unused__)), - u_int32_t plen, void *pval __attribute__((__unused__))) { - return 1 + 8 + 4*2 + klen + plen; -} - -static u_int32_t leafentry_memsize_10 (LEAFENTRY le) { - LESWITCHCALL(le, memsize); - abort(); return 0; // make certain compilers happy -} - -static u_int32_t disksize_le10_committed (u_int32_t keylen, void *key __attribute__((__unused__)), - u_int32_t vallen, void *val __attribute__((__unused__))) { - return 1 + 4 + 4 + keylen + vallen; -} - -static u_int32_t disksize_le10_both (TXNID txnid __attribute__((__unused__)), - u_int32_t klen, void *kval __attribute__((__unused__)), - u_int32_t clen, void *cval __attribute__((__unused__)), - u_int32_t plen, void *pval __attribute__((__unused__))) { - return 1 + 8 + 4*3 + klen + clen + plen; -} - -static u_int32_t disksize_le10_provdel (TXNID txnid __attribute__((__unused__)), - u_int32_t klen, void *kval __attribute__((__unused__)), - u_int32_t clen, void *cval __attribute__((__unused__))) { - return 1 + 8 + 4 + 4 + klen + clen; -} - -static u_int32_t disksize_le10_provpair (TXNID txnid __attribute__((__unused__)), - u_int32_t klen, void *kval __attribute__((__unused__)), - u_int32_t plen, void *pval __attribute__((__unused__))) { - return 1 + 8 + 4 + 4 + klen + plen; -} - - -static u_int32_t -le10_disksize_internal (LEAFENTRY le) { - LESWITCHCALL(le, disksize); - abort(); return 0; // make certain compilers happy -} - -static u_int32_t le10_disksize (LEAFENTRY le) { - u_int32_t d = le10_disksize_internal(le); - // this computation is currently identical to the _disksize_internal - u_int32_t m = leafentry_memsize_10(le); - assert(m==d); - return d; -} - -//LEAFENTRY constructors - -//Constructors for version 10 leafentries, possibly needed for upgrades. -int -le10_committed (u_int32_t klen, void* kval, u_int32_t dlen, void* dval, u_int32_t *resultsize, u_int32_t *disksize, LEAFENTRY *result) { - size_t size = 9+klen+dlen; - unsigned char *lec=toku_malloc(size); - assert(lec); - lec[0] = LE_COMMITTED; - putint(lec+1, klen); - memcpy(lec+1+4, kval, klen); - putint(lec+1+4+klen, dlen); - memcpy(lec+1+4+klen+4, dval, dlen); - *resultsize=size; - *disksize = 1 + 4 + 4 + klen + dlen; - *result=(LEAFENTRY)lec; - return 0; -} - -int -le10_both (TXNID xid, u_int32_t klen, void* kval, u_int32_t clen, void* cval, u_int32_t plen, void* pval, - u_int32_t *resultsize, u_int32_t *disksize, LEAFENTRY *result) { - size_t size = 1+8+4*3+klen+clen+plen; - unsigned char *lec=toku_malloc(size); - assert(lec); - lec[0] = LE_BOTH; - putint64(lec+1, xid); - putint (lec+1+8, klen); - memcpy (lec+1+8+4, kval, klen); - putint (lec+1+8+4+klen, clen); - memcpy (lec+1+8+4+klen+4, cval, clen); - putint (lec+1+8+4+klen+4+clen, plen); - memcpy (lec+1+8+4+klen+4+clen+4, pval, plen); - *resultsize=size; - *disksize = 1 + 8 + 4*3 + klen + clen + plen; - *result=(LEAFENTRY)lec; - return 0; - -} - -int -le10_provdel (TXNID xid, u_int32_t klen, void* kval, u_int32_t dlen, void* dval, - u_int32_t *memsize, u_int32_t *disksize, LEAFENTRY *result) { - size_t size = 1 + 8 + 2*4 + klen + dlen; - unsigned char *lec= toku_malloc(size); - assert(lec); - lec[0] = LE_PROVDEL; - putint64(lec+1, xid); - putint (lec+1+8, klen); - memcpy (lec+1+8+4, kval, klen); - putint (lec+1+8+4+klen, dlen); - memcpy (lec+1+8+4+klen+4, dval, dlen); - *memsize=size; - *disksize = 1 + 4 + 4 + 8 + klen + dlen; - *result=(LEAFENTRY)lec; - return 0; -} - -int -le10_provpair (TXNID xid, u_int32_t klen, void* kval, u_int32_t plen, void* pval, u_int32_t *memsize, u_int32_t *disksize, LEAFENTRY *result) { - size_t size = 1 + 8 + 2*4 + klen + plen; - unsigned char *lec= toku_malloc(size); - assert(lec); - lec[0] = LE_PROVPAIR; - putint64(lec+1, xid); - putint (lec+1+8, klen); - memcpy (lec+1+8+4, kval, klen); - putint (lec+1+8+4+klen, plen); - memcpy (lec+1+8+4+klen+4, pval, plen); - *memsize=size; - *disksize = 1 + 4 + 4 + 8 + klen + plen; - *result=(LEAFENTRY)lec; - return 0; -} - - - -// Given a version 10 header, create a version 11 header. -// If new memory is needed for the new header, allocate it here and free the memory of the old version header. -static int -upgrade_brtheader_10_11(struct brt_header **brth_10, struct brt_header ** brth_11) { - assert((*brth_10)->layout_version == BRT_LAYOUT_VERSION_10); - *brth_11 = *brth_10; - *brth_10 = NULL; - (*brth_11)->layout_version = BRT_LAYOUT_VERSION_11; - (*brth_11)->layout_version_original = BRT_LAYOUT_VERSION_10; - (*brth_11)->checkpoint_lsn = ZERO_LSN; //Can't reuse LSNs - return 0; -} - - -static int -deserialize_brtheader_10 (int fd, struct rbuf *rb, struct brt_header **brth) { - // We already know: - // we have an rbuf representing the header. - // The checksum has been validated - - //Steal rbuf (used to simplify merge, reduce diff size, and keep old code) - struct rbuf rc = *rb; - memset(rb, 0, sizeof(*rb)); - - //Verification of initial elements. - { - //Check magic number - bytevec magic; - rbuf_literal_bytes(&rc, &magic, 8); - assert(memcmp(magic,"tokudata",8)==0); - } - - - struct brt_header *CALLOC(h); - if (h==0) return errno; - int ret=-1; - if (0) { died1: toku_free(h); return ret; } - h->type = BRTHEADER_CURRENT; - h->checkpoint_header = NULL; - h->dirty=0; - h->panic = 0; - h->panic_string = 0; - toku_list_init(&h->live_brts); - toku_list_init(&h->zombie_brts); - toku_list_init(&h->checkpoint_before_commit_link); - //version MUST be in network order on disk regardless of disk order - h->layout_version = rbuf_network_int(&rc); - assert(h->layout_version==BRT_LAYOUT_VERSION_10); - - //Size MUST be in network order regardless of disk order. - u_int32_t size = rbuf_network_int(&rc); - assert(size==rc.size); - - bytevec tmp_byte_order_check; - rbuf_literal_bytes(&rc, &tmp_byte_order_check, 8); //Must not translate byte order - int64_t byte_order_stored = *(int64_t*)tmp_byte_order_check; - assert(byte_order_stored == toku_byte_order_host); - - h->checkpoint_count = rbuf_ulonglong(&rc); - h->checkpoint_lsn = rbuf_lsn(&rc); - h->nodesize = rbuf_int(&rc); - DISKOFF translation_address_on_disk = rbuf_diskoff(&rc); - DISKOFF translation_size_on_disk = rbuf_diskoff(&rc); - assert(translation_address_on_disk>0); - assert(translation_size_on_disk>0); - - // printf("%s:%d translated_blocknum_limit=%ld, block_translation_address_on_disk=%ld\n", __FILE__, __LINE__, h->translated_blocknum_limit, h->block_translation_address_on_disk); - //Load translation table - { - lock_for_pwrite(); - unsigned char *XMALLOC_N(translation_size_on_disk, tbuf); - { - // This cast is messed up in 32-bits if the block translation table is ever more than 4GB. But in that case, the translation table itself won't fit in main memory. - ssize_t r = pread(fd, tbuf, translation_size_on_disk, translation_address_on_disk); - assert(r==translation_size_on_disk); - } - unlock_for_pwrite(); - // Create table and read in data. - toku_blocktable_create_from_buffer(&h->blocktable, - translation_address_on_disk, - translation_size_on_disk, - tbuf); - toku_free(tbuf); - } - - h->root = rbuf_blocknum(&rc); - h->root_hash.valid = FALSE; - h->flags = rbuf_int(&rc); - deserialize_descriptor_from(fd, h, &h->descriptor); - (void)rbuf_int(&rc); //Read in checksum and ignore (already verified). - if (rc.ndone!=rc.size) {ret = EINVAL; goto died1;} - toku_free(rc.buf); - rc.buf = NULL; - *brth = h; - return 0; -} - -enum { uncompressed_magic_len_10 = (8 // tokuleaf or tokunode - +4 // version - +8 // lsn - ) -}; - -#define DO_DECOMPRESS_WORKER 1 - -struct decompress_work_10 { - toku_pthread_t id; - void *compress_ptr; - void *uncompress_ptr; - u_int32_t compress_size; - u_int32_t uncompress_size; -}; - -// initialize the decompression work -static void init_decompress_work_10(struct decompress_work_10 *w, - void *compress_ptr, u_int32_t compress_size, - void *uncompress_ptr, u_int32_t uncompress_size) { - memset(&w->id, 0, sizeof(w->id)); - w->compress_ptr = compress_ptr; w->compress_size = compress_size; - w->uncompress_ptr = uncompress_ptr; w->uncompress_size = uncompress_size; -} - -// do the decompression work -static void do_decompress_work_10(struct decompress_work_10 *w) { - uLongf destlen = w->uncompress_size; - int r = uncompress(w->uncompress_ptr, &destlen, - w->compress_ptr, w->compress_size); - assert(destlen==w->uncompress_size); - assert(r==Z_OK); -} - -#if DO_DECOMPRESS_WORKER - -static void *decompress_worker_10(void *); - -static void start_decompress_work_10(struct decompress_work_10 *w) { - int r = toku_pthread_create(&w->id, NULL, decompress_worker_10, w); assert(r == 0); -} - -static void wait_decompress_work_10(struct decompress_work_10 *w) { - void *ret; - int r = toku_pthread_join(w->id, &ret); assert(r == 0); -} - -static void *decompress_worker_10(void *arg) { - struct decompress_work_10 *w = (struct decompress_work_10 *) arg; - do_decompress_work_10(w); - return arg; -} - -#endif - -static int -verify_decompressed_brtnode_checksum (struct rbuf *rb) { - int r = 0; - - if (rb->size >= 4) { - uint32_t verify_size = rb->size - 4; //Not counting the checksum - - toku_trace("x1764"); - uint32_t crc = x1764_memory(rb->buf, verify_size); - toku_trace("x1764 done"); - - uint32_t *crcp = (uint32_t*)(((uint8_t*)rb->buf) + verify_size); - uint32_t storedcrc = toku_dtoh32(*crcp); - if (crc!=storedcrc) { - printf("Bad CRC\n"); - printf("%s:%d crc=%08x stored=%08x\n", __FILE__, __LINE__, crc, storedcrc); - r = toku_db_badformat(); - } - } - else r = toku_db_badformat(); - return r; -} - -static int -decompress_brtnode_from_raw_block_into_rbuf_10(u_int8_t *raw_block, struct rbuf *rb, BLOCKNUM blocknum) { - int r; - int i; - // get the number of compressed sub blocks - int n_sub_blocks; - int compression_header_offset; - { - n_sub_blocks = toku_dtoh32(*(u_int32_t*)(&raw_block[uncompressed_magic_len_10])); - compression_header_offset = uncompressed_magic_len_10 + 4; - } - assert(0 < n_sub_blocks); - - // verify the sizes of the compressed sub blocks - if (0 && n_sub_blocks != 1) printf("%s:%d %d\n", __FUNCTION__, __LINE__, n_sub_blocks); - - struct sub_block sub_block[n_sub_blocks]; - for (i=0; i(1<<30)) { r = toku_db_badformat(); return r; } - u_int32_t uncompressed_size = toku_dtoh32(*(u_int32_t*)(&raw_block[compression_header_offset+8*i+4])); - if (0) printf("Block %" PRId64 " Compressed size = %u, uncompressed size=%u\n", blocknum.b, compressed_size, uncompressed_size); - if (uncompressed_size<=0 || uncompressed_size>(1<<30)) { r = toku_db_badformat(); return r; } - - sub_block[i].compressed_size = compressed_size; - sub_block[i].uncompressed_size = uncompressed_size; - } - - unsigned char *compressed_data = raw_block + uncompressed_magic_len_10 + sub_block_header_size(n_sub_blocks); - - size_t uncompressed_size = get_sum_uncompressed_size(n_sub_blocks, sub_block); - rb->size= uncompressed_magic_len_10 + uncompressed_size; - assert(rb->size>0); - - rb->buf=toku_xmalloc(rb->size); - - // construct the uncompressed block from the header and compressed sub blocks - memcpy(rb->buf, raw_block, uncompressed_magic_len_10); - - // decompress the sub blocks - unsigned char *uncompressed_data = rb->buf+uncompressed_magic_len_10; - struct decompress_work_10 decompress_work[n_sub_blocks]; - - for (i=0; i0) { -#if DO_DECOMPRESS_WORKER - start_decompress_work_10(&decompress_work[i]); -#else - do_decompress_work_10(&decompress_work[i]); -#endif - } - uncompressed_data += sub_block[i].uncompressed_size; - compressed_data += sub_block[i].compressed_size; - } - do_decompress_work_10(&decompress_work[0]); -#if DO_DECOMPRESS_WORKER - for (i=1; ibuf[uncompressed_magic_len_10], rb->buf[uncompressed_magic_len_10+1], - rb->buf[uncompressed_magic_len_10+2], rb->buf[uncompressed_magic_len_10+3]); - - rb->ndone=0; - - r = verify_decompressed_brtnode_checksum(rb); - return r; -} - -static int -deserialize_brtnode_leaf_from_rbuf_10 (BRTNODE result, bytevec magic, struct rbuf *rb) { - //The only difference between this version and version 11 (for this function) - //is the line that calculates size of leafentry. - int r; - int i; - - if (memcmp(magic, "tokuleaf", 8)!=0) { - r = toku_db_badformat(); - return r; - } - - result->u.l.leaf_stats.nkeys = rbuf_ulonglong(rb); - result->u.l.leaf_stats.ndata = rbuf_ulonglong(rb); - result->u.l.leaf_stats.dsize = rbuf_ulonglong(rb); - result->u.l.leaf_stats.exact = TRUE; - int n_in_buf = rbuf_int(rb); - result->u.l.n_bytes_in_buffer = 0; - result->u.l.seqinsert = 0; - - //printf("%s:%d r PMA= %p\n", __FILE__, __LINE__, result->u.l.buffer); - toku_mempool_init(&result->u.l.buffer_mempool, rb->buf, rb->size); - - u_int32_t actual_sum = 0; - u_int32_t start_of_data = rb->ndone; - OMTVALUE *MALLOC_N(n_in_buf, array); - for (i=0; ibuf[rb->ndone]); - u_int32_t disksize = le10_disksize(le); //Only difference between 10 & 11 - rb->ndone += disksize; - assert(rb->ndone<=rb->size); - - array[i]=(OMTVALUE)le; - actual_sum += x1764_memory(le, disksize); - } - toku_trace("fill array"); - u_int32_t end_of_data = rb->ndone; - result->u.l.n_bytes_in_buffer += end_of_data-start_of_data + n_in_buf*OMT_ITEM_OVERHEAD; - actual_sum *= result->rand4fingerprint; - r = toku_omt_create_steal_sorted_array(&result->u.l.buffer, &array, n_in_buf, n_in_buf); - toku_trace("create omt"); - if (r!=0) { - toku_free(array); - r = toku_db_badformat(); - if (0) { died_1: toku_omt_destroy(&result->u.l.buffer); } - return r; - } - assert(array==NULL); - - result->u.l.buffer_mempool.frag_size = start_of_data; - result->u.l.buffer_mempool.free_offset = end_of_data; - - if (r!=0) goto died_1; - if (actual_sum!=result->local_fingerprint) { - //fprintf(stderr, "%s:%d Corrupted checksum stored=%08x rand=%08x actual=%08x height=%d n_keys=%d\n", __FILE__, __LINE__, result->rand4fingerprint, result->local_fingerprint, actual_sum, result->height, n_in_buf); - r = toku_db_badformat(); - goto died_1; - } else { - //fprintf(stderr, "%s:%d Good checksum=%08x height=%d\n", __FILE__, __LINE__, actual_sum, result->height); - } - - //toku_verify_counts(result); - - (void)rbuf_int(rb); //Ignore the crc (already verified). - if (rb->ndone != rb->size) { //Verify we read exactly the entire block. - r = toku_db_badformat(); goto died_1; - } - - r = toku_leaflock_borrow(result->u.l.leaflock_pool, &result->u.l.leaflock); - if (r!=0) goto died_1; - rb->buf = NULL; //Buffer was used for node's mempool. - return 0; -} - -static int -deserialize_brtnode_nonleaf_from_rbuf_10 (BRTNODE result, bytevec magic, struct rbuf *rb) { - int r; - int i; - - if (memcmp(magic, "tokunode", 8)!=0) { - r = toku_db_badformat(); - return r; - } - - result->u.n.totalchildkeylens=0; - u_int32_t subtree_fingerprint = rbuf_int(rb); - u_int32_t check_subtree_fingerprint = 0; - result->u.n.n_children = rbuf_int(rb); - MALLOC_N(result->u.n.n_children+1, result->u.n.childinfos); - MALLOC_N(result->u.n.n_children, result->u.n.childkeys); - //printf("n_children=%d\n", result->n_children); - assert(result->u.n.n_children>=0); - for (i=0; iu.n.n_children; i++) { - u_int32_t childfp = rbuf_int(rb); - BNC_SUBTREE_FINGERPRINT(result, i)= childfp; - check_subtree_fingerprint += childfp; - struct subtree_estimates *se = &(BNC_SUBTREE_ESTIMATES(result, i)); - se->nkeys = rbuf_ulonglong(rb); - se->ndata = rbuf_ulonglong(rb); - se->dsize = rbuf_ulonglong(rb); - se->exact = (BOOL) (rbuf_char(rb) != 0); - } - for (i=0; iu.n.n_children-1; i++) { - if (result->flags & TOKU_DB_DUPSORT) { - bytevec keyptr, dataptr; - unsigned int keylen, datalen; - rbuf_bytes(rb, &keyptr, &keylen); - rbuf_bytes(rb, &dataptr, &datalen); - result->u.n.childkeys[i] = kv_pair_malloc(keyptr, keylen, dataptr, datalen); - } else { - bytevec childkeyptr; - unsigned int cklen; - rbuf_bytes(rb, &childkeyptr, &cklen); /* Returns a pointer into the rbuf. */ - result->u.n.childkeys[i] = kv_pair_malloc((void*)childkeyptr, cklen, 0, 0); - } - //printf(" key %d length=%d data=%s\n", i, result->childkeylens[i], result->childkeys[i]); - result->u.n.totalchildkeylens+=toku_brtnode_pivot_key_len(result, result->u.n.childkeys[i]); - } - for (i=0; iu.n.n_children; i++) { - BNC_BLOCKNUM(result,i) = rbuf_blocknum(rb); - BNC_HAVE_FULLHASH(result, i) = FALSE; - BNC_NBYTESINBUF(result,i) = 0; - //printf("Child %d at %lld\n", i, result->children[i]); - } - result->u.n.n_bytes_in_buffers = 0; - for (i=0; iu.n.n_children; i++) { - r=toku_fifo10_create(&BNC_BUFFER(result,i)); - if (r!=0) { - int j; - if (0) { died_1: j=result->u.n.n_bytes_in_buffers; } - for (j=0; ju.n.n_children; cnum++) { - int n_in_this_hash = rbuf_int(rb); - //printf("%d in hash\n", n_in_hash); - //START HERE - for (i=0; irand4fingerprint * calc_fingerprint_cmd10(type, xid, key, keylen, val, vallen); - //printf("Found %s,%s\n", (char*)key, (char*)val); - { - r=toku_fifo10_enq(BNC_BUFFER(result, cnum), key, keylen, val, vallen, type, xid); /* Copies the data into the hash table. */ - if (r!=0) { goto died_1; } - } - diff = keylen + vallen + KEY_VALUE_OVERHEAD + BRT_CMD_OVERHEAD_10; - result->u.n.n_bytes_in_buffers += diff; - BNC_NBYTESINBUF(result,cnum) += diff; - //printf("Inserted\n"); - } - } - if (check_local_fingerprint != result->local_fingerprint) { - fprintf(stderr, "%s:%d local fingerprint is wrong (found %8x calcualted %8x\n", __FILE__, __LINE__, result->local_fingerprint, check_local_fingerprint); - return toku_db_badformat(); - } - if (check_subtree_fingerprint+check_local_fingerprint != subtree_fingerprint) { - fprintf(stderr, "%s:%d subtree fingerprint is wrong\n", __FILE__, __LINE__); - return toku_db_badformat(); - } - } - (void)rbuf_int(rb); //Ignore the crc (already verified). - if (rb->ndone != rb->size) { //Verify we read exactly the entire block. - r = toku_db_badformat(); goto died_1; - } - return 0; -} - -static int -deserialize_brtnode_from_rbuf_10 (BLOCKNUM blocknum, u_int32_t fullhash, BRTNODE *brtnode, struct brt_header *h, struct rbuf *rb) { - TAGMALLOC(BRTNODE, result); - int r; - if (result==0) { - r=errno; - if (0) { died0: toku_free(result); } - return r; - } - result->desc = &h->descriptor; - result->ever_been_written = 1; - - //printf("Deserializing %lld datasize=%d\n", off, datasize); - bytevec magic; - rbuf_literal_bytes(rb, &magic, 8); - result->layout_version = rbuf_int(rb); - assert(result->layout_version == BRT_LAYOUT_VERSION_10); - (void)rbuf_ulonglong(rb); // BRTNODE.disk_lsn.lsn no longer exists - { - //Restrict scope for now since we do not support upgrades. - struct descriptor desc; - //desc.dbt.data is TEMPORARY. Will be unusable when the rc buffer is freed. - deserialize_descriptor_from_rbuf(rb, &desc, TRUE); - assert(desc.version == result->desc->version); //We do not yet support upgrading the dbts. - } - result->nodesize = rbuf_int(rb); - //result->log_lsn = result->disk_lsn; //Disabled since neither variable exists anymore - - result->thisnodename = blocknum; - result->flags = rbuf_int(rb); - result->height = rbuf_int(rb); - result->rand4fingerprint = rbuf_int(rb); - result->local_fingerprint = rbuf_int(rb); -// printf("%s:%d read %08x\n", __FILE__, __LINE__, result->local_fingerprint); - result->dirty = 0; - result->fullhash = fullhash; - //printf("height==%d\n", result->height); - - if (result->height>0) - r = deserialize_brtnode_nonleaf_from_rbuf_10(result, magic, rb); - else { - result->u.l.leaflock_pool = toku_cachefile_leaflock_pool(h->cf); - r = deserialize_brtnode_leaf_from_rbuf_10(result, magic, rb); - } - if (r!=0) goto died0; - - //printf("%s:%d Ok got %lld n_children=%d\n", __FILE__, __LINE__, result->thisnodename, result->n_children); - if (result->height>0) { - // For height==0 we used the buf inside the OMT - toku_free(rb->buf); - rb->buf = NULL; - } - toku_trace("deserial done"); - *brtnode = result; - //toku_verify_counts(result); - return 0; -} - -static void le_unpack_le10_committed(u_int32_t klen, void *kval, u_int32_t vallen, void *val, ULE ule) { - //Committed value - toku_upgrade_ule_init_empty_ule(ule, klen, kval); - toku_upgrade_ule_remove_innermost_uxr(ule); // pop committed delete - toku_upgrade_ule_push_insert_uxr(ule, 0, vallen, val); -} - -static void le_unpack_le10_both(TXNID xid, u_int32_t klen, void *kval, u_int32_t clen, void *cval, u_int32_t plen, void *pval, ULE ule) { - if (xid==0) { - //Really committed - le_unpack_le10_committed(klen, kval, plen, pval, ule); - } - else { - //committed value and provisional insert - toku_upgrade_ule_init_empty_ule(ule, klen, kval); - toku_upgrade_ule_remove_innermost_uxr(ule); // pop committed delete - toku_upgrade_ule_push_insert_uxr(ule, 0, clen, cval); // push committed - toku_upgrade_ule_push_insert_uxr(ule, xid, plen, pval); // push provisional - } -} - -static void le_unpack_le10_provdel(TXNID xid, u_int32_t klen, void *kval, u_int32_t clen, void *cval, ULE ule) { - if (xid==0) { - //Really committed delete - toku_upgrade_ule_init_empty_ule(ule, klen, kval); - } - else { - //committed value and provisional delete - toku_upgrade_ule_init_empty_ule(ule, klen, kval); - toku_upgrade_ule_remove_innermost_uxr(ule); // pop committed delete - toku_upgrade_ule_push_insert_uxr(ule, 0, clen, cval); // push committed - toku_upgrade_ule_push_delete_uxr(ule, xid); // push provisional - } -} - -static void le_unpack_le10_provpair(TXNID xid, u_int32_t klen, void *kval, u_int32_t plen, void *pval, ULE ule) { - if (xid==0) { - //Really committed - le_unpack_le10_committed(klen, kval, plen, pval, ule); - } - else { - //committed delete and provisional insert - toku_upgrade_ule_init_empty_ule(ule, klen, kval); - toku_upgrade_ule_push_insert_uxr(ule, xid, plen, pval); // push provisional - } -} - -//Used to unpack a version 10 record to ule, which can be packed to version 11. -static void -le_unpack_from_version_10(ULE ule, LEAFENTRY le) { - LESWITCHCALL(le, le_unpack, ule); - abort(); return; // make certain compilers happy -} - -static u_int32_t -le10_crc(LEAFENTRY v) { - return x1764_memory(v, leafentry_memsize_10(v)); -} - -//old_le10 is opaque data only readable by accessors (Not a 'new' LEAFENTRY) -static void -upgrade_single_leafentry_10_11 (BRTNODE node, u_int32_t idx11, LEAFENTRY old_le10, OMT omt11, struct mempool *mp11) { - //See brt_leaf_apply_cmd_once for template - size_t newlen=0, newdisksize=0; - LEAFENTRY new_le = NULL; - ULE_S ule; - int r; - assert(old_le10); - le_unpack_from_version_10(&ule, old_le10); - r = le_pack(&ule, // create packed leafentry - &newlen, &newdisksize, - &new_le, - omt11, mp11, NULL); - assert(r==0); - - - //Update size of memory information and crc - //Subtract old version 10 leafentry information - node->u.l.n_bytes_in_buffer -= OMT_ITEM_OVERHEAD + le10_disksize(old_le10); - node->local_fingerprint -= node->rand4fingerprint * le10_crc(old_le10); - if (new_le) { - //Version 10 leafentry is being upgraded - assert(newdisksize == leafentry_disksize(new_le)); - - //Add new version 10 leafentry information - node->u.l.n_bytes_in_buffer += OMT_ITEM_OVERHEAD + newdisksize; - node->local_fingerprint += node->rand4fingerprint*toku_le_crc(new_le); - - r = toku_omt_insert_at(omt11, new_le, idx11); - assert(r==0); - } -} - -//Upgrade each leafentry from version 10 to 11(nested transactions) -//Need to update checksums, and memory pools -static void -upgrade_brtnode_leaf_10_11 (BRTNODE node) { - int r; - u_int32_t idx10 = 0; - u_int32_t idx11 = 0; - OMT omt10 = node->u.l.buffer; - OMT omt11 = NULL; - struct mempool mp10 = node->u.l.buffer_mempool; - struct mempool mp11; - { - //Create a new mempool - size_t starting_size = toku_mempool_get_size(&mp10); - void *base = toku_xmalloc(starting_size); - toku_mempool_init(&mp11, base, starting_size); - } - r = toku_omt_create(&omt11); - assert(r==0); - u_int32_t omt11size = toku_omt_size(node->u.l.buffer); - while (idx10 < omt11size) { - OMTVALUE old_le10; - r = toku_omt_fetch(node->u.l.buffer, idx10, &old_le10, NULL); - assert(r==0); - assert(old_le10); - upgrade_single_leafentry_10_11(node, idx11, old_le10, omt11, &mp11); - - u_int32_t omtsize11 = toku_omt_size(omt11); - if (omtsize11 != idx11) { - assert(omtsize11 == idx11+1); - //Leafentry survived (insert) - idx11++; - } - idx10++; //Always advance the old omt - } - //Free the old mempool - { - void *mpbase = toku_mempool_get_base(&mp10); - toku_mempool_fini(&mp10); - toku_free(mpbase); - } - //Free the old omt - toku_omt_destroy(&omt10); - - //Assign new mempool - node->u.l.buffer_mempool = mp11; - //Assign new omt - node->u.l.buffer = omt11; - //Calculate statistics - toku_calculate_leaf_stats(node); -} - -static void -upgrade_brtnode_nonleaf_10_11 (BRTNODE node) { - int i; - int r; - for (i=0; iu.n.n_children; i++) { - FIFO fifo11; - FIFO fifo10 = BNC_BUFFER(node,i); - BNC_BUFFER(node,i) = NULL; - r = toku_fifo_create(&fifo11); - assert(r==0); - - FIFO10_ITERATE(fifo10, keyp, keylen, valp, vallen, type, xid, - XIDS xids; - if (xid == 0) - xids = xids_get_root_xids(); - else { - //Assume all transactions have no parents. - r = xids_create_child(xids_get_root_xids(), &xids, xid); - assert(r==0); - } - //Remove checksum contribution of this fifo_entry - node->local_fingerprint -= node->rand4fingerprint * calc_fingerprint_cmd10(type, xid, keyp, keylen, valp, vallen); - //Remove bytes_in_buf contribution of this fifo_entry - u_int32_t bytes10 = keylen + vallen + KEY_VALUE_OVERHEAD + BRT_CMD_OVERHEAD_10; - node->u.n.n_bytes_in_buffers -= bytes10; - BNC_NBYTESINBUF(node, i) -= bytes10; - - - //Add checksum contribution of the new fifo_entry - node->local_fingerprint += node->rand4fingerprint * toku_calc_fingerprint_cmd(type, xids, keyp, keylen, valp, vallen); - //Add bytes_in_buf contribution of the new fifo_entry - u_int32_t bytes11 = keylen + vallen + KEY_VALUE_OVERHEAD + BRT_CMD_OVERHEAD + xids_get_serialize_size(xids); - node->u.n.n_bytes_in_buffers += bytes11; - BNC_NBYTESINBUF(node, i) += bytes11; - - //Enqueue new fifo entry - r = toku_fifo_enq(fifo11, keyp, keylen, valp, vallen, type, xids); - assert(r==0); - xids_destroy(&xids); - - ); - toku_fifo10_free(&fifo10); - BNC_BUFFER(node,i) = fifo11; - fifo11 = NULL; - } -} - -// Structure of brtnode is same for versions 10, 11. The only difference is in the -// contents of the leafentries and the messages. For this reason, the outer structure -// of the brtnode is left in place (*brtnode_10 is reused.) -static int -upgrade_brtnode_10_11 (BRTNODE *brtnode_10, BRTNODE *brtnode_11) { - if ((*brtnode_10)->height>0) - upgrade_brtnode_nonleaf_10_11(*brtnode_10); - else - upgrade_brtnode_leaf_10_11(*brtnode_10); - *brtnode_11 = *brtnode_10; - *brtnode_10 = NULL; - (*brtnode_11)->layout_version = BRT_LAYOUT_VERSION_11; - (*brtnode_11)->layout_version_original = BRT_LAYOUT_VERSION_10; - (*brtnode_11)->layout_version_read_from_disk = BRT_LAYOUT_VERSION_10; - (*brtnode_11)->dirty = 1; - return 0; -} - diff --git a/newbrt/backwards_10.h b/newbrt/backwards_10.h deleted file mode 100644 index b0a3a70f4fd..00000000000 --- a/newbrt/backwards_10.h +++ /dev/null @@ -1,33 +0,0 @@ -/* -*- mode: C; c-basic-offset: 4 -*- */ -#ident "$Id$" -#ident "Copyright (c) 2007-2010 Tokutek Inc. All rights reserved." -#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it." - -#ifndef BACKWARD_10_H -#define BACKWARD_10_H - -int le10_committed (u_int32_t klen, void* kval, u_int32_t dlen, void* dval, u_int32_t *resultsize, u_int32_t *disksize, LEAFENTRY *result); -int le10_both (TXNID xid, u_int32_t cklen, void* ckval, u_int32_t cdlen, void* cdval, u_int32_t pdlen, void* pdval, - u_int32_t *memsize, u_int32_t *disksize, LEAFENTRY *result); -int le10_provdel (TXNID xid, u_int32_t klen, void* kval, u_int32_t dlen, void* dval, - u_int32_t *resultsize, u_int32_t *memsize, LEAFENTRY *result); -int le10_provpair (TXNID xid, u_int32_t klen, void* kval, u_int32_t plen, void* pval, u_int32_t *memsize, u_int32_t *disksize, LEAFENTRY *result); - -enum le_state { LE_COMMITTED=1, // A committed pair. - LE_BOTH, // A committed pair and a provisional pair. - LE_PROVDEL, // A committed pair that has been provisionally deleted - LE_PROVPAIR }; // No committed value, but a provisional pair. - -static inline enum le_state get_le_state(LEAFENTRY le) { - return (enum le_state)*(unsigned char *)le; -} -#include "ule.h" -//Exposed ule functions for the purpose of upgrading -void toku_upgrade_ule_init_empty_ule(ULE ule, u_int32_t keylen, void * keyp); -void toku_upgrade_ule_remove_innermost_uxr(ULE ule); -void toku_upgrade_ule_push_insert_uxr(ULE ule, TXNID xid, u_int32_t vallen, void * valp); -void toku_upgrade_ule_push_delete_uxr(ULE ule, TXNID xid); -//Exposed brt functions for the purpose of upgrading -void toku_calculate_leaf_stats(BRTNODE node); - -#endif diff --git a/newbrt/backwards_11.c b/newbrt/backwards_11.c new file mode 100644 index 00000000000..61775119eb0 --- /dev/null +++ b/newbrt/backwards_11.c @@ -0,0 +1,771 @@ +/* -*- mode: C; c-basic-offset: 4 -*- */ +#ident "$Id: brt-serialize.c 18555 2010-03-18 01:20:07Z yfogel $" +#ident "Copyright (c) 2007, 2008, 2009 Tokutek Inc. All rights reserved." +#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it." + +#include "includes.h" + +#define PRINT 0 + + +static u_int32_t x1764_memory_11 (const void *buf, int len) +{ + const u_int64_t *lbuf=buf; + u_int64_t c=0; + while (len>=8) { + c = c*17 + *lbuf; + if (PRINT) printf("%d: c=%016" PRIx64 " sum=%016" PRIx64 "\n", __LINE__, *lbuf, c); + lbuf++; + len-=8; + } + if (len>0) { + const u_int8_t *cbuf=(u_int8_t*)lbuf; + int i; + u_int64_t input=0; + for (i=0; i>32); +} + +// Given a version 11 header, create a version 12 header. +// If new memory is needed for the new header, allocate it here and free the memory of the old version header. +static int +upgrade_brtheader_11_12(int fd, struct brt_header **brth_11, struct brt_header ** brth_12) { + int r = 0; + assert((*brth_11)->layout_version == BRT_LAYOUT_VERSION_11); + *brth_12 = *brth_11; + *brth_11 = NULL; + (*brth_12)->layout_version = BRT_LAYOUT_VERSION_12; + toku_list_init(&(*brth_12)->checkpoint_before_commit_link); + (void) toku_sync_fetch_and_increment_uint64(&upgrade_status.header); + { //Re-write descriptor to fix checksum (does not get done automatically). + DISKOFF offset; + DESCRIPTOR d = &(*brth_12)->descriptor; + //4 for checksum + toku_realloc_descriptor_on_disk((*brth_12)->blocktable, toku_serialize_descriptor_size(d)+4, &offset, (*brth_12)); + r = toku_serialize_descriptor_contents_to_fd(fd, d, offset); + } + return r; +} + +// Structure of brtnode is same for versions 11, 12. The only difference is in the +// disk format and layout version. +static int +upgrade_brtnode_11_12 (BRTNODE *brtnode_11, BRTNODE *brtnode_12) { + *brtnode_12 = *brtnode_11; + *brtnode_11 = NULL; + + BRTNODE brt = *brtnode_12; + brt->layout_version = BRT_LAYOUT_VERSION_12; + brt->dirty = 1; + if (brt->height) { + (void) toku_sync_fetch_and_increment_uint64(&upgrade_status.nonleaf); + } + else { + (void) toku_sync_fetch_and_increment_uint64(&upgrade_status.leaf); + } + //x1764 calculation (fingerprint) has changed between 11 and 12. + //Update all local fields based on x1764, verify several others. + toku_verify_or_set_counts(brt, TRUE); + return 0; +} + + +static u_int32_t +toku_serialize_descriptor_size_11(DESCRIPTOR desc) { + //Checksum NOT included in this. Checksum only exists in header's version. + u_int32_t size = 4+ //version + 4; //size + size += desc->dbt.size; + return size; +} + + +static unsigned int toku_brtnode_pivot_key_len_11 (BRTNODE node, struct kv_pair *pk) { + if (node->flags & TOKU_DB_DUPSORT) { + return kv_pair_keylen(pk) + kv_pair_vallen(pk); + } else { + return kv_pair_keylen(pk); + } +} + + + +enum { uncompressed_magic_len_11 = (8 // tokuleaf or tokunode + +4 // layout version + +4 // layout version original + ) +}; + +// uncompressed header offsets +enum { + uncompressed_magic_offset_11 = 0, + uncompressed_version_offset_11 = 8, +}; + +// compression header sub block sizes +struct sub_block_sizes { + u_int32_t compressed_size; // real compressed size + u_int32_t uncompressed_size; + u_int32_t compressed_size_bound; // estimated compressed size +}; + +// target sub-block sizs and max number of sub-blocks per block. +static const int target_sub_block_size_11 = 512*1024; +static const int max_sub_blocks_11 = 8; + +// round up n +static inline int roundup2(int n, int alignment) { + return (n+alignment-1)&~(alignment-1); +} + + +// get the size of the compression header +static size_t get_compression_header_size(int UU(layout_version), int n) { + return sizeof (u_int32_t) + (n * 2 * sizeof (u_int32_t)); +} + + + +// get the sum of the sub block uncompressed sizes +static size_t get_sum_uncompressed_size_11(int n, struct sub_block_sizes sizes[]) { + int i; + size_t uncompressed_size = 0; + for (i=0; iu.n.totalchildkeylens=0; + u_int32_t subtree_fingerprint = rbuf_int(rb); + u_int32_t check_subtree_fingerprint = 0; + result->u.n.n_children = rbuf_int(rb); + MALLOC_N(result->u.n.n_children+1, result->u.n.childinfos); + MALLOC_N(result->u.n.n_children, result->u.n.childkeys); + //printf("n_children=%d\n", result->n_children); + assert(result->u.n.n_children>=0); + for (i=0; iu.n.n_children; i++) { + u_int32_t childfp = rbuf_int(rb); + BNC_SUBTREE_FINGERPRINT(result, i)= childfp; + check_subtree_fingerprint += childfp; + struct subtree_estimates *se = &(BNC_SUBTREE_ESTIMATES(result, i)); + se->nkeys = rbuf_ulonglong(rb); + se->ndata = rbuf_ulonglong(rb); + se->dsize = rbuf_ulonglong(rb); + se->exact = (BOOL) (rbuf_char(rb) != 0); + } + for (i=0; iu.n.n_children-1; i++) { + if (result->flags & TOKU_DB_DUPSORT) { + bytevec keyptr, dataptr; + unsigned int keylen, datalen; + rbuf_bytes(rb, &keyptr, &keylen); + rbuf_bytes(rb, &dataptr, &datalen); + result->u.n.childkeys[i] = kv_pair_malloc(keyptr, keylen, dataptr, datalen); + } else { + bytevec childkeyptr; + unsigned int cklen; + rbuf_bytes(rb, &childkeyptr, &cklen); /* Returns a pointer into the rbuf. */ + result->u.n.childkeys[i] = kv_pair_malloc((void*)childkeyptr, cklen, 0, 0); + } + //printf(" key %d length=%d data=%s\n", i, result->childkeylens[i], result->childkeys[i]); + result->u.n.totalchildkeylens+=toku_brtnode_pivot_key_len_11(result, result->u.n.childkeys[i]); + } + for (i=0; iu.n.n_children; i++) { + BNC_BLOCKNUM(result,i) = rbuf_blocknum(rb); + BNC_HAVE_FULLHASH(result, i) = FALSE; + BNC_NBYTESINBUF(result,i) = 0; + //printf("Child %d at %lld\n", i, result->children[i]); + } + result->u.n.n_bytes_in_buffers = 0; + for (i=0; iu.n.n_children; i++) { + r=toku_fifo_create(&BNC_BUFFER(result,i)); + if (r!=0) { + int j; + if (0) { died_1: j=result->u.n.n_bytes_in_buffers; } + for (j=0; ju.n.n_children; cnum++) { + int n_in_this_hash = rbuf_int(rb); + //printf("%d in hash\n", n_in_hash); + for (i=0; irand4fingerprint * toku_calc_fingerprint_cmd(type, xids, key, keylen, val, vallen); + //printf("Found %s,%s\n", (char*)key, (char*)val); + { + r=toku_fifo_enq(BNC_BUFFER(result, cnum), key, keylen, val, vallen, type, xids); /* Copies the data into the hash table. */ + if (r!=0) { goto died_1; } + } + diff = keylen + vallen + KEY_VALUE_OVERHEAD + BRT_CMD_OVERHEAD + xids_get_serialize_size(xids); + result->u.n.n_bytes_in_buffers += diff; + BNC_NBYTESINBUF(result,cnum) += diff; + //printf("Inserted\n"); + xids_destroy(&xids); + } + } + if (check_local_fingerprint != result->local_fingerprint) { + fprintf(stderr, "%s:%d local fingerprint is wrong (found %8x calcualted %8x\n", __FILE__, __LINE__, result->local_fingerprint, check_local_fingerprint); + return toku_db_badformat(); + } + if (check_subtree_fingerprint+check_local_fingerprint != subtree_fingerprint) { + fprintf(stderr, "%s:%d subtree fingerprint is wrong\n", __FILE__, __LINE__); + return toku_db_badformat(); + } + } + (void)rbuf_int(rb); //Ignore the crc (already verified). + if (rb->ndone != rb->size) { //Verify we read exactly the entire block. + r = toku_db_badformat(); goto died_1; + } + return 0; +} + +static int +deserialize_brtnode_leaf_from_rbuf_11 (BRTNODE result, bytevec magic, struct rbuf *rb) { + int r; + int i; + + if (memcmp(magic, "tokuleaf", 8)!=0) { + r = toku_db_badformat(); + return r; + } + + result->u.l.leaf_stats.nkeys = rbuf_ulonglong(rb); + result->u.l.leaf_stats.ndata = rbuf_ulonglong(rb); + result->u.l.leaf_stats.dsize = rbuf_ulonglong(rb); + result->u.l.leaf_stats.exact = TRUE; + int n_in_buf = rbuf_int(rb); + result->u.l.n_bytes_in_buffer = 0; + result->u.l.seqinsert = 0; + + //printf("%s:%d r PMA= %p\n", __FILE__, __LINE__, result->u.l.buffer); + toku_mempool_init(&result->u.l.buffer_mempool, rb->buf, rb->size); + + u_int32_t actual_sum = 0; + u_int32_t start_of_data = rb->ndone; + OMTVALUE *MALLOC_N(n_in_buf, array); + for (i=0; ibuf[rb->ndone]); + u_int32_t disksize = leafentry_disksize(le); + rb->ndone += disksize; + assert(rb->ndone<=rb->size); + + array[i]=(OMTVALUE)le; + actual_sum += x1764_memory_11(le, disksize); + } + toku_trace("fill array"); + u_int32_t end_of_data = rb->ndone; + result->u.l.n_bytes_in_buffer += end_of_data-start_of_data + n_in_buf*OMT_ITEM_OVERHEAD; + actual_sum *= result->rand4fingerprint; + r = toku_omt_create_steal_sorted_array(&result->u.l.buffer, &array, n_in_buf, n_in_buf); + toku_trace("create omt"); + if (r!=0) { + toku_free(array); + r = toku_db_badformat(); + if (0) { died_1: toku_omt_destroy(&result->u.l.buffer); } + return r; + } + assert(array==NULL); + + result->u.l.buffer_mempool.frag_size = start_of_data; + result->u.l.buffer_mempool.free_offset = end_of_data; + + if (r!=0) goto died_1; + if (actual_sum!=result->local_fingerprint) { + //fprintf(stderr, "%s:%d Corrupted checksum stored=%08x rand=%08x actual=%08x height=%d n_keys=%d\n", __FILE__, __LINE__, result->rand4fingerprint, result->local_fingerprint, actual_sum, result->height, n_in_buf); + r = toku_db_badformat(); + goto died_1; + } else { + //fprintf(stderr, "%s:%d Good checksum=%08x height=%d\n", __FILE__, __LINE__, actual_sum, result->height); + } + + //toku_verify_counts_11(result); + + (void)rbuf_int(rb); //Ignore the crc (already verified). + if (rb->ndone != rb->size) { //Verify we read exactly the entire block. + r = toku_db_badformat(); goto died_1; + } + + r = toku_leaflock_borrow(result->u.l.leaflock_pool, &result->u.l.leaflock); + if (r!=0) goto died_1; + rb->buf = NULL; //Buffer was used for node's mempool. + return 0; +} + + +static int +deserialize_brtnode_from_rbuf_11 (BLOCKNUM blocknum, u_int32_t fullhash, BRTNODE *brtnode, struct brt_header *h, struct rbuf *rb) { + TAGMALLOC(BRTNODE, result); + int r; + if (result==0) { + r=errno; + if (0) { died0: toku_free(result); } + return r; + } + result->ever_been_written = 1; + + //printf("Deserializing %lld datasize=%d\n", off, datasize); + bytevec magic; + rbuf_literal_bytes(rb, &magic, 8); + result->layout_version = rbuf_int(rb); + assert(result->layout_version == BRT_LAYOUT_VERSION_11); + result->layout_version_original = rbuf_int(rb); + result->layout_version_read_from_disk = result->layout_version; + { + //Restrict scope for now since we do not support upgrades. + DESCRIPTOR_S desc; + //desc.dbt.data is TEMPORARY. Will be unusable when the rc buffer is freed. + deserialize_descriptor_from_rbuf_11(rb, &desc, TRUE); + //Just throw away. + } + result->nodesize = rbuf_int(rb); + + result->thisnodename = blocknum; + result->flags = rbuf_int(rb); + result->height = rbuf_int(rb); + result->rand4fingerprint = rbuf_int(rb); + result->local_fingerprint = rbuf_int(rb); +// printf("%s:%d read %08x\n", __FILE__, __LINE__, result->local_fingerprint); + result->dirty = 0; + result->fullhash = fullhash; + //printf("height==%d\n", result->height); + + if (result->height>0) + r = deserialize_brtnode_nonleaf_from_rbuf_11(result, magic, rb); + else { + result->u.l.leaflock_pool = toku_cachefile_leaflock_pool(h->cf); + r = deserialize_brtnode_leaf_from_rbuf_11(result, magic, rb); + } + if (r!=0) goto died0; + + //printf("%s:%d Ok got %lld n_children=%d\n", __FILE__, __LINE__, result->thisnodename, result->n_children); + if (result->height>0) { + // For height==0 we used the buf inside the OMT + toku_free(rb->buf); + rb->buf = NULL; + } + toku_trace("deserial done"); + *brtnode = result; + //toku_verify_counts_11(result); + return 0; +} + +static int +verify_decompressed_brtnode_checksum (struct rbuf *rb) { + int r = 0; + + if (rb->size >= 4) { + uint32_t verify_size = rb->size - 4; //Not counting the checksum + + toku_trace("x1764 start"); + uint32_t crc = x1764_memory_11(rb->buf, verify_size); + toku_trace("x1764"); + + uint32_t *crcp = (uint32_t*)(((uint8_t*)rb->buf) + verify_size); + uint32_t storedcrc = toku_dtoh32(*crcp); + if (crc!=storedcrc) { + printf("Bad CRC\n"); + printf("%s:%d crc=%08x stored=%08x\n", __FILE__, __LINE__, crc, storedcrc); + r = toku_db_badformat(); + } + } + else r = toku_db_badformat(); + return r; +} + +#define PAR_DECOMPRESS 1 + +#if PAR_DECOMPRESS + +#include "workset.h" + +struct decompress_work_11 { + struct work base; + void *compress_ptr; + void *uncompress_ptr; + u_int32_t compress_size; + u_int32_t uncompress_size; +}; + +// initialize the decompression work +static void +decompress_work_init_11(struct decompress_work_11 *dw, + void *compress_ptr, u_int32_t compress_size, + void *uncompress_ptr, u_int32_t uncompress_size) { + dw->compress_ptr = compress_ptr; + dw->compress_size = compress_size; + dw->uncompress_ptr = uncompress_ptr; + dw->uncompress_size = uncompress_size; +} + +// decompress one block +static void +decompress_block(struct decompress_work_11 *dw) { + if (0) printf("%s:%d %x %p\n", __FUNCTION__, __LINE__, (int) toku_pthread_self(), dw); + uLongf destlen = dw->uncompress_size; + int r = uncompress(dw->uncompress_ptr, &destlen, dw->compress_ptr, dw->compress_size); + assert(destlen == dw->uncompress_size); + assert(r==Z_OK); +} + +// decompress blocks until there is no more work to do +static void * +decompress_worker_11(void *arg) { + struct workset *ws = (struct workset *) arg; + while (1) { + struct decompress_work_11 *dw = (struct decompress_work_11 *) workset_get(ws); + if (dw == NULL) + break; + decompress_block(dw); + } + return arg; +} + +#else + +#define DO_DECOMPRESS_WORKER 0 + +struct decompress_work_11 { + toku_pthread_t id; + void *compress_ptr; + void *uncompress_ptr; + u_int32_t compress_size; + u_int32_t uncompress_size; +}; + +// initialize the decompression work +static void init_decompress_work(struct decompress_work_11 *w, + void *compress_ptr, u_int32_t compress_size, + void *uncompress_ptr, u_int32_t uncompress_size) { + memset(&w->id, 0, sizeof(w->id)); + w->compress_ptr = compress_ptr; w->compress_size = compress_size; + w->uncompress_ptr = uncompress_ptr; w->uncompress_size = uncompress_size; +} + +// do the decompression work +static void do_decompress_work(struct decompress_work_11 *w) { + uLongf destlen = w->uncompress_size; + int r = uncompress(w->uncompress_ptr, &destlen, + w->compress_ptr, w->compress_size); + assert(destlen==w->uncompress_size); + assert(r==Z_OK); +} + +#if DO_DECOMPRESS_WORKER + +static void *decompress_worker_11(void *); + +static void start_decompress_work(struct decompress_work_11 *w) { + int r = toku_pthread_create(&w->id, NULL, decompress_worker_11, w); assert(r == 0); +} + +static void wait_decompress_work(struct decompress_work_11 *w) { + void *ret; + int r = toku_pthread_join(w->id, &ret); assert(r == 0); +} + +static void *decompress_worker_11(void *arg) { + struct decompress_work_11 *w = (struct decompress_work_11 *) arg; + do_decompress_work(w); + return arg; +} + +#endif + +#endif + +static int +decompress_brtnode_from_raw_block_into_rbuf_11(u_int8_t *raw_block, struct rbuf *rb, BLOCKNUM blocknum) { + int r; + int i; + // get the number of compressed sub blocks + int n_sub_blocks; + int compression_header_offset; + { + n_sub_blocks = toku_dtoh32(*(u_int32_t*)(&raw_block[uncompressed_magic_len_11])); + compression_header_offset = uncompressed_magic_len_11 + 4; + } + assert(0 < n_sub_blocks); + + // verify the sizes of the compressed sub blocks + if (0 && n_sub_blocks != 1) printf("%s:%d %d\n", __FUNCTION__, __LINE__, n_sub_blocks); + + struct sub_block_sizes sub_block_sizes[n_sub_blocks]; + for (i=0; i(1<<30)) { r = toku_db_badformat(); return r; } + u_int32_t uncompressed_size = toku_dtoh32(*(u_int32_t*)(&raw_block[compression_header_offset+8*i+4])); + if (0) printf("Block %" PRId64 " Compressed size = %u, uncompressed size=%u\n", blocknum.b, compressed_size, uncompressed_size); + if (uncompressed_size<=0 || uncompressed_size>(1<<30)) { r = toku_db_badformat(); return r; } + + sub_block_sizes[i].compressed_size = compressed_size; + sub_block_sizes[i].uncompressed_size = uncompressed_size; + } + + unsigned char *compressed_data = raw_block + uncompressed_magic_len_11 + get_compression_header_size(BRT_LAYOUT_VERSION_11, n_sub_blocks); + + size_t uncompressed_size = get_sum_uncompressed_size_11(n_sub_blocks, sub_block_sizes); + rb->size= uncompressed_magic_len_11 + uncompressed_size; + assert(rb->size>0); + + rb->buf=toku_xmalloc(rb->size); + + // construct the uncompressed block from the header and compressed sub blocks + memcpy(rb->buf, raw_block, uncompressed_magic_len_11); + +#if PAR_DECOMPRESS + // compute the number of additional threads needed for decompressing this node + int T = num_cores; // T = min(#cores, #blocks) - 1 + if (T > n_sub_blocks) + T = n_sub_blocks; + if (T > 0) + T = T - 1; // threads in addition to the running thread + + // init the decompression work set + struct workset ws; + workset_init(&ws); + + // initialize the decompression work and add to the work set + unsigned char *uncompressed_data = rb->buf+uncompressed_magic_len_11; + struct decompress_work_11 decompress_work_11[n_sub_blocks]; + workset_lock(&ws); + for (i = 0; i < n_sub_blocks; i++) { + decompress_work_init_11(&decompress_work_11[i], compressed_data, sub_block_sizes[i].compressed_size, uncompressed_data, sub_block_sizes[i].uncompressed_size); + uncompressed_data += sub_block_sizes[i].uncompressed_size; + compressed_data += sub_block_sizes[i].compressed_size; + workset_put_locked(&ws, &decompress_work_11[i].base); + } + workset_unlock(&ws); + + // decompress the sub-blocks + if (0) printf("%s:%d Cores=%d Blocks=%d T=%d\n", __FUNCTION__, __LINE__, num_cores, n_sub_blocks, T); + toku_pthread_t tids[T]; + threadset_create(tids, &T, decompress_worker_11, &ws); + decompress_worker_11(&ws); + + // cleanup + threadset_join(tids, T); + workset_destroy(&ws); + +#else + // decompress the sub blocks + unsigned char *uncompressed_data = rb->buf+uncompressed_magic_len_11; + struct decompress_work_11 decompress_work_11[n_sub_blocks]; + + for (i=0; i0) { +#if DO_DECOMPRESS_WORKER + start_decompress_work(&decompress_work_11[i]); +#else + do_decompress_work(&decompress_work_11[i]); +#endif + } + uncompressed_data += sub_block_sizes[i].uncompressed_size; + compressed_data += sub_block_sizes[i].compressed_size; + } + do_decompress_work(&decompress_work_11[0]); +#if DO_DECOMPRESS_WORKER + for (i=1; ibuf[uncompressed_magic_len_11], rb->buf[uncompressed_magic_len_11+1], + rb->buf[uncompressed_magic_len_11+2], rb->buf[uncompressed_magic_len_11+3]); + + rb->ndone=0; + + r = verify_decompressed_brtnode_checksum(rb); + return r; +} + + + + + + + +// ################ + + +static void +deserialize_descriptor_from_rbuf_11(struct rbuf *rb, DESCRIPTOR desc, BOOL temporary) { + desc->version = rbuf_int(rb); + u_int32_t size; + bytevec data; + rbuf_bytes(rb, &data, &size); + bytevec data_copy = data;; + if (size>0) { + if (!temporary) { + data_copy = toku_memdup(data, size); //Cannot keep the reference from rbuf. Must copy. + assert(data_copy); + } + } + else { + assert(size==0); + data_copy = NULL; + } + toku_fill_dbt(&desc->dbt, data_copy, size); + if (desc->version==0) assert(desc->dbt.size==0); +} + +static void +deserialize_descriptor_from_11(int fd, struct brt_header *h, DESCRIPTOR desc) { + DISKOFF offset; + DISKOFF size; + toku_get_descriptor_offset_size(h->blocktable, &offset, &size); + memset(desc, 0, sizeof(*desc)); + if (size > 0) { + assert(size>=4); //4 for checksum + { + unsigned char *XMALLOC_N(size, dbuf); + { + lock_for_pwrite(); + ssize_t r = pread(fd, dbuf, size, offset); + assert(r==size); + unlock_for_pwrite(); + } + { + // check the checksum + u_int32_t x1764 = x1764_memory_11(dbuf, size-4); + //printf("%s:%d read from %ld (x1764 offset=%ld) size=%ld\n", __FILE__, __LINE__, block_translation_address_on_disk, offset, block_translation_size_on_disk); + u_int32_t stored_x1764 = toku_dtoh32(*(int*)(dbuf + size-4)); + assert(x1764 == stored_x1764); + } + { + struct rbuf rb = {.buf = dbuf, .size = size, .ndone = 0}; + //Not temporary; must have a toku_memdup'd copy. + deserialize_descriptor_from_rbuf_11(&rb, desc, FALSE); + } + assert(toku_serialize_descriptor_size_11(desc)+4 == size); + toku_free(dbuf); + } + } +} + + +// We only deserialize brt header once and then share everything with all the brts. +static int +deserialize_brtheader_11 (int fd, struct rbuf *rb, struct brt_header **brth) { + // We already know: + // we have an rbuf representing the header. + // The checksum has been validated + + //Steal rbuf (used to simplify merge, reduce diff size, and keep old code) + struct rbuf rc = *rb; + memset(rb, 0, sizeof(*rb)); + + //Verification of initial elements. + { + //Check magic number + bytevec magic; + rbuf_literal_bytes(&rc, &magic, 8); + assert(memcmp(magic,"tokudata",8)==0); + } + + + struct brt_header *CALLOC(h); + if (h==0) return errno; + int ret=-1; + if (0) { died1: toku_free(h); return ret; } + h->type = BRTHEADER_CURRENT; + h->checkpoint_header = NULL; + h->dirty=0; + h->panic = 0; + h->panic_string = 0; + toku_list_init(&h->live_brts); + toku_list_init(&h->zombie_brts); + //version MUST be in network order on disk regardless of disk order + h->layout_version = rbuf_network_int(&rc); + //TODO: #1924 + assert(h->layout_version==BRT_LAYOUT_VERSION_11); + + //Size MUST be in network order regardless of disk order. + u_int32_t size = rbuf_network_int(&rc); + assert(size==rc.size); + + bytevec tmp_byte_order_check; + rbuf_literal_bytes(&rc, &tmp_byte_order_check, 8); //Must not translate byte order + int64_t byte_order_stored = *(int64_t*)tmp_byte_order_check; + assert(byte_order_stored == toku_byte_order_host); + + h->checkpoint_count = rbuf_ulonglong(&rc); + h->checkpoint_lsn = rbuf_lsn(&rc); + h->nodesize = rbuf_int(&rc); + DISKOFF translation_address_on_disk = rbuf_diskoff(&rc); + DISKOFF translation_size_on_disk = rbuf_diskoff(&rc); + assert(translation_address_on_disk>0); + assert(translation_size_on_disk>0); + + // printf("%s:%d translated_blocknum_limit=%ld, block_translation_address_on_disk=%ld\n", __FILE__, __LINE__, h->translated_blocknum_limit, h->block_translation_address_on_disk); + //Load translation table + { + lock_for_pwrite(); + unsigned char *XMALLOC_N(translation_size_on_disk, tbuf); + { + // This cast is messed up in 32-bits if the block translation table is ever more than 4GB. But in that case, the translation table itself won't fit in main memory. + ssize_t r = pread(fd, tbuf, translation_size_on_disk, translation_address_on_disk); + assert(r==translation_size_on_disk); + } + unlock_for_pwrite(); + // Create table and read in data. + toku_blocktable_create_from_buffer(&h->blocktable, + translation_address_on_disk, + translation_size_on_disk, + tbuf, + TRUE); + toku_free(tbuf); + } + + h->root = rbuf_blocknum(&rc); + h->root_hash.valid = FALSE; + h->flags = rbuf_int(&rc); + deserialize_descriptor_from_11(fd, h, &h->descriptor); + h->layout_version_original = rbuf_int(&rc); + (void)rbuf_int(&rc); //Read in checksum and ignore (already verified). + if (rc.ndone!=rc.size) {ret = EINVAL; goto died1;} + toku_free(rc.buf); + rc.buf = NULL; + *brth = h; + return 0; +} + + + + + + + diff --git a/newbrt/backwards_11.h b/newbrt/backwards_11.h new file mode 100644 index 00000000000..cce2a156998 --- /dev/null +++ b/newbrt/backwards_11.h @@ -0,0 +1,18 @@ +/* -*- mode: C; c-basic-offset: 4 -*- */ +#ident "$Id$" +#ident "Copyright (c) 2007-2010 Tokutek Inc. All rights reserved." +#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it." + +#ifndef BACKWARD_11_H +#define BACKWARD_11_H + +static int upgrade_brtheader_11_12 (int fd, struct brt_header **brth_11, struct brt_header **brth_12); +static int upgrade_brtnode_11_12 (BRTNODE *brtnode_11, BRTNODE *brtnode_12); + +static int deserialize_brtheader_11 (int fd, struct rbuf *rb, struct brt_header **brth); + +static int decompress_brtnode_from_raw_block_into_rbuf_11(u_int8_t *raw_block, struct rbuf *rb, BLOCKNUM blocknum); +static int deserialize_brtnode_from_rbuf_11 (BLOCKNUM blocknum, u_int32_t fullhash, BRTNODE *brtnode, struct brt_header *h, struct rbuf *rb); + +#endif + diff --git a/newbrt/block_table.c b/newbrt/block_table.c index fddd8857663..e03c5edc1c4 100644 --- a/newbrt/block_table.c +++ b/newbrt/block_table.c @@ -84,11 +84,13 @@ static void brtheader_set_dirty(struct brt_header *h, BOOL for_checkpoint){ assert(h->blocktable->is_locked); assert(h->type == BRTHEADER_CURRENT); - h->dirty = 1; if (for_checkpoint) { assert(h->checkpoint_header->type == BRTHEADER_CHECKPOINT_INPROGRESS); h->checkpoint_header->dirty = 1; } + else { + h->dirty = 1; + } } //fd is protected (must be holding fdlock) @@ -131,6 +133,22 @@ copy_translation(struct translation * dst, struct translation * src, enum transl dst->block_translation[RESERVED_BLOCKNUM_TRANSLATION].u.diskoff = diskoff_unused; } +int64_t +toku_block_get_blocks_in_use_unlocked(BLOCK_TABLE bt) { + BLOCKNUM b; + struct translation *t = &bt->current; + int64_t num_blocks = 0; + { + //Reserved blocknums do not get upgraded; They are part of the header. + for (b.b = RESERVED_BLOCKNUMS; b.b < t->smallest_never_used_blocknum.b; b.b++) { + if (t->block_translation[b.b].size != size_is_free) { + num_blocks++; + } + } + } + return num_blocks; +} + static void maybe_optimize_translation(struct translation *t) { //Reduce 'smallest_never_used_blocknum.b' (completely free blocknums instead of just @@ -727,7 +745,14 @@ static void translation_deserialize_from_buffer(struct translation *t, // destination into which to deserialize DISKOFF location_on_disk, //Location of translation_buffer u_int64_t size_on_disk, - unsigned char * translation_buffer) { // buffer with serialized translation + unsigned char * translation_buffer +#if BRT_LAYOUT_MIN_SUPPORTED_VERSION <= BRT_LAYOUT_VERSION_11 + , BOOL invert_checksum +#else +#error The above code block is obsolete +#endif + + ) { // buffer with serialized translation assert(location_on_disk!=0); t->type = TRANSLATION_CHECKPOINTED; { @@ -736,6 +761,13 @@ translation_deserialize_from_buffer(struct translation *t, // destination int u_int64_t offset = size_on_disk - 4; //printf("%s:%d read from %ld (x1764 offset=%ld) size=%ld\n", __FILE__, __LINE__, block_translation_address_on_disk, offset, block_translation_size_on_disk); u_int32_t stored_x1764 = toku_dtoh32(*(int*)(translation_buffer + offset)); +#if BRT_LAYOUT_MIN_SUPPORTED_VERSION <= BRT_LAYOUT_VERSION_11 + if (invert_checksum) { + x1764 = ~x1764; + } +#else +#error The above code block is obsolete +#endif assert(x1764 == stored_x1764); } struct rbuf rt; @@ -783,9 +815,10 @@ void toku_blocktable_create_from_buffer(BLOCK_TABLE *btp, DISKOFF location_on_disk, //Location of translation_buffer DISKOFF size_on_disk, - unsigned char *translation_buffer) { + unsigned char *translation_buffer, + BOOL invert_checksum) { BLOCK_TABLE bt = blocktable_create_internal(); - translation_deserialize_from_buffer(&bt->checkpointed, location_on_disk, size_on_disk, translation_buffer); + translation_deserialize_from_buffer(&bt->checkpointed, location_on_disk, size_on_disk, translation_buffer, invert_checksum); blocktable_note_translation(bt->block_allocator, &bt->checkpointed); // we just filled in checkpointed, now copy it to current. copy_translation(&bt->current, &bt->checkpointed, TRANSLATION_CURRENT); diff --git a/newbrt/block_table.h b/newbrt/block_table.h index 113adb2e91a..6b9c953e77e 100644 --- a/newbrt/block_table.h +++ b/newbrt/block_table.h @@ -21,7 +21,7 @@ struct block_translation_pair { }; void toku_blocktable_create_new(BLOCK_TABLE *btp); -void toku_blocktable_create_from_buffer(BLOCK_TABLE *btp, DISKOFF location_on_disk, DISKOFF size_on_disk, unsigned char *translation_buffer); +void toku_blocktable_create_from_buffer(BLOCK_TABLE *btp, DISKOFF location_on_disk, DISKOFF size_on_disk, unsigned char *translation_buffer, BOOL invert_checksum); void toku_blocktable_destroy(BLOCK_TABLE *btp); void toku_brtheader_lock(struct brt_header *h); @@ -73,6 +73,8 @@ void toku_block_table_get_fragmentation_unlocked(BLOCK_TABLE bt, TOKU_DB_FRAGMEN //Requires: blocktable lock is held. //Requires: report->file_size_bytes is already filled in. +int64_t toku_block_get_blocks_in_use_unlocked(BLOCK_TABLE bt); + //Unmovable reserved first, then reallocable. // We reserve one blocknum for the translation table itself. enum {RESERVED_BLOCKNUM_NULL =0, diff --git a/newbrt/brt-internal.h b/newbrt/brt-internal.h index ea3a10774c7..de0c28c3df0 100644 --- a/newbrt/brt-internal.h +++ b/newbrt/brt-internal.h @@ -88,7 +88,6 @@ typedef struct brtnode *BRTNODE; /* Internal nodes. */ struct brtnode { enum typ_tag tag; - struct descriptor *desc; unsigned int nodesize; int ever_been_written; unsigned int flags; @@ -170,11 +169,12 @@ struct brt_header { int layout_version_original; // different (<) from layout_version if upgraded from a previous version (useful for debugging) int layout_version_read_from_disk; // transient, not serialized to disk BOOL upgrade_brt_performed; // initially FALSE, set TRUE when brt has been fully updated (even though nodes may not have been) + uint64_t num_blocks_to_upgrade; // Number of blocks still not newest version. When we release layout 13 we may need to turn this to an array. unsigned int nodesize; BLOCKNUM root; // roots of the dictionary struct remembered_hash root_hash; // hash of the root offset. unsigned int flags; - struct descriptor descriptor; + DESCRIPTOR_S descriptor; u_int64_t root_put_counter; // the generation number of the brt @@ -200,8 +200,7 @@ struct brt { unsigned int flags; BOOL did_set_flags; BOOL did_set_descriptor; - struct descriptor temp_descriptor; - toku_dbt_upgradef dbt_userformat_upgrade; + DESCRIPTOR_S temp_descriptor; int (*compare_fun)(DB*,const DBT*,const DBT*); int (*dup_compare)(DB*,const DBT*,const DBT*); DB *db; // To pass to the compare fun, and close once transactions are done. @@ -230,14 +229,14 @@ int toku_deserialize_brtnode_from (int fd, BLOCKNUM off, u_int32_t /*fullhash*/, unsigned int toku_serialize_brtnode_size(BRTNODE node); /* How much space will it take? */ int toku_keycompare (bytevec key1, ITEMLEN key1len, bytevec key2, ITEMLEN key2len); -void toku_verify_counts(BRTNODE); +void toku_verify_or_set_counts(BRTNODE, BOOL); int toku_serialize_brt_header_size (struct brt_header *h); int toku_serialize_brt_header_to (int fd, struct brt_header *h); int toku_serialize_brt_header_to_wbuf (struct wbuf *, struct brt_header *h, int64_t address_translation, int64_t size_translation); int toku_deserialize_brtheader_from (int fd, struct brt_header **brth); -int toku_serialize_descriptor_contents_to_fd(int fd, const struct descriptor *desc, DISKOFF offset); -void toku_serialize_descriptor_contents_to_wbuf(struct wbuf *wb, const struct descriptor *desc); +int toku_serialize_descriptor_contents_to_fd(int fd, const DESCRIPTOR desc, DISKOFF offset); +void toku_serialize_descriptor_contents_to_wbuf(struct wbuf *wb, const DESCRIPTOR desc); void toku_brtnode_free (BRTNODE *node); @@ -347,10 +346,10 @@ enum brt_layout_version_e { BRT_LAYOUT_VERSION_9 = 9, // Diff from 8 to 9: Variable-sized blocks and compression. BRT_LAYOUT_VERSION_10 = 10, // Diff from 9 to 10: Variable number of compressed sub-blocks per block, disk byte order == intel byte order, Subtree estimates instead of just leafentry estimates, translation table, dictionary descriptors, checksum in header, subdb support removed from brt layer BRT_LAYOUT_VERSION_11 = 11, // Diff from 10 to 11: Nested transaction leafentries (completely redesigned). BRT_CMDs on disk now support XIDS (multiple txnids) instead of exactly one. - BRT_LAYOUT_VERSION_12 = 12, // Diff from 11 to 12: Added BRT_CMD 'BRT_INSERT_NO_OVERWRITE' + BRT_LAYOUT_VERSION_12 = 12, // Diff from 11 to 12: Added BRT_CMD 'BRT_INSERT_NO_OVERWRITE', compressed block format, num old blocks BRT_NEXT_VERSION, // the version after the current version BRT_LAYOUT_VERSION = BRT_NEXT_VERSION-1, // A hack so I don't have to change this line. - BRT_LAYOUT_MIN_SUPPORTED_VERSION = BRT_LAYOUT_VERSION // Minimum version supported without transparent upgrade + BRT_LAYOUT_MIN_SUPPORTED_VERSION = BRT_LAYOUT_VERSION_12 // Minimum version supported }; void toku_brtheader_free (struct brt_header *h); @@ -364,6 +363,15 @@ int toku_db_badformat(void); int toku_brt_remove_on_commit(TOKUTXN child, DBT* iname_dbt_p); int toku_brt_remove_now(CACHETABLE ct, DBT* iname_dbt_p); + +typedef struct brt_upgrade_status { + u_int64_t header; + u_int64_t nonleaf; + u_int64_t leaf; +} BRT_UPGRADE_STATUS_S, *BRT_UPGRADE_STATUS; + +void toku_brt_get_upgrade_status(BRT_UPGRADE_STATUS); + C_END #endif diff --git a/newbrt/brt-serialize.c b/newbrt/brt-serialize.c index f9344be3545..6e4ef7038d0 100644 --- a/newbrt/brt-serialize.c +++ b/newbrt/brt-serialize.c @@ -4,14 +4,21 @@ #ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it." #include "includes.h" +#include "toku_atomic.h" -#include "backwards_10.h" +#include "backwards_11.h" // NOTE: The backwards compatability functions are in a file that is included at the END of this file. -static int deserialize_brtheader_10 (int fd, struct rbuf *rb, struct brt_header **brth); -static int upgrade_brtheader_10_11 (struct brt_header **brth_10, struct brt_header **brth_11); -static int decompress_brtnode_from_raw_block_into_rbuf_10(u_int8_t *raw_block, struct rbuf *rb, BLOCKNUM blocknum); -static int deserialize_brtnode_from_rbuf_10 (BLOCKNUM blocknum, u_int32_t fullhash, BRTNODE *brtnode, struct brt_header *h, struct rbuf *rb); -static int upgrade_brtnode_10_11 (BRTNODE *brtnode_10, BRTNODE *brtnode_11); + + +static BRT_UPGRADE_STATUS_S upgrade_status; // accountability, used in backwards_x.c + +void +toku_brt_get_upgrade_status (BRT_UPGRADE_STATUS s) { + *s = upgrade_status; +} + + + // performance tracing #define DO_TOKU_TRACE 0 @@ -172,8 +179,7 @@ enum { 4+ // layout_version 4), // layout_version_original - extended_node_header_overhead = (0+ // descriptor (variable, not counted here) - 4+ // nodesize + extended_node_header_overhead = (4+ // nodesize 4+ // flags 4+ // height 4+ // random for fingerprint @@ -194,7 +200,6 @@ addupsize (OMTVALUE lev, u_int32_t UU(idx), void *vp) { static unsigned int toku_serialize_brtnode_size_slow (BRTNODE node) { unsigned int size = node_header_overhead + extended_node_header_overhead; - size += toku_serialize_descriptor_size(node->desc); if (node->height > 0) { unsigned int hsize=0; unsigned int csize=0; @@ -236,7 +241,6 @@ unsigned int toku_serialize_brtnode_size (BRTNODE node) { unsigned int result = node_header_overhead + extended_node_header_overhead; assert(sizeof(toku_off_t)==8); - result += toku_serialize_descriptor_size(node->desc); if (node->height > 0) { result += 4; /* subtree fingerpirnt */ result += 4; /* n_children */ @@ -277,9 +281,6 @@ serialize_node_header(BRTNODE node, struct wbuf *wbuf) { wbuf_nocrc_int(wbuf, node->layout_version); wbuf_nocrc_int(wbuf, node->layout_version_original); - // serialize the descriptor - toku_serialize_descriptor_contents_to_wbuf(wbuf, node->desc); - //printf("%s:%d %lld.calculated_size=%d\n", __FILE__, __LINE__, off, calculated_size); wbuf_nocrc_uint(wbuf, node->nodesize); wbuf_nocrc_uint(wbuf, node->flags); @@ -518,8 +519,6 @@ toku_serialize_brtnode_to_memory (BRTNODE node, int UU(n_workitems), int UU(n_th int toku_serialize_brtnode_to (int fd, BLOCKNUM blocknum, BRTNODE node, struct brt_header *h, int n_workitems, int n_threads, BOOL for_checkpoint) { - assert(node->desc == &h->descriptor); - size_t n_to_write; char *compressed_buf; { @@ -550,7 +549,7 @@ toku_serialize_brtnode_to (int fd, BLOCKNUM blocknum, BRTNODE node, struct brt_h return 0; } -static void deserialize_descriptor_from_rbuf(struct rbuf *rb, struct descriptor *desc, BOOL temporary); +static void deserialize_descriptor_from_rbuf(struct rbuf *rb, DESCRIPTOR desc, BOOL temporary); #include "workset.h" @@ -843,7 +842,6 @@ deserialize_brtnode_from_rbuf (BLOCKNUM blocknum, u_int32_t fullhash, BRTNODE *b if (0) { died0: toku_free(result); } return r; } - result->desc = &h->descriptor; result->ever_been_written = 1; //printf("Deserializing %lld datasize=%d\n", off, datasize); @@ -853,13 +851,6 @@ deserialize_brtnode_from_rbuf (BLOCKNUM blocknum, u_int32_t fullhash, BRTNODE *b assert(result->layout_version == BRT_LAYOUT_VERSION); result->layout_version_original = rbuf_int(rb); result->layout_version_read_from_disk = result->layout_version; - { - //Restrict scope for now since we do not support upgrades. - struct descriptor desc; - //desc.dbt.data is TEMPORARY. Will be unusable when the rc buffer is freed. - deserialize_descriptor_from_rbuf(rb, &desc, TRUE); - assert(desc.version == result->desc->version); //We do not yet support upgrading the dbts. - } result->nodesize = rbuf_int(rb); result->thisnodename = blocknum; @@ -892,10 +883,8 @@ deserialize_brtnode_from_rbuf (BLOCKNUM blocknum, u_int32_t fullhash, BRTNODE *b } static int -decompress_from_raw_block_into_rbuf(u_int8_t *raw_block, struct rbuf *rb, BLOCKNUM blocknum) { +decompress_from_raw_block_into_rbuf(u_int8_t *raw_block, size_t raw_block_size, struct rbuf *rb, BLOCKNUM blocknum) { toku_trace("decompress"); - int r; - // get the number of compressed sub blocks int n_sub_blocks; n_sub_blocks = toku_dtoh32(*(u_int32_t*)(&raw_block[node_header_overhead])); @@ -903,6 +892,15 @@ decompress_from_raw_block_into_rbuf(u_int8_t *raw_block, struct rbuf *rb, BLOCKN // verify the number of sub blocks assert(0 <= n_sub_blocks && n_sub_blocks <= max_sub_blocks); + { // verify the header checksum + u_int32_t header_length = node_header_overhead + sub_block_header_size(n_sub_blocks); + assert(header_length <= raw_block_size); + u_int32_t xsum = x1764_memory(raw_block, header_length); + u_int32_t stored_xsum = toku_dtoh32(*(u_int32_t *)(raw_block + header_length)); + assert(xsum == stored_xsum); + } + int r; + // deserialize the sub block header struct sub_block sub_block[n_sub_blocks]; u_int32_t *sub_block_header = (u_int32_t *) &raw_block[node_header_overhead+4]; @@ -954,14 +952,14 @@ decompress_from_raw_block_into_rbuf(u_int8_t *raw_block, struct rbuf *rb, BLOCKN } static int -decompress_from_raw_block_into_rbuf_versioned(u_int32_t version, u_int8_t *raw_block, struct rbuf *rb, BLOCKNUM blocknum) { +decompress_from_raw_block_into_rbuf_versioned(u_int32_t version, u_int8_t *raw_block, size_t raw_block_size, struct rbuf *rb, BLOCKNUM blocknum) { int r; switch (version) { - case BRT_LAYOUT_VERSION_10: - r = decompress_brtnode_from_raw_block_into_rbuf_10(raw_block, rb, blocknum); + case BRT_LAYOUT_VERSION_11: + r = decompress_brtnode_from_raw_block_into_rbuf_11(raw_block, rb, blocknum); break; case BRT_LAYOUT_VERSION: - r = decompress_from_raw_block_into_rbuf(raw_block, rb, blocknum); + r = decompress_from_raw_block_into_rbuf(raw_block, raw_block_size, rb, blocknum); break; default: assert(FALSE); @@ -972,26 +970,32 @@ decompress_from_raw_block_into_rbuf_versioned(u_int32_t version, u_int8_t *raw_b static int deserialize_brtnode_from_rbuf_versioned (u_int32_t version, BLOCKNUM blocknum, u_int32_t fullhash, BRTNODE *brtnode, struct brt_header *h, struct rbuf *rb) { int r = 0; - BRTNODE brtnode_10 = NULL; BRTNODE brtnode_11 = NULL; + BRTNODE brtnode_12 = NULL; int upgrade = 0; switch (version) { - case BRT_LAYOUT_VERSION_10: + case BRT_LAYOUT_VERSION_11: if (!upgrade) - r = deserialize_brtnode_from_rbuf_10(blocknum, fullhash, &brtnode_10, h, rb); + r = deserialize_brtnode_from_rbuf_11(blocknum, fullhash, &brtnode_11, h, rb); upgrade++; if (r==0) - r = upgrade_brtnode_10_11(&brtnode_10, &brtnode_11); + r = upgrade_brtnode_11_12(&brtnode_11, &brtnode_12); //Fall through on purpose. case BRT_LAYOUT_VERSION: if (!upgrade) - r = deserialize_brtnode_from_rbuf(blocknum, fullhash, &brtnode_11, h, rb); + r = deserialize_brtnode_from_rbuf(blocknum, fullhash, &brtnode_12, h, rb); if (r==0) { - assert(brtnode_11); - *brtnode = brtnode_11; + assert(brtnode_12); + *brtnode = brtnode_12; + } + if (upgrade && r == 0) { + toku_brtheader_lock(h); + assert(h->num_blocks_to_upgrade>0); + h->num_blocks_to_upgrade--; + toku_brtheader_unlock(h); + (*brtnode)->dirty = 1; } - if (upgrade && r == 0) (*brtnode)->dirty = 1; break; // this is the only break default: assert(FALSE); @@ -1037,15 +1041,7 @@ read_and_decompress_block_from_fd_into_rbuf(int fd, BLOCKNUM blocknum, } } - // verify the header checksum - u_int32_t n_sub_blocks = toku_dtoh32(*(u_int32_t *)(raw_block + node_header_overhead)); - u_int32_t header_length = node_header_overhead + sub_block_header_size(n_sub_blocks); - assert(header_length <= size); - u_int32_t xsum = x1764_memory(raw_block, header_length); - u_int32_t stored_xsum = toku_dtoh32(*(u_int32_t *)(raw_block + header_length)); - assert(xsum == stored_xsum); - - r = decompress_from_raw_block_into_rbuf_versioned(layout_version, raw_block, rb, blocknum); + r = decompress_from_raw_block_into_rbuf_versioned(layout_version, raw_block, size, rb, blocknum); if (r!=0) goto cleanup; *layout_version_p = layout_version; @@ -1097,8 +1093,8 @@ toku_maybe_upgrade_brt(BRT t) { // possibly do some work to complete the version int version = t->h->layout_version_read_from_disk; if (!t->h->upgrade_brt_performed) { switch (version) { - case BRT_LAYOUT_VERSION_10: - r = toku_brt_broadcast_commit_all(t); + case BRT_LAYOUT_VERSION_11: + r = 0; //Fall through on purpose. case BRT_LAYOUT_VERSION: if (r == 0) { @@ -1144,7 +1140,8 @@ sum_item (OMTVALUE lev, u_int32_t UU(idx), void *vsi) { return 0; } -void toku_verify_counts (BRTNODE node) { +void +toku_verify_or_set_counts (BRTNODE node, BOOL set_fingerprints) { /*foo*/ if (node->height==0) { assert(node->u.l.buffer); @@ -1155,6 +1152,9 @@ void toku_verify_counts (BRTNODE node) { assert(sum_info.msum == node->u.l.buffer_mempool.free_offset - node->u.l.buffer_mempool.frag_size); u_int32_t fps = node->rand4fingerprint * sum_info.fp; + if (set_fingerprints) { + node->local_fingerprint = fps; + } assert(fps==node->local_fingerprint); } else { unsigned int sum = 0; @@ -1162,6 +1162,17 @@ void toku_verify_counts (BRTNODE node) { sum += BNC_NBYTESINBUF(node,i); // We don't rally care of the later buffers have garbage in them. Valgrind would do a better job noticing if we leave it uninitialized. // But for now the code always initializes the later tables so they are 0. + uint32_t fp = 0; + int i; + for (i=0; iu.n.n_children; i++) + FIFO_ITERATE(BNC_BUFFER(node,i), key, keylen, data, datalen, type, xid, + { + fp += node->rand4fingerprint * toku_calc_fingerprint_cmd(type, xid, key, keylen, data, datalen); + }); + if (set_fingerprints) { + node->local_fingerprint = fp; + } + assert(fp==node->local_fingerprint); assert(sum==node->u.n.n_bytes_in_buffers); } } @@ -1171,12 +1182,12 @@ serialize_brt_header_min_size (u_int32_t version) { u_int32_t size = 0; switch(version) { case BRT_LAYOUT_VERSION_12: - case BRT_LAYOUT_VERSION_11: - size += 4; // original_version + size += 8; // Number of blocks in old version. // fall through to add up bytes in previous version - case BRT_LAYOUT_VERSION_10: + case BRT_LAYOUT_VERSION_11: size += (+8 // "tokudata" +4 // version + +4 // original_version +4 // size +8 // byte order verification +8 // checkpoint_count @@ -1221,6 +1232,7 @@ int toku_serialize_brt_header_to_wbuf (struct wbuf *wbuf, struct brt_header *h, wbuf_BLOCKNUM(wbuf, h->root); wbuf_int(wbuf, h->flags); wbuf_int(wbuf, h->layout_version_original); + wbuf_ulonglong(wbuf, h->num_blocks_to_upgrade); u_int32_t checksum = x1764_finish(&wbuf->checksum); wbuf_int(wbuf, checksum); assert(wbuf->ndone == wbuf->size); @@ -1287,7 +1299,7 @@ int toku_serialize_brt_header_to (int fd, struct brt_header *h) { } u_int32_t -toku_serialize_descriptor_size(const struct descriptor *desc) { +toku_serialize_descriptor_size(const DESCRIPTOR desc) { //Checksum NOT included in this. Checksum only exists in header's version. u_int32_t size = 4+ //version 4; //size @@ -1296,7 +1308,7 @@ toku_serialize_descriptor_size(const struct descriptor *desc) { } void -toku_serialize_descriptor_contents_to_wbuf(struct wbuf *wb, const struct descriptor *desc) { +toku_serialize_descriptor_contents_to_wbuf(struct wbuf *wb, const DESCRIPTOR desc) { if (desc->version==0) assert(desc->dbt.size==0); wbuf_int(wb, desc->version); wbuf_bytes(wb, desc->dbt.data, desc->dbt.size); @@ -1306,7 +1318,7 @@ toku_serialize_descriptor_contents_to_wbuf(struct wbuf *wb, const struct descrip //descriptor. //Descriptors are NOT written during the header checkpoint process. int -toku_serialize_descriptor_contents_to_fd(int fd, const struct descriptor *desc, DISKOFF offset) { +toku_serialize_descriptor_contents_to_fd(int fd, const DESCRIPTOR desc, DISKOFF offset) { int r = 0; // make the checksum int64_t size = toku_serialize_descriptor_size(desc)+4; //4 for checksum @@ -1330,7 +1342,7 @@ toku_serialize_descriptor_contents_to_fd(int fd, const struct descriptor *desc, } static void -deserialize_descriptor_from_rbuf(struct rbuf *rb, struct descriptor *desc, BOOL temporary) { +deserialize_descriptor_from_rbuf(struct rbuf *rb, DESCRIPTOR desc, BOOL temporary) { desc->version = rbuf_int(rb); u_int32_t size; bytevec data; @@ -1351,7 +1363,7 @@ deserialize_descriptor_from_rbuf(struct rbuf *rb, struct descriptor *desc, BOOL } static void -deserialize_descriptor_from(int fd, struct brt_header *h, struct descriptor *desc) { +deserialize_descriptor_from(int fd, struct brt_header *h, DESCRIPTOR desc) { DISKOFF offset; DISKOFF size; toku_get_descriptor_offset_size(h->blocktable, &offset, &size); @@ -1454,7 +1466,8 @@ deserialize_brtheader (int fd, struct rbuf *rb, struct brt_header **brth) { toku_blocktable_create_from_buffer(&h->blocktable, translation_address_on_disk, translation_size_on_disk, - tbuf); + tbuf, + FALSE /*not version 11 or older */ ); toku_free(tbuf); } @@ -1463,6 +1476,7 @@ deserialize_brtheader (int fd, struct rbuf *rb, struct brt_header **brth) { h->flags = rbuf_int(&rc); deserialize_descriptor_from(fd, h, &h->descriptor); h->layout_version_original = rbuf_int(&rc); + h->num_blocks_to_upgrade = rbuf_ulonglong(&rc); (void)rbuf_int(&rc); //Read in checksum and ignore (already verified). if (rc.ndone!=rc.size) {ret = EINVAL; goto died1;} toku_free(rc.buf); @@ -1473,31 +1487,36 @@ deserialize_brtheader (int fd, struct rbuf *rb, struct brt_header **brth) { -//TODO: When version 12 exists, add case for version 11 that looks like version 10 case, -// but calls deserialize_brtheader_11() and upgrade_11_12() +//TODO: When version 13 exists, add case for version 12 that looks like version 10 case, +// but calls deserialize_brtheader_12() and upgrade_12_13() static int deserialize_brtheader_versioned (int fd, struct rbuf *rb, struct brt_header **brth, u_int32_t version) { int rval; - struct brt_header *brth_10 = NULL; struct brt_header *brth_11 = NULL; + struct brt_header *brth_12 = NULL; int upgrade = 0; switch(version) { - case BRT_LAYOUT_VERSION_10: + case BRT_LAYOUT_VERSION_11: if (!upgrade) - rval = deserialize_brtheader_10(fd, rb, &brth_10); + rval = deserialize_brtheader_11(fd, rb, &brth_11); upgrade++; if (rval == 0) - rval = upgrade_brtheader_10_11(&brth_10, &brth_11); + rval = upgrade_brtheader_11_12(fd, &brth_11, &brth_12); //Fall through on purpose. case BRT_LAYOUT_VERSION: if (!upgrade) - rval = deserialize_brtheader (fd, rb, &brth_11); + rval = deserialize_brtheader (fd, rb, &brth_12); if (rval == 0) { - assert(brth_11); - *brth = brth_11; + assert(brth_12); + *brth = brth_12; } - if (upgrade && rval == 0) (*brth)->dirty = 1; + if (upgrade && rval == 0) { + toku_brtheader_lock(*brth); + (*brth)->num_blocks_to_upgrade = toku_block_get_blocks_in_use_unlocked((*brth)->blocktable); + (*brth)->dirty = 1; + toku_brtheader_unlock(*brth); + } break; // this is the only break default: assert(FALSE); @@ -1582,6 +1601,13 @@ deserialize_brtheader_from_fd_into_rbuf(int fd, toku_off_t offset, struct rbuf * //Verify checksum u_int32_t calculated_x1764 = x1764_memory(rb->buf, rb->size-4); u_int32_t stored_x1764 = toku_dtoh32(*(int*)(rb->buf+rb->size-4)); +#if BRT_LAYOUT_MIN_SUPPORTED_VERSION <= BRT_LAYOUT_VERSION_11 + if (version<=BRT_LAYOUT_VERSION_11) { + calculated_x1764 = ~calculated_x1764; + } +#else +#error The above code block is obsolete +#endif if (calculated_x1764!=stored_x1764) r = TOKUDB_DICTIONARY_NO_HEADER; //Header useless } if (r==0) { @@ -1869,6 +1895,7 @@ static int deserialize_rollback_log_from_rbuf_versioned (u_int32_t version, BLOCKNUM blocknum, u_int32_t fullhash, ROLLBACK_LOG_NODE *log, struct brt_header *h, struct rbuf *rb) { + //Upgrade is not necessary really here. Rollback log nodes do not survive version changes. int r = 0; ROLLBACK_LOG_NODE rollback_log_node = NULL; @@ -1923,5 +1950,5 @@ cleanup: // NOTE: Backwards compatibility functions are in the included .c file(s): -#include "backwards_10.c" +#include "backwards_11.c" diff --git a/newbrt/brt-test-helpers.c b/newbrt/brt-test-helpers.c index 73b2aff9e31..b5d41a16728 100644 --- a/newbrt/brt-test-helpers.c +++ b/newbrt/brt-test-helpers.c @@ -73,7 +73,7 @@ int toku_testsetup_insert_to_leaf (BRT brt, BLOCKNUM blocknum, char *key, int ke toku_brtnode_flush_callback, toku_brtnode_fetch_callback, brt); if (r!=0) return r; BRTNODE node=node_v; - toku_verify_counts(node); + toku_verify_or_set_counts(node, FALSE); assert(node->height==0); size_t lesize, disksize; @@ -114,7 +114,7 @@ int toku_testsetup_insert_to_leaf (BRT brt, BLOCKNUM blocknum, char *key, int ke node->dirty=1; *subtree_fingerprint = node->local_fingerprint; - toku_verify_counts(node); + toku_verify_or_set_counts(node, FALSE); r = toku_unpin_brtnode(brt, node_v); return r; diff --git a/newbrt/brt-verify.c b/newbrt/brt-verify.c index d2c4d04c00e..bba69b27a66 100644 --- a/newbrt/brt-verify.c +++ b/newbrt/brt-verify.c @@ -31,7 +31,7 @@ static void verify_local_fingerprint (BRTNODE node) { }); assert(fp==node->local_fingerprint); } else { - toku_verify_counts(node); + toku_verify_or_set_counts(node, FALSE); } } diff --git a/newbrt/brt.c b/newbrt/brt.c index 49d5128b3c4..07bec4e317b 100644 --- a/newbrt/brt.c +++ b/newbrt/brt.c @@ -227,7 +227,7 @@ int toku_brt_debug_mode = 0; //#define SLOW #ifdef SLOW -#define VERIFY_NODE(t,n) (toku_verify_counts(n), toku_verify_estimates(t,n)) +#define VERIFY_NODE(t,n) (toku_verify_or_set_counts(n, FALSE), toku_verify_estimates(t,n)) #else #define VERIFY_NODE(t,n) ((void)0) #endif @@ -648,7 +648,6 @@ initialize_empty_brtnode (BRT t, BRTNODE n, BLOCKNUM nodename, int height, size_ // Effect: Fill in N as an empty brtnode. { n->tag = TYP_BRTNODE; - n->desc = &t->h->descriptor; n->nodesize = t->h->nodesize; n->flags = t->flags; n->thisnodename = nodename; @@ -3009,6 +3008,7 @@ brt_init_header_partial (BRT t) { if (t->h->cf!=NULL) assert(t->h->cf == t->cf); t->h->cf = t->cf; t->h->nodesize=t->nodesize; + t->h->num_blocks_to_upgrade = 0; compute_and_fill_remembered_hash(t); @@ -3152,6 +3152,44 @@ verify_builtin_comparisons_consistent(BRT t, u_int32_t flags) { return 0; } + +//if r==0, then frees/takes over descriptor_dbt.data +int +toku_maybe_upgrade_descriptor(BRT t, DESCRIPTOR d, BOOL do_log, TOKUTXN txn) { + int r = 0; + //txn is only for access to logger + if (t->h->descriptor.version!=d->version || + t->h->descriptor.dbt.size!=d->dbt.size || + memcmp(t->h->descriptor.dbt.data, d->dbt.data, d->dbt.size)) { + if (d->version <= t->h->descriptor.version) { + //Changing descriptor requires upping the version. + r = EINVAL; + goto cleanup; + } + if (do_log) { + //If we didn't log fcreate (which contains descriptor) + //we need to log descriptor now. + r = toku_logger_log_descriptor(txn, toku_cachefile_filenum(t->cf), d); + if (r!=0) goto cleanup; + } + DISKOFF offset; + //4 for checksum + toku_realloc_descriptor_on_disk(t->h->blocktable, toku_serialize_descriptor_size(d)+4, &offset, t->h); + { + int fd = toku_cachefile_get_and_pin_fd (t->cf); + r = toku_serialize_descriptor_contents_to_fd(fd, d, offset); + toku_cachefile_unpin_fd(t->cf); + } + if (r!=0) goto cleanup; + if (t->h->descriptor.dbt.data) toku_free(t->h->descriptor.dbt.data); + t->h->descriptor = *d; + } + else toku_free(d->dbt.data); + d->dbt.data = NULL; +cleanup: + return r; +} + // This is the actual open, used for various purposes, such as normal use, recovery, and redirect. // fname_in_env is the iname, relative to the env_dir (data_dir is already in iname as prefix) static int @@ -3172,7 +3210,6 @@ brt_open(BRT t, const char *fname_in_env, int is_create, int only_create, CACHET assert(is_create || !only_create); t->db = db; - BOOL log_fopen = FALSE; // set true if we're opening a pre-existing file BOOL did_create = FALSE; FILENUM reserved_filenum = use_filenum; { @@ -3208,8 +3245,6 @@ brt_open(BRT t, const char *fname_in_env, int is_create, int only_create, CACHET fname_in_env, use_reserved_filenum||did_create, reserved_filenum, did_create); if (r != 0) goto died1; - if (!did_create) - log_fopen = TRUE; //Log of fopen must be delayed till flags are available } if (r!=0) { died_after_open: @@ -3254,12 +3289,22 @@ brt_open(BRT t, const char *fname_in_env, int is_create, int only_create, CACHET } } - int use_reserved_dict_id = use_dictionary_id.dictid != DICTIONARY_ID_NONE.dictid; if (!was_already_open) { - if (log_fopen) { //Only log the fopen that OPENs the file. If it was already open, don't log. + if (!did_create) { //Only log the fopen that OPENs the file. If it was already open, don't log. r = toku_logger_log_fopen(txn, fname_in_env, toku_cachefile_filenum(t->cf), t->flags); if (r!=0) goto died_after_read_and_pin; } + } + if (t->did_set_descriptor) { + r = toku_maybe_upgrade_descriptor(t, &t->temp_descriptor, !did_create, txn); + if (r!=0) { + toku_free(t->temp_descriptor.dbt.data); + goto died_after_read_and_pin; + } + t->did_set_descriptor = FALSE; + } + int use_reserved_dict_id = use_dictionary_id.dictid != DICTIONARY_ID_NONE.dictid; + if (!was_already_open) { DICTIONARY_ID dict_id; if (use_reserved_dict_id) dict_id = use_dictionary_id; @@ -3275,40 +3320,6 @@ brt_open(BRT t, const char *fname_in_env, int is_create, int only_create, CACHET assert(t->h); assert(t->h->dict_id.dictid != DICTIONARY_ID_NONE.dictid); assert(t->h->dict_id.dictid < dict_id_serial); - if (t->did_set_descriptor) { - if (t->h->descriptor.version!=t->temp_descriptor.version || - t->h->descriptor.dbt.size!=t->temp_descriptor.dbt.size || - memcmp(t->h->descriptor.dbt.data, t->temp_descriptor.dbt.data, t->temp_descriptor.dbt.size)) { - if (t->temp_descriptor.version <= t->h->descriptor.version) { - //Changing descriptor requires upping the version. - r = EINVAL; - goto died_after_read_and_pin; - } - toku_brtheader_lock(t->h); - if (!toku_list_empty(&t->h->live_brts) || !toku_list_empty(&t->h->zombie_brts)) { - //Disallow changing if exists two brts with the same header (counting this one) - //The upgrade would be impossible/very hard! - r = EINVAL; - toku_brtheader_unlock(t->h); - goto died_after_read_and_pin; - } - toku_brtheader_unlock(t->h); - DISKOFF offset; - //4 for checksum - toku_realloc_descriptor_on_disk(t->h->blocktable, toku_serialize_descriptor_size(&t->temp_descriptor)+4, &offset, t->h); - { - int fd = toku_cachefile_get_and_pin_fd (t->cf); - r = toku_serialize_descriptor_contents_to_fd(fd, &t->temp_descriptor, offset); - toku_cachefile_unpin_fd(t->cf); - } - if (r!=0) goto died_after_read_and_pin; - if (t->h->descriptor.dbt.data) toku_free(t->h->descriptor.dbt.data); - t->h->descriptor = t->temp_descriptor; - } - else toku_free(t->temp_descriptor.dbt.data); - t->temp_descriptor.dbt.data = NULL; - t->did_set_descriptor = FALSE; - } r = toku_maybe_upgrade_brt(t); // possibly do some work to complete the version upgrade of brt if (r!=0) goto died_after_read_and_pin; @@ -3316,7 +3327,7 @@ brt_open(BRT t, const char *fname_in_env, int is_create, int only_create, CACHET // brtheader_note_brt_open must be after all functions that can fail. r = brtheader_note_brt_open(t); if (r!=0) goto died_after_read_and_pin; - if (t->db) t->db->descriptor = &t->h->descriptor.dbt; + if (t->db) t->db->descriptor = &t->h->descriptor; if (txn_created) { assert(txn); assert(t->h->txnid_that_created_or_locked_when_empty == TXNID_NONE); @@ -3353,15 +3364,6 @@ toku_brt_open(BRT t, const char *fname_in_env, int is_create, int only_create, C return r; } -static int -abort_on_upgrade(DB* UU(pdb), - u_int32_t UU(old_version), const DBT *UU(old_descriptor), const DBT *UU(old_key), const DBT *UU(old_val), - u_int32_t UU(new_version), const DBT *UU(new_descriptor), const DBT *UU(new_key), const DBT *UU(new_val)) { - assert(FALSE); //Must not upgrade. - return ENOSYS; -} - - // Open a brt for use by redirect. The new brt must have the same dict_id as the old_brt passed in. (FILENUM is assigned by the brt_open() function.) static int brt_open_for_redirect(BRT *new_brtp, const char *fname_in_env, TOKUTXN txn, BRT old_brt) { @@ -3380,7 +3382,7 @@ brt_open_for_redirect(BRT *new_brtp, const char *fname_in_env, TOKUTXN txn, BRT r = toku_brt_set_nodesize(t, old_brt->nodesize); assert(r==0); if (old_h->descriptor.version>0) { - r = toku_brt_set_descriptor(t, old_h->descriptor.version, &old_h->descriptor.dbt, abort_on_upgrade); + r = toku_brt_set_descriptor(t, old_h->descriptor.version, &old_h->descriptor.dbt); assert(r==0); } CACHETABLE ct = toku_cachefile_get_cachetable(old_brt->cf); @@ -3503,7 +3505,7 @@ dictionary_redirect_internal(const char *dst_fname_in_env, struct brt_header *sr //Do not need to swap descriptors pointers. //Done by brt_open_for_redirect - assert(dst_brt->db->descriptor == &dst_brt->h->descriptor.dbt); + assert(dst_brt->db->descriptor == &dst_brt->h->descriptor); //Set db->i->brt to new brt brt_redirect_db(dst_brt, src_brt); @@ -4032,11 +4034,10 @@ int toku_brt_create(BRT *brt_ptr) { } int -toku_brt_set_descriptor (BRT t, u_int32_t version, const DBT* descriptor, toku_dbt_upgradef dbt_userformat_upgrade) { +toku_brt_set_descriptor (BRT t, u_int32_t version, const DBT* descriptor) { int r; if (t->did_set_descriptor) r = EINVAL; else if (version==0) r = EINVAL; //0 is reserved for default (no descriptor). - else if (dbt_userformat_upgrade==NULL) r = EINVAL; //Must have an upgrade function. else { void *copy = toku_memdup(descriptor->data, descriptor->size); if (!copy) r = ENOMEM; @@ -4044,8 +4045,6 @@ toku_brt_set_descriptor (BRT t, u_int32_t version, const DBT* descriptor, toku_d t->temp_descriptor.version = version; assert(!t->temp_descriptor.dbt.data); toku_fill_dbt(&t->temp_descriptor.dbt, copy, descriptor->size); - assert(!t->dbt_userformat_upgrade); - t->dbt_userformat_upgrade = dbt_userformat_upgrade; t->did_set_descriptor = TRUE; r = 0; } @@ -5739,14 +5738,6 @@ int toku_brt_set_panic(BRT brt, int panic, char *panic_string) { return toku_brt_header_set_panic(brt->h, panic, panic_string); } -//Wrapper functions for upgrading from version 10. -#include "backwards_10.h" -void -toku_calculate_leaf_stats (BRTNODE node) { - assert(node->height == 0); - node->u.l.leaf_stats = calc_leaf_stats(node); -} - #if 0 int toku_logger_save_rollback_fdelete (TOKUTXN txn, u_int8_t file_was_open, FILENUM filenum, BYTESTRING iname) { diff --git a/newbrt/brt.h b/newbrt/brt.h index 7534ce6c23c..752b8173b02 100644 --- a/newbrt/brt.h +++ b/newbrt/brt.h @@ -32,16 +32,17 @@ typedef int(*BRT_GET_CALLBACK_FUNCTION)(ITEMLEN, bytevec, ITEMLEN, bytevec, void typedef int(*BRT_GET_STRADDLE_CALLBACK_FUNCTION)(ITEMLEN, bytevec, ITEMLEN, bytevec, ITEMLEN, bytevec, ITEMLEN, bytevec, void*); int toku_open_brt (const char *fname, int is_create, BRT *, int nodesize, CACHETABLE, TOKUTXN, int(*)(DB*,const DBT*,const DBT*), DB*); +int toku_maybe_upgrade_descriptor(BRT t, DESCRIPTOR d, BOOL do_log, TOKUTXN txn); int toku_dictionary_redirect (const char *dst_fname_in_env, BRT old_brt, TOKUTXN txn); // See the brt.c file for what this toku_redirect_brt does int toku_dictionary_redirect_abort(struct brt_header *old_h, struct brt_header *new_h, TOKUTXN txn); -u_int32_t toku_serialize_descriptor_size(const struct descriptor *desc); +u_int32_t toku_serialize_descriptor_size(const DESCRIPTOR desc); int toku_brt_create(BRT *); int toku_brt_set_flags(BRT, unsigned int flags); -int toku_brt_set_descriptor (BRT t, u_int32_t version, const DBT* descriptor, toku_dbt_upgradef dbt_userformat_upgrade); +int toku_brt_set_descriptor (BRT t, u_int32_t version, const DBT* descriptor); int toku_brt_get_flags(BRT, unsigned int *flags); int toku_brt_set_nodesize(BRT, unsigned int nodesize); int toku_brt_get_nodesize(BRT, unsigned int *nodesize); diff --git a/newbrt/brtloader-internal.h b/newbrt/brtloader-internal.h index 3df67543276..6049757e183 100644 --- a/newbrt/brtloader-internal.h +++ b/newbrt/brtloader-internal.h @@ -121,7 +121,7 @@ struct brtloader_s { DB *src_db; int N; DB **dbs; // N of these - const struct descriptor **descriptors; // N of these. + DESCRIPTOR *descriptors; // N of these. const char **new_fnames_in_env; // N of these. The file names that the final data will be written to (relative to env). uint64_t *extracted_datasizes; // N of these. @@ -170,7 +170,7 @@ u_int64_t toku_brt_loader_get_n_rows(BRTLOADER bl); // The data passed into a fractal_thread via pthread_create. struct fractal_thread_args { BRTLOADER bl; - const struct descriptor *descriptor; + const DESCRIPTOR descriptor; int fd; // write the brt into tfd. int progress_allocation; QUEUE q; @@ -195,14 +195,14 @@ int mergesort_row_array (struct row rows[/*n*/], int n, int which_db, DB *dest_d CILK_END -//int write_file_to_dbfile (int outfile, FIDX infile, BRTLOADER bl, const struct descriptor *descriptor, int progress_allocation); +//int write_file_to_dbfile (int outfile, FIDX infile, BRTLOADER bl, const DESCRIPTOR descriptor, int progress_allocation); int toku_merge_some_files_using_dbufio (const BOOL to_q, FIDX dest_data, QUEUE q, int n_sources, DBUFIO_FILESET bfs, FIDX srcs_fidxs[/*n_sources*/], BRTLOADER bl, int which_db, DB *dest_db, brt_compare_func compare, int progress_allocation); int brt_loader_sort_and_write_rows (struct rowset *rows, struct merge_fileset *fs, BRTLOADER bl, int which_db, DB *dest_db, brt_compare_func); // This is probably only for testing. int toku_loader_write_brt_from_q_in_C (BRTLOADER bl, - const struct descriptor *descriptor, + const DESCRIPTOR descriptor, int fd, // write to here int progress_allocation, QUEUE q, @@ -210,7 +210,7 @@ int toku_loader_write_brt_from_q_in_C (BRTLOADER bl, int brt_loader_mergesort_row_array (struct row rows[/*n*/], int n, int which_db, DB *dest_db, brt_compare_func, BRTLOADER, struct rowset *); -int brt_loader_write_file_to_dbfile (int outfile, FIDX infile, BRTLOADER bl, const struct descriptor *descriptor, int progress_allocation); +int brt_loader_write_file_to_dbfile (int outfile, FIDX infile, BRTLOADER bl, const DESCRIPTOR descriptor, int progress_allocation); int brtloader_init_file_infos (struct file_infos *fi); void brtloader_fi_destroy (struct file_infos *fi, BOOL is_error); @@ -223,7 +223,7 @@ int toku_brt_loader_internal_init (/* out */ BRTLOADER *blp, generate_row_for_put_func g, DB *src_db, int N, DB*dbs[/*N*/], - const struct descriptor *descriptors[/*N*/], + const DESCRIPTOR descriptors[/*N*/], const char *new_fnames_in_env[/*N*/], brt_compare_func bt_compare_functions[/*N*/], const char *temp_file_template, diff --git a/newbrt/brtloader.c b/newbrt/brtloader.c index 9691ebb5a90..85184c8d34a 100644 --- a/newbrt/brtloader.c +++ b/newbrt/brtloader.c @@ -385,7 +385,7 @@ int toku_brt_loader_internal_init (/* out */ BRTLOADER *blp, generate_row_for_put_func g, DB *src_db, int N, DB*dbs[/*N*/], - const struct descriptor *descriptors[/*N*/], + const DESCRIPTOR descriptors[/*N*/], const char *new_fnames_in_env[/*N*/], brt_compare_func bt_compare_functions[/*N*/], const char *temp_file_template, @@ -484,7 +484,7 @@ int toku_brt_loader_open (/* out */ BRTLOADER *blp, generate_row_for_put_func g, DB *src_db, int N, DB*dbs[/*N*/], - const struct descriptor *descriptors[/*N*/], + const DESCRIPTOR descriptors[/*N*/], const char *new_fnames_in_env[/*N*/], brt_compare_func bt_compare_functions[/*N*/], const char *temp_file_template, @@ -2051,7 +2051,7 @@ static inline long int loader_random(void) { return r; } -static struct leaf_buf *start_leaf (struct dbout *out, const struct descriptor *desc, int64_t lblocknum) { +static struct leaf_buf *start_leaf (struct dbout *out, const DESCRIPTOR UU(desc), int64_t lblocknum) { invariant(lblocknum < out->n_translations_limit); struct leaf_buf *XMALLOC(lbuf); lbuf->blocknum = lblocknum; @@ -2063,10 +2063,6 @@ static struct leaf_buf *start_leaf (struct dbout *out, const struct descriptor * putbuf_int32(&lbuf->dbuf, layout_version); putbuf_int32(&lbuf->dbuf, layout_version); // layout_version original - putbuf_int32(&lbuf->dbuf, desc->version); // desc version - putbuf_int32(&lbuf->dbuf, desc->dbt.size); // desc size - putbuf_bytes(&lbuf->dbuf, desc->dbt.data, desc->dbt.size); - putbuf_int32(&lbuf->dbuf, nodesize); putbuf_int32(&lbuf->dbuf, flags); putbuf_int32(&lbuf->dbuf, height); @@ -2089,7 +2085,7 @@ static struct leaf_buf *start_leaf (struct dbout *out, const struct descriptor * CILK_BEGIN static void finish_leafnode (struct dbout *out, struct leaf_buf *lbuf, int progress_allocation, BRTLOADER bl); -static int write_nonleaves (BRTLOADER bl, FIDX pivots_fidx, struct dbout *out, struct subtrees_info *sts, const struct descriptor *descriptor); +static int write_nonleaves (BRTLOADER bl, FIDX pivots_fidx, struct dbout *out, struct subtrees_info *sts, const DESCRIPTOR descriptor); CILK_END static void add_pair_to_leafnode (struct leaf_buf *lbuf, unsigned char *key, int keylen, unsigned char *val, int vallen); static int write_translation_table (struct dbout *out, long long *off_of_translation_p); @@ -2110,7 +2106,7 @@ static void drain_writer_q(QUEUE q) { CILK_BEGIN static int toku_loader_write_brt_from_q (BRTLOADER bl, - const struct descriptor *descriptor, + const DESCRIPTOR descriptor, int fd, // write to here int progress_allocation, QUEUE q, @@ -2359,7 +2355,7 @@ static int toku_loader_write_brt_from_q (BRTLOADER bl, CILK_END int toku_loader_write_brt_from_q_in_C (BRTLOADER bl, - const struct descriptor *descriptor, + const DESCRIPTOR descriptor, int fd, // write to here int progress_allocation, QUEUE q, @@ -2390,7 +2386,7 @@ static int loader_do_i (BRTLOADER bl, int which_db, DB *dest_db, brt_compare_func compare, - const struct descriptor *descriptor, + const DESCRIPTOR descriptor, const char *new_fname, int progress_allocation // how much progress do I need to add into bl->progress by the end.. ) @@ -2768,7 +2764,7 @@ static int write_header (struct dbout *out, long long translation_location_on_di struct brt_header h; memset(&h, 0, sizeof h); h.layout_version = BRT_LAYOUT_VERSION; h.checkpoint_count = 1; - h.checkpoint_lsn = load_lsn; // (max_uint_long means that this doesn't need any kind of recovery + h.checkpoint_lsn = load_lsn; h.nodesize = nodesize; h.root = root_blocknum_on_disk; h.flags = 0; @@ -2898,14 +2894,14 @@ CILK_BEGIN static void write_nonleaf_node (BRTLOADER bl, struct dbout *out, int64_t blocknum_of_new_node, int n_children, DBT *pivots, /* must free this array, as well as the things it points t */ - struct subtree_info *subtree_info, int height, const struct descriptor *desc) + struct subtree_info *subtree_info, int height, const DESCRIPTOR UU(desc)) { + //Nodes do not currently touch descriptors invariant(height>0); int result = 0; BRTNODE XMALLOC(node); - node->desc =(struct descriptor *)desc; node->nodesize = nodesize; node->thisnodename = make_blocknum(blocknum_of_new_node); node->layout_version = BRT_LAYOUT_VERSION; @@ -2991,7 +2987,7 @@ static void write_nonleaf_node (BRTLOADER bl, struct dbout *out, int64_t blocknu brt_loader_set_panic(bl, result, TRUE); } -static int write_nonleaves (BRTLOADER bl, FIDX pivots_fidx, struct dbout *out, struct subtrees_info *sts, const struct descriptor *descriptor) { +static int write_nonleaves (BRTLOADER bl, FIDX pivots_fidx, struct dbout *out, struct subtrees_info *sts, const DESCRIPTOR descriptor) { int result = 0; int height = 1; @@ -3113,7 +3109,7 @@ CILK_END #if 0 // C function for testing write_file_to_dbfile -int brt_loader_write_file_to_dbfile (int outfile, FIDX infile, BRTLOADER bl, const struct descriptor *descriptor, int progress_allocation) { +int brt_loader_write_file_to_dbfile (int outfile, FIDX infile, BRTLOADER bl, const DESCRIPTOR descriptor, int progress_allocation) { #if defined(__cilkplusplus) return cilk::run(write_file_to_dbfile, outfile, infile, bl, descriptor, progress_allocation); #else diff --git a/newbrt/brtloader.h b/newbrt/brtloader.h index 8e44ee41f4e..a0f025d5847 100644 --- a/newbrt/brtloader.h +++ b/newbrt/brtloader.h @@ -24,7 +24,7 @@ int toku_brt_loader_open (BRTLOADER *bl, DB *src_db, int N, DB *dbs[/*N*/], - const struct descriptor *descriptors[/*N*/], + const DESCRIPTOR descriptors[/*N*/], const char * new_fnames_in_env[/*N*/], brt_compare_func bt_compare_functions[/*N*/], const char *temp_file_template, diff --git a/newbrt/brttypes.h b/newbrt/brttypes.h index b17e4842139..8b0b875d79b 100644 --- a/newbrt/brttypes.h +++ b/newbrt/brttypes.h @@ -22,11 +22,6 @@ typedef struct brt *BRT; struct brt_header; struct wbuf; -typedef struct descriptor { - u_int32_t version; - DBT dbt; -} *DESCRIPTOR, DESCRIPTOR_S; - typedef unsigned int ITEMLEN; typedef const void *bytevec; //typedef const void *bytevec; diff --git a/newbrt/cachetable.c b/newbrt/cachetable.c index 4ba6d9a7a61..465f7a660df 100644 --- a/newbrt/cachetable.c +++ b/newbrt/cachetable.c @@ -321,7 +321,7 @@ void toku_cachetable_release_reserved_memory(CACHETABLE ct, uint64_t reserved_me } void -toku_cachetable_set_env_dir(CACHETABLE ct, char *env_dir) { +toku_cachetable_set_env_dir(CACHETABLE ct, const char *env_dir) { assert(!ct->set_env_dir); toku_free(ct->env_dir); ct->env_dir = toku_xstrdup(env_dir); diff --git a/newbrt/cachetable.h b/newbrt/cachetable.h index 7e040e193c8..f08967f5260 100644 --- a/newbrt/cachetable.h +++ b/newbrt/cachetable.h @@ -330,7 +330,7 @@ void toku_cachetable_get_status(CACHETABLE ct, CACHETABLE_STATUS s); LEAFLOCK_POOL toku_cachefile_leaflock_pool(CACHEFILE cf); -void toku_cachetable_set_env_dir(CACHETABLE ct, char *env_dir); +void toku_cachetable_set_env_dir(CACHETABLE ct, const char *env_dir); char * toku_construct_full_name(int count, ...); char * toku_cachetable_get_fname_in_cwd(CACHETABLE ct, const char * fname_in_env); diff --git a/newbrt/log.h b/newbrt/log.h index 346181cd053..b4933a28b39 100644 --- a/newbrt/log.h +++ b/newbrt/log.h @@ -38,6 +38,10 @@ static inline void toku_free_FILENUMS(FILENUMS val) { toku_free(val.filenums); } void toku_set_lsn_increment (uint64_t incr) __attribute__((__visibility__("default"))); +int toku_maybe_upgrade_log (const char *env_dir, const char *log_dir); +uint64_t toku_log_upgrade_get_footprint(void); + + #if defined(__cplusplus) || defined(__cilkplusplus) }; #endif diff --git a/newbrt/log_upgrade.c b/newbrt/log_upgrade.c new file mode 100644 index 00000000000..0351ebe9782 --- /dev/null +++ b/newbrt/log_upgrade.c @@ -0,0 +1,426 @@ +/* -*- mode: C; c-basic-offset: 4 -*- */ +#ident "$Id$" +#ident "Copyright (c) 2007-2010 Tokutek Inc. All rights reserved." +#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it." + +#include "includes.h" +#include "log_header.h" +#include "checkpoint.h" + +static uint64_t footprint = 0; // for debug and accountability +static uint64_t footprint_previous_upgrade = 0; // for debug and accountability + +uint64_t +toku_log_upgrade_get_footprint(void) { + return footprint + (100000 * footprint_previous_upgrade); +} + +#define FOOTPRINT(x) footprint=footprint_start+(x*footprint_increment) +#define FOOTPRINTSETUP(increment) uint64_t footprint_start=footprint; uint64_t footprint_increment=increment; + +// The lock file is used to detect a failed upgrade. It is created at the start +// of the upgrade procedure and deleted at the end of the upgrade procedure. If +// it exists at startup, then there was a crash during an upgrade, and the previous +// upgrade attempt must be undone. +static const char upgrade_lock_file_suffix[] = "/__tokudb_upgrade_dont_delete_me"; +static const char upgrade_commit_file_suffix[] = "/__tokudb_upgrade_commit_dont_delete_me"; + +//This will be the base information needed. +//Future 'upgrade in progress' files that need more information +//should store it AFTER the prefix checksum, and have its own checksum. +static const int upgrade_lock_prefix_size = 8 // magic ("tokuupgr") + +4 // version upgrading to + +4 // upgrading from version + +4 // size of suffix (data following prefix checksum) + +4; // prefix checksum + + +static int +verify_clean_shutdown_of_log_version_current(const char *log_dir, LSN * last_lsn) { + int rval = DB_RUNRECOVERY; + TOKULOGCURSOR logcursor = NULL; + int r; + FOOTPRINTSETUP(100); + + FOOTPRINT(1); + + r = toku_logcursor_create(&logcursor, log_dir); + assert(r == 0); + struct log_entry *le = NULL; + r = toku_logcursor_last(logcursor, &le); + if (r == 0) { + FOOTPRINT(2); + if (le->cmd==LT_shutdown) { + LSN lsn = le->u.shutdown.lsn; + if (last_lsn) + *last_lsn = lsn; + rval = 0; + } + } + r = toku_logcursor_destroy(&logcursor); + assert(r == 0); + return rval; +} + + +static int +verify_clean_shutdown_of_log_version_1(const char *log_dir, LSN * last_lsn) { + FOOTPRINTSETUP(100); + + FOOTPRINT(1); + //TODO: Remove this hack: + //Base this function on + // - (above)verify_clean_shutdown_of_log_version_current + // - (3.1)tokudb_needs_recovery + // - do breadth/depth first search to find out which functions have to be copied over from 3.1 + // - Put copied functions in .. backwards_log_1.[ch] + LSN lsn = {.lsn = 1LLU << 40}; + if (last_lsn) + *last_lsn = lsn; + log_dir = log_dir; + + return 0; +} + + +static int +verify_clean_shutdown_of_log_version(const char *log_dir, uint32_t version, LSN *last_lsn) { + // return 0 if clean shutdown, DB_RUNRECOVERY if not clean shutdown + // examine logfile at logfilenum and possibly logfilenum-1 + int r = 0; + FOOTPRINTSETUP(100); + + if (version == TOKU_LOG_VERSION_1) { + FOOTPRINT(1); + r = verify_clean_shutdown_of_log_version_1(log_dir, last_lsn); + } + else { + FOOTPRINT(2); + assert(version == TOKU_LOG_VERSION); + r = verify_clean_shutdown_of_log_version_current(log_dir, last_lsn); + } + return r; +} + + + + +//Cross the Rubicon (POINT OF NO RETURN) +static int +convert_logs_and_fsync(const char *log_dir, const char *env_dir, uint32_t from_version, uint32_t to_version) { + int r; + FOOTPRINTSETUP(100); + + r = verify_clean_shutdown_of_log_version(log_dir, to_version, NULL); + assert(r==0); + r = toku_delete_all_logs_of_version(log_dir, from_version); + assert(r==0); + r = toku_fsync_dir_by_name_without_accounting(log_dir); + assert(r==0); + if (to_version==TOKU_LOG_VERSION_1) { + //Undo an upgrade from version 1. + //Delete rollback cachefile if it exists. + FOOTPRINT(1); + + int rollback_len = strlen(log_dir) + sizeof(ROLLBACK_CACHEFILE_NAME) +1; //1 for '/' + char rollback_fname[rollback_len]; + + { + int l = snprintf(rollback_fname, sizeof(rollback_fname), + "%s/%s", env_dir, ROLLBACK_CACHEFILE_NAME); + assert(l+1 == (signed)(sizeof(rollback_fname))); + } + r = unlink(rollback_fname); + assert(r==0 || errno==ENOENT); + if (r==0) { + r = toku_fsync_dir_by_name_without_accounting(env_dir); + assert(r==0); + } + } + return r; +} + +//After this function completes: +// If any log files exist they are all of the same version. +// There is no lock file. +// There is no commit file. +static int +cleanup_previous_upgrade_attempt(const char *env_dir, const char *log_dir, + const char *upgrade_lock_fname, + const char *upgrade_commit_fname) { + int r = 0; + int lock_fd; + int commit_fd; + unsigned char prefix[upgrade_lock_prefix_size]; + FOOTPRINTSETUP(1000); + + commit_fd = open(upgrade_commit_fname, O_RDONLY|O_BINARY, S_IRWXU); + if (commit_fd<0) { + assert(errno==ENOENT); + } + lock_fd = open(upgrade_lock_fname, O_RDONLY|O_BINARY, S_IRWXU); + if (lock_fd<0) { + assert(errno == ENOENT); + //Nothing to clean up (lock file does not exist). + } + else { //Lock file exists. Will commit or abort the upgrade. + FOOTPRINT(1); + int64_t n = pread(lock_fd, prefix, upgrade_lock_prefix_size, 0); + assert(n>=0 && n <= upgrade_lock_prefix_size); + struct rbuf rb; + rb.size = upgrade_lock_prefix_size; + rb.buf = prefix; + rb.ndone = 0; + if (n == upgrade_lock_prefix_size) { + FOOTPRINT(2); + //Check magic number + bytevec magic; + rbuf_literal_bytes(&rb, &magic, 8); + assert(memcmp(magic,"tokuupgr",8)==0); + uint32_t to_version = rbuf_network_int(&rb); + uint32_t from_version = rbuf_network_int(&rb); + uint32_t suffix_length = rbuf_int(&rb); + uint32_t stored_x1764 = rbuf_int(&rb); + uint32_t calculated_x1764 = x1764_memory(rb.buf, rb.size-4); + assert(calculated_x1764 == stored_x1764); + //Now that checksum matches, verify data. + + assert(to_version == TOKU_LOG_VERSION); //Only upgrading directly to newest log version. + assert(from_version < TOKU_LOG_VERSION); //Otherwise it isn't an upgrade. + assert(from_version >= TOKU_LOG_MIN_SUPPORTED_VERSION); //TODO: make this an error case once we have 3 log versions + assert(suffix_length == 0); //TODO: Future versions may change this. + if (commit_fd>=0) { //Commit the upgrade + footprint_previous_upgrade = 1; + FOOTPRINT(3); + r = convert_logs_and_fsync(log_dir, env_dir, from_version, to_version); + assert(r==0); + } + else { //Abort the upgrade + footprint_previous_upgrade = 2; + FOOTPRINT(4); + r = convert_logs_and_fsync(log_dir, env_dir, to_version, from_version); + assert(r==0); + } + } + else { // We never finished writing lock file: commit file cannot exist yet. + // We are aborting the upgrade, but because the previous attempt never got past + // writing the lock file, nothing needs to be undone. + assert(commit_fd<0); + } + { //delete lock file + r = close(lock_fd); + assert(r==0); + r = unlink(upgrade_lock_fname); + assert(r==0); + r = toku_fsync_dir_by_name_without_accounting(log_dir); + assert(r==0); + } + } + if (commit_fd>=0) { //delete commit file + r = close(commit_fd); + assert(r==0); + r = unlink(upgrade_commit_fname); + assert(r==0); + r = toku_fsync_dir_by_name_without_accounting(log_dir); + assert(r==0); + } + return r; +} + + +static int +write_commit_file_and_fsync(const char *log_dir, const char * upgrade_commit_fname) { + int fd; + fd = open(upgrade_commit_fname, O_RDWR|O_BINARY|O_CREAT|O_EXCL, S_IRWXU); + assert(fd>=0); + + int r; + r = toku_file_fsync_without_accounting(fd); + assert(r==0); + r = close(fd); + assert(r==0); + r = toku_fsync_dir_by_name_without_accounting(log_dir); + assert(r==0); + return r; +} + +static int +write_lock_file_and_fsync(const char *log_dir, const char * upgrade_lock_fname, uint32_t from_version) { + int fd; + fd = open(upgrade_lock_fname, O_RDWR|O_BINARY|O_CREAT|O_EXCL, S_IRWXU); + assert(fd>=0); + + char buf[upgrade_lock_prefix_size]; + struct wbuf wb; + const int suffix_size = 0; + wbuf_init(&wb, buf, upgrade_lock_prefix_size); + { //Serialize to wbuf + wbuf_literal_bytes(&wb, "tokuupgr", 8); //magic + wbuf_network_int(&wb, TOKU_LOG_VERSION); //to version + wbuf_network_int(&wb, from_version); //from version + wbuf_int(&wb, suffix_size); //Suffix Length + u_int32_t checksum = x1764_finish(&wb.checksum); + wbuf_int(&wb, checksum); //checksum + assert(wb.ndone == wb.size); + } + toku_os_full_pwrite(fd, wb.buf, wb.size, 0); + { + //Serialize suffix to wbuf and then disk (if exist) + //There is no suffix as of TOKU_LOG_VERSION_2 + } + int r; + r = toku_file_fsync_without_accounting(fd); + assert(r==0); + r = close(fd); + assert(r==0); + r = toku_fsync_dir_by_name_without_accounting(log_dir); + assert(r==0); + return r; +} + +// from_version is version of lognumber_newest, which contains last_lsn +static int +upgrade_log(const char *env_dir, const char *log_dir, + const char * upgrade_lock_fname, const char * upgrade_commit_fname, + LSN last_lsn, + uint32_t from_version) { // the real deal + int r; + FOOTPRINTSETUP(1000); + + r = write_lock_file_and_fsync(log_dir, upgrade_lock_fname, from_version); + assert(r==0); + + LSN initial_lsn = last_lsn; + initial_lsn.lsn++; + CACHETABLE ct; + TOKULOGGER logger; + { //Create temporary environment + r = toku_create_cachetable(&ct, 1<<25, initial_lsn, NULL); + assert(r == 0); + toku_cachetable_set_env_dir(ct, env_dir); + r = toku_logger_create(&logger); + assert(r == 0); + toku_logger_write_log_files(logger, FALSE); //Prevent initial creation of log file + toku_logger_set_cachetable(logger, ct); + r = toku_logger_open(log_dir, logger); + assert(r==0); + r = toku_logger_restart(logger, initial_lsn); //Turn log writing on and create first log file with initial lsn + assert(r==0); + FOOTPRINT(1); + } + if (from_version == TOKU_LOG_VERSION_1) { + { //Create rollback cachefile + r = toku_logger_open_rollback(logger, ct, TRUE); + assert(r==0); + } + { //Checkpoint + r = toku_checkpoint(ct, logger, NULL, NULL, NULL, NULL); + assert(r == 0); + } + { //Close rollback cachefile + r = toku_logger_close_rollback(logger, FALSE); + assert(r==0); + } + FOOTPRINT(2); + } + { //Checkpoint + r = toku_checkpoint(ct, logger, NULL, NULL, NULL, NULL); //fsyncs log dir + assert(r == 0); + FOOTPRINT(3); + } + { //Close cachetable and logger + r = toku_logger_shutdown(logger); + assert(r==0); + r = toku_cachetable_close(&ct); + assert(r==0); + r = toku_logger_close(&logger); + assert(r==0); + FOOTPRINT(4); + } + { //Write commit file + r = write_commit_file_and_fsync(log_dir, upgrade_commit_fname); + assert(r==0); + } + { // Cross the Rubicon here: + // Delete all old logs: POINT OF NO RETURN + r = convert_logs_and_fsync(log_dir, env_dir, from_version, TOKU_LOG_VERSION); + assert(r==0); + FOOTPRINT(5); + } + { //Delete upgrade lock file and ensure directory is fsynced + r = unlink(upgrade_lock_fname); + assert(r==0); + r = toku_fsync_dir_by_name_without_accounting(log_dir); + assert(r==0); + } + { //Delete upgrade commit file and ensure directory is fsynced + r = unlink(upgrade_commit_fname); + assert(r==0); + r = toku_fsync_dir_by_name_without_accounting(log_dir); + assert(r==0); + } + FOOTPRINT(6); + return 0; +} + +int +toku_maybe_upgrade_log(const char *env_dir, const char *log_dir) { + int r; + int lockfd = -1; + FOOTPRINTSETUP(10000); + + r = toku_recover_lock(log_dir, &lockfd); + if (r == 0) { + assert(log_dir); + assert(env_dir); + char upgrade_lock_fname[strlen(log_dir) + sizeof(upgrade_lock_file_suffix)]; + { //Generate full fname + int l = snprintf(upgrade_lock_fname, sizeof(upgrade_lock_fname), + "%s%s", log_dir, upgrade_lock_file_suffix); + assert(l+1 == (ssize_t)(sizeof(upgrade_lock_fname))); + } + char upgrade_commit_fname[strlen(log_dir) + sizeof(upgrade_commit_file_suffix)]; + { //Generate full fname + int l = snprintf(upgrade_commit_fname, sizeof(upgrade_commit_fname), + "%s%s", log_dir, upgrade_commit_file_suffix); + assert(l+1 == (ssize_t)(sizeof(upgrade_commit_fname))); + } + + r = cleanup_previous_upgrade_attempt(env_dir, log_dir, + upgrade_lock_fname, upgrade_commit_fname); + if (r==0) { + uint32_t version_of_logs_on_disk; + BOOL found_any_logs; + r = toku_get_version_of_logs_on_disk(log_dir, &found_any_logs, &version_of_logs_on_disk); + if (r==0) { + if (!found_any_logs) + r = 0; //No logs means no logs to upgrade. + else if (version_of_logs_on_disk > TOKU_LOG_VERSION) + r = TOKUDB_DICTIONARY_TOO_NEW; + else if (version_of_logs_on_disk < TOKU_LOG_MIN_SUPPORTED_VERSION) + r = TOKUDB_DICTIONARY_TOO_OLD; + else if (version_of_logs_on_disk == TOKU_LOG_VERSION) + r = 0; //Logs are up to date + else { + FOOTPRINT(1); + LSN last_lsn; + r = verify_clean_shutdown_of_log_version(log_dir, version_of_logs_on_disk, &last_lsn); + if (r==0) { + FOOTPRINT(2); + r = upgrade_log(env_dir, log_dir, + upgrade_lock_fname, upgrade_commit_fname, + last_lsn, version_of_logs_on_disk); + } + } + } + } + { + //Clean up + int rc; + rc = toku_recover_unlock(lockfd); + if (r==0) r = rc; + } + } + return r; +} + diff --git a/newbrt/logfilemgr.c b/newbrt/logfilemgr.c index 99f5c421ab9..eb1e671b21d 100644 --- a/newbrt/logfilemgr.c +++ b/newbrt/logfilemgr.c @@ -80,8 +80,10 @@ int toku_logfilemgr_init(TOKULOGFILEMGR lfm, const char *log_dir) { } // find the index basename = strrchr(logfiles[i], '/') + 1; - r = sscanf(basename, "log%lld.tokulog", &index); - assert(r==1); // found index + int version; + r = sscanf(basename, "log%lld.tokulog%d", &index, &version); + assert(r==2); // found index and version + assert(version==TOKU_LOG_VERSION); lf_info->index = index; // find last LSN r = toku_logcursor_create_for_file(&cursor, log_dir, basename); diff --git a/newbrt/logformat.c b/newbrt/logformat.c index 9c920f49bc5..1c1f2dd1ab9 100644 --- a/newbrt/logformat.c +++ b/newbrt/logformat.c @@ -135,6 +135,11 @@ const struct logtype logtypes[] = { {"u_int32_t", "treeflags", 0}, NULLFIELD}}, //TODO: #2037 Add dname + {"fdescriptor", 'd', FA{{"FILENUM", "filenum", 0}, + {"u_int32_t", "descriptor_version", 0}, + {"BYTESTRING", "descriptor", 0}, + NULLFIELD}}, + //TODO: #2037 Add dname {"fclose", 'e', FA{{"BYTESTRING", "iname", 0}, {"FILENUM", "filenum", 0}, NULLFIELD}}, @@ -176,6 +181,8 @@ const struct logtype logtypes[] = { {"comment", 'T', FA{{"u_int64_t", "timestamp", 0}, {"BYTESTRING", "comment", 0}, NULLFIELD}}, + {"shutdown", 'Q', FA{{"u_int64_t", "timestamp", 0}, + NULLFIELD}}, {"load", 'l', FA{{"TXNID", "xid", 0}, {"BYTESTRING", "old_iname", 0}, {"BYTESTRING", "new_iname", 0}, diff --git a/newbrt/logger.c b/newbrt/logger.c index fcd62a322e0..637485cd8ae 100644 --- a/newbrt/logger.c +++ b/newbrt/logger.c @@ -13,14 +13,40 @@ static int delete_logfile(TOKULOGGER logger, long long index); static void grab_output(TOKULOGGER logger, LSN *fsynced_lsn); static void release_output(TOKULOGGER logger, LSN fsynced_lsn); +static BOOL is_a_logfile_any_version (const char *name, uint64_t *number_result, uint32_t *version_of_log) { + BOOL rval = TRUE; + uint64_t result; + int n; + int r; + uint32_t version; + r = sscanf(name, "log%"SCNu64".tokulog%"SCNu32"%n", &result, &version, &n); + if (r!=2 || name[n]!='\0' || version <= TOKU_LOG_VERSION_1) { + //Version 1 does NOT append 'version' to end of '.tokulog' + version = TOKU_LOG_VERSION_1; + r = sscanf(name, "log%"SCNu64".tokulog%n", &result, &n); + if (r!=1 || name[n]!='\0') { + rval = FALSE; + } + } + if (rval) { + *number_result = result; + *version_of_log = version; + } + + return rval; +} + // added for #2424, improved for #2521 static BOOL is_a_logfile (const char *name, long long *number_result) { - unsigned long long result; - int n; - int r = sscanf(name, "log%llu.tokulog%n", &result, &n); - if (r!=1 || name[n]!=0) return FALSE; - *number_result = result; - return TRUE; + BOOL rval; + uint64_t result; + uint32_t version; + rval = is_a_logfile_any_version(name, &result, &version); + if (rval && version != TOKU_LOG_VERSION) + rval = FALSE; + if (rval) + *number_result = result; + return rval; } @@ -234,8 +260,8 @@ int toku_logger_shutdown(TOKULOGGER logger) { int r = 0; if (logger->is_open) { if (toku_omt_size(logger->live_txns) == 0) { - BYTESTRING comment = { strlen("shutdown"), "shutdown" }; - int r2 = toku_log_comment(logger, NULL, TRUE, 0, comment); + time_t tnow = time(NULL); + int r2 = toku_log_shutdown(logger, NULL, TRUE, tnow); if (!r) r = r2; } } @@ -575,7 +601,7 @@ static int open_logfile (TOKULOGGER logger) { int fnamelen = strlen(logger->directory)+50; char fname[fnamelen]; - snprintf(fname, fnamelen, "%s/log%012lld.tokulog", logger->directory, logger->next_log_file_number); + snprintf(fname, fnamelen, "%s/log%012lld.tokulog%d", logger->directory, logger->next_log_file_number, TOKU_LOG_VERSION); long long index = logger->next_log_file_number; if (logger->write_log_files) { logger->fd = open(fname, O_CREAT+O_WRONLY+O_TRUNC+O_EXCL+O_BINARY, S_IRWXU); @@ -608,7 +634,7 @@ static int delete_logfile(TOKULOGGER logger, long long index) { int fnamelen = strlen(logger->directory)+50; char fname[fnamelen]; - snprintf(fname, fnamelen, "%s/log%012lld.tokulog", logger->directory, index); + snprintf(fname, fnamelen, "%s/log%012lld.tokulog%d", logger->directory, index, TOKU_LOG_VERSION); int r = remove(fname); return r; } @@ -786,6 +812,14 @@ int toku_logger_log_fdelete (TOKUTXN txn, const char *fname) { return r; } +// fname is the iname +int toku_logger_log_descriptor (TOKUTXN txn, FILENUM filenum, DESCRIPTOR descriptor_p) { + if (txn==0) return 0; + if (txn->logger->is_panicked) return EINVAL; + BYTESTRING bs_descriptor = { .len=descriptor_p->dbt.size, .data = descriptor_p->dbt.data }; + int r = toku_log_fdescriptor (txn->logger, (LSN*)0, 1, filenum, descriptor_p->version, bs_descriptor); + return r; +} @@ -1258,3 +1292,82 @@ toku_logger_get_status(TOKULOGGER logger, LOGGER_STATUS s) { s->swap_ctr = 0; } } + +//Used for upgrade +int +toku_get_version_of_logs_on_disk(const char *log_dir, BOOL *found_any_logs, uint32_t *version_found) { + BOOL found = FALSE; + uint32_t single_version = 0; + int r = 0; + + struct dirent *de; + DIR *d=opendir(log_dir); + if (d==NULL) { + r = errno; + } + else { + // Examine every file in the directory and assert that all log files are of the same version (single_version). + while ((de=readdir(d))) { + uint32_t this_log_version; + uint64_t this_log_number; + BOOL is_log = is_a_logfile_any_version(de->d_name, &this_log_number, &this_log_version); + if (is_log) { + if (found) + assert(single_version == this_log_version); + found = TRUE; + single_version = this_log_version; + } + } + } + { + int r2 = closedir(d); + if (r==0) r = r2; + } + if (r==0) { + *found_any_logs = found; + if (found) + *version_found = single_version; + } + return r; +} + +//Used for upgrade +int +toku_delete_all_logs_of_version(const char *log_dir, uint32_t version_to_delete) { + int r = 0; + + struct dirent *de; + DIR *d=opendir(log_dir); + if (d==NULL) { + r = errno; + } + else { + // Examine every file in the directory and if it is a log of the given version, delete it + while ((de=readdir(d))) { + uint32_t this_log_version; + uint64_t this_log_number; + BOOL is_log = is_a_logfile_any_version(de->d_name, &this_log_number, &this_log_version); + if (is_log && this_log_version == version_to_delete) { + char log_full_name[strlen(log_dir) + strlen(de->d_name) + 2]; //'\0' and '/' + { //Generate full fname + int l = snprintf(log_full_name, sizeof(log_full_name), + "%s/%s", log_dir, de->d_name); + assert(l+1 == (ssize_t)(sizeof(log_full_name))); + } + + r = unlink(log_full_name); + if (r!=0) { + r = errno; + assert(r); + break; + } + } + } + } + { + int r2 = closedir(d); + if (r==0) r = r2; + } + return r; +} + diff --git a/newbrt/logger.h b/newbrt/logger.h index a6bb072b40e..0c3ad1213c7 100644 --- a/newbrt/logger.h +++ b/newbrt/logger.h @@ -14,6 +14,7 @@ enum { TOKU_LOG_VERSION_2 = 2, TOKU_LOG_NEXT_VERSION, // the version after the current version TOKU_LOG_VERSION = TOKU_LOG_NEXT_VERSION-1, // A hack so I don't have to change this line. + TOKU_LOG_MIN_SUPPORTED_VERSION = TOKU_LOG_VERSION_2 }; #define ROLLBACK_CACHEFILE_NAME "tokudb.rollback" @@ -56,6 +57,7 @@ int toku_logger_maybe_trim_log(TOKULOGGER logger, LSN oldest_open_lsn); int toku_logger_log_fcreate (TOKUTXN txn, const char *fname, FILENUM filenum, u_int32_t mode, u_int32_t flags, DESCRIPTOR descriptor_p); int toku_logger_log_fdelete (TOKUTXN txn, const char *fname); int toku_logger_log_fopen (TOKUTXN txn, const char * fname, FILENUM filenum, uint32_t treeflags); +int toku_logger_log_descriptor (TOKUTXN txn, FILENUM filenum, DESCRIPTOR descriptor_p); int toku_fread_u_int8_t (FILE *f, u_int8_t *v, struct x1764 *mm, u_int32_t *len); int toku_fread_u_int32_t_nocrclen (FILE *f, u_int32_t *v); @@ -166,6 +168,9 @@ typedef struct logger_status { void toku_logger_get_status(TOKULOGGER logger, LOGGER_STATUS s); +int toku_get_version_of_logs_on_disk(const char *log_dir, BOOL *found_any_logs, uint32_t *version_found); +int toku_delete_all_logs_of_version(const char *log_dir, uint32_t version_to_delete); + #if defined(__cplusplus) || defined(__cilkplusplus) }; #endif diff --git a/newbrt/recover.c b/newbrt/recover.c index ff556b6ff48..36dc257077a 100644 --- a/newbrt/recover.c +++ b/newbrt/recover.c @@ -7,11 +7,13 @@ #include "log_header.h" #include "checkpoint.h" +static const char recovery_lock_file[] = "/__tokudb_recoverylock_dont_delete_me"; + int tokudb_recovery_trace = 0; // turn on recovery tracing, default off. //#define DO_VERIFY_COUNTS #ifdef DO_VERIFY_COUNTS -#define VERIFY_COUNTS(n) toku_verify_counts(n) +#define VERIFY_COUNTS(n) toku_verify_or_set_counts(n, FALSE) #else #define VERIFY_COUNTS(n) ((void)0) #endif @@ -235,14 +237,6 @@ static void recover_yield(voidfp f, void *fpthunk, void *UU(yieldthunk)) { if (f) f(fpthunk); } -static int -abort_on_upgrade(DB* UU(pdb), - u_int32_t UU(old_version), const DBT *UU(old_descriptor), const DBT *UU(old_key), const DBT *UU(old_val), - u_int32_t UU(new_version), const DBT *UU(new_descriptor), const DBT *UU(new_key), const DBT *UU(new_val)) { - assert(FALSE); //Must not upgrade. - return ENOSYS; -} - // Open the file if it is not already open. If it is already open, then do nothing. static int internal_recover_fopen_or_fcreate (RECOVER_ENV renv, BOOL must_create, int mode, BYTESTRING *bs_iname, FILENUM filenum, u_int32_t treeflags, u_int32_t descriptor_version, BYTESTRING* descriptor, TOKUTXN txn) { @@ -269,7 +263,7 @@ static int internal_recover_fopen_or_fcreate (RECOVER_ENV renv, BOOL must_create if (descriptor_version > 0) { DBT descriptor_dbt; toku_fill_dbt(&descriptor_dbt, descriptor->data, descriptor->len); - r = toku_brt_set_descriptor(brt, descriptor_version, &descriptor_dbt, abort_on_upgrade); + r = toku_brt_set_descriptor(brt, descriptor_version, &descriptor_dbt); if (r!=0) goto close_brt; } r = toku_brt_open_recovery(brt, iname, must_create, must_create, renv->ct, txn, fake_db, filenum); @@ -674,6 +668,32 @@ static int toku_recover_backward_fopen (struct logtype_fopen *UU(l), RECOVER_ENV return 0; } +static int toku_recover_fdescriptor (struct logtype_fdescriptor *l, RECOVER_ENV renv) { + int r; + struct file_map_tuple *tuple = NULL; + r = file_map_find(&renv->fmap, l->filenum, &tuple); + if (r==0) { + //Maybe do the descriptor (lsn filter) + LSN treelsn = toku_brt_checkpoint_lsn(tuple->brt); + if (l->lsn.lsn > treelsn.lsn) { + //Upgrade descriptor. + assert(tuple->brt->h->descriptor.version < l->descriptor_version); //Must be doing an upgrade. + DESCRIPTOR_S d; + d.version = l->descriptor_version; + toku_fill_dbt(&d.dbt, toku_xmemdup(l->descriptor.data, l->descriptor.len), l->descriptor.len); + r = toku_maybe_upgrade_descriptor(tuple->brt, &d, FALSE, NULL); + assert(r==0); + } + } + return 0; +} + +static int toku_recover_backward_fdescriptor (struct logtype_fdescriptor *UU(l), RECOVER_ENV UU(renv)) { + // nothing + return 0; +} + + // if file referred to in l is open, close it static int toku_recover_fclose (struct logtype_fclose *l, RECOVER_ENV renv) { struct file_map_tuple *tuple = NULL; @@ -949,6 +969,16 @@ static int toku_recover_backward_comment (struct logtype_comment *UU(l), RECOVER return 0; } +static int toku_recover_shutdown (struct logtype_shutdown *UU(l), RECOVER_ENV UU(renv)) { + // nothing + return 0; +} + +static int toku_recover_backward_shutdown (struct logtype_shutdown *UU(l), RECOVER_ENV UU(renv)) { + // nothing + return 0; +} + static int toku_recover_load(struct logtype_load *UU(l), RECOVER_ENV UU(renv)) { int r; TOKUTXN txn = NULL; @@ -992,7 +1022,7 @@ int tokudb_needs_recovery(const char *log_dir, BOOL ignore_log_empty) { if (r != 0) { needs_recovery = TRUE; goto exit; } - if (le->cmd == LT_comment) { + if (le->cmd==LT_shutdown || le->cmd==LT_comment) { r = toku_logcursor_prev(logcursor, &le); if (r != 0) { needs_recovery = TRUE; goto exit; @@ -1261,15 +1291,14 @@ static int do_recovery(RECOVER_ENV renv, const char *env_dir, const char *log_di return rr; } -static int recover_lock(const char *lock_dir, int *lockfd) { +int +toku_recover_lock(const char *lock_dir, int *lockfd) { if (!lock_dir) return ENOENT; - - const char fname[] = "/__tokudb_recoverylock_dont_delete_me"; int namelen=strlen(lock_dir); - char lockfname[namelen+sizeof(fname)]; + char lockfname[namelen+sizeof(recovery_lock_file)]; - int l = snprintf(lockfname, sizeof(lockfname), "%s%s", lock_dir, fname); + int l = snprintf(lockfname, sizeof(lockfname), "%s%s", lock_dir, recovery_lock_file); assert(l+1 == (signed)(sizeof(lockfname))); *lockfd = toku_os_lock_file(lockfname); if (*lockfd < 0) { @@ -1280,13 +1309,16 @@ static int recover_lock(const char *lock_dir, int *lockfd) { return 0; } -static int recover_unlock(int lockfd) { +int +toku_recover_unlock(int lockfd) { int r = toku_os_unlock_file(lockfd); if (r != 0) return errno; return 0; } + + int tokudb_recover(const char *env_dir, const char *log_dir, brt_compare_func bt_compare, brt_compare_func dup_compare, @@ -1296,7 +1328,7 @@ int tokudb_recover(const char *env_dir, const char *log_dir, int r; int lockfd = -1; - r = recover_lock(log_dir, &lockfd); + r = toku_recover_lock(log_dir, &lockfd); if (r != 0) return r; @@ -1314,7 +1346,7 @@ int tokudb_recover(const char *env_dir, const char *log_dir, recover_env_cleanup(&renv, (BOOL)(rr == 0)); } - r = recover_unlock(lockfd); + r = toku_recover_unlock(lockfd); if (r != 0) return r; diff --git a/newbrt/recover.h b/newbrt/recover.h index 6d11aa577ee..9e248819137 100644 --- a/newbrt/recover.h +++ b/newbrt/recover.h @@ -41,6 +41,11 @@ void toku_recover_set_callback2 (void (*)(void*), void*); extern int tokudb_recovery_trace; +int toku_recover_lock (const char *lock_dir, int *lockfd); + +int toku_recover_unlock(int lockfd); + + #if defined(__cplusplus) || defined(__cilkplusplus) }; #endif diff --git a/newbrt/tests/brt-serialize-test.c b/newbrt/tests/brt-serialize-test.c index b414b420583..5cfb2e69ad0 100644 --- a/newbrt/tests/brt-serialize-test.c +++ b/newbrt/tests/brt-serialize-test.c @@ -100,7 +100,6 @@ test_serialize_nonleaf(void) { assert(size == 100); } - sn.desc = &brt->h->descriptor; r = toku_serialize_brtnode_to(fd, make_blocknum(20), &sn, brt->h, 1, 1, FALSE); assert(r==0); diff --git a/newbrt/tests/brtloader-test-bad-generate.c b/newbrt/tests/brtloader-test-bad-generate.c index 2780435f4b3..0a9f6eee84e 100644 --- a/newbrt/tests/brtloader-test-bad-generate.c +++ b/newbrt/tests/brtloader-test-bad-generate.c @@ -76,7 +76,7 @@ static void test_extractor(int nrows, int nrowsets, BOOL expect_fail) { // open the brtloader. this runs the extractor. const int N = 1; DB *dbs[N]; - const struct descriptor *descriptors[N]; + DESCRIPTOR descriptors[N]; const char *fnames[N]; brt_compare_func compares[N]; for (int i = 0; i < N; i++) { diff --git a/newbrt/tests/brtloader-test-extractor-errors.c b/newbrt/tests/brtloader-test-extractor-errors.c index 6600e145d9e..39726c1d90a 100644 --- a/newbrt/tests/brtloader-test-extractor-errors.c +++ b/newbrt/tests/brtloader-test-extractor-errors.c @@ -88,7 +88,7 @@ static void test_extractor(int nrows, int nrowsets, BOOL expect_fail, const char // open the brtloader. this runs the extractor. const int N = 1; DB *dbs[N]; - const struct descriptor *descriptors[N]; + DESCRIPTOR descriptors[N]; const char *fnames[N]; brt_compare_func compares[N]; for (int i = 0; i < N; i++) { diff --git a/newbrt/tests/brtloader-test-extractor.c b/newbrt/tests/brtloader-test-extractor.c index 6ff55a75f49..a08fe8f6805 100644 --- a/newbrt/tests/brtloader-test-extractor.c +++ b/newbrt/tests/brtloader-test-extractor.c @@ -295,7 +295,7 @@ static void test_extractor(int nrows, int nrowsets, const char *testdir) { // open the brtloader. this runs the extractor. const int N = 1; DB *dbs[N]; - const struct descriptor *descriptors[N]; + DESCRIPTOR descriptors[N]; const char *fnames[N]; brt_compare_func compares[N]; for (int i = 0; i < N; i++) { diff --git a/newbrt/tests/brtloader-test-merge-files-dbufio.c b/newbrt/tests/brtloader-test-merge-files-dbufio.c index 41148aade9a..758dc9e8a3e 100644 --- a/newbrt/tests/brtloader-test-merge-files-dbufio.c +++ b/newbrt/tests/brtloader-test-merge-files-dbufio.c @@ -292,7 +292,7 @@ static void test (const char *directory, BOOL is_error) { BRTLOADER bl; DB **XMALLOC_N(N_DEST_DBS, dbs); - const struct descriptor **XMALLOC_N(N_DEST_DBS, descriptors); + DESCRIPTOR *XMALLOC_N(N_DEST_DBS, descriptors); const char **XMALLOC_N(N_DEST_DBS, new_fnames_in_env); for (int i=0; i=0); diff --git a/newbrt/tests/brtloader-test-writer.c b/newbrt/tests/brtloader-test-writer.c index 5064d427db2..36e4023b869 100644 --- a/newbrt/tests/brtloader-test-writer.c +++ b/newbrt/tests/brtloader-test-writer.c @@ -152,7 +152,7 @@ static void test_write_dbfile (char *template, int n, char *output_name) { r = queue_destroy(q); assert(r==0); - struct descriptor desc = {.version = 1, .dbt = (DBT){.size = 4, .data="abcd"}}; + DESCRIPTOR_S desc = {.version = 1, .dbt = (DBT){.size = 4, .data="abcd"}}; int fd = open(output_name, O_RDWR | O_CREAT | O_BINARY, S_IRWXU|S_IRWXG|S_IRWXO); assert(fd>=0); diff --git a/newbrt/tests/brtloader-test.c b/newbrt/tests/brtloader-test.c index 8dc61be7d97..e7a5cbbdf55 100644 --- a/newbrt/tests/brtloader-test.c +++ b/newbrt/tests/brtloader-test.c @@ -316,7 +316,7 @@ static void verify_dbfile(int n, int sorted_keys[], const char *sorted_vals[], c assert(fs.n_temp_files==0); - struct descriptor desc = {.version = 1, .dbt = (DBT){.size = 4, .data="abcd"}}; + DESCRIPTOR_S desc = {.version = 1, .dbt = (DBT){.size = 4, .data="abcd"}}; int fd = open(output_name, O_RDWR | O_CREAT | O_BINARY, S_IRWXU|S_IRWXG|S_IRWXO); assert(fd>=0); diff --git a/newbrt/tests/log-test.c b/newbrt/tests/log-test.c index 8b50c97ed33..e31c45cba5c 100644 --- a/newbrt/tests/log-test.c +++ b/newbrt/tests/log-test.c @@ -13,6 +13,7 @@ test_main (int argc __attribute__((__unused__)), const char *argv[] __attribute__((__unused__))) { int r; long long lognum; + char logname[PATH_MAX]; r = system(rmrf); CKERR(r); r = toku_os_mkdir(dname, S_IRWXU); assert(r==0); @@ -20,18 +21,21 @@ test_main (int argc __attribute__((__unused__)), assert(r==0 && lognum==0LL); mode_t mode = S_IRWXU + S_IRWXG + S_IRWXO; - r = open(dname "/log01.tokulog", O_WRONLY + O_CREAT + O_BINARY, mode); assert(r>=0); + sprintf(logname, dname "/log01.tokulog%d", TOKU_LOG_VERSION); + r = open(logname, O_WRONLY + O_CREAT + O_BINARY, mode); assert(r>=0); r = close(r); assert(r==0); r = toku_logger_find_next_unused_log_file(dname,&lognum); assert(r==0 && lognum==2LL); - r = open(dname "/log123456789012345.tokulog", O_WRONLY + O_CREAT + O_BINARY, mode); assert(r>=0); + sprintf(logname, dname "/log123456789012345.tokulog%d", TOKU_LOG_VERSION); + r = open(logname, O_WRONLY + O_CREAT + O_BINARY, mode); assert(r>=0); r = close(r); assert(r==0); r = toku_logger_find_next_unused_log_file(dname,&lognum); assert(r==0 && lognum==123456789012346LL); - r = open(dname "/log3.tokulog", O_WRONLY + O_CREAT + O_BINARY, mode); assert(r>=0); + sprintf(logname, dname "/log3.tokulog%d", TOKU_LOG_VERSION); + r = open(logname, O_WRONLY + O_CREAT + O_BINARY, mode); assert(r>=0); r = close(r); assert(r==0); r = toku_logger_find_next_unused_log_file(dname,&lognum); assert(r==0 && lognum==123456789012346LL); diff --git a/newbrt/tests/log-test4.c b/newbrt/tests/log-test4.c index f9ffa50d82f..00c87fed492 100644 --- a/newbrt/tests/log-test4.c +++ b/newbrt/tests/log-test4.c @@ -14,6 +14,7 @@ int test_main (int argc __attribute__((__unused__)), const char *argv[] __attribute__((__unused__))) { int r; + char logname[PATH_MAX]; r = system(rmrf); CKERR(r); r = toku_os_mkdir(dname, S_IRWXU); assert(r==0); @@ -34,7 +35,8 @@ test_main (int argc __attribute__((__unused__)), r = toku_logger_close(&logger); assert(r == 0); { toku_struct_stat statbuf; - r = toku_stat(dname "/log000000000000.tokulog", &statbuf); + sprintf(logname, dname "/log000000000000.tokulog%d", TOKU_LOG_VERSION); + r = toku_stat(logname, &statbuf); assert(r==0); assert(statbuf.st_size==12+5); } diff --git a/newbrt/tests/log-test6.c b/newbrt/tests/log-test6.c index 5189df9805c..30343992fb6 100644 --- a/newbrt/tests/log-test6.c +++ b/newbrt/tests/log-test6.c @@ -56,8 +56,10 @@ test_main (int argc __attribute__((__unused__)), assert(r == 0); { + char logname[PATH_MAX]; toku_struct_stat statbuf; - r = toku_stat(dname "/log000000000000.tokulog", &statbuf); + sprintf(logname, dname "/log000000000000.tokulog%d", TOKU_LOG_VERSION); + r = toku_stat(logname, &statbuf); assert(r==0); assert(statbuf.st_size<=LSIZE); } diff --git a/newbrt/tests/logcursor-bad-checksum.c b/newbrt/tests/logcursor-bad-checksum.c index 1750bb544ba..ba05c10e271 100644 --- a/newbrt/tests/logcursor-bad-checksum.c +++ b/newbrt/tests/logcursor-bad-checksum.c @@ -13,8 +13,10 @@ static void corrupt_the_checksum(void) { // change the LSN in the first log entry of log 0. this will cause an checksum error. + char logname[PATH_MAX]; int r; - FILE *f = fopen(dname "/" "log000000000000.tokulog", "r+b"); assert(f); + sprintf(logname, dname "/" "log000000000000.tokulog%d", TOKU_LOG_VERSION); + FILE *f = fopen(logname, "r+b"); assert(f); r = fseek(f, 025, SEEK_SET); assert(r == 0); char c = 100; size_t n = fwrite(&c, sizeof c, 1, f); assert(n == sizeof c); diff --git a/newbrt/tests/logcursor-empty-logfile-3.c b/newbrt/tests/logcursor-empty-logfile-3.c index 992c544da9c..93461cdfbe3 100644 --- a/newbrt/tests/logcursor-empty-logfile-3.c +++ b/newbrt/tests/logcursor-empty-logfile-3.c @@ -59,7 +59,7 @@ test_main (int argc, const char *argv[]) { r = toku_logger_find_next_unused_log_file(dname, &nexti); assert(r == 0); char mt_fname[128]; - snprintf(mt_fname, 128, "%s/log%012lld.tokulog", dname, nexti); + snprintf(mt_fname, 128, "%s/log%012lld.tokulog%d", dname, nexti, TOKU_LOG_VERSION); int mt_fd = open(mt_fname, O_CREAT+O_WRONLY+O_TRUNC+O_EXCL+O_BINARY, S_IRWXU); assert(mt_fd != -1); r = close(mt_fd); @@ -89,7 +89,7 @@ test_main (int argc, const char *argv[]) { r = toku_logger_find_next_unused_log_file(dname, &nexti); assert(r == 0); char mt_fname[128]; - snprintf(mt_fname, 128, "%s/log%012lld.tokulog", dname, nexti); + snprintf(mt_fname, 128, "%s/log%012lld.tokulog%d", dname, nexti, TOKU_LOG_VERSION); int mt_fd = open(mt_fname, O_CREAT+O_WRONLY+O_TRUNC+O_EXCL+O_BINARY, S_IRWXU); assert(mt_fd != -1); r = close(mt_fd); diff --git a/newbrt/tests/recovery-bad-last-entry.c b/newbrt/tests/recovery-bad-last-entry.c index f04c7dd6bfb..bbed20f55fd 100644 --- a/newbrt/tests/recovery-bad-last-entry.c +++ b/newbrt/tests/recovery-bad-last-entry.c @@ -46,7 +46,7 @@ run_test(void) { r = close(devnul); assert(r==0); char fname[256]; - sprintf(fname, "%s/%s", TESTDIR, "log000000000000.tokulog"); + sprintf(fname, "%s/%s%d", TESTDIR, "log000000000000.tokulog", TOKU_LOG_VERSION); r = toku_stat(fname, &st); assert(r==0); if ( st.st_size - trim > magic_begin_end_checkpoint_sz ) { diff --git a/newbrt/tests/recovery-lsn-error-during-forward-scan.c b/newbrt/tests/recovery-lsn-error-during-forward-scan.c index 568c070301d..d4d2ede6e9d 100644 --- a/newbrt/tests/recovery-lsn-error-during-forward-scan.c +++ b/newbrt/tests/recovery-lsn-error-during-forward-scan.c @@ -8,7 +8,9 @@ static void recover_callback_at_turnaround(void *UU(arg)) { // change the LSN in the first log entry of log 2. this will cause an LSN error during the forward scan. int r; - FILE *f = fopen("log000000000002.tokulog", "r+b"); assert(f); + char logname[PATH_MAX]; + sprintf(logname, "log000000000002.tokulog%d", TOKU_LOG_VERSION); + FILE *f = fopen(logname, "r+b"); assert(f); r = fseek(f, 025, SEEK_SET); assert(r == 0); char c = 100; size_t n = fwrite(&c, sizeof c, 1, f); assert(n == sizeof c); diff --git a/newbrt/tests/test-leafentry10.c b/newbrt/tests/test-leafentry10.c deleted file mode 100644 index 87516c0d9c9..00000000000 --- a/newbrt/tests/test-leafentry10.c +++ /dev/null @@ -1,121 +0,0 @@ -#include -#include - -#include "test.h" -#include "brttypes.h" -#include "includes.h" -#include "backwards_10.h" - -static char -int32_get_char(u_int32_t i, int which) { - char *c = (char*)&i; - return c[which]; -} - -#define UINT32TOCHAR(i) int32_get_char(i, 0), int32_get_char(i, 1), int32_get_char(i, 2), int32_get_char(i, 3) -#define UINT64TOCHAR(i) UINT32TOCHAR(i>>32), UINT32TOCHAR(i&0xffffffff) - - -static void test_leafentry_1 (void) { - LEAFENTRY l; - int r; - u_int32_t msize, dsize; - r = le10_committed(4, "abc", 3, "xy", &msize, &dsize, &l); - assert(r==0); - char expect[] = {LE_COMMITTED, - UINT32TOCHAR(4), - 'a', 'b', 'c', 0, - UINT32TOCHAR(3), - 'x', 'y', 0}; - assert(sizeof(expect)==msize); - assert(msize==dsize); - assert(memcmp(l, expect, msize)==0); - toku_free(l); -} - -static void test_leafentry_2 (void) { - LEAFENTRY l; - int r; - u_int32_t msize, dsize; - r = le10_both(0x0123456789abcdef0LL, 3, "ab", 4, "xyz", 5, "lmno", &msize, &dsize, &l); - assert(r==0); - char expect[] = {LE_BOTH, - UINT64TOCHAR(0x0123456789abcdef0LL), - UINT32TOCHAR(3), 'a', 'b', 0, - UINT32TOCHAR(4), 'x', 'y', 'z', 0, - UINT32TOCHAR(5), 'l', 'm', 'n', 'o', 0}; - assert(sizeof(expect)==msize); - assert(msize==dsize); - assert(memcmp(l, expect, msize)==0); - toku_free(l); -} - -static void test_leafentry_3 (void) { - LEAFENTRY l; - int r; - u_int32_t msize, dsize; - r = le10_provdel(0x0123456789abcdef0LL, 3, "ab", 5, "lmno", &msize, &dsize, &l); - assert(r==0); - char expect[] = {LE_PROVDEL, - UINT64TOCHAR(0x0123456789abcdef0LL), - UINT32TOCHAR(3), 'a', 'b', 0, - UINT32TOCHAR(5), 'l', 'm', 'n', 'o', 0}; - assert(sizeof(expect)==msize); - assert(msize==dsize); - assert(memcmp(l, expect, msize)==0); - toku_free(l); -} - -static void test_leafentry_4 (void) { - LEAFENTRY l; - int r; - u_int32_t msize, dsize; - r = le10_provpair(0x0123456789abcdef0LL, 3, "ab", 5, "lmno", &msize, &dsize, &l); - assert(r==0); - char expect[] = {LE_PROVPAIR, - UINT64TOCHAR(0x0123456789abcdef0LL), - UINT32TOCHAR(3), 'a', 'b', 0, - UINT32TOCHAR(5), 'l', 'm', 'n', 'o', 0}; - assert(sizeof(expect)==msize); - assert(msize==dsize); - assert(memcmp(l, expect, msize)==0); - toku_free(l); -} - -char zeros[1026]; - -#define n5zeros 0,0,0,0,0 -#define n10zeros n5zeros,n5zeros -#define n25zeros n5zeros,n10zeros,n10zeros -#define n75zeros n25zeros,n25zeros,n25zeros -#define n125zeros n75zeros,n25zeros,n25zeros -#define n150zeros n75zeros,n75zeros -#define n300zeros n150zeros,n150zeros -#define n301zeros 0,n300zeros -#define n1025zeros n300zeros,n300zeros,n300zeros,n125zeros -static void test_leafentry_3long (void) { - char expect_3long[] = {LE_PROVDEL, - UINT64TOCHAR(0x0123456789abcdef0LL), - UINT32TOCHAR(301), n301zeros, - UINT32TOCHAR(1025), n1025zeros}; - - LEAFENTRY l; - int r; - u_int32_t msize, dsize; - r = le10_provdel(0x0123456789abcdef0LL, 301, zeros, 1025, zeros, &msize, &dsize, &l); - assert(r==0); - assert(sizeof(expect_3long)==msize); - assert(msize==dsize); - assert(memcmp(l, expect_3long, msize)==0); - toku_free(l); -} - -int -test_main (int argc __attribute__((__unused__)), const char *argv[] __attribute__((__unused__))) { - test_leafentry_1(); - test_leafentry_2(); - test_leafentry_3(); - test_leafentry_4(); - test_leafentry_3long(); - return 0; -} diff --git a/newbrt/tests/test_logcursor.c b/newbrt/tests/test_logcursor.c index 096cff0302b..4ba790d8baf 100644 --- a/newbrt/tests/test_logcursor.c +++ b/newbrt/tests/test_logcursor.c @@ -127,7 +127,9 @@ int test_0 (void) { // test per-file version int test_1 () { int r=0; - char logfile[100] = "log000000000000.tokulog"; + char logfile[PATH_MAX]; + sprintf(logfile, "log000000000000.tokulog%d", TOKU_LOG_VERSION); + struct toku_logcursor *cursor; struct log_entry *entry; diff --git a/newbrt/ule.c b/newbrt/ule.c index 6f81cda22bf..a6beca7a40a 100644 --- a/newbrt/ule.c +++ b/newbrt/ule.c @@ -1641,25 +1641,3 @@ bool transaction_open(TXNID xid) { #endif -// Wrapper code to support backwards compatibility with version 10 (until we don't want it). -// These wrappers should be removed if/when we remove support for version 10 leafentries. -#include "backwards_10.h" -void -toku_upgrade_ule_init_empty_ule(ULE ule, u_int32_t keylen, void * keyp) { - ule_init_empty_ule(ule, keylen, keyp); -} -void -toku_upgrade_ule_remove_innermost_uxr(ULE ule) { - ule_remove_innermost_uxr(ule); -} -void -toku_upgrade_ule_push_insert_uxr(ULE ule, TXNID xid, u_int32_t vallen, void * valp) { - ule_push_insert_uxr(ule, xid, vallen, valp); -} -void -toku_upgrade_ule_push_delete_uxr(ULE ule, TXNID xid) { - ule_push_delete_uxr(ule, xid); -} - - - diff --git a/src/loader.c b/src/loader.c index 011ff46809e..322b736c8fb 100644 --- a/src/loader.c +++ b/src/loader.c @@ -212,7 +212,7 @@ int toku_loader_create_loader(DB_ENV *env, } else { char **XMALLOC_N(N, new_inames_in_env); - const struct descriptor **XMALLOC_N(N, descriptors); + DESCRIPTOR *XMALLOC_N(N, descriptors); for (int i=0; ii->brt->h->descriptor; } diff --git a/src/tests/Makefile b/src/tests/Makefile index aa4ae681e43..05f5ad69a16 100644 --- a/src/tests/Makefile +++ b/src/tests/Makefile @@ -33,7 +33,12 @@ endif SRCS = $(sort $(wildcard *.c)) RECOVER_SRCS = $(wildcard recover-*.c) LOADER_SRCS = $(wildcard loader-*.c) -NONSTANDARD_SRCS=$(RECOVER_SRCS) $(LOADER_SRCS) +TRANSPARENT_UPGRADE_SRCS = $(wildcard upgrade-*.c) +NONSTANDARD_SRCS= \ + $(RECOVER_SRCS) \ + $(LOADER_SRCS) \ + $(TRANSPARENT_UPGRADE_SRCS) \ +#end #Tests that don't compile in windows. SHould WINDOWS_NOT_PORTED_TESTS = \ @@ -104,6 +109,7 @@ BDB_DONTRUN_TESTS = \ loader-tpch-load \ manyfiles \ powerfail \ + preload-3.1-db \ progress \ recover-2483 \ recover-compare-db \ @@ -117,6 +123,8 @@ BDB_DONTRUN_TESTS = \ recover-put-multiple-fdelete-some \ recover-split-checkpoint \ recover-tablelock \ + recover-upgrade-db-descriptor-multihandle \ + recover-upgrade-db-descriptor \ recovery_fileops_stress \ recovery_fileops_unit \ recovery_stress \ @@ -153,6 +161,10 @@ BDB_DONTRUN_TESTS = \ test_txn_nested4 \ test_txn_nested5 \ transactional_fileops \ + upgrade-test-1 \ + upgrade-test-2 \ + upgrade-test-3 \ + upgrade-test-4 \ zombie_db \ #\ ends prev line diff --git a/src/tests/checkpoint_test.h b/src/tests/checkpoint_test.h index 795c93c13b6..09609393b5c 100644 --- a/src/tests/checkpoint_test.h +++ b/src/tests/checkpoint_test.h @@ -159,7 +159,7 @@ db_startup(DICTIONARY d, DB_TXN *open_txn) { { DBT desc; dbt_init(&desc, "foo", sizeof("foo")); - r = db->set_descriptor(db, 1, &desc, abort_on_upgrade); + r = db->set_descriptor(db, 1, &desc); CKERR(r); } { diff --git a/src/tests/env_startup.c b/src/tests/env_startup.c index e953d8c6751..aa4b23ce752 100644 --- a/src/tests/env_startup.c +++ b/src/tests/env_startup.c @@ -88,7 +88,7 @@ delete_directory(void) { static void delete_log(void) { char cmd[1024]; - sprintf(cmd, "rm -rf %s%s%s", ENVDIR, "/", "*.tokulog"); + sprintf(cmd, "rm -rf %s%s%s", ENVDIR, "/", "*.tokulog*"); int r = system(cmd); CKERR(r); } diff --git a/src/tests/loader-cleanup-test.c b/src/tests/loader-cleanup-test.c index d3616236a0c..91227a6fd9f 100644 --- a/src/tests/loader-cleanup-test.c +++ b/src/tests/loader-cleanup-test.c @@ -754,7 +754,7 @@ static void run_test(enum test_type t, int trigger) for(int i=0;iset_descriptor(dbs[i], 1, &desc, abort_on_upgrade); CKERR(r); + r = dbs[i]->set_descriptor(dbs[i], 1, &desc); CKERR(r); dbs[i]->app_private = &idx[i]; snprintf(name, sizeof(name), "db_%04x", i); r = dbs[i]->open(dbs[i], NULL, name, NULL, DB_BTREE, DB_CREATE, 0666); CKERR(r); diff --git a/src/tests/loader-dup-test.c b/src/tests/loader-dup-test.c index 9c7302aeaf1..4aebf09816c 100644 --- a/src/tests/loader-dup-test.c +++ b/src/tests/loader-dup-test.c @@ -322,7 +322,7 @@ static void run_test(void) for(int i=0;iset_descriptor(dbs[i], 1, &desc, abort_on_upgrade); CKERR(r); + r = dbs[i]->set_descriptor(dbs[i], 1, &desc); CKERR(r); dbs[i]->app_private = &idx[i]; snprintf(name, sizeof(name), "db_%04x", i); r = dbs[i]->open(dbs[i], NULL, name, NULL, DB_BTREE, DB_CREATE, 0666); CKERR(r); diff --git a/src/tests/loader-no-puts.c b/src/tests/loader-no-puts.c index e74d2fe7287..fb482dc7028 100644 --- a/src/tests/loader-no-puts.c +++ b/src/tests/loader-no-puts.c @@ -127,7 +127,7 @@ static void run_test(void) for(int i=0;iset_descriptor(dbs[i], 1, &desc, abort_on_upgrade); CKERR(r); + r = dbs[i]->set_descriptor(dbs[i], 1, &desc); CKERR(r); dbs[i]->app_private = &idx[i]; snprintf(name, sizeof(name), "db_%04x", i); r = dbs[i]->open(dbs[i], NULL, name, NULL, DB_BTREE, DB_CREATE, 0666); CKERR(r); diff --git a/src/tests/loader-reference-test.c b/src/tests/loader-reference-test.c index c42e3bcc35f..6493f0402ad 100644 --- a/src/tests/loader-reference-test.c +++ b/src/tests/loader-reference-test.c @@ -132,7 +132,7 @@ static void run_test(void) for(int i=0;iset_descriptor(dbs[i], 1, &desc, abort_on_upgrade); CKERR(r); + r = dbs[i]->set_descriptor(dbs[i], 1, &desc); CKERR(r); dbs[i]->app_private = &idx[i]; snprintf(name, sizeof(name), "db_%04x", i); r = dbs[i]->open(dbs[i], NULL, name, NULL, DB_BTREE, DB_CREATE, 0666); CKERR(r); diff --git a/src/tests/loader-stress-test.c b/src/tests/loader-stress-test.c index ef068d51947..f8d32b43f21 100644 --- a/src/tests/loader-stress-test.c +++ b/src/tests/loader-stress-test.c @@ -361,7 +361,7 @@ static void run_test(void) for(int i=0;iset_descriptor(dbs[i], 1, &desc, abort_on_upgrade); CKERR(r); + r = dbs[i]->set_descriptor(dbs[i], 1, &desc); CKERR(r); dbs[i]->app_private = &idx[i]; snprintf(name, sizeof(name), "db_%04x", i); r = dbs[i]->open(dbs[i], NULL, name, NULL, DB_BTREE, DB_CREATE, 0666); CKERR(r); diff --git a/src/tests/loader-tpch-load.c b/src/tests/loader-tpch-load.c index a416233e49c..3acf7d0161a 100644 --- a/src/tests/loader-tpch-load.c +++ b/src/tests/loader-tpch-load.c @@ -395,7 +395,7 @@ static int run_test(void) for(int i=0;iset_descriptor(dbs[i], 1, &desc, abort_on_upgrade); CKERR(r); + r = dbs[i]->set_descriptor(dbs[i], 1, &desc); CKERR(r); dbs[i]->app_private = &idx[i]; snprintf(name, sizeof(name), "db_%04x", i); r = dbs[i]->open(dbs[i], NULL, name, NULL, DB_BTREE, DB_CREATE, 0666); CKERR(r); diff --git a/src/tests/preload-3.1-db.c b/src/tests/preload-3.1-db.c new file mode 100644 index 00000000000..41c8869c6df --- /dev/null +++ b/src/tests/preload-3.1-db.c @@ -0,0 +1,202 @@ +/* -*- mode: C; c-basic-offset: 4 -*- */ +#ident "Copyright (c) 2010 Tokutek Inc. All rights reserved." +#ident "$Id: loader-stress-test.c 20470 2010-05-20 18:30:04Z bkuszmaul $" + +#define kv_pair_funcs 1 // pull in kv_pair generators from test.h + +#include "test.h" +#include "toku_pthread.h" +#include "toku_atomic.h" +#include +#include +#include "ydb-internal.h" + +#include "test_kv_gen.h" +/* + */ + +DB_ENV *env; +enum {MAX_NAME=128}; +enum {ROWS_PER_TRANSACTION=10000}; +int NUM_DBS=5; +int NUM_ROWS=100000; +int CHECK_RESULTS=0; +enum { old_default_cachesize=1024 }; // MB +int CACHESIZE=old_default_cachesize; +int ALLOW_DUPS=0; + +static struct timeval starttime; +static double UU() elapsed_time (void) { + struct timeval now; + gettimeofday(&now, NULL); + return now.tv_sec - starttime.tv_sec + 1e-6*(now.tv_usec - starttime.tv_usec); +} + +static void preload_dbs(DB **dbs) +{ + gettimeofday(&starttime, NULL); + int r; + DB_TXN *txn; + uint32_t db_flags[MAX_DBS]; + uint32_t dbt_flags[MAX_DBS]; + uint32_t flags = DB_NOOVERWRITE; + flags = DB_YESOVERWRITE; + for(int i=0;itxn_begin(env, NULL, &txn, 0); CKERR(r); + for(int i=1;i<=ROWS_PER_TRANSACTION;i++) { + k = i + (x*ROWS_PER_TRANSACTION); + v = generate_val(k, 0); + dbt_init(&skey, &k, sizeof(unsigned int)); + dbt_init(&sval, &v, sizeof(unsigned int)); + + for(int db = 0;db < NUM_DBS;db++) { + put_multiple_generate(dbs[db], // dest_db + NULL, // src_db, ignored + &key, &val, // dest_key, dest_val + &skey, &sval, // src_key, src_val + NULL); // extra, ignored + + r = dbs[db]->put(dbs[db], txn, &key, &val, 0); CKERR(r); + if (key.flags == 0) { dbt_init_realloc(&key); } + if (val.flags == 0) { dbt_init_realloc(&val); } + } + } + r = txn->commit(txn, 0); CKERR(r); + if ( verbose ) {printf(".");fflush(stdout);} + } + if ( key.flags ) { toku_free(key.data); key.data = NULL; } + if ( val.flags ) { toku_free(val.data); key.data = NULL; } + + if ( CHECK_RESULTS) { + if ( verbose ) {printf("\nchecking");fflush(stdout);} + check_results(env, dbs, NUM_DBS, NUM_ROWS); + } + if ( verbose) {printf("\ndone\n");fflush(stdout);} +} + + +char *free_me = NULL; +char *env_dir = ENVDIR; // the default env_dir. + +static void run_test(void) +{ + int r; + { + int len = strlen(env_dir) + 20; + char syscmd[len]; + r = snprintf(syscmd, len, "rm -rf %s", env_dir); + assert(rset_default_bt_compare(env, uint_dbt_cmp); CKERR(r); +// r = env->set_default_dup_compare(env, uint_dbt_cmp); CKERR(r); +// if ( verbose ) printf("CACHESIZE = %d MB\n", CACHESIZE); +// r = env->set_cachesize(env, CACHESIZE / 1024, (CACHESIZE % 1024)*1024*1024, 1); CKERR(r); +// CKERR(r); + int envflags = DB_INIT_LOCK | DB_INIT_LOG | DB_INIT_MPOOL | DB_INIT_TXN | DB_CREATE | DB_PRIVATE; + r = env->open(env, env_dir, envflags, S_IRWXU+S_IRWXG+S_IRWXO); CKERR(r); + env->set_errfile(env, stderr); + r = env->checkpointing_set_period(env, 60); CKERR(r); + + DBT desc; + dbt_init(&desc, "foo", sizeof("foo")); + char name[MAX_NAME*2]; + + DB **dbs = (DB**)toku_malloc(sizeof(DB*) * NUM_DBS); + assert(dbs != NULL); + int idx[MAX_DBS]; + for(int i=0;iset_descriptor(dbs[i], 1, &desc); CKERR(r); + dbs[i]->app_private = &idx[i]; + snprintf(name, sizeof(name), "db_%04x", i); + r = dbs[i]->open(dbs[i], NULL, name, NULL, DB_BTREE, DB_CREATE, 0666); CKERR(r); + } + + generate_permute_tables(); + + // -------------------------- // + preload_dbs(dbs); + // -------------------------- // + + for(int i=0;iclose(dbs[i], 0); CKERR(r); + dbs[i] = NULL; + } + + if (verbose >= 2) + print_engine_status(env); + r = env->close(env, 0); CKERR(r); + toku_free(dbs); + + // reopen, then close environment to trim logfiles + r = db_env_create(&env, 0); CKERR(r); + r = env->open(env, env_dir, envflags, S_IRWXU+S_IRWXG+S_IRWXO); CKERR(r); + r = env->close(env, 0); CKERR(r); +} + +// ------------ infrastructure ---------- +static void do_args(int argc, char * const argv[]); + +int test_main(int argc, char * const argv[]) { + do_args(argc, argv); + run_test(); + if (free_me) toku_free(free_me); + return 0; +} + +static void do_args(int argc, char * const argv[]) { + int resultcode; + char *cmd = argv[0]; + argc--; argv++; + + while (argc>0) { + if (strcmp(argv[0], "-v")==0) { + verbose++; + } else if (strcmp(argv[0],"-q")==0) { + verbose--; + if (verbose<0) verbose=0; + } else if (strcmp(argv[0], "-h")==0) { + resultcode=0; + do_usage: + fprintf(stderr, "Usage: -h -c -d -r %s\n", cmd); + exit(resultcode); + } else if (strcmp(argv[0], "-d")==0) { + argc--; argv++; + NUM_DBS = atoi(argv[0]); + if ( NUM_DBS > MAX_DBS ) { + fprintf(stderr, "max value for -d field is %d\n", MAX_DBS); + resultcode=1; + goto do_usage; + } + } else if (strcmp(argv[0], "-r")==0) { + argc--; argv++; + NUM_ROWS = atoi(argv[0]); + } else if (strcmp(argv[0], "-c")==0) { + CHECK_RESULTS = 1; + } else { + fprintf(stderr, "Unknown arg: %s\n", argv[0]); + resultcode=1; + goto do_usage; + } + argc--; + argv++; + } +} diff --git a/src/tests/recover-compare-db-descriptor.c b/src/tests/recover-compare-db-descriptor.c index e6af9e3916e..3825d74889d 100644 --- a/src/tests/recover-compare-db-descriptor.c +++ b/src/tests/recover-compare-db-descriptor.c @@ -15,8 +15,8 @@ char *nameb="b.db"; static int my_compare(DB *UU(db), const DBT *a, const DBT *b) { assert(db); assert(db->descriptor); - assert(db->descriptor->size == sizeof(descriptor_contents)); - assert(memcmp(db->descriptor->data, descriptor_contents, sizeof(descriptor_contents)) == 0); + assert(db->descriptor->dbt.size == sizeof(descriptor_contents)); + assert(memcmp(db->descriptor->dbt.data, descriptor_contents, sizeof(descriptor_contents)) == 0); assert(a->size == b->size); return memcmp(a->data, b->data, a->size); @@ -29,7 +29,7 @@ set_descriptor(DB* db) { #if USE_TDB DBT descriptor; dbt_init(&descriptor, descriptor_contents, sizeof(descriptor_contents)); - int r = db->set_descriptor(db, 1, &descriptor, abort_on_upgrade); CKERR(r); + int r = db->set_descriptor(db, 1, &descriptor); CKERR(r); #endif } diff --git a/src/tests/recover-loader-test.c b/src/tests/recover-loader-test.c index 4e3f0318810..9816ef4cfbc 100644 --- a/src/tests/recover-loader-test.c +++ b/src/tests/recover-loader-test.c @@ -378,7 +378,7 @@ static void run_test(void) for(int i=0;iset_descriptor(dbs[i], 1, &desc, abort_on_upgrade); CKERR(r); + r = dbs[i]->set_descriptor(dbs[i], 1, &desc); CKERR(r); dbs[i]->app_private = &idx[i]; snprintf(name, sizeof(name), "db_%04x", i); r = dbs[i]->open(dbs[i], NULL, name, NULL, DB_BTREE, DB_CREATE, 0666); CKERR(r); diff --git a/src/tests/recover-lsn-filter-multiple.c b/src/tests/recover-lsn-filter-multiple.c index d0009c76f42..5e64d7d0899 100644 --- a/src/tests/recover-lsn-filter-multiple.c +++ b/src/tests/recover-lsn-filter-multiple.c @@ -14,38 +14,21 @@ static DBT dest_vals[num_dbs]; BOOL do_test=FALSE, do_recover=FALSE; -static int -crash_on_upgrade(DB* db, - u_int32_t old_version, const DBT *old_descriptor, const DBT *old_key, const DBT *old_val, - u_int32_t new_version, const DBT *new_descriptor, const DBT *new_key, const DBT *new_val) { - db = db; - old_version = old_version; - old_descriptor = old_descriptor; - old_key = old_key; - old_val = old_val; - new_version = new_version; - new_descriptor = new_descriptor; - new_key = new_key; - new_val = new_val; - assert(FALSE); - return 0; -} - static int put_multiple_generate(DB *dest_db, DB *src_db, DBT *dest_key, DBT *dest_val, const DBT *src_key, const DBT *src_val, void *extra) { if (extra == NULL) { if (src_db) { assert(src_db->descriptor); - assert(src_db->descriptor->size == 4); - assert((*(uint32_t*)src_db->descriptor->data) == 0); + assert(src_db->descriptor->dbt.size == 4); + assert((*(uint32_t*)src_db->descriptor->dbt.data) == 0); } } else { assert(src_db == NULL); assert(extra==&namea); //Verifying extra gets set right. } - assert(dest_db->descriptor->size == 4); - uint32_t which = *(uint32_t*)dest_db->descriptor->data; + assert(dest_db->descriptor->dbt.size == 4); + uint32_t which = *(uint32_t*)dest_db->descriptor->dbt.data; assert(which < num_dbs); if (dest_key->data) toku_free(dest_key->data); @@ -88,9 +71,9 @@ static void run_test (void) { r = db_create(&dba, env, 0); CKERR(r); r = db_create(&dbb, env, 0); CKERR(r); which = 0; - r = dba->set_descriptor(dba, 1, &descriptor, crash_on_upgrade); CKERR(r); + r = dba->set_descriptor(dba, 1, &descriptor); CKERR(r); which = 1; - r = dbb->set_descriptor(dbb, 1, &descriptor, crash_on_upgrade); CKERR(r); + r = dbb->set_descriptor(dbb, 1, &descriptor); CKERR(r); r = dba->open(dba, NULL, namea, NULL, DB_BTREE, DB_AUTO_COMMIT|DB_CREATE, 0666); CKERR(r); r = dbb->open(dbb, NULL, nameb, NULL, DB_BTREE, DB_AUTO_COMMIT|DB_CREATE, 0666); CKERR(r); diff --git a/src/tests/recover-missing-logfile.c b/src/tests/recover-missing-logfile.c index fb38e39b612..65f55bca82b 100644 --- a/src/tests/recover-missing-logfile.c +++ b/src/tests/recover-missing-logfile.c @@ -62,17 +62,17 @@ static void run_recover (void) { r = toku_os_mkdir(ENVDIR "/savedlogs", S_IRWXU+S_IRWXG+S_IRWXO); CKERR(r); - r = system("mv " ENVDIR "/*.tokulog " ENVDIR "/savedlogs/"); + r = system("mv " ENVDIR "/*.tokulog* " ENVDIR "/savedlogs/"); CKERR(r); r = db_env_create(&env, 0); CKERR(r); r = env->open(env, ENVDIR, envflags + DB_RECOVER, S_IRWXU+S_IRWXG+S_IRWXO); CKERR2(r, ENOENT); - r = system("rm -rf " ENVDIR "/*.tokulog"); + r = system("rm -rf " ENVDIR "/*.tokulog*"); CKERR(r); - r = system("mv " ENVDIR "/savedlogs/*.tokulog " ENVDIR "/"); + r = system("mv " ENVDIR "/savedlogs/*.tokulog* " ENVDIR "/"); CKERR(r); r = env->open(env, ENVDIR, envflags + DB_RECOVER, S_IRWXU+S_IRWXG+S_IRWXO); CKERR(r); diff --git a/src/tests/recover-put-multiple-fdelete-all.c b/src/tests/recover-put-multiple-fdelete-all.c index 0b3296ebefc..f5147e0b783 100644 --- a/src/tests/recover-put-multiple-fdelete-all.c +++ b/src/tests/recover-put-multiple-fdelete-all.c @@ -14,29 +14,12 @@ static DBT dest_vals[num_dbs]; BOOL do_test=FALSE, do_recover=FALSE; -static int -crash_on_upgrade(DB* db, - u_int32_t old_version, const DBT *old_descriptor, const DBT *old_key, const DBT *old_val, - u_int32_t new_version, const DBT *new_descriptor, const DBT *new_key, const DBT *new_val) { - db = db; - old_version = old_version; - old_descriptor = old_descriptor; - old_key = old_key; - old_val = old_val; - new_version = new_version; - new_descriptor = new_descriptor; - new_key = new_key; - new_val = new_val; - assert(FALSE); - return 0; -} - static int put_multiple_generate(DB *dest_db, DB *src_db, DBT *dest_key, DBT *dest_val, const DBT *src_key, const DBT *src_val, void *extra) { assert(src_db == NULL); assert(extra==&namea || extra==NULL); //Verifying extra gets set right. - assert(dest_db->descriptor->size == 4); - uint32_t which = *(uint32_t*)dest_db->descriptor->data; + assert(dest_db->descriptor->dbt.size == 4); + uint32_t which = *(uint32_t*)dest_db->descriptor->dbt.data; assert(which < num_dbs); if (dest_key->data) toku_free(dest_key->data); @@ -79,9 +62,9 @@ static void run_test (void) { r = db_create(&dba, env, 0); CKERR(r); r = db_create(&dbb, env, 0); CKERR(r); which = 0; - r = dba->set_descriptor(dba, 1, &descriptor, crash_on_upgrade); CKERR(r); + r = dba->set_descriptor(dba, 1, &descriptor); CKERR(r); which = 1; - r = dbb->set_descriptor(dbb, 1, &descriptor, crash_on_upgrade); CKERR(r); + r = dbb->set_descriptor(dbb, 1, &descriptor); CKERR(r); r = dba->open(dba, NULL, namea, NULL, DB_BTREE, DB_AUTO_COMMIT|DB_CREATE, 0666); CKERR(r); r = dbb->open(dbb, NULL, nameb, NULL, DB_BTREE, DB_AUTO_COMMIT|DB_CREATE, 0666); CKERR(r); diff --git a/src/tests/recover-put-multiple-fdelete-some.c b/src/tests/recover-put-multiple-fdelete-some.c index 17299187068..8b5bd1ee3e2 100644 --- a/src/tests/recover-put-multiple-fdelete-some.c +++ b/src/tests/recover-put-multiple-fdelete-some.c @@ -14,38 +14,21 @@ static DBT dest_vals[num_dbs]; BOOL do_test=FALSE, do_recover=FALSE; -static int -crash_on_upgrade(DB* db, - u_int32_t old_version, const DBT *old_descriptor, const DBT *old_key, const DBT *old_val, - u_int32_t new_version, const DBT *new_descriptor, const DBT *new_key, const DBT *new_val) { - db = db; - old_version = old_version; - old_descriptor = old_descriptor; - old_key = old_key; - old_val = old_val; - new_version = new_version; - new_descriptor = new_descriptor; - new_key = new_key; - new_val = new_val; - assert(FALSE); - return 0; -} - static int put_multiple_generate(DB *dest_db, DB *src_db, DBT *dest_key, DBT *dest_val, const DBT *src_key, const DBT *src_val, void *extra) { if (extra == NULL) { if (src_db) { assert(src_db->descriptor); - assert(src_db->descriptor->size == 4); - assert((*(uint32_t*)src_db->descriptor->data) == 0); + assert(src_db->descriptor->dbt.size == 4); + assert((*(uint32_t*)src_db->descriptor->dbt.data) == 0); } } else { assert(src_db == NULL); assert(extra==&namea); //Verifying extra gets set right. } - assert(dest_db->descriptor->size == 4); - uint32_t which = *(uint32_t*)dest_db->descriptor->data; + assert(dest_db->descriptor->dbt.size == 4); + uint32_t which = *(uint32_t*)dest_db->descriptor->dbt.data; assert(which < num_dbs); if (dest_key->data) toku_free(dest_key->data); @@ -88,9 +71,9 @@ static void run_test (void) { r = db_create(&dba, env, 0); CKERR(r); r = db_create(&dbb, env, 0); CKERR(r); which = 0; - r = dba->set_descriptor(dba, 1, &descriptor, crash_on_upgrade); CKERR(r); + r = dba->set_descriptor(dba, 1, &descriptor); CKERR(r); which = 1; - r = dbb->set_descriptor(dbb, 1, &descriptor, crash_on_upgrade); CKERR(r); + r = dbb->set_descriptor(dbb, 1, &descriptor); CKERR(r); r = dba->open(dba, NULL, namea, NULL, DB_BTREE, DB_AUTO_COMMIT|DB_CREATE, 0666); CKERR(r); r = dbb->open(dbb, NULL, nameb, NULL, DB_BTREE, DB_AUTO_COMMIT|DB_CREATE, 0666); CKERR(r); diff --git a/src/tests/recover-upgrade-db-descriptor-multihandle.c b/src/tests/recover-upgrade-db-descriptor-multihandle.c new file mode 100644 index 00000000000..d3b4d4913fa --- /dev/null +++ b/src/tests/recover-upgrade-db-descriptor-multihandle.c @@ -0,0 +1,294 @@ +// verify that the comparison function get a valid db object pointer + +#include +#include "test.h" + + +char *descriptor_contents[] = { + "Spoon full of sugar", + "Bucket full of pants" +}; + +const int envflags = DB_INIT_MPOOL|DB_CREATE|DB_THREAD |DB_INIT_LOCK|DB_INIT_LOG|DB_INIT_TXN|DB_PRIVATE; +char *namea="a.db"; + +int verified = 0; +uint32_t forced_version = 2; + +#if USE_TDB + +static int my_compare(DB *UU(db), const DBT *a, const DBT *b) { + assert(db); + assert(db->descriptor); + uint32_t version = db->descriptor->version; + assert(version > 0); + assert(version == forced_version); + uint32_t which = version-1; + size_t len = strlen(descriptor_contents[which])+1; + + assert(db->descriptor->dbt.size == len); + assert(memcmp(db->descriptor->dbt.data, descriptor_contents[which], len) == 0); + + assert(a->size == b->size); + verified = 1; + return memcmp(a->data, b->data, a->size); +} + +#endif + +static void +set_descriptor(DB* db, int which) { +#if USE_TDB + DBT descriptor; + size_t len = strlen(descriptor_contents[which])+1; + dbt_init(&descriptor, descriptor_contents[which], len); + int r = db->set_descriptor(db, which+1, &descriptor); CKERR(r); +#endif +} + +static void +do_x1_shutdown (BOOL do_commit, BOOL do_abort) { + int r; + r = system("rm -rf " ENVDIR); CKERR(r); + r = toku_os_mkdir(ENVDIR, S_IRWXU+S_IRWXG+S_IRWXO); CKERR(r); + r = toku_os_mkdir(ENVDIR"/data", S_IRWXU+S_IRWXG+S_IRWXO); CKERR(r); + DB_ENV *env; + DB *dba, *dbb; + r = db_env_create(&env, 0); CKERR(r); + r = env->set_data_dir(env, "data"); CKERR(r); +#if USE_TDB + r = env->set_default_bt_compare(env, my_compare); CKERR(r); +#endif + r = env->open(env, ENVDIR, envflags, S_IRWXU+S_IRWXG+S_IRWXO); CKERR(r); + + r = db_create(&dba, env, 0); CKERR(r); + set_descriptor(dba, 0); + r = dba->open(dba, NULL, namea, NULL, DB_BTREE, DB_AUTO_COMMIT|DB_CREATE, 0666); CKERR(r); + + r = db_create(&dbb, env, 0); CKERR(r); + set_descriptor(dbb, 1); + r = dbb->open(dbb, NULL, namea, NULL, DB_BTREE, DB_AUTO_COMMIT|DB_CREATE, 0666); CKERR(r); + DB_TXN *txn; + r = env->txn_begin(env, NULL, &txn, 0); CKERR(r); + { + DBT a={.data="a", .size=2}; + DBT b={.data="b", .size=2}; + r = dba->put(dba, txn, &a, &b, 0); CKERR(r); + r = dba->put(dba, txn, &b, &a, 0); CKERR(r); + r = dbb->put(dbb, txn, &b, &a, 0); CKERR(r); + } + //printf("opened\n"); + if (do_commit) { + r = txn->commit(txn, 0); CKERR(r); + } else if (do_abort) { + r = txn->abort(txn); CKERR(r); + + // force an fsync of the log + r = env->txn_begin(env, NULL, &txn, 0); CKERR(r); + r = txn->commit(txn, 0); CKERR(r); + } + assert(verified); + //printf("shutdown\n"); + toku_hard_crash_on_purpose(); +} + +static void +do_x1_recover (BOOL did_commit) { + DB_ENV *env; + DB *dba; + int r; + r = system("rm -rf " ENVDIR"/data"); /* Delete dictionaries */ CKERR(r); + r = toku_os_mkdir(ENVDIR"/data", S_IRWXU+S_IRWXG+S_IRWXO); CKERR(r); + r = db_env_create(&env, 0); CKERR(r); + r = env->set_data_dir(env, "data"); CKERR(r); +#if USE_TDB + r = env->set_default_bt_compare(env, my_compare); CKERR(r); +#endif + r = env->open(env, ENVDIR, envflags|DB_RECOVER, S_IRWXU+S_IRWXG+S_IRWXO); CKERR(r); + r = db_create(&dba, env, 0); CKERR(r); + r = dba->open(dba, NULL, namea, NULL, DB_BTREE, DB_AUTO_COMMIT|DB_CREATE, 0666); CKERR(r); + DBT aa={.size=0}, ab={.size=0}; + DB_TXN *txn; + DBC *ca; + r = env->txn_begin(env, NULL, &txn, 0); CKERR(r); + r = dba->cursor(dba, txn, &ca, 0); CKERR(r); + int ra = ca->c_get(ca, &aa, &ab, DB_FIRST); CKERR(r); + if (did_commit) { + assert(ra==0); + // verify key-value pairs + assert(aa.size==2); + assert(ab.size==2); + const char a[2] = "a"; + const char b[2] = "b"; + assert(memcmp(aa.data, &a, 2)==0); + assert(memcmp(ab.data, &b, 2)==0); + assert(memcmp(ab.data, &b, 2)==0); + assert(ca->c_get(ca, &aa, &ab, DB_NEXT) == 0); + assert(aa.size == 2 && ab.size == 2 && memcmp(aa.data, b, 2) == 0 && memcmp(ab.data, a, 2) == 0); + // make sure no other entries in DB + assert(ca->c_get(ca, &aa, &ab, DB_NEXT) == DB_NOTFOUND); + } else { + // It wasn't committed (it also wasn't aborted), but a checkpoint happened. + assert(ra==DB_NOTFOUND); + } + r = ca->c_close(ca); CKERR(r); + r = txn->commit(txn, 0); CKERR(r); + r = dba->close(dba, 0); CKERR(r); + r = env->close(env, 0); CKERR(r); + assert(verified); + exit(0); +} + +static void +do_x1_recover_only (void) { + DB_ENV *env; + int r; + + r = db_env_create(&env, 0); CKERR(r); + r = env->open(env, ENVDIR, envflags|DB_RECOVER, S_IRWXU+S_IRWXG+S_IRWXO); CKERR(r); + r = env->close(env, 0); CKERR(r); + exit(0); +} + +static void +do_x1_no_recover (void) { + DB_ENV *env; + int r; + + r = db_env_create(&env, 0); CKERR(r); + r = env->open(env, ENVDIR, envflags & ~DB_RECOVER, S_IRWXU+S_IRWXG+S_IRWXO); + assert(r == DB_RUNRECOVERY); + r = env->close(env, 0); CKERR(r); + exit(0); +} + +const char *cmd; + +#if 0 + +static void +do_test_internal (BOOL commit) +{ + pid_t pid; + if (0 == (pid=fork())) { + int r=execl(cmd, verbose ? "-v" : "-q", commit ? "--commit" : "--abort", NULL); + assert(r==-1); + printf("execl failed: %d (%s)\n", errno, strerror(errno)); + assert(0); + } + { + int r; + int status; + r = waitpid(pid, &status, 0); + //printf("signaled=%d sig=%d\n", WIFSIGNALED(status), WTERMSIG(status)); + assert(WIFSIGNALED(status) && WTERMSIG(status)==SIGABRT); + } + // Now find out what happend + + if (0 == (pid = fork())) { + int r=execl(cmd, verbose ? "-v" : "-q", commit ? "--recover-committed" : "--recover-aborted", NULL); + assert(r==-1); + printf("execl failed: %d (%s)\n", errno, strerror(errno)); + assert(0); + } + { + int r; + int status; + r = waitpid(pid, &status, 0); + //printf("recovery exited=%d\n", WIFEXITED(status)); + assert(WIFEXITED(status) && WEXITSTATUS(status)==0); + } +} + +static void +do_test (void) { + do_test_internal(TRUE); + do_test_internal(FALSE); +} + +#endif + + +BOOL do_commit=FALSE, do_abort=FALSE, do_explicit_abort=FALSE, do_recover_committed=FALSE, do_recover_aborted=FALSE, do_recover_only=FALSE, do_no_recover = FALSE; + +static void +x1_parse_args (int argc, char * const argv[]) { + int resultcode; + cmd = argv[0]; + argc--; argv++; + while (argc>0) { + if (strcmp(argv[0], "-v") == 0) { + verbose++; + } else if (strcmp(argv[0],"-q")==0) { + verbose--; + if (verbose<0) verbose=0; + } else if (strcmp(argv[0], "--commit")==0 || strcmp(argv[0], "--test") == 0) { + do_commit=TRUE; + } else if (strcmp(argv[0], "--abort")==0) { + do_abort=TRUE; + } else if (strcmp(argv[0], "--explicit-abort")==0) { + do_explicit_abort=TRUE; + } else if (strcmp(argv[0], "--recover-committed")==0 || strcmp(argv[0], "--recover") == 0) { + do_recover_committed=TRUE; + } else if (strcmp(argv[0], "--recover-aborted")==0) { + do_recover_aborted=TRUE; + } else if (strcmp(argv[0], "--recover-only") == 0) { + do_recover_only=TRUE; + } else if (strcmp(argv[0], "--no-recover") == 0) { + do_no_recover=TRUE; + } else if (strcmp(argv[0], "-h")==0) { + resultcode=0; + do_usage: + fprintf(stderr, "Usage:\n%s [-v|-q]* [-h] {--commit | --abort | --explicit-abort | --recover-committed | --recover-aborted } \n", cmd); + exit(resultcode); + } else { + fprintf(stderr, "Unknown arg: %s\n", argv[0]); + resultcode=1; + goto do_usage; + } + argc--; + argv++; + } + { + int n_specified=0; + if (do_commit) n_specified++; + if (do_abort) n_specified++; + if (do_explicit_abort) n_specified++; + if (do_recover_committed) n_specified++; + if (do_recover_aborted) n_specified++; + if (do_recover_only) n_specified++; + if (do_no_recover) n_specified++; + if (n_specified>1) { + printf("Specify only one of --commit or --abort or --recover-committed or --recover-aborted\n"); + resultcode=1; + goto do_usage; + } + } +} + +int +test_main (int argc, char * const argv[]) +{ + x1_parse_args(argc, argv); + if (do_commit) { + do_x1_shutdown (TRUE, FALSE); + } else if (do_abort) { + do_x1_shutdown (FALSE, FALSE); + } else if (do_explicit_abort) { + do_x1_shutdown(FALSE, TRUE); + } else if (do_recover_committed) { + do_x1_recover(TRUE); + } else if (do_recover_aborted) { + do_x1_recover(FALSE); + } else if (do_recover_only) { + do_x1_recover_only(); + } else if (do_no_recover) { + do_x1_no_recover(); + } +#if 0 + else { + do_test(); + } +#endif + return 0; +} diff --git a/src/tests/recover-upgrade-db-descriptor.c b/src/tests/recover-upgrade-db-descriptor.c new file mode 100644 index 00000000000..ca8b488dd14 --- /dev/null +++ b/src/tests/recover-upgrade-db-descriptor.c @@ -0,0 +1,297 @@ +// verify that the comparison function get a valid db object pointer + +#include +#include "test.h" + + +char *descriptor_contents[] = { + "Spoon full of sugar", + "Bucket full of pants" +}; + +const int envflags = DB_INIT_MPOOL|DB_CREATE|DB_THREAD |DB_INIT_LOCK|DB_INIT_LOG|DB_INIT_TXN|DB_PRIVATE; +char *namea="a.db"; + +int verified = 0; +uint32_t forced_version = 2; + +#if USE_TDB + +static int my_compare(DB *UU(db), const DBT *a, const DBT *b) { + assert(db); + assert(db->descriptor); + uint32_t version = db->descriptor->version; + assert(version > 0); + assert(version == forced_version); + uint32_t which = version-1; + size_t len = strlen(descriptor_contents[which])+1; + + assert(db->descriptor->dbt.size == len); + assert(memcmp(db->descriptor->dbt.data, descriptor_contents[which], len) == 0); + + assert(a->size == b->size); + verified = 1; + return memcmp(a->data, b->data, a->size); +} + +#endif + +static void +set_descriptor(DB* db, int which) { +#if USE_TDB + DBT descriptor; + size_t len = strlen(descriptor_contents[which])+1; + dbt_init(&descriptor, descriptor_contents[which], len); + int r = db->set_descriptor(db, which+1, &descriptor); CKERR(r); +#endif +} + +static void +do_x1_shutdown (BOOL do_commit, BOOL do_abort) { + int r; + r = system("rm -rf " ENVDIR); CKERR(r); + r = toku_os_mkdir(ENVDIR, S_IRWXU+S_IRWXG+S_IRWXO); CKERR(r); + r = toku_os_mkdir(ENVDIR"/data", S_IRWXU+S_IRWXG+S_IRWXO); CKERR(r); + DB_ENV *env; + DB *dba; + r = db_env_create(&env, 0); CKERR(r); + r = env->set_data_dir(env, "data"); CKERR(r); +#if USE_TDB + r = env->set_default_bt_compare(env, my_compare); CKERR(r); +#endif + r = env->open(env, ENVDIR, envflags, S_IRWXU+S_IRWXG+S_IRWXO); CKERR(r); + + r = db_create(&dba, env, 0); CKERR(r); + set_descriptor(dba, 0); + r = dba->open(dba, NULL, namea, NULL, DB_BTREE, DB_AUTO_COMMIT|DB_CREATE, 0666); CKERR(r); + r = dba->close(dba, 0); CKERR(r); + + r = db_create(&dba, env, 0); CKERR(r); + set_descriptor(dba, 1); + r = dba->open(dba, NULL, namea, NULL, DB_BTREE, DB_AUTO_COMMIT|DB_CREATE, 0666); CKERR(r); + + + + DB_TXN *txn; + r = env->txn_begin(env, NULL, &txn, 0); CKERR(r); + { + DBT a={.data="a", .size=2}; + DBT b={.data="b", .size=2}; + r = dba->put(dba, txn, &a, &b, 0); CKERR(r); + r = dba->put(dba, txn, &b, &a, 0); CKERR(r); + } + //printf("opened\n"); + if (do_commit) { + r = txn->commit(txn, 0); CKERR(r); + } else if (do_abort) { + r = txn->abort(txn); CKERR(r); + + // force an fsync of the log + r = env->txn_begin(env, NULL, &txn, 0); CKERR(r); + r = txn->commit(txn, 0); CKERR(r); + } + //printf("shutdown\n"); + assert(verified); + toku_hard_crash_on_purpose(); +} + +static void +do_x1_recover (BOOL did_commit) { + DB_ENV *env; + DB *dba; + int r; + r = system("rm -rf " ENVDIR"/data"); /* Delete dictionaries */ CKERR(r); + r = toku_os_mkdir(ENVDIR"/data", S_IRWXU+S_IRWXG+S_IRWXO); CKERR(r); + r = db_env_create(&env, 0); CKERR(r); + r = env->set_data_dir(env, "data"); CKERR(r); +#if USE_TDB + r = env->set_default_bt_compare(env, my_compare); CKERR(r); +#endif + r = env->open(env, ENVDIR, envflags|DB_RECOVER, S_IRWXU+S_IRWXG+S_IRWXO); CKERR(r); + r = db_create(&dba, env, 0); CKERR(r); + r = dba->open(dba, NULL, namea, NULL, DB_BTREE, DB_AUTO_COMMIT|DB_CREATE, 0666); CKERR(r); + DBT aa={.size=0}, ab={.size=0}; + DB_TXN *txn; + DBC *ca; + r = env->txn_begin(env, NULL, &txn, 0); CKERR(r); + r = dba->cursor(dba, txn, &ca, 0); CKERR(r); + int ra = ca->c_get(ca, &aa, &ab, DB_FIRST); CKERR(r); + if (did_commit) { + assert(ra==0); + // verify key-value pairs + assert(aa.size==2); + assert(ab.size==2); + const char a[2] = "a"; + const char b[2] = "b"; + assert(memcmp(aa.data, &a, 2)==0); + assert(memcmp(ab.data, &b, 2)==0); + assert(memcmp(ab.data, &b, 2)==0); + assert(ca->c_get(ca, &aa, &ab, DB_NEXT) == 0); + assert(aa.size == 2 && ab.size == 2 && memcmp(aa.data, b, 2) == 0 && memcmp(ab.data, a, 2) == 0); + // make sure no other entries in DB + assert(ca->c_get(ca, &aa, &ab, DB_NEXT) == DB_NOTFOUND); + } else { + // It wasn't committed (it also wasn't aborted), but a checkpoint happened. + assert(ra==DB_NOTFOUND); + } + r = ca->c_close(ca); CKERR(r); + r = txn->commit(txn, 0); CKERR(r); + r = dba->close(dba, 0); CKERR(r); + r = env->close(env, 0); CKERR(r); + assert(verified); + exit(0); +} + +static void +do_x1_recover_only (void) { + DB_ENV *env; + int r; + + r = db_env_create(&env, 0); CKERR(r); + r = env->open(env, ENVDIR, envflags|DB_RECOVER, S_IRWXU+S_IRWXG+S_IRWXO); CKERR(r); + r = env->close(env, 0); CKERR(r); + exit(0); +} + +static void +do_x1_no_recover (void) { + DB_ENV *env; + int r; + + r = db_env_create(&env, 0); CKERR(r); + r = env->open(env, ENVDIR, envflags & ~DB_RECOVER, S_IRWXU+S_IRWXG+S_IRWXO); + assert(r == DB_RUNRECOVERY); + r = env->close(env, 0); CKERR(r); + exit(0); +} + +const char *cmd; + +#if 0 + +static void +do_test_internal (BOOL commit) +{ + pid_t pid; + if (0 == (pid=fork())) { + int r=execl(cmd, verbose ? "-v" : "-q", commit ? "--commit" : "--abort", NULL); + assert(r==-1); + printf("execl failed: %d (%s)\n", errno, strerror(errno)); + assert(0); + } + { + int r; + int status; + r = waitpid(pid, &status, 0); + //printf("signaled=%d sig=%d\n", WIFSIGNALED(status), WTERMSIG(status)); + assert(WIFSIGNALED(status) && WTERMSIG(status)==SIGABRT); + } + // Now find out what happend + + if (0 == (pid = fork())) { + int r=execl(cmd, verbose ? "-v" : "-q", commit ? "--recover-committed" : "--recover-aborted", NULL); + assert(r==-1); + printf("execl failed: %d (%s)\n", errno, strerror(errno)); + assert(0); + } + { + int r; + int status; + r = waitpid(pid, &status, 0); + //printf("recovery exited=%d\n", WIFEXITED(status)); + assert(WIFEXITED(status) && WEXITSTATUS(status)==0); + } +} + +static void +do_test (void) { + do_test_internal(TRUE); + do_test_internal(FALSE); +} + +#endif + + +BOOL do_commit=FALSE, do_abort=FALSE, do_explicit_abort=FALSE, do_recover_committed=FALSE, do_recover_aborted=FALSE, do_recover_only=FALSE, do_no_recover = FALSE; + +static void +x1_parse_args (int argc, char * const argv[]) { + int resultcode; + cmd = argv[0]; + argc--; argv++; + while (argc>0) { + if (strcmp(argv[0], "-v") == 0) { + verbose++; + } else if (strcmp(argv[0],"-q")==0) { + verbose--; + if (verbose<0) verbose=0; + } else if (strcmp(argv[0], "--commit")==0 || strcmp(argv[0], "--test") == 0) { + do_commit=TRUE; + } else if (strcmp(argv[0], "--abort")==0) { + do_abort=TRUE; + } else if (strcmp(argv[0], "--explicit-abort")==0) { + do_explicit_abort=TRUE; + } else if (strcmp(argv[0], "--recover-committed")==0 || strcmp(argv[0], "--recover") == 0) { + do_recover_committed=TRUE; + } else if (strcmp(argv[0], "--recover-aborted")==0) { + do_recover_aborted=TRUE; + } else if (strcmp(argv[0], "--recover-only") == 0) { + do_recover_only=TRUE; + } else if (strcmp(argv[0], "--no-recover") == 0) { + do_no_recover=TRUE; + } else if (strcmp(argv[0], "-h")==0) { + resultcode=0; + do_usage: + fprintf(stderr, "Usage:\n%s [-v|-q]* [-h] {--commit | --abort | --explicit-abort | --recover-committed | --recover-aborted } \n", cmd); + exit(resultcode); + } else { + fprintf(stderr, "Unknown arg: %s\n", argv[0]); + resultcode=1; + goto do_usage; + } + argc--; + argv++; + } + { + int n_specified=0; + if (do_commit) n_specified++; + if (do_abort) n_specified++; + if (do_explicit_abort) n_specified++; + if (do_recover_committed) n_specified++; + if (do_recover_aborted) n_specified++; + if (do_recover_only) n_specified++; + if (do_no_recover) n_specified++; + if (n_specified>1) { + printf("Specify only one of --commit or --abort or --recover-committed or --recover-aborted\n"); + resultcode=1; + goto do_usage; + } + } +} + +int +test_main (int argc, char * const argv[]) +{ + x1_parse_args(argc, argv); + if (do_commit) { + do_x1_shutdown (TRUE, FALSE); + } else if (do_abort) { + do_x1_shutdown (FALSE, FALSE); + } else if (do_explicit_abort) { + do_x1_shutdown(FALSE, TRUE); + } else if (do_recover_committed) { + do_x1_recover(TRUE); + } else if (do_recover_aborted) { + do_x1_recover(FALSE); + } else if (do_recover_only) { + do_x1_recover_only(); + } else if (do_no_recover) { + do_x1_no_recover(); + } +#if 0 + else { + do_test(); + } +#endif + return 0; +} diff --git a/src/tests/test.h b/src/tests/test.h index b8ffea6fe7f..7d250fbf997 100644 --- a/src/tests/test.h +++ b/src/tests/test.h @@ -185,14 +185,6 @@ typedef enum __toku_bool { FALSE=0, TRUE=1} BOOL; #include -static int __attribute__((__unused__)) -abort_on_upgrade(DB* UU(pdb), - u_int32_t UU(old_version), const DBT *UU(old_descriptor), const DBT *UU(old_key), const DBT *UU(old_val), - u_int32_t UU(new_version), const DBT *UU(new_descriptor), const DBT *UU(new_key), const DBT *UU(new_val)) { - assert(FALSE); //Must not upgrade. - return ENOSYS; -} - unsigned int seed = 0xFEEDFACE; static u_int64_t __attribute__((__unused__)) diff --git a/src/tests/test_db_descriptor.c b/src/tests/test_db_descriptor.c index 344bf86e6aa..d24b9bc0c48 100644 --- a/src/tests/test_db_descriptor.c +++ b/src/tests/test_db_descriptor.c @@ -15,7 +15,7 @@ #define FNAME "foo.tokudb" char *name = NULL; -#define NUM 8 +#define NUM 3 #define MAX_LENGTH (1<<16) int order[NUM+1]; @@ -24,7 +24,8 @@ u_int8_t data[NUM][MAX_LENGTH]; DBT descriptors[NUM]; DB_ENV *env; -DB *db; +enum {NUM_DBS=2}; +DB *dbs[NUM_DBS]; DB_TXN *txn = NULL; DB_TXN *null_txn; int last_open_descriptor = -1; @@ -37,17 +38,27 @@ int manual_truncate = 0; static void verify_db_matches(void) { - const DBT * dbt = db->descriptor; + DB *db; + int which; + for (which = 0; which < NUM_DBS; which++) { + db = dbs[which]; + if (db) { + const DBT * dbt = &db->descriptor->dbt; - if (last_open_descriptor<0) { - assert(dbt->size == 0 && dbt->data == NULL); - } - else { - assert(last_open_descriptor < NUM); - assert(dbt->size == descriptors[last_open_descriptor].size); - assert(!memcmp(dbt->data, descriptors[last_open_descriptor].data, dbt->size)); - assert(dbt->data != descriptors[last_open_descriptor].data); + if (last_open_descriptor<0) { + assert(dbt->size == 0 && dbt->data == NULL); + assert(db->descriptor->version == 0); + } + else { + assert(last_open_descriptor < NUM); + assert(dbt->size == descriptors[last_open_descriptor].size); + assert(!memcmp(dbt->data, descriptors[last_open_descriptor].data, dbt->size)); + assert(dbt->data != descriptors[last_open_descriptor].data); + assert(db->descriptor->version == (uint32_t)last_open_descriptor+1); + } + } } + } static int @@ -59,30 +70,33 @@ verify_int_cmp (DB *dbp, const DBT *a, const DBT *b) { } static void -open_db(int descriptor) { +open_db(int descriptor, int which) { /* create the dup database file */ - assert(txn==NULL); + assert(dbs[which]==NULL); + DB *db; int r = db_create(&db, env, 0); CKERR(r); + dbs[which] = db; + r = db->set_bt_compare(db, verify_int_cmp); CKERR(r); assert(abort_type >=0 && abort_type <= 2); - if (abort_type==2) { + if (abort_type==2 && !txn) { r = env->txn_begin(env, null_txn, &txn, 0); CKERR(r); last_open_descriptor = -1; //DB was destroyed at end of last close, did not hang around. } if (descriptor >= 0) { assert(descriptor < NUM); - u_int32_t descriptor_version = 1; - r = db->set_descriptor(db, descriptor_version, &descriptors[descriptor], abort_on_upgrade); + u_int32_t descriptor_version = descriptor+1; + r = db->set_descriptor(db, descriptor_version, &descriptors[descriptor]); CKERR(r); last_open_descriptor = descriptor; } r = db->open(db, txn, FNAME, name, DB_BTREE, DB_CREATE, 0666); CKERR(r); verify_db_matches(); - if (abort_type!=2) { + if (abort_type!=2 && !txn) { r = env->txn_begin(env, null_txn, &txn, 0); CKERR(r); } @@ -95,6 +109,11 @@ open_db(int descriptor) { static void delete_db(void) { + int which; + for (which = 0; which < NUM_DBS; which++) { + assert(dbs[which] == NULL); + } + DB *db; int r = db_create(&db, env, 0); CKERR(r); r = db->remove(db, FNAME, name, 0); @@ -106,14 +125,26 @@ delete_db(void) { } static void -close_db(void) { +close_db(int which) { + assert(dbs[which]!=NULL); + DB *db = dbs[which]; + dbs[which] = NULL; + int r; + if (which==1) { + r = db->close(db, 0); + CKERR(r); + return; + } if (manual_truncate) { u_int32_t ignore_row_count; r = db->truncate(db, txn, &ignore_row_count, 0); CKERR(r); } if (abort_type>0) { + if (abort_type==2 && dbs[1]) { + close_db(1); + } r = db->close(db, 0); CKERR(r); r = txn->abort(txn); @@ -163,7 +194,17 @@ permute_order(void) { } static void -test_insert (int n) { +test_insert (int n, int which) { + if (which == -1) { + for (which = 0; which < NUM_DBS; which++) { + if (dbs[which]) { + test_insert(n, which); + } + } + return; + } + assert(dbs[which]!=NULL); + DB *db = dbs[which]; int i; static int last = 0; for (i=0; iclose(env, 0); } diff --git a/src/tests/test_kv_gen.h b/src/tests/test_kv_gen.h new file mode 100644 index 00000000000..a0032f743e5 --- /dev/null +++ b/src/tests/test_kv_gen.h @@ -0,0 +1,177 @@ + +#ifndef __TEST_KV_GEN_H +#define __TEST_KV_GEN_H + +#if defined(__cilkplusplus) || defined(__cplusplus) +extern "C" { +#endif + +#ident "Copyright (c) 2007 Tokutek Inc. All rights reserved." + +#include "test.h" +// +// Functions to create unique key/value pairs, row generators, checkers, ... for each of NUM_DBS +// + +// a is the bit-wise permute table. For DB[i], permute bits as described in a[i] using 'twiddle32' +// inv is the inverse bit-wise permute of a[]. To get the original value from a twiddled value, twiddle32 (again) with inv[] +enum {MAX_DBS=256}; +enum {MAGIC=311}; +static int aa[MAX_DBS][32] UU(); +static int inv[MAX_DBS][32] UU(); + +// rotate right and left functionsp +static inline unsigned int UU() +rotr32(const unsigned int x, const unsigned int num) { + const unsigned int n = num % 32; + return (x >> n) | ( x << (32 - n)); +} +static inline unsigned int UU() +rotl32(const unsigned int x, const unsigned int num) { + const unsigned int n = num % 32; + return (x << n) | ( x >> (32 - n)); +} + +static void UU() +generate_permute_tables(void) { + srandom(1); + int i, j, tmp; + for(int db=0;db> i ) & 1) << aa[db][i]; + } + return b; +} + +// permute bits of x based on inverse permute table bitmap +static unsigned int UU() +inv_twiddle32(unsigned int x, int db) +{ + unsigned int b = 0; + for(int i=0;i<32;i++) { + b |= (( x >> i ) & 1) << inv[db][i]; + } + return b; +} + +// generate val from key, index +static unsigned int UU() +generate_val(int key, int i) { + return rotl32((key + MAGIC), i); +} +static unsigned int UU() +pkey_for_val(int key, int i) { + return rotr32(key, i) - MAGIC; +} + +static void UU() +check_results(DB_ENV *env, DB **dbs, const int num_dbs, const int num_rows) +{ + for(int j=0;jtxn_begin(env, NULL, &txn, 0); + CKERR(r); + + DBC *cursor; + r = dbs[j]->cursor(dbs[j], txn, &cursor, 0); + CKERR(r); + for(int i=0;ic_get(cursor, &key, &val, DB_NEXT); + CKERR(r); + k = *(unsigned int*)key.data; + pkey_for_db_key = (j == 0) ? k : inv_twiddle32(k, j); + v = *(unsigned int*)val.data; + // test that we have the expected keys and values + if ((unsigned int)pkey_for_db_key != (unsigned int)pkey_for_val(v, j)) + printf(" DB[%d] key = %10u, val = %10u, pkey_for_db_key = %10u, pkey_for_val=%10d\n", j, v, k, pkey_for_db_key, pkey_for_val(v, j)); + assert((unsigned int)pkey_for_db_key == (unsigned int)pkey_for_val(v, j)); + dbt_init(&key, NULL, sizeof(unsigned int)); + dbt_init(&val, NULL, sizeof(unsigned int)); + } + if ( verbose ) {printf("."); fflush(stdout);} + r = cursor->c_close(cursor); + CKERR(r); + r = txn->commit(txn, 0); + CKERR(r); + } + if ( verbose ) {printf("ok");fflush(stdout);} +} + +static int UU() +put_multiple_generate(DB *dest_db, DB *src_db, DBT *dest_key, DBT *dest_val, const DBT *src_key, const DBT *src_val, void *extra) { + + src_db = src_db; + extra = extra; + + uint32_t which = *(uint32_t*)dest_db->app_private; + + if ( which == 0 ) { + if (dest_key->flags==DB_DBT_REALLOC) { + if (dest_key->data) toku_free(dest_key->data); + dest_key->flags = 0; + dest_key->ulen = 0; + } + if (dest_val->flags==DB_DBT_REALLOC) { + if (dest_val->data) toku_free(dest_val->data); + dest_val->flags = 0; + dest_val->ulen = 0; + } + dbt_init(dest_key, src_key->data, src_key->size); + dbt_init(dest_val, src_val->data, src_val->size); + } + else { + assert(dest_key->flags==DB_DBT_REALLOC); + if (dest_key->ulen < sizeof(unsigned int)) { + dest_key->data = toku_xrealloc(dest_key->data, sizeof(unsigned int)); + dest_key->ulen = sizeof(unsigned int); + } + assert(dest_val->flags==DB_DBT_REALLOC); + if (dest_val->ulen < sizeof(unsigned int)) { + dest_val->data = toku_xrealloc(dest_val->data, sizeof(unsigned int)); + dest_val->ulen = sizeof(unsigned int); + } + unsigned int *new_key = (unsigned int *)dest_key->data; + unsigned int *new_val = (unsigned int *)dest_val->data; + + *new_key = twiddle32(*(unsigned int*)src_key->data, which); + *new_val = generate_val(*(unsigned int*)src_key->data, which); + + dest_key->size = sizeof(unsigned int); + dest_val->size = sizeof(unsigned int); + //data is already set above + } + return 0; +} + +#if defined(__cilkplusplus) || defined(__cplusplus) +} +#endif + +#endif // __TEST_KV_GEN_H diff --git a/src/tests/upgrade-test-1.c b/src/tests/upgrade-test-1.c new file mode 100644 index 00000000000..f0b886e7ada --- /dev/null +++ b/src/tests/upgrade-test-1.c @@ -0,0 +1,164 @@ +/* -*- mode: C; c-basic-offset: 4 -*- */ +#ident "Copyright (c) 2010 Tokutek Inc. All rights reserved." +#ident "$Id: loader-stress-test.c 20470 2010-05-20 18:30:04Z bkuszmaul $" + +#define kv_pair_funcs 1 // pull in kv_pair generators from test.h + +#include "test.h" +#include "toku_pthread.h" +#include "toku_atomic.h" +#include +#include +#include "ydb-internal.h" + +#include "test_kv_gen.h" + +/* + */ + +DB_ENV *env; +enum {MAX_NAME=128}; +int NUM_DBS=5; +int NUM_ROWS=100000; +int CHECK_RESULTS=0; +enum { old_default_cachesize=1024 }; // MB +int CACHESIZE=old_default_cachesize; + +char *db_v3_dir = "../../utils/dir.preload-3.1-db.c.tdb"; +char *db_v4_dir = "dir.preload-3.1-db.c.tdb"; +char *env_dir = ENVDIR; // the default env_dir. + +int SRC_VERSION = 4; + +static void upgrade_test_1(DB **dbs) { + int r; + // open the DBS + { + DBT desc; + dbt_init(&desc, "foo", sizeof("foo")); + char name[MAX_NAME*2]; + + int idx[MAX_DBS]; + for(int i=0;iset_descriptor(dbs[i], 1, &desc); CKERR(r); + dbs[i]->app_private = &idx[i]; + snprintf(name, sizeof(name), "db_%04x", i); + r = dbs[i]->open(dbs[i], NULL, name, NULL, DB_BTREE, DB_CREATE, 0666); CKERR(r); + } + } + + // read and verify all rows + { + if ( verbose ) {printf("checking");fflush(stdout);} + check_results(env, dbs, NUM_DBS, NUM_ROWS); + if ( verbose) {printf("\ndone\n");fflush(stdout);} + } + // close + { + for(int i=0;iclose(dbs[i], 0); CKERR(r); + dbs[i] = NULL; + } + } +} + +static void run_test(void) +{ + int r; + + char *src_db_dir; + if ( SRC_VERSION == 3 ) + src_db_dir = db_v3_dir; + else if ( SRC_VERSION == 4 ) + src_db_dir = db_v4_dir; + else { + fprintf(stderr, "unsupported TokuDB version %d to upgrade\n", SRC_VERSION); + assert(0); + } + + { + int len = 256; + char syscmd[len]; + r = snprintf(syscmd, len, "rm -rf %s", env_dir); + assert(ropen(env, env_dir, envflags, S_IRWXU+S_IRWXG+S_IRWXO); CKERR(r); + env->set_errfile(env, stderr); + r = env->checkpointing_set_period(env, 60); CKERR(r); + + DB **dbs = (DB**)toku_malloc(sizeof(DB*) * NUM_DBS); + assert(dbs != NULL); + + // -------------------------- + upgrade_test_1(dbs); + // -------------------------- + + if (verbose >= 2) + print_engine_status(env); + r = env->close(env, 0); CKERR(r); + toku_free(dbs); + +} + +// ------------ infrastructure ---------- +static void do_args(int argc, char * const argv[]); + +int test_main(int argc, char * const *argv) { + do_args(argc, argv); + run_test(); + return 0; +} + +static void do_args(int argc, char * const argv[]) { + int resultcode; + char *cmd = argv[0]; + argc--; argv++; + + while (argc>0) { + if (strcmp(argv[0], "-v")==0) { + verbose++; + } else if (strcmp(argv[0],"-q")==0) { + verbose--; + if (verbose<0) verbose=0; + } else if (strcmp(argv[0], "-h")==0) { + resultcode=0; + do_usage: + fprintf(stderr, "Usage: -h -c -d -r %s\n", cmd); + exit(resultcode); + } else if (strcmp(argv[0], "-d")==0) { + argc--; argv++; + NUM_DBS = atoi(argv[0]); + if ( NUM_DBS > MAX_DBS ) { + fprintf(stderr, "max value for -d field is %d\n", MAX_DBS); + resultcode=1; + goto do_usage; + } + } else if (strcmp(argv[0], "-r")==0) { + argc--; argv++; + NUM_ROWS = atoi(argv[0]); + } else if (strcmp(argv[0], "-c")==0) { + CHECK_RESULTS = 1; + } else if (strcmp(argv[0], "-V")==0) { + argc--; argv++; + SRC_VERSION = atoi(argv[0]); + } else { + fprintf(stderr, "Unknown arg: %s\n", argv[0]); + resultcode=1; + goto do_usage; + } + argc--; + argv++; + } +} diff --git a/src/tests/upgrade-test-2.c b/src/tests/upgrade-test-2.c new file mode 100644 index 00000000000..221c3b96e9c --- /dev/null +++ b/src/tests/upgrade-test-2.c @@ -0,0 +1,185 @@ +/* -*- mode: C; c-basic-offset: 4 -*- */ +#ident "Copyright (c) 2010 Tokutek Inc. All rights reserved." +#ident "$Id: loader-stress-test.c 20470 2010-05-20 18:30:04Z bkuszmaul $" + +#include "test.h" +#include "toku_pthread.h" +#include "toku_atomic.h" +#include +#include +#include "ydb-internal.h" + +#include "test_kv_gen.h" + +/* + */ + +DB_ENV *env; +enum {MAX_NAME=128}; +int NUM_DBS=5; +int NUM_ROWS=100000; +int CHECK_RESULTS=0; +enum { old_default_cachesize=1024 }; // MB +int CACHESIZE=old_default_cachesize; + +char *db_v3_dir = "../../utils/dir.preload-3.1-db.c.tdb"; +char *db_v4_dir = "dir.preload-3.1-db.c.tdb"; +char *env_dir = ENVDIR; // the default env_dir. + +int SRC_VERSION = 4; + +static void upgrade_test_2(DB **dbs) { + int r; + // open the DBS + { + DBT desc; + dbt_init(&desc, "foo", sizeof("foo")); + char name[MAX_NAME*2]; + + int idx[MAX_DBS]; + for(int i=0;iset_descriptor(dbs[i], 1, &desc); CKERR(r); + dbs[i]->app_private = &idx[i]; + snprintf(name, sizeof(name), "db_%04x", i); + r = dbs[i]->open(dbs[i], NULL, name, NULL, DB_BTREE, DB_CREATE, 0666); CKERR(r); + } + } + // close + { + for(int i=0;iclose(dbs[i], 0); CKERR(r); + dbs[i] = NULL; + } + } + // open + { + DBT desc; + dbt_init(&desc, "foo", sizeof("foo")); + char name[MAX_NAME*2]; + + int idx[MAX_DBS]; + for(int i=0;iset_descriptor(dbs[i], 1, &desc); CKERR(r); + dbs[i]->app_private = &idx[i]; + snprintf(name, sizeof(name), "db_%04x", i); + r = dbs[i]->open(dbs[i], NULL, name, NULL, DB_BTREE, DB_CREATE, 0666); CKERR(r); + } + } + + // read and verify all rows + { + if ( verbose ) {printf("checking");fflush(stdout);} + check_results(env, dbs, NUM_DBS, NUM_ROWS); + if ( verbose) {printf("\ndone\n");fflush(stdout);} + } + // close + { + for(int i=0;iclose(dbs[i], 0); CKERR(r); + dbs[i] = NULL; + } + } +} + +static void run_test(void) +{ + int r; + + char *src_db_dir; + if ( SRC_VERSION == 3 ) + src_db_dir = db_v3_dir; + else if ( SRC_VERSION == 4 ) + src_db_dir = db_v4_dir; + else { + fprintf(stderr, "unsupported TokuDB version %d to upgrade\n", SRC_VERSION); + assert(0); + } + + { + int len = 256; + char syscmd[len]; + r = snprintf(syscmd, len, "rm -rf %s", env_dir); + assert(ropen(env, env_dir, envflags, S_IRWXU+S_IRWXG+S_IRWXO); CKERR(r); + env->set_errfile(env, stderr); + r = env->checkpointing_set_period(env, 60); CKERR(r); + + DB **dbs = (DB**)toku_malloc(sizeof(DB*) * NUM_DBS); + assert(dbs != NULL); + + // -------------------------- + upgrade_test_2(dbs); + // -------------------------- + + if (verbose >= 2) + print_engine_status(env); + r = env->close(env, 0); CKERR(r); + toku_free(dbs); + +} + +// ------------ infrastructure ---------- +static void do_args(int argc, char * const argv[]); + +int test_main(int argc, char * const *argv) { + do_args(argc, argv); + run_test(); + return 0; +} + +static void do_args(int argc, char * const argv[]) { + int resultcode; + char *cmd = argv[0]; + argc--; argv++; + + while (argc>0) { + if (strcmp(argv[0], "-v")==0) { + verbose++; + } else if (strcmp(argv[0],"-q")==0) { + verbose--; + if (verbose<0) verbose=0; + } else if (strcmp(argv[0], "-h")==0) { + resultcode=0; + do_usage: + fprintf(stderr, "Usage: -h -c -d -r %s\n", cmd); + exit(resultcode); + } else if (strcmp(argv[0], "-d")==0) { + argc--; argv++; + NUM_DBS = atoi(argv[0]); + if ( NUM_DBS > MAX_DBS ) { + fprintf(stderr, "max value for -d field is %d\n", MAX_DBS); + resultcode=1; + goto do_usage; + } + } else if (strcmp(argv[0], "-r")==0) { + argc--; argv++; + NUM_ROWS = atoi(argv[0]); + } else if (strcmp(argv[0], "-c")==0) { + CHECK_RESULTS = 1; + } else if (strcmp(argv[0], "-V")==0) { + argc--; argv++; + SRC_VERSION = atoi(argv[0]); + } else { + fprintf(stderr, "Unknown arg: %s\n", argv[0]); + resultcode=1; + goto do_usage; + } + argc--; + argv++; + } +} diff --git a/src/tests/upgrade-test-3.c b/src/tests/upgrade-test-3.c new file mode 100644 index 00000000000..2b6743e6d7d --- /dev/null +++ b/src/tests/upgrade-test-3.c @@ -0,0 +1,187 @@ +/* -*- mode: C; c-basic-offset: 4 -*- */ +#ident "Copyright (c) 2010 Tokutek Inc. All rights reserved." +#ident "$Id: loader-stress-test.c 20470 2010-05-20 18:30:04Z bkuszmaul $" + +#include "test.h" +#include "toku_pthread.h" +#include "toku_atomic.h" +#include +#include +#include "ydb-internal.h" + +#include "test_kv_gen.h" + +/* + */ + +DB_ENV *env; +enum {MAX_NAME=128}; +int NUM_DBS=5; +int NUM_ROWS=100000; +int CHECK_RESULTS=0; +enum { old_default_cachesize=1024 }; // MB +int CACHESIZE=old_default_cachesize; + +char *db_v3_dir = "../../utils/dir.preload-3.1-db.c.tdb"; +char *db_v4_dir = "dir.preload-3.1-db.c.tdb"; +char *env_dir = ENVDIR; // the default env_dir. + +int SRC_VERSION = 4; + +static void upgrade_test_3(DB **dbs) { + int r; + // open the DBS + { + DBT desc; + dbt_init(&desc, "foo", sizeof("foo")); + char name[MAX_NAME*2]; + + int idx[MAX_DBS]; + for(int i=0;iset_descriptor(dbs[i], 1, &desc); CKERR(r); + dbs[i]->app_private = &idx[i]; + snprintf(name, sizeof(name), "db_%04x", i); + r = dbs[i]->open(dbs[i], NULL, name, NULL, DB_BTREE, DB_CREATE, 0666); CKERR(r); + } + } + // insert some rows + printf("ToDo : insert rows\n"); + // close + { + for(int i=0;iclose(dbs[i], 0); CKERR(r); + dbs[i] = NULL; + } + } + // open + { + DBT desc; + dbt_init(&desc, "foo", sizeof("foo")); + char name[MAX_NAME*2]; + + int idx[MAX_DBS]; + for(int i=0;iset_descriptor(dbs[i], 1, &desc); CKERR(r); + dbs[i]->app_private = &idx[i]; + snprintf(name, sizeof(name), "db_%04x", i); + r = dbs[i]->open(dbs[i], NULL, name, NULL, DB_BTREE, DB_CREATE, 0666); CKERR(r); + } + } + + // read and verify all rows + { + if ( verbose ) {printf("checking");fflush(stdout);} + check_results(env, dbs, NUM_DBS, NUM_ROWS); + if ( verbose) {printf("\ndone\n");fflush(stdout);} + } + // close + { + for(int i=0;iclose(dbs[i], 0); CKERR(r); + dbs[i] = NULL; + } + } +} + +static void run_test(void) +{ + int r; + + char *src_db_dir; + if ( SRC_VERSION == 3 ) + src_db_dir = db_v3_dir; + else if ( SRC_VERSION == 4 ) + src_db_dir = db_v4_dir; + else { + fprintf(stderr, "unsupported TokuDB version %d to upgrade\n", SRC_VERSION); + assert(0); + } + + { + int len = 256; + char syscmd[len]; + r = snprintf(syscmd, len, "rm -rf %s", env_dir); + assert(ropen(env, env_dir, envflags, S_IRWXU+S_IRWXG+S_IRWXO); CKERR(r); + env->set_errfile(env, stderr); + r = env->checkpointing_set_period(env, 60); CKERR(r); + + DB **dbs = (DB**)toku_malloc(sizeof(DB*) * NUM_DBS); + assert(dbs != NULL); + + // -------------------------- + upgrade_test_3(dbs); + // -------------------------- + + if (verbose >= 2) + print_engine_status(env); + r = env->close(env, 0); CKERR(r); + toku_free(dbs); + +} + +// ------------ infrastructure ---------- +static void do_args(int argc, char * const argv[]); + +int test_main(int argc, char * const *argv) { + do_args(argc, argv); + run_test(); + return 0; +} + +static void do_args(int argc, char * const argv[]) { + int resultcode; + char *cmd = argv[0]; + argc--; argv++; + + while (argc>0) { + if (strcmp(argv[0], "-v")==0) { + verbose++; + } else if (strcmp(argv[0],"-q")==0) { + verbose--; + if (verbose<0) verbose=0; + } else if (strcmp(argv[0], "-h")==0) { + resultcode=0; + do_usage: + fprintf(stderr, "Usage: -h -c -d -r %s\n", cmd); + exit(resultcode); + } else if (strcmp(argv[0], "-d")==0) { + argc--; argv++; + NUM_DBS = atoi(argv[0]); + if ( NUM_DBS > MAX_DBS ) { + fprintf(stderr, "max value for -d field is %d\n", MAX_DBS); + resultcode=1; + goto do_usage; + } + } else if (strcmp(argv[0], "-r")==0) { + argc--; argv++; + NUM_ROWS = atoi(argv[0]); + } else if (strcmp(argv[0], "-c")==0) { + CHECK_RESULTS = 1; + } else if (strcmp(argv[0], "-V")==0) { + argc--; argv++; + SRC_VERSION = atoi(argv[0]); + } else { + fprintf(stderr, "Unknown arg: %s\n", argv[0]); + resultcode=1; + goto do_usage; + } + argc--; + argv++; + } +} diff --git a/src/tests/upgrade-test-4.c b/src/tests/upgrade-test-4.c new file mode 100644 index 00000000000..492aff60d8e --- /dev/null +++ b/src/tests/upgrade-test-4.c @@ -0,0 +1,222 @@ +/* -*- mode: C; c-basic-offset: 4 -*- */ +#ident "Copyright (c) 2010 Tokutek Inc. All rights reserved." +#ident "$Id: loader-stress-test.c 20470 2010-05-20 18:30:04Z bkuszmaul $" + +#include "test.h" +#include "toku_pthread.h" +#include "toku_atomic.h" +#include +#include +#include "ydb-internal.h" + +#include "test_kv_gen.h" + +/* + */ + +DB_ENV *env; +enum {MAX_NAME=128}; +int NUM_DBS=5; +int NUM_ROWS=100000; +int CHECK_RESULTS=0; +enum { old_default_cachesize=1024 }; // MB +int CACHESIZE=old_default_cachesize; +enum {ROWS_PER_TRANSACTION=10000}; + +char *db_v3_dir = "../../utils/dir.preload-3.1-db.c.tdb"; +char *db_v4_dir = "dir.preload-3.1-db.c.tdb"; +char *env_dir = ENVDIR; // the default env_dir. + +int SRC_VERSION = 4; + +static void upgrade_test_4(DB **dbs) { + int r; + // open the DBS + { + DBT desc; + dbt_init(&desc, "foo", sizeof("foo")); + char name[MAX_NAME*2]; + + int idx[MAX_DBS]; + for(int i=0;iset_descriptor(dbs[i], 1, &desc); CKERR(r); + dbs[i]->app_private = &idx[i]; + snprintf(name, sizeof(name), "db_%04x", i); + r = dbs[i]->open(dbs[i], NULL, name, NULL, DB_BTREE, DB_CREATE, 0666); CKERR(r); + } + } + // append some rows + DB_TXN *txn; + DBT skey, sval; + DBT key, val; + dbt_init_realloc(&key); + dbt_init_realloc(&val); + + unsigned int k, v; + if ( verbose ) { printf("appending");fflush(stdout); } + int outer_loop_num = ( NUM_ROWS <= ROWS_PER_TRANSACTION ) ? 1 : (NUM_ROWS / ROWS_PER_TRANSACTION); + for(int x=0;xtxn_begin(env, NULL, &txn, 0); CKERR(r); + for(int i=1;i<=ROWS_PER_TRANSACTION;i++) { + k = i + (x*ROWS_PER_TRANSACTION) + NUM_ROWS; + v = generate_val(k, 0); + dbt_init(&skey, &k, sizeof(unsigned int)); + dbt_init(&sval, &v, sizeof(unsigned int)); + + for(int db = 0;db < NUM_DBS;db++) { + put_multiple_generate(dbs[db], // dest_db + NULL, // src_db, ignored + &key, &val, // + &skey, &sval, // src_key, src_val + NULL); // extra, ignored + + r = dbs[db]->put(dbs[db], txn, &key, &val, 0); CKERR(r); + if (key.flags == 0) { dbt_init_realloc(&key); } + if (val.flags == 0) { dbt_init_realloc(&val); } + } + } + r = txn->commit(txn, 0); CKERR(r); + if ( verbose ) {printf(".");fflush(stdout);} + } + if ( key.flags ) { toku_free(key.data); key.data = NULL; } + if ( val.flags ) { toku_free(val.data); key.data = NULL; } + + // close + { + for(int i=0;iclose(dbs[i], 0); CKERR(r); + dbs[i] = NULL; + } + } + // open + { + DBT desc; + dbt_init(&desc, "foo", sizeof("foo")); + char name[MAX_NAME*2]; + + int idx[MAX_DBS]; + for(int i=0;iset_descriptor(dbs[i], 1, &desc); CKERR(r); + dbs[i]->app_private = &idx[i]; + snprintf(name, sizeof(name), "db_%04x", i); + r = dbs[i]->open(dbs[i], NULL, name, NULL, DB_BTREE, DB_CREATE, 0666); CKERR(r); + } + } + + // read and verify all rows + { + if ( verbose ) {printf("\nchecking");fflush(stdout);} + check_results(env, dbs, NUM_DBS, NUM_ROWS * 2); + if ( verbose) {printf("\ndone\n");fflush(stdout);} + } + // close + { + for(int i=0;iclose(dbs[i], 0); CKERR(r); + dbs[i] = NULL; + } + } +} + +static void run_test(void) +{ + int r; + + char *src_db_dir; + if ( SRC_VERSION == 3 ) + src_db_dir = db_v3_dir; + else if ( SRC_VERSION == 4 ) + src_db_dir = db_v4_dir; + else { + fprintf(stderr, "unsupported TokuDB version %d to upgrade\n", SRC_VERSION); + assert(0); + } + + { + int len = 256; + char syscmd[len]; + r = snprintf(syscmd, len, "rm -rf %s", env_dir); + assert(ropen(env, env_dir, envflags, S_IRWXU+S_IRWXG+S_IRWXO); CKERR(r); + env->set_errfile(env, stderr); + r = env->checkpointing_set_period(env, 60); CKERR(r); + + DB **dbs = (DB**)toku_malloc(sizeof(DB*) * NUM_DBS); + assert(dbs != NULL); + + // -------------------------- + upgrade_test_4(dbs); + // -------------------------- + + if (verbose >= 2) + print_engine_status(env); + r = env->close(env, 0); CKERR(r); + toku_free(dbs); + +} + +// ------------ infrastructure ---------- +static void do_args(int argc, char * const argv[]); + +int test_main(int argc, char * const *argv) { + do_args(argc, argv); + run_test(); + return 0; +} + +static void do_args(int argc, char * const argv[]) { + int resultcode; + char *cmd = argv[0]; + argc--; argv++; + + while (argc>0) { + if (strcmp(argv[0], "-v")==0) { + verbose++; + } else if (strcmp(argv[0],"-q")==0) { + verbose--; + if (verbose<0) verbose=0; + } else if (strcmp(argv[0], "-h")==0) { + resultcode=0; + do_usage: + fprintf(stderr, "Usage: -h -c -d -r %s\n", cmd); + exit(resultcode); + } else if (strcmp(argv[0], "-d")==0) { + argc--; argv++; + NUM_DBS = atoi(argv[0]); + if ( NUM_DBS > MAX_DBS ) { + fprintf(stderr, "max value for -d field is %d\n", MAX_DBS); + resultcode=1; + goto do_usage; + } + } else if (strcmp(argv[0], "-r")==0) { + argc--; argv++; + NUM_ROWS = atoi(argv[0]); + } else if (strcmp(argv[0], "-c")==0) { + CHECK_RESULTS = 1; + } else if (strcmp(argv[0], "-V")==0) { + argc--; argv++; + SRC_VERSION = atoi(argv[0]); + } else { + fprintf(stderr, "Unknown arg: %s\n", argv[0]); + resultcode=1; + goto do_usage; + } + argc--; + argv++; + } +} diff --git a/src/ydb.c b/src/ydb.c index 30d1dce0c20..86d669dc09e 100644 --- a/src/ydb.c +++ b/src/ydb.c @@ -569,9 +569,9 @@ static const char * orig_env_ver_key = "original_version"; // requires: persistent environment dictionary is already open static int -upgrade_env(DB_ENV * env, DB_TXN * txn) { +maybe_upgrade_persistent_environment_dictionary(DB_ENV * env, DB_TXN * txn) { int r; - uint64_t stored_env_version; + uint32_t stored_env_version; DBT key, val; toku_fill_dbt(&key, curr_env_ver_key, strlen(curr_env_ver_key)); @@ -579,8 +579,18 @@ upgrade_env(DB_ENV * env, DB_TXN * txn) { r = toku_db_get(env->i->persistent_environment, txn, &key, &val, 0); assert(r == 0); stored_env_version = toku_dtoh32(*(uint32_t*)val.data); - if (stored_env_version != BRT_LAYOUT_VERSION) + if (stored_env_version > BRT_LAYOUT_VERSION) r = TOKUDB_DICTIONARY_TOO_NEW; + else if (stored_env_version < BRT_LAYOUT_MIN_SUPPORTED_VERSION) + r = TOKUDB_DICTIONARY_TOO_OLD; + else if (stored_env_version < BRT_LAYOUT_VERSION) { + const uint32_t environment_version = toku_htod32(BRT_LAYOUT_VERSION); + toku_fill_dbt(&key, curr_env_ver_key, strlen(curr_env_ver_key)); + toku_fill_dbt(&val, &environment_version, sizeof(environment_version)); + r = toku_db_put(env->i->persistent_environment, txn, &key, &val, DB_YESOVERWRITE); + assert(r==0); + } + // TODO: add key/val for timestamp of VERSION_12_CREATION (could be upgrade) return r; } @@ -640,7 +650,7 @@ validate_env(DB_ENV * env, BOOL * valid_newenv, BOOL need_rollback_cachefile) { r = 0; // both rollback cachefile and persistent env are missing } else { - r = toku_ydb_do_error(env, errno, "Unable to access rollback cachefile\n"); + r = toku_ydb_do_error(env, stat_errno, "Unable to access rollback cachefile\n"); assert(r); } } @@ -663,7 +673,7 @@ validate_env(DB_ENV * env, BOOL * valid_newenv, BOOL need_rollback_cachefile) { r = 0; // both fileops directory and persistent env are missing } else { - r = toku_ydb_do_error(env, errno, "Unable to access fileops directory\n"); + r = toku_ydb_do_error(env, stat_errno, "Unable to access fileops directory\n"); assert(r); } } @@ -687,6 +697,18 @@ validate_env(DB_ENV * env, BOOL * valid_newenv, BOOL need_rollback_cachefile) { return r; } +static int +ydb_maybe_upgrade_env (DB_ENV *env) { + int r = 0; + if (env->i->open_flags & DB_INIT_TXN && env->i->open_flags & DB_INIT_LOG) { + toku_ydb_unlock(); + r = toku_maybe_upgrade_log(env->i->dir, env->i->real_log_dir); + toku_ydb_lock(); + } + return r; +} + + // Open the environment. @@ -767,6 +789,9 @@ toku_env_open(DB_ENV * env, const char *home, u_int32_t flags, int mode) { need_rollback_cachefile = TRUE; } + r = ydb_maybe_upgrade_env(env); + if (r!=0) return r; + r = validate_env(env, &newenv, need_rollback_cachefile); // make sure that environment is either new or complete if (r != 0) return r; @@ -848,11 +873,11 @@ toku_env_open(DB_ENV * env, const char *home, u_int32_t flags, int mode) { r = db_use_builtin_val_cmp(env->i->persistent_environment); assert(r==0); r = db_open_iname(env->i->persistent_environment, txn, environmentdictionary, DB_CREATE, mode); + assert(r==0); if (newenv) { // create new persistent_environment DBT key, val; const uint32_t environment_version = toku_htod32(BRT_LAYOUT_VERSION); - assert(r==0); toku_fill_dbt(&key, orig_env_ver_key, strlen(orig_env_ver_key)); toku_fill_dbt(&val, &environment_version, sizeof(environment_version)); r = toku_db_put(env->i->persistent_environment, txn, &key, &val, 0); @@ -863,8 +888,8 @@ toku_env_open(DB_ENV * env, const char *home, u_int32_t flags, int mode) { assert(r==0); } else { + r = maybe_upgrade_persistent_environment_dictionary(env, txn); assert(r==0); - r = upgrade_env(env, txn); } } { @@ -1664,11 +1689,13 @@ env_get_engine_status(DB_ENV * env, ENGINE_STATUS * engstat) { engstat->logsuppressfail = logsuppressfail; } { - // dummy values until upgrade logic is complete and counters are available - engstat->upgrade_env_status = 0; - engstat->upgrade_header = 0; - engstat->upgrade_nonleaf = 0; - engstat->upgrade_leaf = 0; + BRT_UPGRADE_STATUS_S brt_upgrade_stat; + toku_brt_get_upgrade_status(&brt_upgrade_stat); + + engstat->upgrade_env_status = toku_log_upgrade_get_footprint(); + engstat->upgrade_header = brt_upgrade_stat.header; + engstat->upgrade_nonleaf = brt_upgrade_stat.nonleaf; + engstat->upgrade_leaf = brt_upgrade_stat.leaf; } } return r; @@ -5012,13 +5039,13 @@ toku_db_set_dup_compare(DB *db, int (*dup_compare)(DB *, const DBT *, const DBT return r; } -static int toku_db_set_descriptor(DB *db, u_int32_t version, const DBT* descriptor, toku_dbt_upgradef dbt_userformat_upgrade) { +static int toku_db_set_descriptor(DB *db, u_int32_t version, const DBT* descriptor) { HANDLE_PANICKED_DB(db); int r; if (db_opened(db)) return EINVAL; else if (!descriptor) r = EINVAL; else if (descriptor->size>0 && !descriptor->data) r = EINVAL; - else r = toku_brt_set_descriptor(db->i->brt, version, descriptor, dbt_userformat_upgrade); + else r = toku_brt_set_descriptor(db->i->brt, version, descriptor); return r; } @@ -5410,9 +5437,9 @@ static int locked_db_set_dup_compare(DB * db, int (*dup_compare) (DB *, const DB toku_ydb_lock(); int r = toku_db_set_dup_compare(db, dup_compare); toku_ydb_unlock(); return r; } -static int locked_db_set_descriptor(DB *db, u_int32_t version, const DBT* descriptor, toku_dbt_upgradef dbt_userformat_upgrade) { +static int locked_db_set_descriptor(DB *db, u_int32_t version, const DBT* descriptor) { toku_ydb_lock(); - int r = toku_db_set_descriptor(db, version, descriptor, dbt_userformat_upgrade); + int r = toku_db_set_descriptor(db, version, descriptor); toku_ydb_unlock(); return r; } diff --git a/toku_include/toku_os.h b/toku_include/toku_os.h index 0e85ddc100c..0ac5f6cf842 100644 --- a/toku_include/toku_os.h +++ b/toku_include/toku_os.h @@ -81,6 +81,8 @@ void toku_fs_get_write_info(time_t *enospc_last_time, uint64_t *enospc_current, int toku_fsync_dirfd_without_accounting(DIR *dirp); +int toku_fsync_dir_by_name_without_accounting(const char *dir_name); + // Get the file system free and total space for the file system that contains a path name // *avail_size is set to the bytes of free space in the file system available for non-root // *free_size is set to the bytes of free space in the file system diff --git a/windows/dirs.c b/windows/dirs.c index b7cf8b458e1..059c3828710 100644 --- a/windows/dirs.c +++ b/windows/dirs.c @@ -140,7 +140,7 @@ toku_fstat(int fd, toku_struct_stat *statbuf) { int toku_fsync_dirfd_without_accounting(DIR *dirp) { - //Not supported in windows. + //Believed to not be supported in windows. //Possibly not needed return 0; } @@ -149,3 +149,11 @@ int toku_fsync_directory(const char *UU(fname)) { return 0; // toku_fsync_dirfd } + +int +toku_fsync_dir_by_name_without_accounting(const char *dir_name) { + //Believed to not be supported in windows. + //Possibly not needed + return 0; +} +