diff --git a/buildheader/db.h_4_1 b/buildheader/db.h_4_1 index 7a8f560ead1..9900a95e455 100644 --- a/buildheader/db.h_4_1 +++ b/buildheader/db.h_4_1 @@ -324,7 +324,6 @@ struct __toku_db { DESCRIPTOR descriptor /* saved row/dictionary descriptor for aiding in comparisons */; int (*change_descriptor) (DB*, DB_TXN*, const DBT* descriptor, u_int32_t) /* change row/dictionary descriptor for a db. Available only while db is open */; int (*getf_set)(DB*, DB_TXN*, u_int32_t, DBT*, YDB_CALLBACK_FUNCTION, void*) /* same as DBC->c_getf_set without a persistent cursor) */; - int (*flatten)(DB*, DB_TXN*) /* Flatten a dictionary, similar to (but faster than) a table scan */; int (*optimize)(DB*) /* Run garbage collecion and promote all transactions older than oldest. Amortized (happens during flattening) */; int (*hot_optimize)(DB*, int (*progress_callback)(void *progress_extra, float progress), void *progress_extra); int (*get_fragmentation)(DB*,TOKU_DB_FRAGMENTATION); @@ -335,7 +334,7 @@ struct __toku_db { int (*verify_with_progress)(DB *, int (*progress_callback)(void *progress_extra, float progress), void *progress_extra, int verbose, int keep_going); int (*update)(DB *, DB_TXN*, const DBT *key, const DBT *extra, u_int32_t flags); int (*update_broadcast)(DB *, DB_TXN*, const DBT *extra, u_int32_t flags); - void* __toku_dummy0[11]; + void* __toku_dummy0[12]; char __toku_dummy1[96]; void *api_internal; /* 32-bit offset=236 size=4, 64=bit offset=376 size=8 */ void* __toku_dummy2[5]; diff --git a/buildheader/db.h_4_3 b/buildheader/db.h_4_3 index 453d907c533..2f4d2987f4a 100644 --- a/buildheader/db.h_4_3 +++ b/buildheader/db.h_4_3 @@ -333,7 +333,6 @@ struct __toku_db { DESCRIPTOR descriptor /* saved row/dictionary descriptor for aiding in comparisons */; int (*change_descriptor) (DB*, DB_TXN*, const DBT* descriptor, u_int32_t) /* change row/dictionary descriptor for a db. Available only while db is open */; int (*getf_set)(DB*, DB_TXN*, u_int32_t, DBT*, YDB_CALLBACK_FUNCTION, void*) /* same as DBC->c_getf_set without a persistent cursor) */; - int (*flatten)(DB*, DB_TXN*) /* Flatten a dictionary, similar to (but faster than) a table scan */; int (*optimize)(DB*) /* Run garbage collecion and promote all transactions older than oldest. Amortized (happens during flattening) */; int (*hot_optimize)(DB*, int (*progress_callback)(void *progress_extra, float progress), void *progress_extra); int (*get_fragmentation)(DB*,TOKU_DB_FRAGMENTATION); @@ -344,7 +343,7 @@ struct __toku_db { int (*verify_with_progress)(DB *, int (*progress_callback)(void *progress_extra, float progress), void *progress_extra, int verbose, int keep_going); int (*update)(DB *, DB_TXN*, const DBT *key, const DBT *extra, u_int32_t flags); int (*update_broadcast)(DB *, DB_TXN*, const DBT *extra, u_int32_t flags); - void* __toku_dummy0[14]; + void* __toku_dummy0[15]; char __toku_dummy1[96]; void *api_internal; /* 32-bit offset=248 size=4, 64=bit offset=400 size=8 */ void* __toku_dummy2[5]; diff --git a/buildheader/db.h_4_4 b/buildheader/db.h_4_4 index 990097812c3..97825182943 100644 --- a/buildheader/db.h_4_4 +++ b/buildheader/db.h_4_4 @@ -9,478 +9,3 @@ #if defined(__cplusplus) extern "C" { #endif -#define TOKUDB 1 -#define TOKUDB_NATIVE_H 0 -#define DB_VERSION_MAJOR 4 -#define DB_VERSION_MINOR 4 -#define DB_VERSION_PATCH 20 -#ifndef _TOKUDB_WRAP_H -#define DB_VERSION_STRING "Tokutek: TokuDB 4.4.20" -#else -#define DB_VERSION_STRING_ydb "Tokutek: TokuDB (wrapped bdb)" -#endif -#ifndef TOKU_OFF_T_DEFINED -#define TOKU_OFF_T_DEFINED -typedef int64_t toku_off_t; -#endif -#define DB_GID_SIZE 128 -typedef struct __toku_db_env DB_ENV; -typedef struct __toku_db_key_range DB_KEY_RANGE; -typedef struct __toku_db_lsn DB_LSN; -typedef struct __toku_db DB; -typedef struct __toku_db_txn DB_TXN; -typedef struct __toku_db_txn_active DB_TXN_ACTIVE; -typedef struct __toku_db_txn_stat DB_TXN_STAT; -typedef struct __toku_dbc DBC; -typedef struct __toku_dbt DBT; -typedef struct __toku_db_preplist { DB_TXN *txn; uint8_t gid[DB_GID_SIZE]; } DB_PREPLIST; -typedef u_int32_t db_recno_t; -typedef int(*YDB_CALLBACK_FUNCTION)(DBT const*, DBT const*, void*); -#include -#ifndef __BIGGEST_ALIGNMENT__ - #define __BIGGEST_ALIGNMENT__ 16 -#endif -typedef struct __toku_db_btree_stat64 { - u_int64_t bt_nkeys; /* how many unique keys (guaranteed only to be an estimate, even when flattened) */ - u_int64_t bt_ndata; /* how many key-value pairs (an estimate, but exact when flattened) */ - u_int64_t bt_dsize; /* how big are the keys+values (not counting the lengths) (an estimate, unless flattened) */ - u_int64_t bt_fsize; /* how big is the underlying file */ - u_int64_t bt_create_time_sec; /* Creation time, in seconds */ - u_int64_t bt_modify_time_sec; /* Time of last serialization, in seconds */ - u_int64_t bt_verify_time_sec; /* Time of last verification, in seconds */ -} DB_BTREE_STAT64; -typedef struct __toku_loader DB_LOADER; -struct __toku_loader_internal; -struct __toku_loader { - struct __toku_loader_internal *i; - int (*set_error_callback)(DB_LOADER *loader, void (*error_cb)(DB *db, int i, int err, DBT *key, DBT *val, void *error_extra), void *error_extra); /* set the error callback */ - int (*set_poll_function)(DB_LOADER *loader, int (*poll_func)(void *extra, float progress), void *poll_extra); /* set the polling function */ - int (*put)(DB_LOADER *loader, DBT *key, DBT* val); /* give a row to the loader */ - int (*close)(DB_LOADER *loader); /* finish loading, free memory */ - int (*abort)(DB_LOADER *loader); /* abort loading, free memory */ -}; -typedef struct __toku_indexer DB_INDEXER; -struct __toku_indexer_internal; -struct __toku_indexer { - struct __toku_indexer_internal *i; - int (*set_error_callback)(DB_INDEXER *indexer, void (*error_cb)(DB *db, int i, int err, DBT *key, DBT *val, void *error_extra), void *error_extra); /* set the error callback */ - int (*set_poll_function)(DB_INDEXER *indexer, int (*poll_func)(void *extra, float progress), void *poll_extra); /* set the polling function */ - int (*build)(DB_INDEXER *indexer); /* build the indexes */ - int (*close)(DB_INDEXER *indexer); /* finish indexing, free memory */ - int (*abort)(DB_INDEXER *indexer); /* abort indexing, free memory */ -}; -typedef enum { - FS_GREEN = 0, // green zone (we have lots of space) - FS_YELLOW = 1, // yellow zone (issue warning but allow operations) - FS_RED = 2, // red zone (prevent insert operations) - FS_BLOCKED = 3 // For reporting engine status, completely blocked -} fs_redzone_state; -typedef enum { - FS_STATE = 0, // interpret as file system state (redzone) enum - UINT64, // interpret as uint64_t - CHARSTR, // interpret as char * - UNIXTIME, // interpret as time_t - TOKUTIME // interpret as tokutime_t -} toku_engine_status_display_type; -typedef struct __toku_engine_status_row { - char * keyname; // info schema key, should not change across revisions without good reason - char * legend; // the text that will appear at user interface - toku_engine_status_display_type type; // how to interpret the value - union { - uint64_t num; - char * str; - } value; -} * TOKU_ENGINE_STATUS_ROW, TOKU_ENGINE_STATUS_ROW_S; -typedef enum { - DB_BTREE=1, - DB_UNKNOWN=5 -} DBTYPE; -#ifndef _TOKUDB_WRAP_H -#define DB_VERB_DEADLOCK 1 -#define DB_VERB_RECOVERY 2 -#define DB_VERB_REPLICATION 8 -#define DB_VERB_WAITSFOR 16 -#define DB_ARCH_ABS 1 -#define DB_ARCH_LOG 4 -#define DB_CREATE 1 -#define DB_CXX_NO_EXCEPTIONS 1 -#define DB_EXCL 8192 -#define DB_PRIVATE 1048576 -#define DB_RDONLY 16 -#define DB_RECOVER 32 -#define DB_RUNRECOVERY -30974 -#define DB_THREAD 64 -#define DB_TXN_NOSYNC 256 -#define DB_LOCK_DEFAULT 1 -#define DB_LOCK_OLDEST 7 -#define DB_LOCK_RANDOM 8 -#define DB_KEYFIRST 15 -#define DB_KEYLAST 16 -#define DB_NOOVERWRITE 22 -#define DB_NODUPDATA 21 -#define DB_NOOVERWRITE_NO_ERROR 1 -#define DB_OPFLAGS_MASK 255 -#define DB_AUTO_COMMIT 16777216 -#define DB_INIT_LOCK 16384 -#define DB_INIT_LOG 32768 -#define DB_INIT_MPOOL 65536 -#define DB_INIT_TXN 262144 -#define DB_KEYEXIST -30996 -#define DB_LOCK_DEADLOCK -30995 -#define DB_LOCK_NOTGRANTED -30994 -#define DB_NOTFOUND -30989 -#define DB_SECONDARY_BAD -30973 -#define DB_DONOTINDEX -30998 -#define DB_BUFFER_SMALL -30999 -#define DB_BADFORMAT -30500 -#define DB_DELETE_ANY 65536 -#define DB_TRUNCATE_WITHCURSORS 131072 -#define DB_FIRST 9 -#define DB_LAST 17 -#define DB_CURRENT 7 -#define DB_NEXT 18 -#define DB_NEXT_NODUP 20 -#define DB_PREV 25 -#define DB_PREV_NODUP 26 -#define DB_SET 28 -#define DB_SET_RANGE 30 -#define DB_CURRENT_BINDING 253 -#define DB_SET_RANGE_REVERSE 252 -#define DB_RMW 536870912 -#define DB_IS_RESETTING_OP 0x01000000 -#define DB_PRELOCKED 0x00800000 -#define DB_PRELOCKED_WRITE 0x00400000 -#define DB_IS_HOT_INDEX 0x00100000 -#define DBC_DISABLE_PREFETCHING 0x20000000 -#define DB_DBT_APPMALLOC 1 -#define DB_DBT_DUPOK 64 -#define DB_DBT_MALLOC 4 -#define DB_DBT_REALLOC 16 -#define DB_DBT_USERMEM 32 -#define DB_LOG_AUTOREMOVE 262144 -#define DB_TXN_WRITE_NOSYNC 1024 -#define DB_TXN_NOWAIT 8192 -#define DB_TXN_SYNC 16384 -#define DB_READ_UNCOMMITTED 67108864 -#define DB_READ_COMMITTED 33554432 -#define DB_TXN_SNAPSHOT 1 -#define DB_INHERIT_ISOLATION 2 -#define DB_SERIALIZABLE 4 -#endif -/* TOKUDB specific error codes */ -#define TOKUDB_OUT_OF_LOCKS -100000 -#define TOKUDB_SUCCEEDED_EARLY -100001 -#define TOKUDB_FOUND_BUT_REJECTED -100002 -#define TOKUDB_USER_CALLBACK_ERROR -100003 -#define TOKUDB_DICTIONARY_TOO_OLD -100004 -#define TOKUDB_DICTIONARY_TOO_NEW -100005 -#define TOKUDB_DICTIONARY_NO_HEADER -100006 -#define TOKUDB_CANCELED -100007 -#define TOKUDB_NO_DATA -100008 -#define TOKUDB_ACCEPT -100009 -#define TOKUDB_MVCC_DICTIONARY_TOO_NEW -100010 -#define TOKUDB_UPGRADE_FAILURE -100011 -#define TOKUDB_TRY_AGAIN -100012 -#define TOKUDB_NEEDS_REPAIR -100013 -#define TOKUDB_CURSOR_CONTINUE -100014 -/* LOADER flags */ -#define LOADER_USE_PUTS 1 -typedef int (*generate_row_for_put_func)(DB *dest_db, DB *src_db, DBT *dest_key, DBT *dest_val, const DBT *src_key, const DBT *src_val); -typedef int (*generate_row_for_del_func)(DB *dest_db, DB *src_db, DBT *dest_key, const DBT *src_key, const DBT *src_val); -/* in wrap mode, top-level function txn_begin is renamed, but the field isn't renamed, so we have to hack it here.*/ -#ifdef _TOKUDB_WRAP_H -#undef txn_begin -#endif -struct __toku_db_env { - struct __toku_db_env_internal *i; -#define db_env_struct_i(x) ((x)->i) - int (*checkpointing_set_period) (DB_ENV*, u_int32_t) /* Change the delay between automatic checkpoints. 0 means disabled. */; - int (*checkpointing_get_period) (DB_ENV*, u_int32_t*) /* Retrieve the delay between automatic checkpoints. 0 means disabled. */; - int (*cleaner_set_period) (DB_ENV*, u_int32_t) /* Change the delay between automatic cleaner attempts. 0 means disabled. */; - int (*cleaner_get_period) (DB_ENV*, u_int32_t*) /* Retrieve the delay between automatic cleaner attempts. 0 means disabled. */; - int (*cleaner_set_iterations) (DB_ENV*, u_int32_t) /* Change the number of attempts on each cleaner invokation. 0 means disabled. */; - int (*cleaner_get_iterations) (DB_ENV*, u_int32_t*) /* Retrieve the number of attempts on each cleaner invokation. 0 means disabled. */; - int (*checkpointing_postpone) (DB_ENV*) /* Use for 'rename table' or any other operation that must be disjoint from a checkpoint */; - int (*checkpointing_resume) (DB_ENV*) /* Alert tokudb 'postpone' is no longer necessary */; - int (*checkpointing_begin_atomic_operation) (DB_ENV*) /* Begin a set of operations (that must be atomic as far as checkpoints are concerned). i.e. inserting into every index in one table */; - int (*checkpointing_end_atomic_operation) (DB_ENV*) /* End a set of operations (that must be atomic as far as checkpoints are concerned). */; - void *app_private; /* 32-bit offset=44 size=4, 64=bit offset=88 size=8 */ - int (*set_default_bt_compare) (DB_ENV*,int (*bt_compare) (DB *, const DBT *, const DBT *)) /* Set default (key) comparison function for all DBs in this environment. Required for RECOVERY since you cannot open the DBs manually. */; - int (*get_engine_status_num_rows) (DB_ENV*, uint64_t*) /* return number of rows in engine status */; - int (*get_engine_status) (DB_ENV*, TOKU_ENGINE_STATUS_ROW, uint64_t, fs_redzone_state*, uint64_t*, char*, int) /* Fill in status struct and redzone state, possibly env panic string */; - int (*get_engine_status_text) (DB_ENV*, char*, int) /* Fill in status text */; - int (*crash) (DB_ENV*, const char*/*expr_as_string*/,const char */*fun*/,const char*/*file*/,int/*line*/, int/*errno*/);; - int (*get_iname) (DB_ENV* env, DBT* dname_dbt, DBT* iname_dbt) /* FOR TEST ONLY: lookup existing iname */; - int (*create_loader) (DB_ENV *env, DB_TXN *txn, DB_LOADER **blp, DB *src_db, int N, DB *dbs[/*N*/], uint32_t db_flags[/*N*/], uint32_t dbt_flags[/*N*/], uint32_t loader_flags); - int (*create_indexer) (DB_ENV *env, DB_TXN *txn, DB_INDEXER **idxrp, DB *src_db, int N, DB *dbs[/*N*/], uint32_t db_flags[/*N*/], uint32_t indexer_flags); - int (*put_multiple) (DB_ENV *env, DB *src_db, DB_TXN *txn, - const DBT *src_key, const DBT *src_val, - uint32_t num_dbs, DB **db_array, DBT *keys, DBT *vals, uint32_t *flags_array) /* insert into multiple DBs */; - int (*set_generate_row_callback_for_put) (DB_ENV *env, generate_row_for_put_func generate_row_for_put); - int (*del_multiple) (DB_ENV *env, DB *src_db, DB_TXN *txn, - const DBT *src_key, const DBT *src_val, - uint32_t num_dbs, DB **db_array, DBT *keys, uint32_t *flags_array) /* delete from multiple DBs */; - int (*set_generate_row_callback_for_del) (DB_ENV *env, generate_row_for_del_func generate_row_for_del); - int (*update_multiple) (DB_ENV *env, DB *src_db, DB_TXN *txn, - DBT *old_src_key, DBT *old_src_data, - DBT *new_src_key, DBT *new_src_data, - uint32_t num_dbs, DB **db_array, uint32_t *flags_array, - uint32_t num_keys, DBT *keys, - uint32_t num_vals, DBT *vals) /* update multiple DBs */; - int (*get_redzone) (DB_ENV *env, int *redzone) /* get the redzone limit */; - int (*set_redzone) (DB_ENV *env, int redzone) /* set the redzone limit in percent of total space */; - int (*set_lk_max_memory) (DB_ENV *env, uint64_t max); - int (*get_lk_max_memory) (DB_ENV *env, uint64_t *max); - void (*set_update) (DB_ENV *env, int (*update_function)(DB *, const DBT *key, const DBT *old_val, const DBT *extra, void (*set_val)(const DBT *new_val, void *set_extra), void *set_extra)); - int (*set_lock_timeout) (DB_ENV *env, uint64_t lock_wait_time_msec); - int (*get_lock_timeout) (DB_ENV *env, uint64_t *lock_wait_time_msec); - void* __toku_dummy0[20]; - char __toku_dummy1[128]; - void *api1_internal; /* 32-bit offset=336 size=4, 64=bit offset=544 size=8 */ - void* __toku_dummy2[7]; - int (*close) (DB_ENV *, u_int32_t); /* 32-bit offset=368 size=4, 64=bit offset=608 size=8 */ - int (*dbremove) (DB_ENV *, DB_TXN *, const char *, const char *, u_int32_t); /* 32-bit offset=372 size=4, 64=bit offset=616 size=8 */ - int (*dbrename) (DB_ENV *, DB_TXN *, const char *, const char *, const char *, u_int32_t); /* 32-bit offset=376 size=4, 64=bit offset=624 size=8 */ - void (*err) (const DB_ENV *, int, const char *, ...); /* 32-bit offset=380 size=4, 64=bit offset=632 size=8 */ - void* __toku_dummy3[3]; - int (*get_cachesize) (DB_ENV *, u_int32_t *, u_int32_t *, int *); /* 32-bit offset=396 size=4, 64=bit offset=664 size=8 */ - void* __toku_dummy4[4]; - int (*get_flags) (DB_ENV *, u_int32_t *); /* 32-bit offset=416 size=4, 64=bit offset=704 size=8 */ - void* __toku_dummy5[4]; - int (*get_lg_max) (DB_ENV *, u_int32_t*); /* 32-bit offset=436 size=4, 64=bit offset=744 size=8 */ - void* __toku_dummy6[4]; - int (*get_lk_max_locks) (DB_ENV *, u_int32_t *); /* 32-bit offset=456 size=4, 64=bit offset=784 size=8 */ - void* __toku_dummy7[22]; - int (*log_archive) (DB_ENV *, char **[], u_int32_t); /* 32-bit offset=548 size=4, 64=bit offset=968 size=8 */ - void* __toku_dummy8[2]; - int (*log_flush) (DB_ENV *, const DB_LSN *); /* 32-bit offset=560 size=4, 64=bit offset=992 size=8 */ - void* __toku_dummy9[25]; - int (*open) (DB_ENV *, const char *, u_int32_t, int); /* 32-bit offset=664 size=4, 64=bit offset=1200 size=8 */ - void* __toku_dummy10[12]; - int (*set_cachesize) (DB_ENV *, u_int32_t, u_int32_t, int); /* 32-bit offset=716 size=4, 64=bit offset=1304 size=8 */ - int (*set_data_dir) (DB_ENV *, const char *); /* 32-bit offset=720 size=4, 64=bit offset=1312 size=8 */ - void* __toku_dummy11[1]; - void (*set_errcall) (DB_ENV *, void (*)(const DB_ENV *, const char *, const char *)); /* 32-bit offset=728 size=4, 64=bit offset=1328 size=8 */ - void (*set_errfile) (DB_ENV *, FILE*); /* 32-bit offset=732 size=4, 64=bit offset=1336 size=8 */ - void (*set_errpfx) (DB_ENV *, const char *); /* 32-bit offset=736 size=4, 64=bit offset=1344 size=8 */ - void* __toku_dummy12[1]; - int (*set_flags) (DB_ENV *, u_int32_t, int); /* 32-bit offset=744 size=4, 64=bit offset=1360 size=8 */ - void* __toku_dummy13[2]; - int (*set_lg_bsize) (DB_ENV *, u_int32_t); /* 32-bit offset=756 size=4, 64=bit offset=1384 size=8 */ - int (*set_lg_dir) (DB_ENV *, const char *); /* 32-bit offset=760 size=4, 64=bit offset=1392 size=8 */ - void* __toku_dummy14[1]; - int (*set_lg_max) (DB_ENV *, u_int32_t); /* 32-bit offset=768 size=4, 64=bit offset=1408 size=8 */ - void* __toku_dummy15[2]; - int (*set_lk_detect) (DB_ENV *, u_int32_t); /* 32-bit offset=780 size=4, 64=bit offset=1432 size=8 */ - int (*set_lk_max) (DB_ENV *, u_int32_t); /* 32-bit offset=784 size=4, 64=bit offset=1440 size=8 */ - void* __toku_dummy16[1]; - int (*set_lk_max_locks) (DB_ENV *, u_int32_t); /* 32-bit offset=792 size=4, 64=bit offset=1456 size=8 */ - void* __toku_dummy17[16]; - int (*set_tmp_dir) (DB_ENV *, const char *); /* 32-bit offset=860 size=4, 64=bit offset=1592 size=8 */ - void* __toku_dummy18[2]; - int (*set_verbose) (DB_ENV *, u_int32_t, int); /* 32-bit offset=872 size=4, 64=bit offset=1616 size=8 */ - void* __toku_dummy19[1]; - int (*txn_begin) (DB_ENV *, DB_TXN *, DB_TXN **, u_int32_t); /* 32-bit offset=880 size=4, 64=bit offset=1632 size=8 */ - int (*txn_checkpoint) (DB_ENV *, u_int32_t, u_int32_t, u_int32_t); /* 32-bit offset=884 size=4, 64=bit offset=1640 size=8 */ - int (*txn_recover) (DB_ENV *, DB_PREPLIST preplist[/*count*/], long count, /*out*/ long *retp, u_int32_t flags); /* 32-bit offset=888 size=4, 64=bit offset=1648 size=8 */ - int (*txn_stat) (DB_ENV *, DB_TXN_STAT **, u_int32_t); /* 32-bit offset=892 size=4, 64=bit offset=1656 size=8 */ - void* __toku_dummy20[2]; /* Padding at the end */ - char __toku_dummy21[16]; /* Padding at the end */ -}; -struct __toku_db_key_range { - double less; /* 32-bit offset=0 size=8, 64=bit offset=0 size=8 */ - double equal; /* 32-bit offset=8 size=8, 64=bit offset=8 size=8 */ - double greater; /* 32-bit offset=16 size=8, 64=bit offset=16 size=8 */ - void* __toku_dummy0[194]; /* Padding at the end */ - char __toku_dummy1[120]; /* Padding at the end */ -}; -struct __toku_db_lsn { - char __toku_dummy0[8]; /* Padding at the end */ -}; -struct __toku_dbt { - void*data; /* 32-bit offset=0 size=4, 64=bit offset=0 size=8 */ - u_int32_t size; /* 32-bit offset=4 size=4, 64=bit offset=8 size=4 */ - u_int32_t ulen; /* 32-bit offset=8 size=4, 64=bit offset=12 size=4 */ - char __toku_dummy0[8]; - u_int32_t flags; /* 32-bit offset=20 size=4, 64=bit offset=24 size=4 */ - /* 4 more bytes of alignment in the 64-bit case. */ -}; -typedef struct __toku_descriptor { - DBT dbt; -} *DESCRIPTOR, DESCRIPTOR_S; -//One header is included in 'data' -//One header is included in 'additional for checkpoint' -typedef struct __toku_db_fragmentation { - uint64_t file_size_bytes; //Total file size in bytes - uint64_t data_bytes; //Compressed User Data in bytes - uint64_t data_blocks; //Number of blocks of compressed User Data - uint64_t checkpoint_bytes_additional; //Additional bytes used for checkpoint system - uint64_t checkpoint_blocks_additional; //Additional blocks used for checkpoint system - uint64_t unused_bytes; //Unused space in file - uint64_t unused_blocks; //Number of contiguous regions of unused space - uint64_t largest_unused_block; //Size of largest contiguous unused space -} *TOKU_DB_FRAGMENTATION, TOKU_DB_FRAGMENTATION_S; -struct __toku_db { - struct __toku_db_internal *i; -#define db_struct_i(x) ((x)->i) - int (*key_range64)(DB*, DB_TXN *, DBT *, u_int64_t *less, u_int64_t *equal, u_int64_t *greater, int *is_exact); - int (*stat64)(DB *, DB_TXN *, DB_BTREE_STAT64 *); - int (*pre_acquire_table_lock)(DB*, DB_TXN*); - void *app_private; /* 32-bit offset=16 size=4, 64=bit offset=32 size=8 */ - DB_ENV *dbenv; /* 32-bit offset=20 size=4, 64=bit offset=40 size=8 */ - int (*pre_acquire_fileops_lock)(DB*, DB_TXN*); - const DBT* (*dbt_pos_infty)(void) /* Return the special DBT that refers to positive infinity in the lock table.*/; - const DBT* (*dbt_neg_infty)(void)/* Return the special DBT that refers to negative infinity in the lock table.*/; - void (*get_max_row_size) (DB*, u_int32_t *max_key_size, u_int32_t *max_row_size); - DESCRIPTOR descriptor /* saved row/dictionary descriptor for aiding in comparisons */; - int (*change_descriptor) (DB*, DB_TXN*, const DBT* descriptor, u_int32_t) /* change row/dictionary descriptor for a db. Available only while db is open */; - int (*getf_set)(DB*, DB_TXN*, u_int32_t, DBT*, YDB_CALLBACK_FUNCTION, void*) /* same as DBC->c_getf_set without a persistent cursor) */; - int (*flatten)(DB*, DB_TXN*) /* Flatten a dictionary, similar to (but faster than) a table scan */; - int (*optimize)(DB*) /* Run garbage collecion and promote all transactions older than oldest. Amortized (happens during flattening) */; - int (*hot_optimize)(DB*, int (*progress_callback)(void *progress_extra, float progress), void *progress_extra); - int (*get_fragmentation)(DB*,TOKU_DB_FRAGMENTATION); - int (*get_readpagesize)(DB*,u_int32_t*); - int (*set_readpagesize)(DB*,u_int32_t); - int (*set_indexer)(DB*, DB_INDEXER*); - void (*get_indexer)(DB*, DB_INDEXER**); - int (*verify_with_progress)(DB *, int (*progress_callback)(void *progress_extra, float progress), void *progress_extra, int verbose, int keep_going); - int (*update)(DB *, DB_TXN*, const DBT *key, const DBT *extra, u_int32_t flags); - int (*update_broadcast)(DB *, DB_TXN*, const DBT *extra, u_int32_t flags); - void* __toku_dummy0[16]; - char __toku_dummy1[96]; - void *api_internal; /* 32-bit offset=256 size=4, 64=bit offset=416 size=8 */ - void* __toku_dummy2[5]; - int (*close) (DB*, u_int32_t); /* 32-bit offset=280 size=4, 64=bit offset=464 size=8 */ - void* __toku_dummy3[1]; - int (*cursor) (DB *, DB_TXN *, DBC **, u_int32_t); /* 32-bit offset=288 size=4, 64=bit offset=480 size=8 */ - int (*del) (DB *, DB_TXN *, DBT *, u_int32_t); /* 32-bit offset=292 size=4, 64=bit offset=488 size=8 */ - void* __toku_dummy4[2]; - int (*fd) (DB *, int *); /* 32-bit offset=304 size=4, 64=bit offset=512 size=8 */ - int (*get) (DB *, DB_TXN *, DBT *, DBT *, u_int32_t); /* 32-bit offset=308 size=4, 64=bit offset=520 size=8 */ - void* __toku_dummy5[8]; - int (*get_flags) (DB *, u_int32_t *); /* 32-bit offset=344 size=4, 64=bit offset=592 size=8 */ - void* __toku_dummy6[6]; - int (*get_pagesize) (DB *, u_int32_t *); /* 32-bit offset=372 size=4, 64=bit offset=648 size=8 */ - void* __toku_dummy7[8]; - int (*key_range) (DB *, DB_TXN *, DBT *, DB_KEY_RANGE *, u_int32_t); /* 32-bit offset=408 size=4, 64=bit offset=720 size=8 */ - int (*open) (DB *, DB_TXN *, const char *, const char *, DBTYPE, u_int32_t, int); /* 32-bit offset=412 size=4, 64=bit offset=728 size=8 */ - void* __toku_dummy8[1]; - int (*put) (DB *, DB_TXN *, DBT *, DBT *, u_int32_t); /* 32-bit offset=420 size=4, 64=bit offset=744 size=8 */ - int (*remove) (DB *, const char *, const char *, u_int32_t); /* 32-bit offset=424 size=4, 64=bit offset=752 size=8 */ - int (*rename) (DB *, const char *, const char *, const char *, u_int32_t); /* 32-bit offset=428 size=4, 64=bit offset=760 size=8 */ - void* __toku_dummy9[9]; - void (*set_errfile) (DB *, FILE*); /* 32-bit offset=468 size=4, 64=bit offset=840 size=8 */ - void* __toku_dummy10[2]; - int (*set_flags) (DB *, u_int32_t); /* 32-bit offset=480 size=4, 64=bit offset=864 size=8 */ - void* __toku_dummy11[6]; - int (*set_pagesize) (DB *, u_int32_t); /* 32-bit offset=508 size=4, 64=bit offset=920 size=8 */ - void* __toku_dummy12[6]; - int (*stat) (DB *, void *, u_int32_t); /* 32-bit offset=536 size=4, 64=bit offset=976 size=8 */ - void* __toku_dummy13[2]; - int (*truncate) (DB *, DB_TXN *, u_int32_t *, u_int32_t); /* 32-bit offset=548 size=4, 64=bit offset=1000 size=8 */ - void* __toku_dummy14[1]; - int (*verify) (DB *, const char *, const char *, FILE *, u_int32_t); /* 32-bit offset=556 size=4, 64=bit offset=1016 size=8 */ - void* __toku_dummy15[5]; /* Padding at the end */ - char __toku_dummy16[16]; /* Padding at the end */ -}; -struct __toku_db_txn_active { - u_int32_t txnid; /* 32-bit offset=0 size=4, 64=bit offset=0 size=4 */ - void* __toku_dummy0[2]; - char __toku_dummy1[4]; - DB_LSN lsn; /* 32-bit offset=16 size=8, 64=bit offset=24 size=8 */ - char __toku_dummy2[184]; /* Padding at the end */ -}; -typedef struct __toku_txn_progress { - uint64_t entries_total; - uint64_t entries_processed; - uint8_t is_commit; - uint8_t stalled_on_checkpoint; -} *TOKU_TXN_PROGRESS, TOKU_TXN_PROGRESS_S; -typedef void(*TXN_PROGRESS_POLL_FUNCTION)(TOKU_TXN_PROGRESS, void*); -struct txn_stat { - u_int64_t rollback_raw_count; -}; -struct __toku_db_txn { - DB_ENV *mgrp /*In TokuDB, mgrp is a DB_ENV not a DB_TXNMGR*/; /* 32-bit offset=0 size=4, 64=bit offset=0 size=8 */ - DB_TXN *parent; /* 32-bit offset=4 size=4, 64=bit offset=8 size=8 */ - int (*txn_stat)(DB_TXN *, struct txn_stat **); - struct toku_list open_txns; - int (*commit_with_progress)(DB_TXN*, uint32_t, TXN_PROGRESS_POLL_FUNCTION, void*); - int (*abort_with_progress)(DB_TXN*, TXN_PROGRESS_POLL_FUNCTION, void*); - void* __toku_dummy0[13]; - char __toku_dummy1[8]; - void *api_internal; /* 32-bit offset=84 size=4, 64=bit offset=160 size=8 */ - void* __toku_dummy2[2]; - int (*abort) (DB_TXN *); /* 32-bit offset=96 size=4, 64=bit offset=184 size=8 */ - int (*commit) (DB_TXN*, u_int32_t); /* 32-bit offset=100 size=4, 64=bit offset=192 size=8 */ - void* __toku_dummy3[2]; - u_int32_t (*id) (DB_TXN *); /* 32-bit offset=112 size=4, 64=bit offset=216 size=8 */ - int (*prepare) (DB_TXN*, u_int8_t gid[DB_GID_SIZE]); /* 32-bit offset=116 size=4, 64=bit offset=224 size=8 */ - void* __toku_dummy4[4]; /* Padding at the end */ -}; -struct __toku_db_txn_stat { - void* __toku_dummy0[1]; - char __toku_dummy1[28]; - u_int32_t st_nactive; /* 32-bit offset=32 size=4, 64=bit offset=36 size=4 */ - char __toku_dummy2[8]; - DB_TXN_ACTIVE *st_txnarray; /* 32-bit offset=44 size=4, 64=bit offset=48 size=8 */ - void* __toku_dummy3[1]; /* Padding at the end */ - char __toku_dummy4[8]; /* Padding at the end */ -}; -struct __toku_dbc { - DB *dbp; /* 32-bit offset=0 size=4, 64=bit offset=0 size=8 */ - int (*c_getf_first)(DBC *, u_int32_t, YDB_CALLBACK_FUNCTION, void *); - int (*c_getf_last)(DBC *, u_int32_t, YDB_CALLBACK_FUNCTION, void *); - int (*c_getf_next)(DBC *, u_int32_t, YDB_CALLBACK_FUNCTION, void *); - int (*c_getf_prev)(DBC *, u_int32_t, YDB_CALLBACK_FUNCTION, void *); - int (*c_getf_current)(DBC *, u_int32_t, YDB_CALLBACK_FUNCTION, void *); - int (*c_getf_current_binding)(DBC *, u_int32_t, YDB_CALLBACK_FUNCTION, void *); - int (*c_getf_set)(DBC *, u_int32_t, DBT *, YDB_CALLBACK_FUNCTION, void *); - int (*c_getf_set_range)(DBC *, u_int32_t, DBT *, YDB_CALLBACK_FUNCTION, void *); - int (*c_getf_set_range_reverse)(DBC *, u_int32_t, DBT *, YDB_CALLBACK_FUNCTION, void *); - int (*c_pre_acquire_range_lock)(DBC*, const DBT*, const DBT*); - void* __toku_dummy0[10]; - char __toku_dummy1[104]; - int (*c_close) (DBC *); /* 32-bit offset=188 size=4, 64=bit offset=272 size=8 */ - int (*c_count) (DBC *, db_recno_t *, u_int32_t); /* 32-bit offset=192 size=4, 64=bit offset=280 size=8 */ - int (*c_del) (DBC *, u_int32_t); /* 32-bit offset=196 size=4, 64=bit offset=288 size=8 */ - void* __toku_dummy2[1]; - int (*c_get) (DBC *, DBT *, DBT *, u_int32_t); /* 32-bit offset=204 size=4, 64=bit offset=304 size=8 */ - void* __toku_dummy3[10]; /* Padding at the end */ -}; -#ifdef _TOKUDB_WRAP_H -#define txn_begin txn_begin_tokudb -#endif -int db_env_create(DB_ENV **, u_int32_t) __attribute__((__visibility__("default"))); -int db_create(DB **, DB_ENV *, u_int32_t) __attribute__((__visibility__("default"))); -char *db_strerror(int) __attribute__((__visibility__("default"))); -const char *db_version(int*,int *,int *) __attribute__((__visibility__("default"))); -int log_compare (const DB_LSN*, const DB_LSN *) __attribute__((__visibility__("default"))); -int db_env_set_func_fsync (int (*)(int)) __attribute__((__visibility__("default"))); -int toku_set_trace_file (char *fname) __attribute__((__visibility__("default"))); -int toku_close_trace_file (void) __attribute__((__visibility__("default"))); -int db_env_set_func_free (void (*)(void*)) __attribute__((__visibility__("default"))); -int db_env_set_func_malloc (void *(*)(size_t)) __attribute__((__visibility__("default"))); -int db_env_set_func_realloc (void *(*)(void*, size_t)) __attribute__((__visibility__("default"))); -int db_env_set_func_pwrite (ssize_t (*)(int, const void *, size_t, toku_off_t)) __attribute__((__visibility__("default"))); -int db_env_set_func_full_pwrite (ssize_t (*)(int, const void *, size_t, toku_off_t)) __attribute__((__visibility__("default"))); -int db_env_set_func_write (ssize_t (*)(int, const void *, size_t)) __attribute__((__visibility__("default"))); -int db_env_set_func_full_write (ssize_t (*)(int, const void *, size_t)) __attribute__((__visibility__("default"))); -int db_env_set_func_fdopen (FILE* (*)(int, const char *)) __attribute__((__visibility__("default"))); -int db_env_set_func_fopen (FILE* (*)(const char *, const char *)) __attribute__((__visibility__("default"))); -int db_env_set_func_open (int (*)(const char *, int, int)) __attribute__((__visibility__("default"))); -int db_env_set_func_fclose (int (*)(FILE*)) __attribute__((__visibility__("default"))); -int db_env_set_func_pread (ssize_t (*)(int, void *, size_t, off_t)) __attribute__((__visibility__("default"))); -void db_env_set_func_loader_fwrite (size_t (*fwrite_fun)(const void*,size_t,size_t,FILE*)) __attribute__((__visibility__("default"))); -void db_env_set_checkpoint_callback (void (*)(void*), void*) __attribute__((__visibility__("default"))); -void db_env_set_checkpoint_callback2 (void (*)(void*), void*) __attribute__((__visibility__("default"))); -void db_env_set_recover_callback (void (*)(void*), void*) __attribute__((__visibility__("default"))); -void db_env_set_recover_callback2 (void (*)(void*), void*) __attribute__((__visibility__("default"))); -void db_env_set_loader_size_factor (uint32_t) __attribute__((__visibility__("default"))); -void db_env_set_mvcc_garbage_collection_verification(u_int32_t) __attribute__((__visibility__("default"))); -void db_env_enable_engine_status(u_int32_t) __attribute__((__visibility__("default"))); -void db_env_set_flusher_thread_callback (void (*)(int, void*), void*) __attribute__((__visibility__("default"))); -#if defined(__cplusplus) -} -#endif -#endif diff --git a/buildheader/db.h_4_5 b/buildheader/db.h_4_5 index 614169441ee..97825182943 100644 --- a/buildheader/db.h_4_5 +++ b/buildheader/db.h_4_5 @@ -9,478 +9,3 @@ #if defined(__cplusplus) extern "C" { #endif -#define TOKUDB 1 -#define TOKUDB_NATIVE_H 0 -#define DB_VERSION_MAJOR 4 -#define DB_VERSION_MINOR 5 -#define DB_VERSION_PATCH 20 -#ifndef _TOKUDB_WRAP_H -#define DB_VERSION_STRING "Tokutek: TokuDB 4.5.20" -#else -#define DB_VERSION_STRING_ydb "Tokutek: TokuDB (wrapped bdb)" -#endif -#ifndef TOKU_OFF_T_DEFINED -#define TOKU_OFF_T_DEFINED -typedef int64_t toku_off_t; -#endif -#define DB_GID_SIZE 128 -typedef struct __toku_db_env DB_ENV; -typedef struct __toku_db_key_range DB_KEY_RANGE; -typedef struct __toku_db_lsn DB_LSN; -typedef struct __toku_db DB; -typedef struct __toku_db_txn DB_TXN; -typedef struct __toku_db_txn_active DB_TXN_ACTIVE; -typedef struct __toku_db_txn_stat DB_TXN_STAT; -typedef struct __toku_dbc DBC; -typedef struct __toku_dbt DBT; -typedef struct __toku_db_preplist { DB_TXN *txn; uint8_t gid[DB_GID_SIZE]; } DB_PREPLIST; -typedef u_int32_t db_recno_t; -typedef int(*YDB_CALLBACK_FUNCTION)(DBT const*, DBT const*, void*); -#include -#ifndef __BIGGEST_ALIGNMENT__ - #define __BIGGEST_ALIGNMENT__ 16 -#endif -typedef struct __toku_db_btree_stat64 { - u_int64_t bt_nkeys; /* how many unique keys (guaranteed only to be an estimate, even when flattened) */ - u_int64_t bt_ndata; /* how many key-value pairs (an estimate, but exact when flattened) */ - u_int64_t bt_dsize; /* how big are the keys+values (not counting the lengths) (an estimate, unless flattened) */ - u_int64_t bt_fsize; /* how big is the underlying file */ - u_int64_t bt_create_time_sec; /* Creation time, in seconds */ - u_int64_t bt_modify_time_sec; /* Time of last serialization, in seconds */ - u_int64_t bt_verify_time_sec; /* Time of last verification, in seconds */ -} DB_BTREE_STAT64; -typedef struct __toku_loader DB_LOADER; -struct __toku_loader_internal; -struct __toku_loader { - struct __toku_loader_internal *i; - int (*set_error_callback)(DB_LOADER *loader, void (*error_cb)(DB *db, int i, int err, DBT *key, DBT *val, void *error_extra), void *error_extra); /* set the error callback */ - int (*set_poll_function)(DB_LOADER *loader, int (*poll_func)(void *extra, float progress), void *poll_extra); /* set the polling function */ - int (*put)(DB_LOADER *loader, DBT *key, DBT* val); /* give a row to the loader */ - int (*close)(DB_LOADER *loader); /* finish loading, free memory */ - int (*abort)(DB_LOADER *loader); /* abort loading, free memory */ -}; -typedef struct __toku_indexer DB_INDEXER; -struct __toku_indexer_internal; -struct __toku_indexer { - struct __toku_indexer_internal *i; - int (*set_error_callback)(DB_INDEXER *indexer, void (*error_cb)(DB *db, int i, int err, DBT *key, DBT *val, void *error_extra), void *error_extra); /* set the error callback */ - int (*set_poll_function)(DB_INDEXER *indexer, int (*poll_func)(void *extra, float progress), void *poll_extra); /* set the polling function */ - int (*build)(DB_INDEXER *indexer); /* build the indexes */ - int (*close)(DB_INDEXER *indexer); /* finish indexing, free memory */ - int (*abort)(DB_INDEXER *indexer); /* abort indexing, free memory */ -}; -typedef enum { - FS_GREEN = 0, // green zone (we have lots of space) - FS_YELLOW = 1, // yellow zone (issue warning but allow operations) - FS_RED = 2, // red zone (prevent insert operations) - FS_BLOCKED = 3 // For reporting engine status, completely blocked -} fs_redzone_state; -typedef enum { - FS_STATE = 0, // interpret as file system state (redzone) enum - UINT64, // interpret as uint64_t - CHARSTR, // interpret as char * - UNIXTIME, // interpret as time_t - TOKUTIME // interpret as tokutime_t -} toku_engine_status_display_type; -typedef struct __toku_engine_status_row { - char * keyname; // info schema key, should not change across revisions without good reason - char * legend; // the text that will appear at user interface - toku_engine_status_display_type type; // how to interpret the value - union { - uint64_t num; - char * str; - } value; -} * TOKU_ENGINE_STATUS_ROW, TOKU_ENGINE_STATUS_ROW_S; -typedef enum { - DB_BTREE=1, - DB_UNKNOWN=5 -} DBTYPE; -#ifndef _TOKUDB_WRAP_H -#define DB_VERB_DEADLOCK 1 -#define DB_VERB_RECOVERY 2 -#define DB_VERB_REPLICATION 8 -#define DB_VERB_WAITSFOR 16 -#define DB_ARCH_ABS 1 -#define DB_ARCH_LOG 4 -#define DB_CREATE 1 -#define DB_CXX_NO_EXCEPTIONS 1 -#define DB_EXCL 16384 -#define DB_PRIVATE 2097152 -#define DB_RDONLY 32 -#define DB_RECOVER 64 -#define DB_RUNRECOVERY -30975 -#define DB_THREAD 128 -#define DB_TXN_NOSYNC 512 -#define DB_LOCK_DEFAULT 1 -#define DB_LOCK_OLDEST 7 -#define DB_LOCK_RANDOM 8 -#define DB_KEYFIRST 13 -#define DB_KEYLAST 14 -#define DB_NOOVERWRITE 20 -#define DB_NODUPDATA 19 -#define DB_NOOVERWRITE_NO_ERROR 1 -#define DB_OPFLAGS_MASK 255 -#define DB_AUTO_COMMIT 33554432 -#define DB_INIT_LOCK 32768 -#define DB_INIT_LOG 65536 -#define DB_INIT_MPOOL 131072 -#define DB_INIT_TXN 524288 -#define DB_KEYEXIST -30996 -#define DB_LOCK_DEADLOCK -30995 -#define DB_LOCK_NOTGRANTED -30994 -#define DB_NOTFOUND -30989 -#define DB_SECONDARY_BAD -30974 -#define DB_DONOTINDEX -30998 -#define DB_BUFFER_SMALL -30999 -#define DB_BADFORMAT -30500 -#define DB_DELETE_ANY 65536 -#define DB_TRUNCATE_WITHCURSORS 131072 -#define DB_FIRST 7 -#define DB_LAST 15 -#define DB_CURRENT 6 -#define DB_NEXT 16 -#define DB_NEXT_NODUP 18 -#define DB_PREV 23 -#define DB_PREV_NODUP 24 -#define DB_SET 25 -#define DB_SET_RANGE 27 -#define DB_CURRENT_BINDING 253 -#define DB_SET_RANGE_REVERSE 252 -#define DB_RMW 1073741824 -#define DB_IS_RESETTING_OP 0x01000000 -#define DB_PRELOCKED 0x00800000 -#define DB_PRELOCKED_WRITE 0x00400000 -#define DB_IS_HOT_INDEX 0x00100000 -#define DBC_DISABLE_PREFETCHING 0x20000000 -#define DB_DBT_APPMALLOC 1 -#define DB_DBT_DUPOK 128 -#define DB_DBT_MALLOC 4 -#define DB_DBT_REALLOC 16 -#define DB_DBT_USERMEM 64 -#define DB_LOG_AUTOREMOVE 524288 -#define DB_TXN_WRITE_NOSYNC 2048 -#define DB_TXN_NOWAIT 16384 -#define DB_TXN_SYNC 32768 -#define DB_TXN_SNAPSHOT 268435456 -#define DB_READ_UNCOMMITTED 134217728 -#define DB_READ_COMMITTED 67108864 -#define DB_INHERIT_ISOLATION 1 -#define DB_SERIALIZABLE 2 -#endif -/* TOKUDB specific error codes */ -#define TOKUDB_OUT_OF_LOCKS -100000 -#define TOKUDB_SUCCEEDED_EARLY -100001 -#define TOKUDB_FOUND_BUT_REJECTED -100002 -#define TOKUDB_USER_CALLBACK_ERROR -100003 -#define TOKUDB_DICTIONARY_TOO_OLD -100004 -#define TOKUDB_DICTIONARY_TOO_NEW -100005 -#define TOKUDB_DICTIONARY_NO_HEADER -100006 -#define TOKUDB_CANCELED -100007 -#define TOKUDB_NO_DATA -100008 -#define TOKUDB_ACCEPT -100009 -#define TOKUDB_MVCC_DICTIONARY_TOO_NEW -100010 -#define TOKUDB_UPGRADE_FAILURE -100011 -#define TOKUDB_TRY_AGAIN -100012 -#define TOKUDB_NEEDS_REPAIR -100013 -#define TOKUDB_CURSOR_CONTINUE -100014 -/* LOADER flags */ -#define LOADER_USE_PUTS 1 -typedef int (*generate_row_for_put_func)(DB *dest_db, DB *src_db, DBT *dest_key, DBT *dest_val, const DBT *src_key, const DBT *src_val); -typedef int (*generate_row_for_del_func)(DB *dest_db, DB *src_db, DBT *dest_key, const DBT *src_key, const DBT *src_val); -/* in wrap mode, top-level function txn_begin is renamed, but the field isn't renamed, so we have to hack it here.*/ -#ifdef _TOKUDB_WRAP_H -#undef txn_begin -#endif -struct __toku_db_env { - struct __toku_db_env_internal *i; -#define db_env_struct_i(x) ((x)->i) - int (*checkpointing_set_period) (DB_ENV*, u_int32_t) /* Change the delay between automatic checkpoints. 0 means disabled. */; - int (*checkpointing_get_period) (DB_ENV*, u_int32_t*) /* Retrieve the delay between automatic checkpoints. 0 means disabled. */; - int (*cleaner_set_period) (DB_ENV*, u_int32_t) /* Change the delay between automatic cleaner attempts. 0 means disabled. */; - int (*cleaner_get_period) (DB_ENV*, u_int32_t*) /* Retrieve the delay between automatic cleaner attempts. 0 means disabled. */; - int (*cleaner_set_iterations) (DB_ENV*, u_int32_t) /* Change the number of attempts on each cleaner invokation. 0 means disabled. */; - int (*cleaner_get_iterations) (DB_ENV*, u_int32_t*) /* Retrieve the number of attempts on each cleaner invokation. 0 means disabled. */; - int (*checkpointing_postpone) (DB_ENV*) /* Use for 'rename table' or any other operation that must be disjoint from a checkpoint */; - int (*checkpointing_resume) (DB_ENV*) /* Alert tokudb 'postpone' is no longer necessary */; - int (*checkpointing_begin_atomic_operation) (DB_ENV*) /* Begin a set of operations (that must be atomic as far as checkpoints are concerned). i.e. inserting into every index in one table */; - int (*checkpointing_end_atomic_operation) (DB_ENV*) /* End a set of operations (that must be atomic as far as checkpoints are concerned). */; - int (*set_default_bt_compare) (DB_ENV*,int (*bt_compare) (DB *, const DBT *, const DBT *)) /* Set default (key) comparison function for all DBs in this environment. Required for RECOVERY since you cannot open the DBs manually. */; - int (*get_engine_status_num_rows) (DB_ENV*, uint64_t*) /* return number of rows in engine status */; - void *app_private; /* 32-bit offset=52 size=4, 64=bit offset=104 size=8 */ - int (*get_engine_status) (DB_ENV*, TOKU_ENGINE_STATUS_ROW, uint64_t, fs_redzone_state*, uint64_t*, char*, int) /* Fill in status struct and redzone state, possibly env panic string */; - int (*get_engine_status_text) (DB_ENV*, char*, int) /* Fill in status text */; - int (*crash) (DB_ENV*, const char*/*expr_as_string*/,const char */*fun*/,const char*/*file*/,int/*line*/, int/*errno*/);; - int (*get_iname) (DB_ENV* env, DBT* dname_dbt, DBT* iname_dbt) /* FOR TEST ONLY: lookup existing iname */; - int (*create_loader) (DB_ENV *env, DB_TXN *txn, DB_LOADER **blp, DB *src_db, int N, DB *dbs[/*N*/], uint32_t db_flags[/*N*/], uint32_t dbt_flags[/*N*/], uint32_t loader_flags); - int (*create_indexer) (DB_ENV *env, DB_TXN *txn, DB_INDEXER **idxrp, DB *src_db, int N, DB *dbs[/*N*/], uint32_t db_flags[/*N*/], uint32_t indexer_flags); - int (*put_multiple) (DB_ENV *env, DB *src_db, DB_TXN *txn, - const DBT *src_key, const DBT *src_val, - uint32_t num_dbs, DB **db_array, DBT *keys, DBT *vals, uint32_t *flags_array) /* insert into multiple DBs */; - int (*set_generate_row_callback_for_put) (DB_ENV *env, generate_row_for_put_func generate_row_for_put); - int (*del_multiple) (DB_ENV *env, DB *src_db, DB_TXN *txn, - const DBT *src_key, const DBT *src_val, - uint32_t num_dbs, DB **db_array, DBT *keys, uint32_t *flags_array) /* delete from multiple DBs */; - int (*set_generate_row_callback_for_del) (DB_ENV *env, generate_row_for_del_func generate_row_for_del); - int (*update_multiple) (DB_ENV *env, DB *src_db, DB_TXN *txn, - DBT *old_src_key, DBT *old_src_data, - DBT *new_src_key, DBT *new_src_data, - uint32_t num_dbs, DB **db_array, uint32_t *flags_array, - uint32_t num_keys, DBT *keys, - uint32_t num_vals, DBT *vals) /* update multiple DBs */; - int (*get_redzone) (DB_ENV *env, int *redzone) /* get the redzone limit */; - int (*set_redzone) (DB_ENV *env, int redzone) /* set the redzone limit in percent of total space */; - int (*set_lk_max_memory) (DB_ENV *env, uint64_t max); - int (*get_lk_max_memory) (DB_ENV *env, uint64_t *max); - void (*set_update) (DB_ENV *env, int (*update_function)(DB *, const DBT *key, const DBT *old_val, const DBT *extra, void (*set_val)(const DBT *new_val, void *set_extra), void *set_extra)); - int (*set_lock_timeout) (DB_ENV *env, uint64_t lock_wait_time_msec); - int (*get_lock_timeout) (DB_ENV *env, uint64_t *lock_wait_time_msec); - void* __toku_dummy0[20]; - char __toku_dummy1[128]; - void *api1_internal; /* 32-bit offset=336 size=4, 64=bit offset=544 size=8 */ - void* __toku_dummy2[8]; - int (*close) (DB_ENV *, u_int32_t); /* 32-bit offset=372 size=4, 64=bit offset=616 size=8 */ - int (*dbremove) (DB_ENV *, DB_TXN *, const char *, const char *, u_int32_t); /* 32-bit offset=376 size=4, 64=bit offset=624 size=8 */ - int (*dbrename) (DB_ENV *, DB_TXN *, const char *, const char *, const char *, u_int32_t); /* 32-bit offset=380 size=4, 64=bit offset=632 size=8 */ - void (*err) (const DB_ENV *, int, const char *, ...); /* 32-bit offset=384 size=4, 64=bit offset=640 size=8 */ - void* __toku_dummy3[3]; - int (*get_cachesize) (DB_ENV *, u_int32_t *, u_int32_t *, int *); /* 32-bit offset=400 size=4, 64=bit offset=672 size=8 */ - void* __toku_dummy4[4]; - int (*get_flags) (DB_ENV *, u_int32_t *); /* 32-bit offset=420 size=4, 64=bit offset=712 size=8 */ - void* __toku_dummy5[4]; - int (*get_lg_max) (DB_ENV *, u_int32_t*); /* 32-bit offset=440 size=4, 64=bit offset=752 size=8 */ - void* __toku_dummy6[4]; - int (*get_lk_max_locks) (DB_ENV *, u_int32_t *); /* 32-bit offset=460 size=4, 64=bit offset=792 size=8 */ - void* __toku_dummy7[21]; - int (*log_archive) (DB_ENV *, char **[], u_int32_t); /* 32-bit offset=548 size=4, 64=bit offset=968 size=8 */ - void* __toku_dummy8[2]; - int (*log_flush) (DB_ENV *, const DB_LSN *); /* 32-bit offset=560 size=4, 64=bit offset=992 size=8 */ - void* __toku_dummy9[25]; - int (*open) (DB_ENV *, const char *, u_int32_t, int); /* 32-bit offset=664 size=4, 64=bit offset=1200 size=8 */ - void* __toku_dummy10[27]; - int (*set_cachesize) (DB_ENV *, u_int32_t, u_int32_t, int); /* 32-bit offset=776 size=4, 64=bit offset=1424 size=8 */ - int (*set_data_dir) (DB_ENV *, const char *); /* 32-bit offset=780 size=4, 64=bit offset=1432 size=8 */ - void* __toku_dummy11[1]; - void (*set_errcall) (DB_ENV *, void (*)(const DB_ENV *, const char *, const char *)); /* 32-bit offset=788 size=4, 64=bit offset=1448 size=8 */ - void (*set_errfile) (DB_ENV *, FILE*); /* 32-bit offset=792 size=4, 64=bit offset=1456 size=8 */ - void (*set_errpfx) (DB_ENV *, const char *); /* 32-bit offset=796 size=4, 64=bit offset=1464 size=8 */ - void* __toku_dummy12[2]; - int (*set_flags) (DB_ENV *, u_int32_t, int); /* 32-bit offset=808 size=4, 64=bit offset=1488 size=8 */ - void* __toku_dummy13[2]; - int (*set_lg_bsize) (DB_ENV *, u_int32_t); /* 32-bit offset=820 size=4, 64=bit offset=1512 size=8 */ - int (*set_lg_dir) (DB_ENV *, const char *); /* 32-bit offset=824 size=4, 64=bit offset=1520 size=8 */ - void* __toku_dummy14[1]; - int (*set_lg_max) (DB_ENV *, u_int32_t); /* 32-bit offset=832 size=4, 64=bit offset=1536 size=8 */ - void* __toku_dummy15[2]; - int (*set_lk_detect) (DB_ENV *, u_int32_t); /* 32-bit offset=844 size=4, 64=bit offset=1560 size=8 */ - void* __toku_dummy16[1]; - int (*set_lk_max_locks) (DB_ENV *, u_int32_t); /* 32-bit offset=852 size=4, 64=bit offset=1576 size=8 */ - void* __toku_dummy17[14]; - int (*set_tmp_dir) (DB_ENV *, const char *); /* 32-bit offset=912 size=4, 64=bit offset=1696 size=8 */ - void* __toku_dummy18[2]; - int (*set_verbose) (DB_ENV *, u_int32_t, int); /* 32-bit offset=924 size=4, 64=bit offset=1720 size=8 */ - void* __toku_dummy19[1]; - int (*txn_begin) (DB_ENV *, DB_TXN *, DB_TXN **, u_int32_t); /* 32-bit offset=932 size=4, 64=bit offset=1736 size=8 */ - int (*txn_checkpoint) (DB_ENV *, u_int32_t, u_int32_t, u_int32_t); /* 32-bit offset=936 size=4, 64=bit offset=1744 size=8 */ - int (*txn_recover) (DB_ENV *, DB_PREPLIST preplist[/*count*/], long count, /*out*/ long *retp, u_int32_t flags); /* 32-bit offset=940 size=4, 64=bit offset=1752 size=8 */ - int (*txn_stat) (DB_ENV *, DB_TXN_STAT **, u_int32_t); /* 32-bit offset=944 size=4, 64=bit offset=1760 size=8 */ - void* __toku_dummy20[2]; /* Padding at the end */ - char __toku_dummy21[16]; /* Padding at the end */ -}; -struct __toku_db_key_range { - double less; /* 32-bit offset=0 size=8, 64=bit offset=0 size=8 */ - double equal; /* 32-bit offset=8 size=8, 64=bit offset=8 size=8 */ - double greater; /* 32-bit offset=16 size=8, 64=bit offset=16 size=8 */ - void* __toku_dummy0[207]; /* Padding at the end */ - char __toku_dummy1[120]; /* Padding at the end */ -}; -struct __toku_db_lsn { - char __toku_dummy0[8]; /* Padding at the end */ -}; -struct __toku_dbt { - void*data; /* 32-bit offset=0 size=4, 64=bit offset=0 size=8 */ - u_int32_t size; /* 32-bit offset=4 size=4, 64=bit offset=8 size=4 */ - u_int32_t ulen; /* 32-bit offset=8 size=4, 64=bit offset=12 size=4 */ - void* __toku_dummy0[1]; - char __toku_dummy1[8]; - u_int32_t flags; /* 32-bit offset=24 size=4, 64=bit offset=32 size=4 */ - /* 4 more bytes of alignment in the 64-bit case. */ -}; -typedef struct __toku_descriptor { - DBT dbt; -} *DESCRIPTOR, DESCRIPTOR_S; -//One header is included in 'data' -//One header is included in 'additional for checkpoint' -typedef struct __toku_db_fragmentation { - uint64_t file_size_bytes; //Total file size in bytes - uint64_t data_bytes; //Compressed User Data in bytes - uint64_t data_blocks; //Number of blocks of compressed User Data - uint64_t checkpoint_bytes_additional; //Additional bytes used for checkpoint system - uint64_t checkpoint_blocks_additional; //Additional blocks used for checkpoint system - uint64_t unused_bytes; //Unused space in file - uint64_t unused_blocks; //Number of contiguous regions of unused space - uint64_t largest_unused_block; //Size of largest contiguous unused space -} *TOKU_DB_FRAGMENTATION, TOKU_DB_FRAGMENTATION_S; -struct __toku_db { - struct __toku_db_internal *i; -#define db_struct_i(x) ((x)->i) - int (*key_range64)(DB*, DB_TXN *, DBT *, u_int64_t *less, u_int64_t *equal, u_int64_t *greater, int *is_exact); - int (*stat64)(DB *, DB_TXN *, DB_BTREE_STAT64 *); - int (*pre_acquire_table_lock)(DB*, DB_TXN*); - void *app_private; /* 32-bit offset=16 size=4, 64=bit offset=32 size=8 */ - DB_ENV *dbenv; /* 32-bit offset=20 size=4, 64=bit offset=40 size=8 */ - int (*pre_acquire_fileops_lock)(DB*, DB_TXN*); - const DBT* (*dbt_pos_infty)(void) /* Return the special DBT that refers to positive infinity in the lock table.*/; - const DBT* (*dbt_neg_infty)(void)/* Return the special DBT that refers to negative infinity in the lock table.*/; - void (*get_max_row_size) (DB*, u_int32_t *max_key_size, u_int32_t *max_row_size); - DESCRIPTOR descriptor /* saved row/dictionary descriptor for aiding in comparisons */; - int (*change_descriptor) (DB*, DB_TXN*, const DBT* descriptor, u_int32_t) /* change row/dictionary descriptor for a db. Available only while db is open */; - int (*getf_set)(DB*, DB_TXN*, u_int32_t, DBT*, YDB_CALLBACK_FUNCTION, void*) /* same as DBC->c_getf_set without a persistent cursor) */; - int (*flatten)(DB*, DB_TXN*) /* Flatten a dictionary, similar to (but faster than) a table scan */; - int (*optimize)(DB*) /* Run garbage collecion and promote all transactions older than oldest. Amortized (happens during flattening) */; - int (*hot_optimize)(DB*, int (*progress_callback)(void *progress_extra, float progress), void *progress_extra); - int (*get_fragmentation)(DB*,TOKU_DB_FRAGMENTATION); - int (*get_readpagesize)(DB*,u_int32_t*); - int (*set_readpagesize)(DB*,u_int32_t); - int (*set_indexer)(DB*, DB_INDEXER*); - void (*get_indexer)(DB*, DB_INDEXER**); - int (*verify_with_progress)(DB *, int (*progress_callback)(void *progress_extra, float progress), void *progress_extra, int verbose, int keep_going); - int (*update)(DB *, DB_TXN*, const DBT *key, const DBT *extra, u_int32_t flags); - int (*update_broadcast)(DB *, DB_TXN*, const DBT *extra, u_int32_t flags); - void* __toku_dummy0[19]; - char __toku_dummy1[96]; - void *api_internal; /* 32-bit offset=268 size=4, 64=bit offset=440 size=8 */ - void* __toku_dummy2[5]; - int (*close) (DB*, u_int32_t); /* 32-bit offset=292 size=4, 64=bit offset=488 size=8 */ - void* __toku_dummy3[1]; - int (*cursor) (DB *, DB_TXN *, DBC **, u_int32_t); /* 32-bit offset=300 size=4, 64=bit offset=504 size=8 */ - int (*del) (DB *, DB_TXN *, DBT *, u_int32_t); /* 32-bit offset=304 size=4, 64=bit offset=512 size=8 */ - void* __toku_dummy4[2]; - int (*fd) (DB *, int *); /* 32-bit offset=316 size=4, 64=bit offset=536 size=8 */ - int (*get) (DB *, DB_TXN *, DBT *, DBT *, u_int32_t); /* 32-bit offset=320 size=4, 64=bit offset=544 size=8 */ - void* __toku_dummy5[8]; - int (*get_flags) (DB *, u_int32_t *); /* 32-bit offset=356 size=4, 64=bit offset=616 size=8 */ - void* __toku_dummy6[6]; - int (*get_pagesize) (DB *, u_int32_t *); /* 32-bit offset=384 size=4, 64=bit offset=672 size=8 */ - void* __toku_dummy7[8]; - int (*key_range) (DB *, DB_TXN *, DBT *, DB_KEY_RANGE *, u_int32_t); /* 32-bit offset=420 size=4, 64=bit offset=744 size=8 */ - int (*open) (DB *, DB_TXN *, const char *, const char *, DBTYPE, u_int32_t, int); /* 32-bit offset=424 size=4, 64=bit offset=752 size=8 */ - void* __toku_dummy8[1]; - int (*put) (DB *, DB_TXN *, DBT *, DBT *, u_int32_t); /* 32-bit offset=432 size=4, 64=bit offset=768 size=8 */ - int (*remove) (DB *, const char *, const char *, u_int32_t); /* 32-bit offset=436 size=4, 64=bit offset=776 size=8 */ - int (*rename) (DB *, const char *, const char *, const char *, u_int32_t); /* 32-bit offset=440 size=4, 64=bit offset=784 size=8 */ - void* __toku_dummy9[9]; - void (*set_errfile) (DB *, FILE*); /* 32-bit offset=480 size=4, 64=bit offset=864 size=8 */ - void* __toku_dummy10[2]; - int (*set_flags) (DB *, u_int32_t); /* 32-bit offset=492 size=4, 64=bit offset=888 size=8 */ - void* __toku_dummy11[6]; - int (*set_pagesize) (DB *, u_int32_t); /* 32-bit offset=520 size=4, 64=bit offset=944 size=8 */ - void* __toku_dummy12[6]; - int (*stat) (DB *, void *, u_int32_t); /* 32-bit offset=548 size=4, 64=bit offset=1000 size=8 */ - void* __toku_dummy13[2]; - int (*truncate) (DB *, DB_TXN *, u_int32_t *, u_int32_t); /* 32-bit offset=560 size=4, 64=bit offset=1024 size=8 */ - void* __toku_dummy14[1]; - int (*verify) (DB *, const char *, const char *, FILE *, u_int32_t); /* 32-bit offset=568 size=4, 64=bit offset=1040 size=8 */ - void* __toku_dummy15[5]; /* Padding at the end */ - char __toku_dummy16[16]; /* Padding at the end */ -}; -struct __toku_db_txn_active { - u_int32_t txnid; /* 32-bit offset=0 size=4, 64=bit offset=0 size=4 */ - void* __toku_dummy0[2]; - char __toku_dummy1[4]; - DB_LSN lsn; /* 32-bit offset=16 size=8, 64=bit offset=24 size=8 */ - char __toku_dummy2[200]; /* Padding at the end */ -}; -typedef struct __toku_txn_progress { - uint64_t entries_total; - uint64_t entries_processed; - uint8_t is_commit; - uint8_t stalled_on_checkpoint; -} *TOKU_TXN_PROGRESS, TOKU_TXN_PROGRESS_S; -typedef void(*TXN_PROGRESS_POLL_FUNCTION)(TOKU_TXN_PROGRESS, void*); -struct txn_stat { - u_int64_t rollback_raw_count; -}; -struct __toku_db_txn { - DB_ENV *mgrp /*In TokuDB, mgrp is a DB_ENV not a DB_TXNMGR*/; /* 32-bit offset=0 size=4, 64=bit offset=0 size=8 */ - DB_TXN *parent; /* 32-bit offset=4 size=4, 64=bit offset=8 size=8 */ - int (*txn_stat)(DB_TXN *, struct txn_stat **); - struct toku_list open_txns; - int (*commit_with_progress)(DB_TXN*, uint32_t, TXN_PROGRESS_POLL_FUNCTION, void*); - int (*abort_with_progress)(DB_TXN*, TXN_PROGRESS_POLL_FUNCTION, void*); - void* __toku_dummy0[13]; - char __toku_dummy1[8]; - void *api_internal; /* 32-bit offset=84 size=4, 64=bit offset=160 size=8 */ - void* __toku_dummy2[2]; - int (*abort) (DB_TXN *); /* 32-bit offset=96 size=4, 64=bit offset=184 size=8 */ - int (*commit) (DB_TXN*, u_int32_t); /* 32-bit offset=100 size=4, 64=bit offset=192 size=8 */ - void* __toku_dummy3[2]; - u_int32_t (*id) (DB_TXN *); /* 32-bit offset=112 size=4, 64=bit offset=216 size=8 */ - int (*prepare) (DB_TXN*, u_int8_t gid[DB_GID_SIZE]); /* 32-bit offset=116 size=4, 64=bit offset=224 size=8 */ - void* __toku_dummy4[4]; /* Padding at the end */ -}; -struct __toku_db_txn_stat { - void* __toku_dummy0[1]; - char __toku_dummy1[28]; - u_int32_t st_nactive; /* 32-bit offset=32 size=4, 64=bit offset=36 size=4 */ - char __toku_dummy2[16]; - DB_TXN_ACTIVE *st_txnarray; /* 32-bit offset=52 size=4, 64=bit offset=56 size=8 */ - void* __toku_dummy3[1]; /* Padding at the end */ - char __toku_dummy4[8]; /* Padding at the end */ -}; -struct __toku_dbc { - DB *dbp; /* 32-bit offset=0 size=4, 64=bit offset=0 size=8 */ - int (*c_getf_first)(DBC *, u_int32_t, YDB_CALLBACK_FUNCTION, void *); - int (*c_getf_last)(DBC *, u_int32_t, YDB_CALLBACK_FUNCTION, void *); - int (*c_getf_next)(DBC *, u_int32_t, YDB_CALLBACK_FUNCTION, void *); - int (*c_getf_prev)(DBC *, u_int32_t, YDB_CALLBACK_FUNCTION, void *); - int (*c_getf_current)(DBC *, u_int32_t, YDB_CALLBACK_FUNCTION, void *); - int (*c_getf_current_binding)(DBC *, u_int32_t, YDB_CALLBACK_FUNCTION, void *); - int (*c_getf_set)(DBC *, u_int32_t, DBT *, YDB_CALLBACK_FUNCTION, void *); - int (*c_getf_set_range)(DBC *, u_int32_t, DBT *, YDB_CALLBACK_FUNCTION, void *); - int (*c_getf_set_range_reverse)(DBC *, u_int32_t, DBT *, YDB_CALLBACK_FUNCTION, void *); - int (*c_pre_acquire_range_lock)(DBC*, const DBT*, const DBT*); - void* __toku_dummy0[14]; - char __toku_dummy1[104]; - int (*c_close) (DBC *); /* 32-bit offset=204 size=4, 64=bit offset=304 size=8 */ - int (*c_count) (DBC *, db_recno_t *, u_int32_t); /* 32-bit offset=208 size=4, 64=bit offset=312 size=8 */ - int (*c_del) (DBC *, u_int32_t); /* 32-bit offset=212 size=4, 64=bit offset=320 size=8 */ - void* __toku_dummy2[1]; - int (*c_get) (DBC *, DBT *, DBT *, u_int32_t); /* 32-bit offset=220 size=4, 64=bit offset=336 size=8 */ - void* __toku_dummy3[10]; /* Padding at the end */ -}; -#ifdef _TOKUDB_WRAP_H -#define txn_begin txn_begin_tokudb -#endif -int db_env_create(DB_ENV **, u_int32_t) __attribute__((__visibility__("default"))); -int db_create(DB **, DB_ENV *, u_int32_t) __attribute__((__visibility__("default"))); -char *db_strerror(int) __attribute__((__visibility__("default"))); -const char *db_version(int*,int *,int *) __attribute__((__visibility__("default"))); -int log_compare (const DB_LSN*, const DB_LSN *) __attribute__((__visibility__("default"))); -int db_env_set_func_fsync (int (*)(int)) __attribute__((__visibility__("default"))); -int toku_set_trace_file (char *fname) __attribute__((__visibility__("default"))); -int toku_close_trace_file (void) __attribute__((__visibility__("default"))); -int db_env_set_func_free (void (*)(void*)) __attribute__((__visibility__("default"))); -int db_env_set_func_malloc (void *(*)(size_t)) __attribute__((__visibility__("default"))); -int db_env_set_func_realloc (void *(*)(void*, size_t)) __attribute__((__visibility__("default"))); -int db_env_set_func_pwrite (ssize_t (*)(int, const void *, size_t, toku_off_t)) __attribute__((__visibility__("default"))); -int db_env_set_func_full_pwrite (ssize_t (*)(int, const void *, size_t, toku_off_t)) __attribute__((__visibility__("default"))); -int db_env_set_func_write (ssize_t (*)(int, const void *, size_t)) __attribute__((__visibility__("default"))); -int db_env_set_func_full_write (ssize_t (*)(int, const void *, size_t)) __attribute__((__visibility__("default"))); -int db_env_set_func_fdopen (FILE* (*)(int, const char *)) __attribute__((__visibility__("default"))); -int db_env_set_func_fopen (FILE* (*)(const char *, const char *)) __attribute__((__visibility__("default"))); -int db_env_set_func_open (int (*)(const char *, int, int)) __attribute__((__visibility__("default"))); -int db_env_set_func_fclose (int (*)(FILE*)) __attribute__((__visibility__("default"))); -int db_env_set_func_pread (ssize_t (*)(int, void *, size_t, off_t)) __attribute__((__visibility__("default"))); -void db_env_set_func_loader_fwrite (size_t (*fwrite_fun)(const void*,size_t,size_t,FILE*)) __attribute__((__visibility__("default"))); -void db_env_set_checkpoint_callback (void (*)(void*), void*) __attribute__((__visibility__("default"))); -void db_env_set_checkpoint_callback2 (void (*)(void*), void*) __attribute__((__visibility__("default"))); -void db_env_set_recover_callback (void (*)(void*), void*) __attribute__((__visibility__("default"))); -void db_env_set_recover_callback2 (void (*)(void*), void*) __attribute__((__visibility__("default"))); -void db_env_set_loader_size_factor (uint32_t) __attribute__((__visibility__("default"))); -void db_env_set_mvcc_garbage_collection_verification(u_int32_t) __attribute__((__visibility__("default"))); -void db_env_enable_engine_status(u_int32_t) __attribute__((__visibility__("default"))); -void db_env_set_flusher_thread_callback (void (*)(int, void*), void*) __attribute__((__visibility__("default"))); -#if defined(__cplusplus) -} -#endif -#endif diff --git a/buildheader/db.h_4_6 b/buildheader/db.h_4_6 index 70be34efbb9..97825182943 100644 --- a/buildheader/db.h_4_6 +++ b/buildheader/db.h_4_6 @@ -9,482 +9,3 @@ #if defined(__cplusplus) extern "C" { #endif -#define TOKUDB 1 -#define TOKUDB_NATIVE_H 0 -#define DB_VERSION_MAJOR 4 -#define DB_VERSION_MINOR 6 -#define DB_VERSION_PATCH 19 -#ifndef _TOKUDB_WRAP_H -#define DB_VERSION_STRING "Tokutek: TokuDB 4.6.19" -#else -#define DB_VERSION_STRING_ydb "Tokutek: TokuDB (wrapped bdb)" -#endif -#ifndef TOKU_OFF_T_DEFINED -#define TOKU_OFF_T_DEFINED -typedef int64_t toku_off_t; -#endif -#define DB_GID_SIZE 128 -typedef struct __toku_db_env DB_ENV; -typedef struct __toku_db_key_range DB_KEY_RANGE; -typedef struct __toku_db_lsn DB_LSN; -typedef struct __toku_db DB; -typedef struct __toku_db_txn DB_TXN; -typedef struct __toku_db_txn_active DB_TXN_ACTIVE; -typedef struct __toku_db_txn_stat DB_TXN_STAT; -typedef struct __toku_dbc DBC; -typedef struct __toku_dbt DBT; -typedef struct __toku_db_preplist { DB_TXN *txn; uint8_t gid[DB_GID_SIZE]; } DB_PREPLIST; -typedef u_int32_t db_recno_t; -typedef int(*YDB_CALLBACK_FUNCTION)(DBT const*, DBT const*, void*); -#include -#ifndef __BIGGEST_ALIGNMENT__ - #define __BIGGEST_ALIGNMENT__ 16 -#endif -typedef struct __toku_db_btree_stat64 { - u_int64_t bt_nkeys; /* how many unique keys (guaranteed only to be an estimate, even when flattened) */ - u_int64_t bt_ndata; /* how many key-value pairs (an estimate, but exact when flattened) */ - u_int64_t bt_dsize; /* how big are the keys+values (not counting the lengths) (an estimate, unless flattened) */ - u_int64_t bt_fsize; /* how big is the underlying file */ - u_int64_t bt_create_time_sec; /* Creation time, in seconds */ - u_int64_t bt_modify_time_sec; /* Time of last serialization, in seconds */ - u_int64_t bt_verify_time_sec; /* Time of last verification, in seconds */ -} DB_BTREE_STAT64; -typedef struct __toku_loader DB_LOADER; -struct __toku_loader_internal; -struct __toku_loader { - struct __toku_loader_internal *i; - int (*set_error_callback)(DB_LOADER *loader, void (*error_cb)(DB *db, int i, int err, DBT *key, DBT *val, void *error_extra), void *error_extra); /* set the error callback */ - int (*set_poll_function)(DB_LOADER *loader, int (*poll_func)(void *extra, float progress), void *poll_extra); /* set the polling function */ - int (*put)(DB_LOADER *loader, DBT *key, DBT* val); /* give a row to the loader */ - int (*close)(DB_LOADER *loader); /* finish loading, free memory */ - int (*abort)(DB_LOADER *loader); /* abort loading, free memory */ -}; -typedef struct __toku_indexer DB_INDEXER; -struct __toku_indexer_internal; -struct __toku_indexer { - struct __toku_indexer_internal *i; - int (*set_error_callback)(DB_INDEXER *indexer, void (*error_cb)(DB *db, int i, int err, DBT *key, DBT *val, void *error_extra), void *error_extra); /* set the error callback */ - int (*set_poll_function)(DB_INDEXER *indexer, int (*poll_func)(void *extra, float progress), void *poll_extra); /* set the polling function */ - int (*build)(DB_INDEXER *indexer); /* build the indexes */ - int (*close)(DB_INDEXER *indexer); /* finish indexing, free memory */ - int (*abort)(DB_INDEXER *indexer); /* abort indexing, free memory */ -}; -typedef enum { - FS_GREEN = 0, // green zone (we have lots of space) - FS_YELLOW = 1, // yellow zone (issue warning but allow operations) - FS_RED = 2, // red zone (prevent insert operations) - FS_BLOCKED = 3 // For reporting engine status, completely blocked -} fs_redzone_state; -typedef enum { - FS_STATE = 0, // interpret as file system state (redzone) enum - UINT64, // interpret as uint64_t - CHARSTR, // interpret as char * - UNIXTIME, // interpret as time_t - TOKUTIME // interpret as tokutime_t -} toku_engine_status_display_type; -typedef struct __toku_engine_status_row { - char * keyname; // info schema key, should not change across revisions without good reason - char * legend; // the text that will appear at user interface - toku_engine_status_display_type type; // how to interpret the value - union { - uint64_t num; - char * str; - } value; -} * TOKU_ENGINE_STATUS_ROW, TOKU_ENGINE_STATUS_ROW_S; -typedef enum { - DB_BTREE=1, - DB_UNKNOWN=5 -} DBTYPE; -#ifndef _TOKUDB_WRAP_H -#define DB_VERB_DEADLOCK 1 -#define DB_VERB_RECOVERY 8 -#define DB_VERB_REPLICATION 32 -#define DB_VERB_WAITSFOR 64 -#define DB_ARCH_ABS 1 -#define DB_ARCH_LOG 4 -#define DB_CREATE 1 -#define DB_CXX_NO_EXCEPTIONS 1 -#define DB_EXCL 16384 -#define DB_PRIVATE 8388608 -#define DB_RDONLY 32 -#define DB_RECOVER 64 -#define DB_RUNRECOVERY -30975 -#define DB_THREAD 128 -#define DB_TXN_NOSYNC 512 -#define DB_LOCK_DEFAULT 1 -#define DB_LOCK_OLDEST 7 -#define DB_LOCK_RANDOM 8 -#define DB_KEYFIRST 13 -#define DB_KEYLAST 14 -#define DB_NOOVERWRITE 20 -#define DB_NODUPDATA 19 -#define DB_NOOVERWRITE_NO_ERROR 1 -#define DB_OPFLAGS_MASK 255 -#define DB_AUTO_COMMIT 33554432 -#define DB_INIT_LOCK 131072 -#define DB_INIT_LOG 262144 -#define DB_INIT_MPOOL 524288 -#define DB_INIT_TXN 2097152 -#define DB_KEYEXIST -30996 -#define DB_LOCK_DEADLOCK -30995 -#define DB_LOCK_NOTGRANTED -30994 -#define DB_NOTFOUND -30989 -#define DB_SECONDARY_BAD -30974 -#define DB_DONOTINDEX -30998 -#define DB_BUFFER_SMALL -30999 -#define DB_BADFORMAT -30500 -#define DB_DELETE_ANY 65536 -#define DB_TRUNCATE_WITHCURSORS 131072 -#define DB_FIRST 7 -#define DB_LAST 15 -#define DB_CURRENT 6 -#define DB_NEXT 16 -#define DB_NEXT_NODUP 18 -#define DB_PREV 23 -#define DB_PREV_NODUP 25 -#define DB_SET 26 -#define DB_SET_RANGE 27 -#define DB_CURRENT_BINDING 253 -#define DB_SET_RANGE_REVERSE 252 -#define DB_RMW 1073741824 -#define DB_IS_RESETTING_OP 0x01000000 -#define DB_PRELOCKED 0x00800000 -#define DB_PRELOCKED_WRITE 0x00400000 -#define DB_IS_HOT_INDEX 0x00100000 -#define DBC_DISABLE_PREFETCHING 0x20000000 -#define DB_DBT_APPMALLOC 1 -#define DB_DBT_DUPOK 2 -#define DB_DBT_MALLOC 8 -#define DB_DBT_MULTIPLE 16 -#define DB_DBT_REALLOC 64 -#define DB_DBT_USERMEM 256 -#define DB_LOG_AUTOREMOVE 524288 -#define DB_TXN_WRITE_NOSYNC 4096 -#define DB_TXN_NOWAIT 1024 -#define DB_TXN_SYNC 16384 -#define DB_TXN_SNAPSHOT 268435456 -#define DB_READ_UNCOMMITTED 134217728 -#define DB_READ_COMMITTED 67108864 -#define DB_INHERIT_ISOLATION 1 -#define DB_SERIALIZABLE 2 -#endif -/* TOKUDB specific error codes */ -#define TOKUDB_OUT_OF_LOCKS -100000 -#define TOKUDB_SUCCEEDED_EARLY -100001 -#define TOKUDB_FOUND_BUT_REJECTED -100002 -#define TOKUDB_USER_CALLBACK_ERROR -100003 -#define TOKUDB_DICTIONARY_TOO_OLD -100004 -#define TOKUDB_DICTIONARY_TOO_NEW -100005 -#define TOKUDB_DICTIONARY_NO_HEADER -100006 -#define TOKUDB_CANCELED -100007 -#define TOKUDB_NO_DATA -100008 -#define TOKUDB_ACCEPT -100009 -#define TOKUDB_MVCC_DICTIONARY_TOO_NEW -100010 -#define TOKUDB_UPGRADE_FAILURE -100011 -#define TOKUDB_TRY_AGAIN -100012 -#define TOKUDB_NEEDS_REPAIR -100013 -#define TOKUDB_CURSOR_CONTINUE -100014 -/* LOADER flags */ -#define LOADER_USE_PUTS 1 -typedef int (*generate_row_for_put_func)(DB *dest_db, DB *src_db, DBT *dest_key, DBT *dest_val, const DBT *src_key, const DBT *src_val); -typedef int (*generate_row_for_del_func)(DB *dest_db, DB *src_db, DBT *dest_key, const DBT *src_key, const DBT *src_val); -/* in wrap mode, top-level function txn_begin is renamed, but the field isn't renamed, so we have to hack it here.*/ -#ifdef _TOKUDB_WRAP_H -#undef txn_begin -#endif -struct __toku_db_env { - struct __toku_db_env_internal *i; -#define db_env_struct_i(x) ((x)->i) - int (*checkpointing_set_period) (DB_ENV*, u_int32_t) /* Change the delay between automatic checkpoints. 0 means disabled. */; - int (*checkpointing_get_period) (DB_ENV*, u_int32_t*) /* Retrieve the delay between automatic checkpoints. 0 means disabled. */; - int (*cleaner_set_period) (DB_ENV*, u_int32_t) /* Change the delay between automatic cleaner attempts. 0 means disabled. */; - int (*cleaner_get_period) (DB_ENV*, u_int32_t*) /* Retrieve the delay between automatic cleaner attempts. 0 means disabled. */; - int (*cleaner_set_iterations) (DB_ENV*, u_int32_t) /* Change the number of attempts on each cleaner invokation. 0 means disabled. */; - int (*cleaner_get_iterations) (DB_ENV*, u_int32_t*) /* Retrieve the number of attempts on each cleaner invokation. 0 means disabled. */; - int (*checkpointing_postpone) (DB_ENV*) /* Use for 'rename table' or any other operation that must be disjoint from a checkpoint */; - int (*checkpointing_resume) (DB_ENV*) /* Alert tokudb 'postpone' is no longer necessary */; - int (*checkpointing_begin_atomic_operation) (DB_ENV*) /* Begin a set of operations (that must be atomic as far as checkpoints are concerned). i.e. inserting into every index in one table */; - int (*checkpointing_end_atomic_operation) (DB_ENV*) /* End a set of operations (that must be atomic as far as checkpoints are concerned). */; - int (*set_default_bt_compare) (DB_ENV*,int (*bt_compare) (DB *, const DBT *, const DBT *)) /* Set default (key) comparison function for all DBs in this environment. Required for RECOVERY since you cannot open the DBs manually. */; - int (*get_engine_status_num_rows) (DB_ENV*, uint64_t*) /* return number of rows in engine status */; - void *app_private; /* 32-bit offset=52 size=4, 64=bit offset=104 size=8 */ - int (*get_engine_status) (DB_ENV*, TOKU_ENGINE_STATUS_ROW, uint64_t, fs_redzone_state*, uint64_t*, char*, int) /* Fill in status struct and redzone state, possibly env panic string */; - int (*get_engine_status_text) (DB_ENV*, char*, int) /* Fill in status text */; - int (*crash) (DB_ENV*, const char*/*expr_as_string*/,const char */*fun*/,const char*/*file*/,int/*line*/, int/*errno*/);; - int (*get_iname) (DB_ENV* env, DBT* dname_dbt, DBT* iname_dbt) /* FOR TEST ONLY: lookup existing iname */; - int (*create_loader) (DB_ENV *env, DB_TXN *txn, DB_LOADER **blp, DB *src_db, int N, DB *dbs[/*N*/], uint32_t db_flags[/*N*/], uint32_t dbt_flags[/*N*/], uint32_t loader_flags); - int (*create_indexer) (DB_ENV *env, DB_TXN *txn, DB_INDEXER **idxrp, DB *src_db, int N, DB *dbs[/*N*/], uint32_t db_flags[/*N*/], uint32_t indexer_flags); - int (*put_multiple) (DB_ENV *env, DB *src_db, DB_TXN *txn, - const DBT *src_key, const DBT *src_val, - uint32_t num_dbs, DB **db_array, DBT *keys, DBT *vals, uint32_t *flags_array) /* insert into multiple DBs */; - int (*set_generate_row_callback_for_put) (DB_ENV *env, generate_row_for_put_func generate_row_for_put); - int (*del_multiple) (DB_ENV *env, DB *src_db, DB_TXN *txn, - const DBT *src_key, const DBT *src_val, - uint32_t num_dbs, DB **db_array, DBT *keys, uint32_t *flags_array) /* delete from multiple DBs */; - int (*set_generate_row_callback_for_del) (DB_ENV *env, generate_row_for_del_func generate_row_for_del); - int (*update_multiple) (DB_ENV *env, DB *src_db, DB_TXN *txn, - DBT *old_src_key, DBT *old_src_data, - DBT *new_src_key, DBT *new_src_data, - uint32_t num_dbs, DB **db_array, uint32_t *flags_array, - uint32_t num_keys, DBT *keys, - uint32_t num_vals, DBT *vals) /* update multiple DBs */; - int (*get_redzone) (DB_ENV *env, int *redzone) /* get the redzone limit */; - int (*set_redzone) (DB_ENV *env, int redzone) /* set the redzone limit in percent of total space */; - int (*set_lk_max_memory) (DB_ENV *env, uint64_t max); - int (*get_lk_max_memory) (DB_ENV *env, uint64_t *max); - void (*set_update) (DB_ENV *env, int (*update_function)(DB *, const DBT *key, const DBT *old_val, const DBT *extra, void (*set_val)(const DBT *new_val, void *set_extra), void *set_extra)); - int (*set_lock_timeout) (DB_ENV *env, uint64_t lock_wait_time_msec); - int (*get_lock_timeout) (DB_ENV *env, uint64_t *lock_wait_time_msec); - void* __toku_dummy0[21]; - char __toku_dummy1[144]; - void *api1_internal; /* 32-bit offset=356 size=4, 64=bit offset=568 size=8 */ - void* __toku_dummy2[8]; - int (*close) (DB_ENV *, u_int32_t); /* 32-bit offset=392 size=4, 64=bit offset=640 size=8 */ - int (*dbremove) (DB_ENV *, DB_TXN *, const char *, const char *, u_int32_t); /* 32-bit offset=396 size=4, 64=bit offset=648 size=8 */ - int (*dbrename) (DB_ENV *, DB_TXN *, const char *, const char *, const char *, u_int32_t); /* 32-bit offset=400 size=4, 64=bit offset=656 size=8 */ - void (*err) (const DB_ENV *, int, const char *, ...); /* 32-bit offset=404 size=4, 64=bit offset=664 size=8 */ - void* __toku_dummy3[3]; - int (*get_cachesize) (DB_ENV *, u_int32_t *, u_int32_t *, int *); /* 32-bit offset=420 size=4, 64=bit offset=696 size=8 */ - void* __toku_dummy4[5]; - int (*get_flags) (DB_ENV *, u_int32_t *); /* 32-bit offset=444 size=4, 64=bit offset=744 size=8 */ - void* __toku_dummy5[4]; - int (*get_lg_max) (DB_ENV *, u_int32_t*); /* 32-bit offset=464 size=4, 64=bit offset=784 size=8 */ - void* __toku_dummy6[4]; - int (*get_lk_max_locks) (DB_ENV *, u_int32_t *); /* 32-bit offset=484 size=4, 64=bit offset=824 size=8 */ - void* __toku_dummy7[22]; - int (*log_archive) (DB_ENV *, char **[], u_int32_t); /* 32-bit offset=576 size=4, 64=bit offset=1008 size=8 */ - void* __toku_dummy8[2]; - int (*log_flush) (DB_ENV *, const DB_LSN *); /* 32-bit offset=588 size=4, 64=bit offset=1032 size=8 */ - void* __toku_dummy9[25]; - int (*open) (DB_ENV *, const char *, u_int32_t, int); /* 32-bit offset=692 size=4, 64=bit offset=1240 size=8 */ - void* __toku_dummy10[30]; - int (*set_cachesize) (DB_ENV *, u_int32_t, u_int32_t, int); /* 32-bit offset=816 size=4, 64=bit offset=1488 size=8 */ - void* __toku_dummy11[1]; - int (*set_data_dir) (DB_ENV *, const char *); /* 32-bit offset=824 size=4, 64=bit offset=1504 size=8 */ - void* __toku_dummy12[1]; - void (*set_errcall) (DB_ENV *, void (*)(const DB_ENV *, const char *, const char *)); /* 32-bit offset=832 size=4, 64=bit offset=1520 size=8 */ - void (*set_errfile) (DB_ENV *, FILE*); /* 32-bit offset=836 size=4, 64=bit offset=1528 size=8 */ - void (*set_errpfx) (DB_ENV *, const char *); /* 32-bit offset=840 size=4, 64=bit offset=1536 size=8 */ - void* __toku_dummy13[2]; - int (*set_flags) (DB_ENV *, u_int32_t, int); /* 32-bit offset=852 size=4, 64=bit offset=1560 size=8 */ - void* __toku_dummy14[2]; - int (*set_lg_bsize) (DB_ENV *, u_int32_t); /* 32-bit offset=864 size=4, 64=bit offset=1584 size=8 */ - int (*set_lg_dir) (DB_ENV *, const char *); /* 32-bit offset=868 size=4, 64=bit offset=1592 size=8 */ - void* __toku_dummy15[1]; - int (*set_lg_max) (DB_ENV *, u_int32_t); /* 32-bit offset=876 size=4, 64=bit offset=1608 size=8 */ - void* __toku_dummy16[2]; - int (*set_lk_detect) (DB_ENV *, u_int32_t); /* 32-bit offset=888 size=4, 64=bit offset=1632 size=8 */ - void* __toku_dummy17[1]; - int (*set_lk_max_locks) (DB_ENV *, u_int32_t); /* 32-bit offset=896 size=4, 64=bit offset=1648 size=8 */ - void* __toku_dummy18[14]; - int (*set_tmp_dir) (DB_ENV *, const char *); /* 32-bit offset=956 size=4, 64=bit offset=1768 size=8 */ - void* __toku_dummy19[2]; - int (*set_verbose) (DB_ENV *, u_int32_t, int); /* 32-bit offset=968 size=4, 64=bit offset=1792 size=8 */ - void* __toku_dummy20[1]; - int (*txn_begin) (DB_ENV *, DB_TXN *, DB_TXN **, u_int32_t); /* 32-bit offset=976 size=4, 64=bit offset=1808 size=8 */ - int (*txn_checkpoint) (DB_ENV *, u_int32_t, u_int32_t, u_int32_t); /* 32-bit offset=980 size=4, 64=bit offset=1816 size=8 */ - int (*txn_recover) (DB_ENV *, DB_PREPLIST preplist[/*count*/], long count, /*out*/ long *retp, u_int32_t flags); /* 32-bit offset=984 size=4, 64=bit offset=1824 size=8 */ - int (*txn_stat) (DB_ENV *, DB_TXN_STAT **, u_int32_t); /* 32-bit offset=988 size=4, 64=bit offset=1832 size=8 */ - void* __toku_dummy21[2]; /* Padding at the end */ - char __toku_dummy22[16]; /* Padding at the end */ -}; -struct __toku_db_key_range { - double less; /* 32-bit offset=0 size=8, 64=bit offset=0 size=8 */ - double equal; /* 32-bit offset=8 size=8, 64=bit offset=8 size=8 */ - double greater; /* 32-bit offset=16 size=8, 64=bit offset=16 size=8 */ - void* __toku_dummy0[214]; /* Padding at the end */ - char __toku_dummy1[136]; /* Padding at the end */ -}; -struct __toku_db_lsn { - char __toku_dummy0[8]; /* Padding at the end */ -}; -struct __toku_dbt { - void*data; /* 32-bit offset=0 size=4, 64=bit offset=0 size=8 */ - u_int32_t size; /* 32-bit offset=4 size=4, 64=bit offset=8 size=4 */ - u_int32_t ulen; /* 32-bit offset=8 size=4, 64=bit offset=12 size=4 */ - void* __toku_dummy0[1]; - char __toku_dummy1[8]; - u_int32_t flags; /* 32-bit offset=24 size=4, 64=bit offset=32 size=4 */ - /* 4 more bytes of alignment in the 64-bit case. */ -}; -typedef struct __toku_descriptor { - DBT dbt; -} *DESCRIPTOR, DESCRIPTOR_S; -//One header is included in 'data' -//One header is included in 'additional for checkpoint' -typedef struct __toku_db_fragmentation { - uint64_t file_size_bytes; //Total file size in bytes - uint64_t data_bytes; //Compressed User Data in bytes - uint64_t data_blocks; //Number of blocks of compressed User Data - uint64_t checkpoint_bytes_additional; //Additional bytes used for checkpoint system - uint64_t checkpoint_blocks_additional; //Additional blocks used for checkpoint system - uint64_t unused_bytes; //Unused space in file - uint64_t unused_blocks; //Number of contiguous regions of unused space - uint64_t largest_unused_block; //Size of largest contiguous unused space -} *TOKU_DB_FRAGMENTATION, TOKU_DB_FRAGMENTATION_S; -struct __toku_db { - struct __toku_db_internal *i; -#define db_struct_i(x) ((x)->i) - int (*key_range64)(DB*, DB_TXN *, DBT *, u_int64_t *less, u_int64_t *equal, u_int64_t *greater, int *is_exact); - int (*stat64)(DB *, DB_TXN *, DB_BTREE_STAT64 *); - char __toku_dummy0[8]; - void *app_private; /* 32-bit offset=20 size=4, 64=bit offset=32 size=8 */ - DB_ENV *dbenv; /* 32-bit offset=24 size=4, 64=bit offset=40 size=8 */ - int (*pre_acquire_table_lock)(DB*, DB_TXN*); - int (*pre_acquire_fileops_lock)(DB*, DB_TXN*); - const DBT* (*dbt_pos_infty)(void) /* Return the special DBT that refers to positive infinity in the lock table.*/; - const DBT* (*dbt_neg_infty)(void)/* Return the special DBT that refers to negative infinity in the lock table.*/; - void (*get_max_row_size) (DB*, u_int32_t *max_key_size, u_int32_t *max_row_size); - DESCRIPTOR descriptor /* saved row/dictionary descriptor for aiding in comparisons */; - int (*change_descriptor) (DB*, DB_TXN*, const DBT* descriptor, u_int32_t) /* change row/dictionary descriptor for a db. Available only while db is open */; - int (*getf_set)(DB*, DB_TXN*, u_int32_t, DBT*, YDB_CALLBACK_FUNCTION, void*) /* same as DBC->c_getf_set without a persistent cursor) */; - int (*flatten)(DB*, DB_TXN*) /* Flatten a dictionary, similar to (but faster than) a table scan */; - int (*optimize)(DB*) /* Run garbage collecion and promote all transactions older than oldest. Amortized (happens during flattening) */; - int (*hot_optimize)(DB*, int (*progress_callback)(void *progress_extra, float progress), void *progress_extra); - int (*get_fragmentation)(DB*,TOKU_DB_FRAGMENTATION); - int (*get_readpagesize)(DB*,u_int32_t*); - int (*set_readpagesize)(DB*,u_int32_t); - int (*set_indexer)(DB*, DB_INDEXER*); - void (*get_indexer)(DB*, DB_INDEXER**); - int (*verify_with_progress)(DB *, int (*progress_callback)(void *progress_extra, float progress), void *progress_extra, int verbose, int keep_going); - int (*update)(DB *, DB_TXN*, const DBT *key, const DBT *extra, u_int32_t flags); - int (*update_broadcast)(DB *, DB_TXN*, const DBT *extra, u_int32_t flags); - void* __toku_dummy1[23]; - char __toku_dummy2[80]; - void *api_internal; /* 32-bit offset=276 size=4, 64=bit offset=464 size=8 */ - void* __toku_dummy3[5]; - int (*close) (DB*, u_int32_t); /* 32-bit offset=300 size=4, 64=bit offset=512 size=8 */ - void* __toku_dummy4[1]; - int (*cursor) (DB *, DB_TXN *, DBC **, u_int32_t); /* 32-bit offset=308 size=4, 64=bit offset=528 size=8 */ - int (*del) (DB *, DB_TXN *, DBT *, u_int32_t); /* 32-bit offset=312 size=4, 64=bit offset=536 size=8 */ - void* __toku_dummy5[3]; - int (*fd) (DB *, int *); /* 32-bit offset=328 size=4, 64=bit offset=568 size=8 */ - int (*get) (DB *, DB_TXN *, DBT *, DBT *, u_int32_t); /* 32-bit offset=332 size=4, 64=bit offset=576 size=8 */ - void* __toku_dummy6[8]; - int (*get_flags) (DB *, u_int32_t *); /* 32-bit offset=368 size=4, 64=bit offset=648 size=8 */ - void* __toku_dummy7[7]; - int (*get_pagesize) (DB *, u_int32_t *); /* 32-bit offset=400 size=4, 64=bit offset=712 size=8 */ - void* __toku_dummy8[9]; - int (*key_range) (DB *, DB_TXN *, DBT *, DB_KEY_RANGE *, u_int32_t); /* 32-bit offset=440 size=4, 64=bit offset=792 size=8 */ - int (*open) (DB *, DB_TXN *, const char *, const char *, DBTYPE, u_int32_t, int); /* 32-bit offset=444 size=4, 64=bit offset=800 size=8 */ - void* __toku_dummy9[1]; - int (*put) (DB *, DB_TXN *, DBT *, DBT *, u_int32_t); /* 32-bit offset=452 size=4, 64=bit offset=816 size=8 */ - int (*remove) (DB *, const char *, const char *, u_int32_t); /* 32-bit offset=456 size=4, 64=bit offset=824 size=8 */ - int (*rename) (DB *, const char *, const char *, const char *, u_int32_t); /* 32-bit offset=460 size=4, 64=bit offset=832 size=8 */ - void* __toku_dummy10[9]; - void (*set_errfile) (DB *, FILE*); /* 32-bit offset=500 size=4, 64=bit offset=912 size=8 */ - void* __toku_dummy11[2]; - int (*set_flags) (DB *, u_int32_t); /* 32-bit offset=512 size=4, 64=bit offset=936 size=8 */ - void* __toku_dummy12[7]; - int (*set_pagesize) (DB *, u_int32_t); /* 32-bit offset=544 size=4, 64=bit offset=1000 size=8 */ - void* __toku_dummy13[7]; - int (*stat) (DB *, void *, u_int32_t); /* 32-bit offset=576 size=4, 64=bit offset=1064 size=8 */ - void* __toku_dummy14[2]; - int (*truncate) (DB *, DB_TXN *, u_int32_t *, u_int32_t); /* 32-bit offset=588 size=4, 64=bit offset=1088 size=8 */ - void* __toku_dummy15[1]; - int (*verify) (DB *, const char *, const char *, FILE *, u_int32_t); /* 32-bit offset=596 size=4, 64=bit offset=1104 size=8 */ - void* __toku_dummy16[5]; /* Padding at the end */ - char __toku_dummy17[16]; /* Padding at the end */ -}; -struct __toku_db_txn_active { - u_int32_t txnid; /* 32-bit offset=0 size=4, 64=bit offset=0 size=4 */ - void* __toku_dummy0[2]; - char __toku_dummy1[4]; - DB_LSN lsn; /* 32-bit offset=16 size=8, 64=bit offset=24 size=8 */ - char __toku_dummy2[200]; /* Padding at the end */ -}; -typedef struct __toku_txn_progress { - uint64_t entries_total; - uint64_t entries_processed; - uint8_t is_commit; - uint8_t stalled_on_checkpoint; -} *TOKU_TXN_PROGRESS, TOKU_TXN_PROGRESS_S; -typedef void(*TXN_PROGRESS_POLL_FUNCTION)(TOKU_TXN_PROGRESS, void*); -struct txn_stat { - u_int64_t rollback_raw_count; -}; -struct __toku_db_txn { - DB_ENV *mgrp /*In TokuDB, mgrp is a DB_ENV not a DB_TXNMGR*/; /* 32-bit offset=0 size=4, 64=bit offset=0 size=8 */ - DB_TXN *parent; /* 32-bit offset=4 size=4, 64=bit offset=8 size=8 */ - int (*txn_stat)(DB_TXN *, struct txn_stat **); - struct toku_list open_txns; - int (*commit_with_progress)(DB_TXN*, uint32_t, TXN_PROGRESS_POLL_FUNCTION, void*); - int (*abort_with_progress)(DB_TXN*, TXN_PROGRESS_POLL_FUNCTION, void*); - void* __toku_dummy0[14]; - char __toku_dummy1[8]; - void *api_internal; /* 32-bit offset=88 size=4, 64=bit offset=168 size=8 */ - void* __toku_dummy2[2]; - int (*abort) (DB_TXN *); /* 32-bit offset=100 size=4, 64=bit offset=192 size=8 */ - int (*commit) (DB_TXN*, u_int32_t); /* 32-bit offset=104 size=4, 64=bit offset=200 size=8 */ - void* __toku_dummy3[2]; - u_int32_t (*id) (DB_TXN *); /* 32-bit offset=116 size=4, 64=bit offset=224 size=8 */ - int (*prepare) (DB_TXN*, u_int8_t gid[DB_GID_SIZE]); /* 32-bit offset=120 size=4, 64=bit offset=232 size=8 */ - void* __toku_dummy4[4]; /* Padding at the end */ -}; -struct __toku_db_txn_stat { - void* __toku_dummy0[2]; - char __toku_dummy1[28]; - u_int32_t st_nactive; /* 32-bit offset=36 size=4, 64=bit offset=44 size=4 */ - void* __toku_dummy2[1]; - char __toku_dummy3[8]; - DB_TXN_ACTIVE *st_txnarray; /* 32-bit offset=52 size=4, 64=bit offset=64 size=8 */ - void* __toku_dummy4[1]; /* Padding at the end */ - char __toku_dummy5[8]; /* Padding at the end */ -}; -struct __toku_dbc { - DB *dbp; /* 32-bit offset=0 size=4, 64=bit offset=0 size=8 */ - int (*c_getf_first)(DBC *, u_int32_t, YDB_CALLBACK_FUNCTION, void *); - int (*c_getf_last)(DBC *, u_int32_t, YDB_CALLBACK_FUNCTION, void *); - int (*c_getf_next)(DBC *, u_int32_t, YDB_CALLBACK_FUNCTION, void *); - int (*c_getf_prev)(DBC *, u_int32_t, YDB_CALLBACK_FUNCTION, void *); - int (*c_getf_current)(DBC *, u_int32_t, YDB_CALLBACK_FUNCTION, void *); - int (*c_getf_current_binding)(DBC *, u_int32_t, YDB_CALLBACK_FUNCTION, void *); - int (*c_getf_set)(DBC *, u_int32_t, DBT *, YDB_CALLBACK_FUNCTION, void *); - int (*c_getf_set_range)(DBC *, u_int32_t, DBT *, YDB_CALLBACK_FUNCTION, void *); - int (*c_getf_set_range_reverse)(DBC *, u_int32_t, DBT *, YDB_CALLBACK_FUNCTION, void *); - int (*c_pre_acquire_range_lock)(DBC*, const DBT*, const DBT*); - void* __toku_dummy0[24]; - char __toku_dummy1[104]; - int (*c_close) (DBC *); /* 32-bit offset=244 size=4, 64=bit offset=384 size=8 */ - int (*c_count) (DBC *, db_recno_t *, u_int32_t); /* 32-bit offset=248 size=4, 64=bit offset=392 size=8 */ - int (*c_del) (DBC *, u_int32_t); /* 32-bit offset=252 size=4, 64=bit offset=400 size=8 */ - void* __toku_dummy2[1]; - int (*c_get) (DBC *, DBT *, DBT *, u_int32_t); /* 32-bit offset=260 size=4, 64=bit offset=416 size=8 */ - void* __toku_dummy3[10]; /* Padding at the end */ -}; -#ifdef _TOKUDB_WRAP_H -#define txn_begin txn_begin_tokudb -#endif -int db_env_create(DB_ENV **, u_int32_t) __attribute__((__visibility__("default"))); -int db_create(DB **, DB_ENV *, u_int32_t) __attribute__((__visibility__("default"))); -char *db_strerror(int) __attribute__((__visibility__("default"))); -const char *db_version(int*,int *,int *) __attribute__((__visibility__("default"))); -int log_compare (const DB_LSN*, const DB_LSN *) __attribute__((__visibility__("default"))); -int db_env_set_func_fsync (int (*)(int)) __attribute__((__visibility__("default"))); -int toku_set_trace_file (char *fname) __attribute__((__visibility__("default"))); -int toku_close_trace_file (void) __attribute__((__visibility__("default"))); -int db_env_set_func_free (void (*)(void*)) __attribute__((__visibility__("default"))); -int db_env_set_func_malloc (void *(*)(size_t)) __attribute__((__visibility__("default"))); -int db_env_set_func_realloc (void *(*)(void*, size_t)) __attribute__((__visibility__("default"))); -int db_env_set_func_pwrite (ssize_t (*)(int, const void *, size_t, toku_off_t)) __attribute__((__visibility__("default"))); -int db_env_set_func_full_pwrite (ssize_t (*)(int, const void *, size_t, toku_off_t)) __attribute__((__visibility__("default"))); -int db_env_set_func_write (ssize_t (*)(int, const void *, size_t)) __attribute__((__visibility__("default"))); -int db_env_set_func_full_write (ssize_t (*)(int, const void *, size_t)) __attribute__((__visibility__("default"))); -int db_env_set_func_fdopen (FILE* (*)(int, const char *)) __attribute__((__visibility__("default"))); -int db_env_set_func_fopen (FILE* (*)(const char *, const char *)) __attribute__((__visibility__("default"))); -int db_env_set_func_open (int (*)(const char *, int, int)) __attribute__((__visibility__("default"))); -int db_env_set_func_fclose (int (*)(FILE*)) __attribute__((__visibility__("default"))); -int db_env_set_func_pread (ssize_t (*)(int, void *, size_t, off_t)) __attribute__((__visibility__("default"))); -void db_env_set_func_loader_fwrite (size_t (*fwrite_fun)(const void*,size_t,size_t,FILE*)) __attribute__((__visibility__("default"))); -void db_env_set_checkpoint_callback (void (*)(void*), void*) __attribute__((__visibility__("default"))); -void db_env_set_checkpoint_callback2 (void (*)(void*), void*) __attribute__((__visibility__("default"))); -void db_env_set_recover_callback (void (*)(void*), void*) __attribute__((__visibility__("default"))); -void db_env_set_recover_callback2 (void (*)(void*), void*) __attribute__((__visibility__("default"))); -void db_env_set_loader_size_factor (uint32_t) __attribute__((__visibility__("default"))); -void db_env_set_mvcc_garbage_collection_verification(u_int32_t) __attribute__((__visibility__("default"))); -void db_env_enable_engine_status(u_int32_t) __attribute__((__visibility__("default"))); -void db_env_set_flusher_thread_callback (void (*)(int, void*), void*) __attribute__((__visibility__("default"))); -#if defined(__cplusplus) -} -#endif -#endif diff --git a/buildheader/make_db_h.c b/buildheader/make_db_h.c index 3661352be6b..a5ca2c5cd0c 100644 --- a/buildheader/make_db_h.c +++ b/buildheader/make_db_h.c @@ -599,7 +599,6 @@ int main (int argc __attribute__((__unused__)), char *const argv[] __attribute__ "DESCRIPTOR descriptor /* saved row/dictionary descriptor for aiding in comparisons */", "int (*change_descriptor) (DB*, DB_TXN*, const DBT* descriptor, u_int32_t) /* change row/dictionary descriptor for a db. Available only while db is open */", "int (*getf_set)(DB*, DB_TXN*, u_int32_t, DBT*, YDB_CALLBACK_FUNCTION, void*) /* same as DBC->c_getf_set without a persistent cursor) */", - "int (*flatten)(DB*, DB_TXN*) /* Flatten a dictionary, similar to (but faster than) a table scan */", "int (*optimize)(DB*) /* Run garbage collecion and promote all transactions older than oldest. Amortized (happens during flattening) */", "int (*hot_optimize)(DB*, int (*progress_callback)(void *progress_extra, float progress), void *progress_extra)", "int (*get_fragmentation)(DB*,TOKU_DB_FRAGMENTATION)", diff --git a/buildheader/tdb.h b/buildheader/tdb.h index 96b808ef2d1..ded74364a02 100644 --- a/buildheader/tdb.h +++ b/buildheader/tdb.h @@ -308,7 +308,6 @@ struct __toku_db { DESCRIPTOR descriptor /* saved row/dictionary descriptor for aiding in comparisons */; int (*change_descriptor) (DB*, DB_TXN*, const DBT* descriptor, u_int32_t) /* change row/dictionary descriptor for a db. Available only while db is open */; int (*getf_set)(DB*, DB_TXN*, u_int32_t, DBT*, YDB_CALLBACK_FUNCTION, void*) /* same as DBC->c_getf_set without a persistent cursor) */; - int (*flatten)(DB*, DB_TXN*) /* Flatten a dictionary, similar to (but faster than) a table scan */; int (*optimize)(DB*) /* Run garbage collecion and promote all transactions older than oldest. Amortized (happens during flattening) */; int (*hot_optimize)(DB*, int (*progress_callback)(void *progress_extra, float progress), void *progress_extra); int (*get_fragmentation)(DB*,TOKU_DB_FRAGMENTATION); diff --git a/include/db.h b/include/db.h index 96b808ef2d1..ded74364a02 100644 --- a/include/db.h +++ b/include/db.h @@ -308,7 +308,6 @@ struct __toku_db { DESCRIPTOR descriptor /* saved row/dictionary descriptor for aiding in comparisons */; int (*change_descriptor) (DB*, DB_TXN*, const DBT* descriptor, u_int32_t) /* change row/dictionary descriptor for a db. Available only while db is open */; int (*getf_set)(DB*, DB_TXN*, u_int32_t, DBT*, YDB_CALLBACK_FUNCTION, void*) /* same as DBC->c_getf_set without a persistent cursor) */; - int (*flatten)(DB*, DB_TXN*) /* Flatten a dictionary, similar to (but faster than) a table scan */; int (*optimize)(DB*) /* Run garbage collecion and promote all transactions older than oldest. Amortized (happens during flattening) */; int (*hot_optimize)(DB*, int (*progress_callback)(void *progress_extra, float progress), void *progress_extra); int (*get_fragmentation)(DB*,TOKU_DB_FRAGMENTATION); diff --git a/newbrt/brt.c b/newbrt/brt.c index ed698b22437..1a754843a50 100644 --- a/newbrt/brt.c +++ b/newbrt/brt.c @@ -5276,13 +5276,6 @@ brt_search_node( return r; } -// When this is called, the cachetable lock is held -static void -unlock_root_tree_lock (void *v) { - struct brt_header* h = v; - toku_brtheader_release_treelock(h); -} - static int toku_brt_search (BRT brt, brt_search_t *search, BRT_GET_CALLBACK_FUNCTION getf, void *getf_v, BRT_CURSOR brtcursor, BOOL can_bulk_fetch) // Effect: Perform a search. Associate cursor with a leaf if possible. @@ -5332,7 +5325,6 @@ try_again: u_int32_t fullhash; CACHEKEY *rootp = toku_calculate_root_offset_pointer(brt->h, &fullhash); - fill_bfe_for_subset_read( &bfe, brt->h, @@ -5343,22 +5335,15 @@ try_again: brtcursor->right_is_pos_infty, brtcursor->disable_prefetching ); - struct unlockers root_unlockers = { - .locked = TRUE, - .f = unlock_root_tree_lock, - .extra = brt->h, - .next = NULL - }; - r = toku_pin_brtnode(brt, *rootp, fullhash,&root_unlockers,(ANCESTORS)NULL, &infinite_bounds, &bfe, TRUE, &node); - assert(r==0 || r== TOKUDB_TRY_AGAIN); - if (r == TOKUDB_TRY_AGAIN) { - // unlock_root_tree_lock will have released tree_lock of header - assert(!root_unlockers.locked); - root_tries++; - goto try_again; - } - assert(root_unlockers.locked); - + toku_pin_brtnode_off_client_thread( + brt->h, + *rootp, + fullhash, + &bfe, + 0, + NULL, + &node + ); toku_brtheader_release_treelock(brt->h); } @@ -5500,31 +5485,6 @@ toku_brt_cursor_current(BRT_CURSOR cursor, int op, BRT_GET_CALLBACK_FUNCTION get return getf(cursor->key.size, cursor->key.data, cursor->val.size, cursor->val.data, getf_v, false); // brt_cursor_copyout(cursor, outkey, outval); } -static int -brt_flatten_getf(ITEMLEN UU(keylen), bytevec UU(key), - ITEMLEN UU(vallen), bytevec UU(val), - void *UU(v), bool UU(lock_only)) { - return DB_NOTFOUND; -} - -int -toku_brt_flatten(BRT brt, TOKUTXN ttxn) -{ - BRT_CURSOR tmp_cursor; - int r = toku_brt_cursor(brt, &tmp_cursor, ttxn, FALSE, FALSE); - if (r!=0) return r; - brt_search_t search; brt_search_init(&search, brt_cursor_compare_one, BRT_SEARCH_LEFT, 0, tmp_cursor->brt); - r = brt_cursor_search(tmp_cursor, &search, brt_flatten_getf, NULL, FALSE); - brt_search_finish(&search); - if (r==DB_NOTFOUND) r = 0; - { - //Cleanup temporary cursor - int r2 = toku_brt_cursor_close(tmp_cursor); - if (r==0) r = r2; - } - return r; -} - int toku_brt_cursor_first(BRT_CURSOR cursor, BRT_GET_CALLBACK_FUNCTION getf, void *getf_v) { @@ -5939,24 +5899,15 @@ toku_brt_keyrange (BRT brt, DBT *key, u_int64_t *less_p, u_int64_t *equal_p, u_i u_int32_t fullhash; CACHEKEY *rootp = toku_calculate_root_offset_pointer(brt->h, &fullhash); - - - { - struct unlockers root_unlockers = { - .locked = TRUE, - .f = unlock_root_tree_lock, - .extra = brt->h, - .next = NULL - }; - int r = toku_pin_brtnode(brt, *rootp, fullhash, &root_unlockers,(ANCESTORS)NULL, &infinite_bounds, &bfe, FALSE, &node); - assert(r == 0 || r == TOKUDB_TRY_AGAIN); - if (r == TOKUDB_TRY_AGAIN) { - assert(!root_unlockers.locked); - goto try_again; - } - assert(root_unlockers.locked); - } - + toku_pin_brtnode_off_client_thread( + brt->h, + *rootp, + fullhash, + &bfe, + 0, + NULL, + &node + ); toku_brtheader_release_treelock(brt->h); } diff --git a/newbrt/brt.h b/newbrt/brt.h index e7744c656e7..c5e7a9dbc57 100644 --- a/newbrt/brt.h +++ b/newbrt/brt.h @@ -197,7 +197,6 @@ void toku_brt_cursor_set_range_lock(BRT_CURSOR, const DBT *, const DBT *, BOOL, // get is deprecated in favor of the individual functions below int toku_brt_cursor_get (BRT_CURSOR cursor, DBT *key, BRT_GET_CALLBACK_FUNCTION getf, void *getf_v, int get_flags) __attribute__ ((warn_unused_result)); -int toku_brt_flatten(BRT, TOKUTXN ttxn) __attribute__ ((warn_unused_result)); int toku_brt_cursor_first(BRT_CURSOR cursor, BRT_GET_CALLBACK_FUNCTION getf, void *getf_v) __attribute__ ((warn_unused_result)); int toku_brt_cursor_last(BRT_CURSOR cursor, BRT_GET_CALLBACK_FUNCTION getf, void *getf_v) __attribute__ ((warn_unused_result)); int toku_brt_cursor_next(BRT_CURSOR cursor, BRT_GET_CALLBACK_FUNCTION getf, void *getf_v) __attribute__ ((warn_unused_result)); diff --git a/newbrt/cachetable.c b/newbrt/cachetable.c index 4ae7ffd7e9d..cfd98a9f096 100644 --- a/newbrt/cachetable.c +++ b/newbrt/cachetable.c @@ -2492,10 +2492,10 @@ int toku_cachetable_get_and_pin_nonblocking ( note_hash_count(count); // - // In Dr. No, the ydb lock ensures that only one client may be successfully - // doing a query on a dictionary at any given time. This function - // is called with the ydb lock held. This implies that only ONE client can ever be - // in get_and_pin_nonblocking while the ydb lock is held. + // In Doofenshmirts, we keep the root to leaf path pinned + // as we perform a quiry on a dictionary at any given time. + // This implies that only ONE query client can ever be + // in get_and_pin_nonblocking for this dictionary. // So, if there is a write lock grabbed // on the PAIR that we want to lock, then some expensive operation // MUST be happening (read from disk, write to disk, flush, etc...), @@ -2516,13 +2516,9 @@ int toku_cachetable_get_and_pin_nonblocking ( if (partial_fetch_required) { p->state = CTPAIR_READING; run_unlockers(unlockers); // The contract says the unlockers are run with the ct lock being held. - if (ct->ydb_unlock_callback) ct->ydb_unlock_callback(); - // Now wait for the I/O to occur. - + // Now wait for the I/O to occur. do_partial_fetch(ct, cf, p, pf_callback, read_extraargs, FALSE); - cachetable_unlock(ct); - if (ct->ydb_lock_callback) ct->ydb_lock_callback(); return TOKUDB_TRY_AGAIN; } pair_touch(p); @@ -2535,7 +2531,6 @@ int toku_cachetable_get_and_pin_nonblocking ( } else { run_unlockers(unlockers); // The contract says the unlockers are run with the ct lock being held. - if (ct->ydb_unlock_callback) ct->ydb_unlock_callback(); // Now wait for the I/O to occur. // We need to obtain the lock (waiting for the write to finish), but then we only waited so we could wake up again if (p->checkpoint_pending) { @@ -2578,7 +2573,6 @@ int toku_cachetable_get_and_pin_nonblocking ( } } cachetable_unlock(ct); - if (ct->ydb_lock_callback) ct->ydb_lock_callback(); return TOKUDB_TRY_AGAIN; } } @@ -2600,13 +2594,11 @@ int toku_cachetable_get_and_pin_nonblocking ( assert(p); nb_mutex_write_lock(&p->nb_mutex, ct->mutex); run_unlockers(unlockers); // we hold the ct mutex. - if (ct->ydb_unlock_callback) ct->ydb_unlock_callback(); u_int64_t t0 = get_tnow(); cachetable_fetch_pair(ct, cf, p, fetch_callback, read_extraargs, FALSE); cachetable_miss++; cachetable_misstime += get_tnow() - t0; cachetable_unlock(ct); - if (ct->ydb_lock_callback) ct->ydb_lock_callback(); return TOKUDB_TRY_AGAIN; } diff --git a/src/Makefile b/src/Makefile index 5744ae0d414..d34e084004e 100644 --- a/src/Makefile +++ b/src/Makefile @@ -21,6 +21,11 @@ endif OBJS_RAW = \ ydb_lib \ ydb \ + ydb_cursor \ + ydb_row_lock \ + ydb_env_func \ + ydb_write \ + ydb_db \ errors \ dlmalloc \ loader \ diff --git a/src/elocks.c b/src/elocks.c index 0cba0d88c08..bca0d1bd220 100644 --- a/src/elocks.c +++ b/src/elocks.c @@ -131,9 +131,4 @@ toku_ydb_unlock_and_yield(unsigned long useconds) { ydb_unlock_internal(useconds); } -toku_pthread_mutex_t * -toku_ydb_mutex(void) { - return &ydb_big_lock.lock; -} - #undef STATUS_VALUE diff --git a/src/indexer-undo-do.c b/src/indexer-undo-do.c index 81366703360..049427d10d2 100644 --- a/src/indexer-undo-do.c +++ b/src/indexer-undo-do.c @@ -20,6 +20,7 @@ #include "leafentry.h" #include "ule.h" #include "xids.h" +#include "ydb_row_lock.h" #include "indexer-internal.h" diff --git a/src/loader.c b/src/loader.c index 2d33d8fdf32..bef38bbf6c6 100644 --- a/src/loader.c +++ b/src/loader.c @@ -20,6 +20,7 @@ #include "ydb_load.h" #include "checkpoint.h" #include "brt-internal.h" +#include "ydb_db.h" #define lazy_assert(a) assert(a) // indicates code is incomplete @@ -239,7 +240,7 @@ int toku_loader_create_loader(DB_ENV *env, } // time to open the big kahuna - if ( loader->i->loader_flags & LOADER_USE_PUTS ) { + if ( FALSE && (loader->i->loader_flags & LOADER_USE_PUTS) ) { XCALLOC_N(loader->i->N, loader->i->ekeys); XCALLOC_N(loader->i->N, loader->i->evals); for (int i=0; ii->loader_flags & LOADER_USE_PUTS ) { + if ( FALSE && (loader->i->loader_flags & LOADER_USE_PUTS) ) { r = loader->i->env->put_multiple(loader->i->env, loader->i->src_db, // src_db loader->i->txn, @@ -389,7 +390,7 @@ int toku_loader_close(DB_LOADER *loader) if ( loader->i->error_callback != NULL ) { loader->i->error_callback(loader->i->dbs[loader->i->err_i], loader->i->err_i, loader->i->err_errno, &loader->i->err_key, &loader->i->err_val, loader->i->error_extra); } - if ( !(loader->i->loader_flags & LOADER_USE_PUTS ) ) { + if (TRUE || !(loader->i->loader_flags & LOADER_USE_PUTS ) ) { r = toku_brt_loader_abort(loader->i->brt_loader, TRUE); } else { @@ -397,7 +398,7 @@ int toku_loader_close(DB_LOADER *loader) } } else { // no error outstanding - if ( !(loader->i->loader_flags & LOADER_USE_PUTS ) ) { + if (TRUE || !(loader->i->loader_flags & LOADER_USE_PUTS ) ) { // use the bulk loader // in case you've been looking - here is where the real work is done! r = toku_brt_loader_close(loader->i->brt_loader, @@ -436,7 +437,7 @@ int toku_loader_abort(DB_LOADER *loader) } } - if ( !(loader->i->loader_flags & LOADER_USE_PUTS) ) { + if (TRUE || !(loader->i->loader_flags & LOADER_USE_PUTS) ) { r = toku_brt_loader_abort(loader->i->brt_loader, TRUE); } toku_ydb_lock(); diff --git a/src/lock_tree/locktree.c b/src/lock_tree/locktree.c index 5fa45f569be..c4be3d9cf96 100644 --- a/src/lock_tree/locktree.c +++ b/src/lock_tree/locktree.c @@ -14,6 +14,7 @@ #include #include #include +#include /* TODO: Yoni should check that all asserts make sense instead of panic, and all early returns make sense instead of panic, @@ -86,11 +87,16 @@ toku_ltm_get_status(toku_ltm* mgr, LTM_STATUS statp) { *statp = mgr->status; } - - static inline int lt_panic(toku_lock_tree *tree, int r) { - return tree->panic(tree->db, r); + return tree->mgr->panic(tree->db, r); } + +// forward defs of lock request tree functions +static void toku_lock_request_tree_init(toku_lock_tree *tree); +static void toku_lock_request_tree_destroy(toku_lock_tree *tree); +static void toku_lock_request_tree_insert(toku_lock_tree *tree, toku_lock_request *lock_request); +static void toku_lock_request_tree_delete(toku_lock_tree *tree, toku_lock_request *lock_request); +static toku_lock_request *toku_lock_request_tree_find(toku_lock_tree *tree, TXNID id); const uint32_t __toku_default_buflen = 2; @@ -100,40 +106,6 @@ static const DBT __toku_lt_neg_infinity; const DBT* const toku_lt_infinity = &__toku_lt_infinity; const DBT* const toku_lt_neg_infinity = &__toku_lt_neg_infinity; -static toku_pthread_mutex_t * -toku_ltm_get_mutex(toku_ltm *ltm) { - toku_pthread_mutex_t *lock = ltm->use_lock; - if (lock == NULL) - lock = <m->lock; - return lock; -} - -void -toku_ltm_set_mutex(toku_ltm *ltm, toku_pthread_mutex_t *use_lock) { - ltm->use_lock = use_lock; -} - -static void -toku_ltm_init_mutex(toku_ltm *ltm) { - int r = toku_pthread_mutex_init(<m->lock, NULL); assert_zero(r); - ltm->use_lock = NULL; -} - -static void -toku_ltm_destroy_mutex(toku_ltm *ltm) { - int r = toku_pthread_mutex_destroy(<m->lock); assert_zero(r); -} - -void -toku_ltm_lock_mutex(toku_ltm *ltm) { - int r = toku_pthread_mutex_lock(toku_ltm_get_mutex(ltm)); assert_zero(r); -} - -void -toku_ltm_unlock_mutex(toku_ltm *ltm) { - int r = toku_pthread_mutex_unlock(toku_ltm_get_mutex(ltm)); assert_zero(r); -} - char* toku_lt_strerror(TOKU_LT_ERROR r) { if (r >= 0) @@ -143,6 +115,7 @@ toku_lt_strerror(TOKU_LT_ERROR r) { } return "Unknown error in locking data structures.\n"; } + /* Compare two payloads assuming that at least one of them is infinite */ static inline int infinite_compare(const DBT* a, const DBT* b) { @@ -224,8 +197,8 @@ toku_lt_point_cmp(const toku_point* x, const toku_point* y) { return infinite_compare(x->key_payload, y->key_payload); } return x->lt->compare_fun(x->lt->db, - recreate_DBT(&point_1, x->key_payload, x->key_len), - recreate_DBT(&point_2, y->key_payload, y->key_len)); + recreate_DBT(&point_1, x->key_payload, x->key_len), + recreate_DBT(&point_2, y->key_payload, y->key_len)); } /* Lock tree manager functions begin here */ @@ -233,60 +206,70 @@ int toku_ltm_create(toku_ltm** pmgr, uint32_t locks_limit, uint64_t lock_memory_limit, - int (*panic)(DB*, int), - toku_dbt_cmp (*get_compare_fun_from_db)(DB*)) { + int (*panic)(DB*, int)) { int r = ENOSYS; - toku_ltm* tmp_mgr = NULL; + toku_ltm* mgr = NULL; if (!pmgr || !locks_limit) { r = EINVAL; goto cleanup; } - assert(panic && get_compare_fun_from_db); + assert(panic); - tmp_mgr = (toku_ltm*) toku_malloc(sizeof(*tmp_mgr)); - if (!tmp_mgr) { + mgr = (toku_ltm*) toku_malloc(sizeof(*mgr)); + if (!mgr) { r = ENOMEM; goto cleanup; } - memset(tmp_mgr, 0, sizeof(toku_ltm)); + memset(mgr, 0, sizeof(toku_ltm)); - r = toku_ltm_set_max_locks(tmp_mgr, locks_limit); + r = toku_ltm_set_max_locks(mgr, locks_limit); if (r != 0) goto cleanup; - r = toku_ltm_set_max_lock_memory(tmp_mgr, lock_memory_limit); + r = toku_ltm_set_max_lock_memory(mgr, lock_memory_limit); if (r != 0) goto cleanup; - tmp_mgr->panic = panic; - tmp_mgr->get_compare_fun_from_db = get_compare_fun_from_db; + mgr->panic = panic; - r = toku_lth_create(&tmp_mgr->lth); + r = toku_lth_create(&mgr->lth); if (r != 0) goto cleanup; - if (!tmp_mgr->lth) { + if (!mgr->lth) { r = ENOMEM; goto cleanup; } - r = toku_idlth_create(&tmp_mgr->idlth); + r = toku_idlth_create(&mgr->idlth); if (r != 0) goto cleanup; - if (!tmp_mgr->idlth) { + if (!mgr->idlth) { r = ENOMEM; goto cleanup; } - toku_ltm_init_mutex(tmp_mgr); + toku_mutex_init(&mgr->mutex, NULL); + DRD_IGNORE_VAR(mgr->status); r = 0; - *pmgr = tmp_mgr; + *pmgr = mgr; cleanup: if (r != 0) { - if (tmp_mgr) { - if (tmp_mgr->lth) - toku_lth_close(tmp_mgr->lth); - if (tmp_mgr->idlth) - toku_idlth_close(tmp_mgr->idlth); - toku_free(tmp_mgr); + if (mgr) { + if (mgr->lth) + toku_lth_close(mgr->lth); + if (mgr->idlth) + toku_idlth_close(mgr->idlth); + toku_free(mgr); } } return r; } +// For now, ltm_open does nothing. +int +toku_ltm_open(toku_ltm *mgr) { + int r; + if (!mgr) + r = EINVAL; + else + r = 0; + return r; +} + int toku_ltm_close(toku_ltm* mgr) { int r = ENOSYS; @@ -305,7 +288,8 @@ toku_ltm_close(toku_ltm* mgr) { } toku_lth_close(mgr->lth); toku_idlth_close(mgr->idlth); - toku_ltm_destroy_mutex(mgr); + toku_mutex_destroy(&mgr->mutex); + DRD_STOP_IGNORING_VAR(mgr->status); assert(mgr->curr_locks == 0 && mgr->curr_lock_memory == 0); toku_free(mgr); @@ -314,7 +298,6 @@ cleanup: return r; } - int toku_ltm_get_max_locks(toku_ltm* mgr, uint32_t* locks_limit) { if (!mgr || !locks_limit) @@ -354,15 +337,15 @@ toku_ltm_set_max_lock_memory(toku_ltm* mgr, uint64_t lock_memory_limit) { static inline void ltm_incr_locks(toku_ltm* tree_mgr, uint32_t replace_locks) { assert(replace_locks <= tree_mgr->curr_locks); - tree_mgr->curr_locks -= replace_locks; - tree_mgr->curr_locks += 1; + (void) __sync_fetch_and_sub(&tree_mgr->curr_locks, replace_locks); + (void) __sync_fetch_and_add(&tree_mgr->curr_locks, 1); } static inline void ltm_decr_locks(toku_ltm* tree_mgr, uint32_t locks) { assert(tree_mgr); assert(tree_mgr->curr_locks >= locks); - tree_mgr->curr_locks -= locks; + (void) __sync_fetch_and_sub(&tree_mgr->curr_locks, locks); } static int @@ -375,7 +358,7 @@ ltm_out_of_locks(toku_ltm *mgr) { static void ltm_incr_lock_memory(toku_ltm *mgr, size_t s) { - mgr->curr_lock_memory += s; + (void) __sync_add_and_fetch(&mgr->curr_lock_memory, s); } void @@ -387,7 +370,7 @@ toku_ltm_incr_lock_memory(void *extra, size_t s) { static void ltm_decr_lock_memory(toku_ltm *mgr, size_t s) { assert(mgr->curr_lock_memory >= s); - mgr->curr_lock_memory -= s; + (void) __sync_sub_and_fetch(&mgr->curr_lock_memory, s); } void @@ -468,21 +451,8 @@ toku_lt_ifexist_selfwrite(toku_lock_tree* tree, TXNID txn) { static inline int lt_add_locked_txn(toku_lock_tree* tree, TXNID txn) { - int r = ENOSYS; - bool half_done = FALSE; - /* Neither selfread nor selfwrite exist. */ - r = toku_rth_insert(tree->rth, txn); - if (r != 0) - goto cleanup; - r = toku_rth_insert(tree->txns_still_locked, txn); - if (r != 0) { - half_done = TRUE; goto cleanup; - } - r = 0; -cleanup: - if (half_done) - toku_rth_delete(tree->rth, txn); + int r = toku_rth_insert(tree->rth, txn); return r; } @@ -1128,21 +1098,16 @@ r_backwards(toku_interval* range) { (toku_lt_point_cmp(left, right) > 0)); } -static inline int lt_unlock_deferred_txns(toku_lock_tree* tree); - static inline void lt_set_comparison_functions(toku_lock_tree* tree, DB* db) { - assert(!tree->db && !tree->compare_fun); + assert(!tree->db); tree->db = db; - tree->compare_fun = tree->get_compare_fun_from_db(tree->db); - assert(tree->compare_fun); } static inline void lt_clear_comparison_functions(toku_lock_tree* tree) { assert(tree); tree->db = NULL; - tree->compare_fun = NULL; } /* Preprocess step for acquire functions. */ @@ -1155,10 +1120,6 @@ lt_preprocess(toku_lock_tree* tree, DB* db, toku_interval* query) { int r = ENOSYS; - if (!tree || !db || !key_left || !key_right) { - r = EINVAL; goto cleanup; - } - /* Verify that NULL keys have payload and size that are mutually consistent*/ if ((r = lt_verify_null_key(key_left)) != 0) @@ -1179,11 +1140,7 @@ lt_preprocess(toku_lock_tree* tree, DB* db, r = 0; cleanup: if (r == 0) { - assert(tree->db && tree->compare_fun); - /* Cleanup all existing deleted transactions */ - if (!toku_rth_is_empty(tree->txns_to_unlock)) { - r = lt_unlock_deferred_txns(tree); - } + assert(tree->db); } return r; } @@ -1359,22 +1316,19 @@ lt_borderwrite_insert(toku_lock_tree* tree, toku_interval* query, toku_range* to /* TODO: Investigate better way of passing comparison functions. */ int toku_lt_create(toku_lock_tree** ptree, - int (*panic)(DB*, int), toku_ltm* mgr, - toku_dbt_cmp (*get_compare_fun_from_db)(DB*)) { + toku_dbt_cmp compare_fun) { int r = ENOSYS; toku_lock_tree* tmp_tree = NULL; - if (!ptree || !mgr || - !get_compare_fun_from_db || !panic) { + if (!ptree || !mgr || !compare_fun) { r = EINVAL; goto cleanup; } tmp_tree = (toku_lock_tree*)toku_malloc(sizeof(*tmp_tree)); if (!tmp_tree) { r = ENOMEM; goto cleanup; } memset(tmp_tree, 0, sizeof(toku_lock_tree)); - tmp_tree->panic = panic; tmp_tree->mgr = mgr; - tmp_tree->get_compare_fun_from_db = get_compare_fun_from_db; + tmp_tree->compare_fun = compare_fun; tmp_tree->lock_escalation_allowed = TRUE; r = toku_rt_create(&tmp_tree->borderwrite, toku_lt_point_cmp, lt_txn_cmp, FALSE, toku_ltm_incr_lock_memory, toku_ltm_decr_lock_memory, mgr); @@ -1384,12 +1338,7 @@ toku_lt_create(toku_lock_tree** ptree, r = toku_rth_create(&tmp_tree->rth); if (r != 0) goto cleanup; - r = toku_rth_create(&tmp_tree->txns_to_unlock); - if (r != 0) - goto cleanup; - r = toku_rth_create(&tmp_tree->txns_still_locked); - if (r != 0) - goto cleanup; + tmp_tree->buflen = __toku_default_buflen; tmp_tree->buf = (toku_range*) toku_malloc(tmp_tree->buflen * sizeof(toku_range)); @@ -1404,7 +1353,7 @@ toku_lt_create(toku_lock_tree** ptree, if (r != 0) goto cleanup; toku_lock_request_tree_init(tmp_tree); - + toku_mutex_init(&tmp_tree->mutex, NULL); tmp_tree->ref_count = 1; *ptree = tmp_tree; r = 0; @@ -1415,8 +1364,6 @@ cleanup: toku_rt_close(tmp_tree->borderwrite); if (tmp_tree->rth) toku_rth_close(tmp_tree->rth); - if (tmp_tree->txns_to_unlock) - toku_rth_close(tmp_tree->txns_to_unlock); if (tmp_tree->buf) toku_free(tmp_tree->buf); if (tmp_tree->bw_buf) @@ -1433,13 +1380,14 @@ void toku_ltm_invalidate_lt(toku_ltm* mgr, DICTIONARY_ID dict_id) { assert(mgr && dict_id.dictid != DICTIONARY_ID_NONE.dictid); toku_lt_map* map = NULL; + toku_mutex_lock(&mgr->mutex); map = toku_idlth_find(mgr->idlth, dict_id); if (map) { toku_idlth_delete(mgr->idlth, dict_id); } + toku_mutex_unlock(&mgr->mutex); } - static inline void toku_lt_set_dict_id(toku_lock_tree* lt, DICTIONARY_ID dict_id) { assert(lt && dict_id.dictid != DICTIONARY_ID_NONE.dictid); @@ -1450,8 +1398,7 @@ static void lt_add_db(toku_lock_tree* tree, DB *db); static void lt_remove_db(toku_lock_tree* tree, DB *db); int -toku_ltm_get_lt(toku_ltm* mgr, toku_lock_tree** ptree, - DICTIONARY_ID dict_id, DB *db) { +toku_ltm_get_lt(toku_ltm* mgr, toku_lock_tree** ptree, DICTIONARY_ID dict_id, DB *db, toku_dbt_cmp compare_fun) { /* first look in hash table to see if lock tree exists for that db, if so return it */ int r = ENOSYS; @@ -1461,6 +1408,7 @@ toku_ltm_get_lt(toku_ltm* mgr, toku_lock_tree** ptree, bool added_to_idlth = FALSE; bool added_extant_db = FALSE; + toku_mutex_lock(&mgr->mutex); map = toku_idlth_find(mgr->idlth, dict_id); if (map != NULL) { /* Load already existing lock tree. */ @@ -1473,7 +1421,7 @@ toku_ltm_get_lt(toku_ltm* mgr, toku_lock_tree** ptree, goto cleanup; } /* Must create new lock tree for this dict_id*/ - r = toku_lt_create(&tree, mgr->panic, mgr, mgr->get_compare_fun_from_db); + r = toku_lt_create(&tree, mgr, compare_fun); if (r != 0) goto cleanup; toku_lt_set_dict_id(tree, dict_id); @@ -1500,6 +1448,7 @@ toku_ltm_get_lt(toku_ltm* mgr, toku_lock_tree** ptree, *ptree = tree; r = 0; cleanup: + toku_mutex_unlock(&mgr->mutex); if (r == 0) { mgr->STATUS_VALUE(LTM_LT_CREATE)++; mgr->STATUS_VALUE(LTM_LT_NUM)++; @@ -1508,6 +1457,7 @@ cleanup: } else { if (tree != NULL) { + toku_mutex_lock(&mgr->mutex); if (added_to_ltm) toku_ltm_remove_lt(mgr, tree); if (added_to_idlth) @@ -1515,6 +1465,7 @@ cleanup: if (added_extant_db) lt_remove_db(tree, db); toku_lt_close(tree); + toku_mutex_unlock(&mgr->mutex); } mgr->STATUS_VALUE(LTM_LT_CREATE_FAIL)++; } @@ -1547,10 +1498,8 @@ toku_lt_close(toku_lock_tree* tree) { first_error = r; } toku_rth_close(tree->rth); - toku_rth_close(tree->txns_to_unlock); - toku_rth_close(tree->txns_still_locked); toku_omt_destroy(&tree->dbs); - + toku_mutex_destroy(&tree->mutex); toku_free(tree->buf); toku_free(tree->bw_buf); toku_free(tree->verify_buf); @@ -1560,13 +1509,6 @@ cleanup: return r; } -// toku_lt_acquire_read_lock() used only by test programs -int -toku_lt_acquire_read_lock(toku_lock_tree* tree, DB* db, TXNID txn, const DBT* key) { - return toku_lt_acquire_range_read_lock(tree, db, txn, key, key); -} - - static int lt_try_acquire_range_read_lock(toku_lock_tree* tree, DB* db, TXNID txn, const DBT* key_left, const DBT* key_right) { int r; @@ -1871,8 +1813,8 @@ cleanup: return r; } -int -toku_lt_acquire_range_read_lock(toku_lock_tree* tree, DB* db, TXNID txn, const DBT* key_left, const DBT* key_right) { +static int +toku_lt_acquire_range_read_lock_internal(toku_lock_tree* tree, DB* db, TXNID txn, const DBT* key_left, const DBT* key_right) { int r = ENOSYS; r = lt_try_acquire_range_read_lock(tree, db, txn, @@ -1904,6 +1846,24 @@ toku_lt_acquire_range_read_lock(toku_lock_tree* tree, DB* db, TXNID txn, const D return r; } +int +toku_lt_acquire_range_read_lock(toku_lock_tree* tree, DB* db, TXNID txn, const DBT* key_left, const DBT *key_right) { + int r = 0; + if (!tree || !db || !key_left || !key_right) + r = EINVAL; + if (r == 0) { + toku_mutex_lock(&tree->mutex); + r = toku_lt_acquire_range_read_lock_internal(tree, db, txn, key_left, key_right); + toku_mutex_unlock(&tree->mutex); + } + return r; +} + +int +toku_lt_acquire_read_lock(toku_lock_tree* tree, DB* db, TXNID txn, const DBT* key) { + return toku_lt_acquire_range_read_lock(tree, db, txn, key, key); +} + static int lt_try_acquire_range_write_lock(toku_lock_tree* tree, DB* db, TXNID txn, const DBT* key_left, const DBT* key_right) { int r; @@ -1990,8 +1950,8 @@ cleanup: return r; } -int -toku_lt_acquire_range_write_lock(toku_lock_tree* tree, DB* db, TXNID txn, const DBT* key_left, const DBT* key_right) { +static int +toku_lt_acquire_range_write_lock_internal(toku_lock_tree* tree, DB* db, TXNID txn, const DBT* key_left, const DBT* key_right) { int r = ENOSYS; r = lt_try_acquire_range_write_lock(tree, db, txn, key_left, key_right); @@ -2021,6 +1981,19 @@ toku_lt_acquire_range_write_lock(toku_lock_tree* tree, DB* db, TXNID txn, const return r; } +int +toku_lt_acquire_range_write_lock(toku_lock_tree* tree, DB* db, TXNID txn, const DBT* key_left, const DBT* key_right) { + int r = 0; + if (!tree || !db || !key_left || !key_right) + r = EINVAL; + if (r == 0) { + toku_mutex_lock(&tree->mutex); + r = toku_lt_acquire_range_write_lock_internal(tree, db, txn, key_left, key_right); + toku_mutex_unlock(&tree->mutex); + } + return r; +} + int toku_lt_acquire_write_lock(toku_lock_tree* tree, DB* db, TXNID txn, const DBT* key) { return toku_lt_acquire_range_write_lock(tree, db, txn, key, key); @@ -2130,24 +2103,6 @@ lt_border_delete(toku_lock_tree* tree, toku_range_tree* rt) { return 0; } -static inline int -lt_defer_unlocking_txn(toku_lock_tree* tree, TXNID txnid) { - int r = ENOSYS; - - rt_forest* forest = toku_rth_find(tree->txns_to_unlock, txnid); - /* Should not be unlocking a transaction twice. */ - assert(!forest); - r = toku_rth_insert(tree->txns_to_unlock, txnid); - if (r != 0) - goto cleanup; - if (toku_rth_find(tree->txns_still_locked, txnid) != NULL) { - toku_rth_delete(tree->txns_still_locked, txnid); - } - r = 0; -cleanup: - return r; -} - static inline int lt_unlock_txn(toku_lock_tree* tree, TXNID txn) { if (!tree) @@ -2169,7 +2124,17 @@ lt_unlock_txn(toku_lock_tree* tree, TXNID txn) { if (selfwrite) { uint32_t size = toku_rt_get_size(selfwrite); ranges += size; + + DB *db = NULL; + if (toku_omt_size(tree->dbs) > 0) { + OMTVALUE dbv; + r = toku_omt_fetch(tree->dbs, 0, &dbv); + assert_zero(r); + db = dbv; + } + lt_set_comparison_functions(tree, db); r = lt_border_delete(tree, selfwrite); + lt_clear_comparison_functions(tree); if (r != 0) return lt_panic(tree, r); r = lt_free_contents(tree, selfwrite); @@ -2185,69 +2150,22 @@ lt_unlock_txn(toku_lock_tree* tree, TXNID txn) { return 0; } -static inline int -lt_unlock_deferred_txns(toku_lock_tree* tree) { - int r = ENOSYS; - toku_rth_start_scan(tree->txns_to_unlock); - rt_forest* forest = NULL; - while ((forest = toku_rth_next(tree->txns_to_unlock)) != NULL) { - /* This can only fail with a panic so it is fine to quit immediately. */ - r = lt_unlock_txn(tree, forest->hash_key); - if (r != 0) - goto cleanup; - } - toku_rth_clear(tree->txns_to_unlock); - r = 0; -cleanup: - return r; -} - -static inline void -lt_clear(toku_lock_tree* tree) { - int r; - - assert(tree); - toku_rt_clear(tree->borderwrite); - - toku_rth_start_scan(tree->rth); - rt_forest* forest; - uint32_t ranges = 0; - while ((forest = toku_rth_next(tree->rth)) != NULL) { - if (forest->self_read) { - ranges += toku_rt_get_size(forest->self_read); - r = lt_free_contents(tree, forest->self_read); - assert_zero(r); - } - if (forest->self_write) { - ranges += toku_rt_get_size(forest->self_write); - r = lt_free_contents(tree, forest->self_write); - assert_zero(r); - } - - } - toku_rth_clear(tree->rth); - toku_rth_clear(tree->txns_to_unlock); - /* tree->txns_still_locked is already empty, so we do not clear it. */ - ltm_decr_locks(tree->mgr, ranges); -} +static void lt_retry_lock_requests(toku_lock_tree *tree); int -toku_lt_unlock(toku_lock_tree* tree, TXNID txn) { - int r = ENOSYS; - if (!tree) { - r = EINVAL; goto cleanup; - } +toku_lt_unlock_txn(toku_lock_tree* tree, TXNID txn) { #if TOKU_LT_DEBUG if (toku_lt_debug) printf("%s:%u %lu\n", __FUNCTION__, __LINE__, txn); #endif - r = lt_defer_unlocking_txn(tree, txn); - if (r != 0) - goto cleanup; - if (toku_rth_is_empty(tree->txns_still_locked)) - lt_clear(tree); - toku_lt_retry_lock_requests_locked(tree); - r = 0; + int r = 0; + if (!tree) { + r = EINVAL; goto cleanup; + } + toku_mutex_lock(&tree->mutex); + lt_unlock_txn(tree, txn); + lt_retry_lock_requests(tree); + toku_mutex_unlock(&tree->mutex); cleanup: return r; } @@ -2256,16 +2174,18 @@ void toku_lt_add_ref(toku_lock_tree* tree) { assert(tree); assert(tree->ref_count > 0); - tree->ref_count++; + (void) __sync_add_and_fetch(&tree->ref_count, 1); } static void toku_ltm_stop_managing_lt(toku_ltm* mgr, toku_lock_tree* tree) { + toku_mutex_lock(&mgr->mutex); toku_ltm_remove_lt(mgr, tree); toku_lt_map* map = toku_idlth_find(mgr->idlth, tree->dict_id); if (map && map->tree == tree) { toku_idlth_delete(mgr->idlth, tree->dict_id); } + toku_mutex_unlock(&mgr->mutex); } int @@ -2273,8 +2193,8 @@ toku_lt_remove_ref(toku_lock_tree* tree) { int r = ENOSYS; assert(tree); assert(tree->ref_count > 0); - tree->ref_count--; - if (tree->ref_count > 0) { + uint32_t ref_count = __sync_sub_and_fetch(&tree->ref_count, 1); + if (ref_count > 0) { r = 0; goto cleanup; } assert(tree->dict_id.dictid != DICTIONARY_ID_NONE.dictid); @@ -2302,6 +2222,7 @@ find_db (OMTVALUE v, void *dbv) { static void lt_add_db(toku_lock_tree* tree, DB *db) { + toku_mutex_lock(&tree->mutex); if (db != NULL) { int r; OMTVALUE get_dbv = NULL; @@ -2311,10 +2232,12 @@ lt_add_db(toku_lock_tree* tree, DB *db) { r = toku_omt_insert_at(tree->dbs, db, index); assert_zero(r); } + toku_mutex_unlock(&tree->mutex); } static void lt_remove_db(toku_lock_tree* tree, DB *db) { + toku_mutex_lock(&tree->mutex); if (db != NULL) { int r; OMTVALUE get_dbv = NULL; @@ -2325,13 +2248,13 @@ lt_remove_db(toku_lock_tree* tree, DB *db) { r = toku_omt_delete_at(tree->dbs, index); assert_zero(r); } + toku_mutex_unlock(&tree->mutex); } void toku_lt_remove_db_ref(toku_lock_tree* tree, DB *db) { - int r; lt_remove_db(tree, db); - r = toku_lt_remove_ref(tree); + int r = toku_lt_remove_ref(tree); assert_zero(r); } @@ -2384,8 +2307,12 @@ toku_lock_request_init(toku_lock_request *lock_request, DB *db, TXNID txnid, con void toku_lock_request_destroy(toku_lock_request *lock_request) { - if (lock_request->state == LOCK_REQUEST_PENDING) + if (lock_request->state == LOCK_REQUEST_PENDING) { + toku_lock_tree *tree = lock_request->tree; + toku_mutex_lock(&tree->mutex); toku_lock_request_tree_delete(lock_request->tree, lock_request); + toku_mutex_unlock(&tree->mutex); + } toku_lock_request_destroy_wait(lock_request); toku_free(lock_request->key_left_copy.data); toku_free(lock_request->key_right_copy.data); @@ -2399,8 +2326,8 @@ toku_lock_request_complete(toku_lock_request *lock_request, int complete_r) { static const struct timeval max_timeval = { ~0, 0 }; -int -toku_lock_request_wait(toku_lock_request *lock_request, toku_lock_tree *tree, struct timeval *wait_time) { +static int +toku_lock_request_wait_internal(toku_lock_request *lock_request, toku_lock_tree *tree, struct timeval *wait_time, bool tree_locked) { #if TOKU_LT_DEBUG if (toku_lt_debug) printf("%s:%u %lu\n", __FUNCTION__, __LINE__, lock_request->txnid); @@ -2414,43 +2341,56 @@ toku_lock_request_wait(toku_lock_request *lock_request, toku_lock_tree *tree, st long int d_sec = usec / 1000000; long int d_usec = usec % 1000000; struct timespec ts = { sec + d_sec, d_usec * 1000 }; + if (!tree_locked) toku_mutex_lock(&tree->mutex); while (lock_request->state == LOCK_REQUEST_PENDING) { toku_lock_request_init_wait(lock_request); - r = pthread_cond_timedwait(&lock_request->wait, toku_ltm_get_mutex(tree->mgr), &ts); + r = pthread_cond_timedwait(&lock_request->wait, &tree->mutex, &ts); assert(r == 0 || r == ETIMEDOUT); if (r == ETIMEDOUT && lock_request->state == LOCK_REQUEST_PENDING) { toku_lock_request_tree_delete(tree, lock_request); toku_lock_request_complete(lock_request, DB_LOCK_NOTGRANTED); } } + if (!tree_locked) toku_mutex_unlock(&tree->mutex); } else { + if (!tree_locked) toku_mutex_lock(&tree->mutex); while (lock_request->state == LOCK_REQUEST_PENDING) { toku_lock_request_init_wait(lock_request); - r = toku_pthread_cond_wait(&lock_request->wait, toku_ltm_get_mutex(tree->mgr)); assert_zero(r); + r = toku_pthread_cond_wait(&lock_request->wait, &tree->mutex); assert_zero(r); } + if (!tree_locked) toku_mutex_unlock(&tree->mutex); } assert(lock_request->state == LOCK_REQUEST_COMPLETE); return lock_request->complete_r; } -int -toku_lock_request_wait_with_default_timeout(toku_lock_request *lock_request, toku_lock_tree *tree) { - return toku_lock_request_wait(lock_request, tree, &tree->mgr->lock_wait_time); +int +toku_lock_request_wait(toku_lock_request *lock_request, toku_lock_tree *tree, struct timeval *wait_time) { + return toku_lock_request_wait_internal(lock_request, tree, wait_time, false); } -void +int +toku_lock_request_wait_with_default_timeout(toku_lock_request *lock_request, toku_lock_tree *tree) { + return toku_lock_request_wait_internal(lock_request, tree, &tree->mgr->lock_wait_time, false); +} + +void toku_lock_request_wakeup(toku_lock_request *lock_request, toku_lock_tree *tree UU()) { if (lock_request->wait_initialized) { int r = toku_pthread_cond_broadcast(&lock_request->wait); assert_zero(r); } } -void +// a lock request tree contains pending lock requests. +// initialize a lock request tree. +static void toku_lock_request_tree_init(toku_lock_tree *tree) { int r = toku_omt_create(&tree->lock_requests); assert_zero(r); } -void +// destroy a lock request tree. +// the tree must be empty when destroyed. +static void toku_lock_request_tree_destroy(toku_lock_tree *tree) { assert(toku_omt_size(tree->lock_requests) == 0); toku_omt_destroy(&tree->lock_requests); @@ -2467,7 +2407,8 @@ compare_lock_request(OMTVALUE a, void *b) { return 0; } -void +// insert a lock request into the tree. +static void toku_lock_request_tree_insert(toku_lock_tree *tree, toku_lock_request *lock_request) { lock_request->tree = tree; int r; @@ -2477,7 +2418,8 @@ toku_lock_request_tree_insert(toku_lock_tree *tree, toku_lock_request *lock_requ r = toku_omt_insert_at(tree->lock_requests, lock_request, idx); assert_zero(r); } -void +// delete a lock request from the tree. +static void toku_lock_request_tree_delete(toku_lock_tree *tree, toku_lock_request *lock_request) { int r; OMTVALUE v; @@ -2488,7 +2430,8 @@ toku_lock_request_tree_delete(toku_lock_tree *tree, toku_lock_request *lock_requ } } -toku_lock_request * +// find a lock request for a given transaction id. +static toku_lock_request * toku_lock_request_tree_find(toku_lock_tree *tree, TXNID id) { int r; OMTVALUE v; @@ -2531,14 +2474,16 @@ static void print_key(const char *sp, const DBT *k) { } #endif -int +static void toku_lt_check_deadlock(toku_lock_tree *tree, toku_lock_request *a_lock_request); + +static int toku_lock_request_start_locked(toku_lock_request *lock_request, toku_lock_tree *tree, bool copy_keys_if_not_granted) { int r; assert(lock_request->state == LOCK_REQUEST_INIT); if (lock_request->type == LOCK_REQUEST_READ) { - r = toku_lt_acquire_range_read_lock(tree, lock_request->db, lock_request->txnid, lock_request->key_left, lock_request->key_right); + r = toku_lt_acquire_range_read_lock_internal(tree, lock_request->db, lock_request->txnid, lock_request->key_left, lock_request->key_right); } else if (lock_request->type == LOCK_REQUEST_WRITE) { - r = toku_lt_acquire_range_write_lock(tree, lock_request->db, lock_request->txnid, lock_request->key_left, lock_request->key_right); + r = toku_lt_acquire_range_write_lock_internal(tree, lock_request->db, lock_request->txnid, lock_request->key_left, lock_request->key_right); } else assert(0); #if TOKU_LT_DEBUG @@ -2572,43 +2517,36 @@ toku_lock_request_start_locked(toku_lock_request *lock_request, toku_lock_tree * int toku_lock_request_start(toku_lock_request *lock_request, toku_lock_tree *tree, bool copy_keys_if_not_granted) { - toku_ltm_lock_mutex(tree->mgr); + toku_mutex_lock(&tree->mutex); int r = toku_lock_request_start_locked(lock_request, tree, copy_keys_if_not_granted); - toku_ltm_unlock_mutex(tree->mgr); + toku_mutex_unlock(&tree->mutex); return r; } -int +static int toku_lt_acquire_lock_request_with_timeout_locked(toku_lock_tree *tree, toku_lock_request *lock_request, struct timeval *wait_time) { int r = toku_lock_request_start_locked(lock_request, tree, false); if (r == DB_LOCK_NOTGRANTED) - r = toku_lock_request_wait(lock_request, tree, wait_time); + r = toku_lock_request_wait_internal(lock_request, tree, wait_time, true); return r; } int toku_lt_acquire_lock_request_with_timeout(toku_lock_tree *tree, toku_lock_request *lock_request, struct timeval *wait_time) { - toku_ltm_lock_mutex(tree->mgr); + toku_mutex_lock(&tree->mutex); int r = toku_lt_acquire_lock_request_with_timeout_locked(tree, lock_request, wait_time); - toku_ltm_unlock_mutex(tree->mgr); + toku_mutex_unlock(&tree->mutex); return r; } -int -toku_lt_acquire_lock_request_with_default_timeout_locked(toku_lock_tree *tree, toku_lock_request *lock_request) { - return toku_lt_acquire_lock_request_with_timeout_locked(tree, lock_request, &tree->mgr->lock_wait_time); -} - -int +int toku_lt_acquire_lock_request_with_default_timeout(toku_lock_tree *tree, toku_lock_request *lock_request) { - toku_ltm_lock_mutex(tree->mgr); - int r = toku_lt_acquire_lock_request_with_timeout_locked(tree, lock_request, &tree->mgr->lock_wait_time); - toku_ltm_unlock_mutex(tree->mgr); + int r = toku_lt_acquire_lock_request_with_timeout(tree, lock_request, &tree->mgr->lock_wait_time); return r; } -void -toku_lt_retry_lock_requests_locked(toku_lock_tree *tree) { +static void +lt_retry_lock_requests(toku_lock_tree *tree) { int r; for (uint32_t i = 0; i < toku_omt_size(tree->lock_requests); ) { OMTVALUE v; @@ -2627,6 +2565,13 @@ toku_lt_retry_lock_requests_locked(toku_lock_tree *tree) { } } +void +toku_lt_retry_lock_requests(toku_lock_tree *tree) { + toku_mutex_lock(&tree->mutex); + lt_retry_lock_requests(tree); + toku_mutex_unlock(&tree->mutex); +} + #include #include "wfg.h" @@ -2653,7 +2598,7 @@ build_wfg_for_a_lock_request(toku_lock_tree *tree, struct wfg *wfg, toku_lock_re } // check if a given lock request could deadlock with any granted locks. -void +static void toku_lt_check_deadlock(toku_lock_tree *tree, toku_lock_request *a_lock_request) { // init the wfg struct wfg wfg_static; @@ -2821,9 +2766,11 @@ lt_verify(toku_lock_tree *lt) { void toku_lt_verify(toku_lock_tree *lt, DB *db) { + toku_mutex_lock(<->mutex); lt_set_comparison_functions(lt, db); lt_verify(lt); lt_clear_comparison_functions(lt); + toku_mutex_unlock(<->mutex); } #undef STATUS_VALUE diff --git a/src/lock_tree/locktree.h b/src/lock_tree/locktree.h index f222ca291db..361f871400d 100644 --- a/src/lock_tree/locktree.h +++ b/src/lock_tree/locktree.h @@ -59,26 +59,26 @@ typedef struct __toku_ltm toku_ltm; /** \brief The lock tree structure */ struct __toku_lock_tree { + /** Lock tree manager */ + toku_ltm* mgr; /** The database for which this locktree will be handling locks */ DB* db; toku_range_tree* borderwrite; /**< See design document */ toku_rth* rth; /**< Stores local(read|write)set tables */ - /** - Stores a list of transactions to unlock when it is safe. - When we get a PUT or a GET, the comparison function is valid - and we can delete locks held in txns_to_unlock, even if txns_still_locked - is nonempty. - */ - toku_rth* txns_to_unlock; - /** Stores a list of transactions that hold locks. txns_still_locked = rth - txns_to_unlock - rth != txns_still_locked + txns_to_unlock, we may get an unlock call for a txn that has - no locks in rth. - When txns_still_locked becomes empty, we can throw away the contents of the lock tree - quickly. */ - toku_rth* txns_still_locked; + /** Whether lock escalation is allowed. */ + bool lock_escalation_allowed; + /** Function to retrieve the key compare function from the database. */ + toku_dbt_cmp compare_fun; + /** The number of references held by DB instances and transactions to this lock tree*/ + uint32_t ref_count; + /** DICTIONARY_ID associated with the lock tree */ + DICTIONARY_ID dict_id; + OMT dbs; //The extant dbs using this lock tree. + OMT lock_requests; + toku_pthread_mutex_t mutex; + /** A temporary area where we store the results of various find on the range trees that this lock tree owns - Memory ownership: - tree->buf is an array of toku_range's, which the lt owns The contents of tree->buf are volatile (this is a buffer space @@ -98,26 +98,8 @@ struct __toku_lock_tree { uint32_t bw_buflen; toku_range* verify_buf; uint32_t verify_buflen; - /** Whether lock escalation is allowed. */ - bool lock_escalation_allowed; - /** Lock tree manager */ - toku_ltm* mgr; - /** Function to retrieve the key compare function from the database. */ - toku_dbt_cmp (*get_compare_fun_from_db)(DB*); - /** The key compare function */ - toku_dbt_cmp compare_fun; - /** The panic function */ - int (*panic)(DB*, int); - /** The number of references held by DB instances and transactions to this lock tree*/ - uint32_t ref_count; - /** DICTIONARY_ID associated with the lock tree */ - DICTIONARY_ID dict_id; - OMT dbs; //The extant dbs using this lock tree. - - OMT lock_requests; }; - typedef enum { LTM_LOCKS_LIMIT, // number of locks allowed (obsolete) LTM_LOCKS_CURR, // number of locks in existence @@ -162,13 +144,10 @@ struct __toku_ltm { is retrieved from this list, otherwise, a new lock tree is created and the new mapping of DB and Lock tree is stored here */ toku_idlth* idlth; - /** Function to retrieve the key compare function from the database. */ - toku_dbt_cmp (*get_compare_fun_from_db)(DB*); /** The panic function */ int (*panic)(DB*, int); - toku_pthread_mutex_t lock; - toku_pthread_mutex_t *use_lock; + toku_pthread_mutex_t mutex; struct timeval lock_wait_time; }; @@ -203,10 +182,6 @@ typedef struct __toku_point toku_point; Create a lock tree. Should be called only inside DB->open. \param ptree We set *ptree to the newly allocated tree. - \param get_compare_fun_from_db Accessor for the key compare function. - \param panic The function to cause the db to panic. - i.e., godzilla_rampage() - \param payload_capacity The maximum amount of memory to use for dbt payloads. \return - 0 Success @@ -222,17 +197,8 @@ typedef struct __toku_point toku_point; instead. */ int toku_lt_create(toku_lock_tree** ptree, - int (*panic)(DB*, int), toku_ltm* mgr, - toku_dbt_cmp (*get_compare_fun_from_db)(DB*)); - -/** - Gets a lock tree for a given DB with id dict_id -*/ -int toku_ltm_get_lt(toku_ltm* mgr, toku_lock_tree** ptree, - DICTIONARY_ID dict_id, DB *db); - -void toku_ltm_invalidate_lt(toku_ltm* mgr, DICTIONARY_ID dict_id); + toku_dbt_cmp compare_fun); /** Closes and frees a lock tree. @@ -398,11 +364,28 @@ int toku_lt_acquire_range_write_lock(toku_lock_tree* tree, DB* db, TXNID txn, - EINVAL If (tree == NULL || txn == NULL). - EINVAL If panicking. */ -int toku_lt_unlock(toku_lock_tree* tree, TXNID txn); +int toku_lt_unlock_txn(toku_lock_tree* tree, TXNID txn); + +void toku_lt_retry_lock_requests(toku_lock_tree *tree); + +void toku_lt_add_ref(toku_lock_tree* tree); + +int toku_lt_remove_ref(toku_lock_tree* tree); + +void toku_lt_remove_db_ref(toku_lock_tree* tree, DB *db); + +toku_range_tree* toku_lt_ifexist_selfread(toku_lock_tree* tree, TXNID txn); + +toku_range_tree* toku_lt_ifexist_selfwrite(toku_lock_tree* tree, TXNID txn); + +void toku_lt_verify(toku_lock_tree *tree, DB *db); + +int toku_lt_point_cmp(const toku_point* x, const toku_point* y); /* Lock tree manager functions begin here */ + /** - Creates a lock tree manager.. + Creates a lock tree manager. \param pmgr A buffer for the new lock tree manager. \param locks_limit The maximum number of locks. @@ -415,8 +398,10 @@ int toku_lt_unlock(toku_lock_tree* tree, TXNID txn); int toku_ltm_create(toku_ltm** pmgr, uint32_t locks_limit, uint64_t lock_memory_limit, - int (*panic)(DB*, int), - toku_dbt_cmp (*get_compare_fun_from_db)(DB*)); + int (*panic)(DB*, int)); + +/** Open the lock tree manager */ +int toku_ltm_open(toku_ltm *mgr); /** Closes and frees a lock tree manager.. @@ -444,31 +429,30 @@ int toku_ltm_close(toku_ltm* mgr); */ int toku_ltm_set_max_locks(toku_ltm* mgr, uint32_t locks_limit); -int toku_ltm_get_max_lock_memory(toku_ltm* mgr, uint64_t* lock_memory_limit); +int toku_ltm_get_max_locks(toku_ltm* mgr, uint32_t* locks_limit); int toku_ltm_set_max_lock_memory(toku_ltm* mgr, uint64_t lock_memory_limit); +int toku_ltm_get_max_lock_memory(toku_ltm* mgr, uint64_t* lock_memory_limit); + void toku_ltm_get_status(toku_ltm* mgr, LTM_STATUS s); -int toku_ltm_get_max_locks(toku_ltm* mgr, uint32_t* locks_limit); +// set the default lock timeout. units are milliseconds +void toku_ltm_set_lock_wait_time(toku_ltm *mgr, uint64_t lock_wait_time_msec); + +// get the default lock timeout +void toku_ltm_get_lock_wait_time(toku_ltm *mgr, uint64_t *lock_wait_time_msec); + +/** + Gets a lock tree for a given DB with id dict_id +*/ +int toku_ltm_get_lt(toku_ltm* mgr, toku_lock_tree** ptree, DICTIONARY_ID dict_id, DB *dbp, toku_dbt_cmp compare_fun); + +void toku_ltm_invalidate_lt(toku_ltm* mgr, DICTIONARY_ID dict_id); void toku_ltm_incr_lock_memory(void *extra, size_t s); void toku_ltm_decr_lock_memory(void *extra, size_t s); -void toku_lt_add_ref(toku_lock_tree* tree); - -int toku_lt_remove_ref(toku_lock_tree* tree); - -void toku_lt_remove_db_ref(toku_lock_tree* tree, DB *db); - -int toku_lt_point_cmp(const toku_point* x, const toku_point* y); - -toku_range_tree* toku_lt_ifexist_selfread(toku_lock_tree* tree, TXNID txn); - -toku_range_tree* toku_lt_ifexist_selfwrite(toku_lock_tree* tree, TXNID txn); - -void toku_lt_verify(toku_lock_tree *tree, DB *db); - typedef enum { LOCK_REQUEST_INIT = 0, LOCK_REQUEST_PENDING = 1, @@ -524,17 +508,10 @@ void toku_lock_request_destroy(toku_lock_request *lock_request); // returns 0 (success), DB_LOCK_NOTGRANTED, DB_LOCK_DEADLOCK int toku_lock_request_start(toku_lock_request *lock_request, toku_lock_tree *tree, bool copy_keys_if_not_granted); -// try to acquire a lock described by a lock request. -// if the lock is not granted and copy_keys_if_not_granted is true, then make a copy of the keys in the key range. -// this is necessary when used in the ydb cursor callbacks where the keys are only valid when in the callback function. -// called with the lock tree already locked. -int toku_lock_request_start_locked(toku_lock_request *lock_request, toku_lock_tree *tree, bool copy_keys_if_not_granted); - // sleep on the lock request until it becomes resolved or the wait time occurs. // if the wait time is not specified, then wait for as long as it takes. int toku_lock_request_wait(toku_lock_request *lock_request, toku_lock_tree *tree, struct timeval *wait_time); -// use the default timeouts set in the ltm int toku_lock_request_wait_with_default_timeout(toku_lock_request *lock_request, toku_lock_tree *tree); // wakeup any threads that are waiting on a lock request. @@ -543,46 +520,14 @@ void toku_lock_request_wakeup(toku_lock_request *lock_request, toku_lock_tree *t // returns the lock request state toku_lock_request_state toku_lock_request_get_state(toku_lock_request *lock_request); -// a lock request tree contains pending lock requests. -// initialize a lock request tree. -void toku_lock_request_tree_init(toku_lock_tree *tree); - -// destroy a lock request tree. -// the tree must be empty when destroyed. -void toku_lock_request_tree_destroy(toku_lock_tree *tree); - -// insert a lock request into the tree. -void toku_lock_request_tree_insert(toku_lock_tree *tree, toku_lock_request *lock_request); - -// delete a lock request from the tree. -void toku_lock_request_tree_delete(toku_lock_tree *tree, toku_lock_request *lock_request); - -// find a lock request for a given transaction id. -toku_lock_request *toku_lock_request_tree_find(toku_lock_tree *tree, TXNID id); - -// retry all pending lock requests. -// for all lock requests, if the lock request is resolved, then wakeup any threads waiting on the lock request. -// called with the lock tree already locked. -void toku_lt_retry_lock_requests_locked(toku_lock_tree *tree); - // try to acquire a lock described by a lock request. if the lock is granted then return success. // otherwise wait on the lock request until the lock request is resolved (either granted or // deadlocks), or the given timer has expired. // returns 0 (success), DB_LOCK_NOTGRANTED int toku_lt_acquire_lock_request_with_timeout(toku_lock_tree *tree, toku_lock_request *lock_request, struct timeval *wait_time); -// called with the lock tree already locked -int toku_lt_acquire_lock_request_with_timeout_locked(toku_lock_tree *tree, toku_lock_request *lock_request, struct timeval *wait_time); - -// call acquire_lock_request_with_timeout with the default lock wait timeout int toku_lt_acquire_lock_request_with_default_timeout(toku_lock_tree *tree, toku_lock_request *lock_request); -// called with the lock tree already locked -int toku_lt_acquire_lock_request_with_default_timeout_locked (toku_lock_tree *tree, toku_lock_request *lock_request); - -// check if a given lock request could deadlock with any granted locks. -void toku_lt_check_deadlock(toku_lock_tree *tree, toku_lock_request *lock_request); - #include "txnid_set.h" // internal function that finds all transactions that conflict with a given lock request @@ -595,23 +540,6 @@ void toku_lt_check_deadlock(toku_lock_tree *tree, toku_lock_request *lock_reques // returns an error code (0 == success) int toku_lt_get_lock_request_conflicts(toku_lock_tree *tree, toku_lock_request *lock_request, txnid_set *conflicts); -// set the ltm mutex (used to override the internal mutex) and use a user supplied mutex instead to protect the -// lock tree). the first use is to use the ydb mutex to protect the lock tree. eventually, the ydb code will -// be refactored to use the ltm mutex instead. -void toku_ltm_set_mutex(toku_ltm *ltm, toku_pthread_mutex_t *use_lock); - -// lock the lock tree -void toku_ltm_lock_mutex(toku_ltm *mgr); - -// unlock the lock tree -void toku_ltm_unlock_mutex(toku_ltm *mgr); - -// set the default lock timeout. units are milliseconds -void toku_ltm_set_lock_wait_time(toku_ltm *mgr, uint64_t lock_wait_time_msec); - -// get the default lock timeout -void toku_ltm_get_lock_wait_time(toku_ltm *mgr, uint64_t *lock_wait_time_msec); - #if defined(__cplusplus) } #endif diff --git a/src/lock_tree/tests/bench_point_write_locks.c b/src/lock_tree/tests/bench_point_write_locks.c index 3e4db28ac41..8814b79f4f6 100644 --- a/src/lock_tree/tests/bench_point_write_locks.c +++ b/src/lock_tree/tests/bench_point_write_locks.c @@ -38,11 +38,11 @@ int main(int argc, const char *argv[]) { // setup toku_ltm *ltm = NULL; - r = toku_ltm_create(<m, max_locks, max_lock_memory, dbpanic, get_compare_fun_from_db); + r = toku_ltm_create(<m, max_locks, max_lock_memory, dbpanic); assert(r == 0 && ltm); toku_lock_tree *lt = NULL; - r = toku_lt_create(<, dbpanic, ltm, get_compare_fun_from_db); + r = toku_lt_create(<, ltm, dbcmp); assert(r == 0 && lt); DB *db_a = (DB *) 2; @@ -55,7 +55,7 @@ int main(int argc, const char *argv[]) { } // release the locks - r = toku_lt_unlock(lt, txn_a); assert(r == 0); + r = toku_lt_unlock_txn(lt, txn_a); assert(r == 0); // shutdown r = toku_lt_close(lt); assert(r == 0); diff --git a/src/lock_tree/tests/stress_point_write_locks.c b/src/lock_tree/tests/stress_point_write_locks.c index a01c27e47b6..474e1bc9256 100644 --- a/src/lock_tree/tests/stress_point_write_locks.c +++ b/src/lock_tree/tests/stress_point_write_locks.c @@ -47,7 +47,7 @@ struct test_arg { uint64_t iterations; }; -static void runtest(DB *db, TXNID txn, toku_ltm *ltm, toku_lock_tree *lt, uint64_t locks_per_txn, uint64_t nrows, uint64_t iterations) { +static void runtest(DB *db, TXNID txn, toku_ltm *ltm UU(), toku_lock_tree *lt, uint64_t locks_per_txn, uint64_t nrows, uint64_t iterations) { int r; uint64_t notgranted = 0, deadlocked = 0; @@ -60,9 +60,7 @@ static void runtest(DB *db, TXNID txn, toku_ltm *ltm, toku_lock_tree *lt, uint64 DBT key = { .data = &keys[i], .size = sizeof keys[i] }; toku_lock_request lr; toku_lock_request_init(&lr, db, txn, &key, &key, LOCK_REQUEST_WRITE); - toku_ltm_lock_mutex(ltm); - r = toku_lt_acquire_lock_request_with_default_timeout_locked(lt, &lr); - toku_ltm_unlock_mutex(ltm); + r = toku_lt_acquire_lock_request_with_default_timeout(lt, &lr); if (r == 0) { get_lock(keys[i], txn); continue; @@ -80,9 +78,7 @@ static void runtest(DB *db, TXNID txn, toku_ltm *ltm, toku_lock_tree *lt, uint64 // usleep(random() % 1000); release_locks(keys, i, txn); - toku_ltm_lock_mutex(ltm); - r = toku_lt_unlock(lt, txn); assert(r == 0); - toku_ltm_unlock_mutex(ltm); + r = toku_lt_unlock_txn(lt, txn); assert(r == 0); if ((iter % 10000) == 0) printf("%lu %lu %lu\n", (long unsigned) iter, (long unsigned) notgranted, (long unsigned) deadlocked); @@ -143,11 +139,11 @@ int main(int argc, const char *argv[]) { // setup toku_ltm *ltm = NULL; - r = toku_ltm_create(<m, max_locks, max_lock_memory, dbpanic, get_compare_fun_from_db); + r = toku_ltm_create(<m, max_locks, max_lock_memory, dbpanic); assert(r == 0 && ltm); toku_lock_tree *lt = NULL; - r = toku_lt_create(<, dbpanic, ltm, get_compare_fun_from_db); + r = toku_lt_create(<, ltm, dbcmp); assert(r == 0 && lt); DB *fake_db = (DB *) 1; diff --git a/src/lock_tree/tests/test.h b/src/lock_tree/tests/test.h index 88971d75971..821437a49cc 100644 --- a/src/lock_tree/tests/test.h +++ b/src/lock_tree/tests/test.h @@ -33,12 +33,6 @@ static inline int dbcmp (DB *db __attribute__((__unused__)), const DBT *a, const return toku_keycompare(a->data, a->size, b->data, b->size); } -toku_dbt_cmp compare_fun = dbcmp; - -static inline toku_dbt_cmp get_compare_fun_from_db(__attribute__((unused)) DB* db) { - return compare_fun; -} - bool panicked = false; static inline int dbpanic(DB* db, int r) { diff --git a/src/lock_tree/tests/test_00000_createclose.c b/src/lock_tree/tests/test_00000_createclose.c index 0c7b9bd9304..3e953ccb2f1 100644 --- a/src/lock_tree/tests/test_00000_createclose.c +++ b/src/lock_tree/tests/test_00000_createclose.c @@ -7,13 +7,11 @@ int main(void) { uint32_t max_locks = 1000; uint64_t max_lock_memory = max_locks*64; - r = toku_ltm_create(&mgr, max_locks, max_lock_memory, dbpanic, - get_compare_fun_from_db); + r = toku_ltm_create(&mgr, max_locks, max_lock_memory, dbpanic); CKERR(r); { - r = toku_lt_create(<, dbpanic, mgr, - get_compare_fun_from_db); + r = toku_lt_create(<, mgr, dbcmp); CKERR(r); assert(lt); r = toku_lt_close(lt); diff --git a/src/lock_tree/tests/test_00010_parameter_errors.c b/src/lock_tree/tests/test_00010_parameter_errors.c index d4990e4f3a7..1d94b2e078c 100644 --- a/src/lock_tree/tests/test_00010_parameter_errors.c +++ b/src/lock_tree/tests/test_00010_parameter_errors.c @@ -22,7 +22,7 @@ static void do_range_test(int (*acquire)(toku_lock_tree*, DB*, TXNID, DBT* key_l = &_key_l; DBT* key_r = &_key_r; { - r = toku_lt_create(<, dbpanic, ltm, get_compare_fun_from_db); + r = toku_lt_create(<, ltm, dbcmp); CKERR(r); assert(lt); @@ -64,11 +64,11 @@ static void do_point_test(int (*acquire)(toku_lock_tree*, DB*, TXNID, /* Point read tests. */ key = &_key; { - r = toku_lt_create(<, dbpanic, ltm, get_compare_fun_from_db); + r = toku_lt_create(<, ltm, dbcmp); CKERR(r); assert(lt); - r = toku_lt_unlock(NULL, (TXNID)1); + r = toku_lt_unlock_txn(NULL, (TXNID)1); CKERR2(r, EINVAL); r = acquire(NULL, db, txn, key); @@ -91,18 +91,18 @@ int main(int argc, const char *argv[]) { int r; toku_lock_tree* lt = NULL; - r = toku_ltm_create(NULL, max_locks, max_lock_memory, dbpanic, get_compare_fun_from_db); + r = toku_ltm_create(NULL, max_locks, max_lock_memory, dbpanic); CKERR2(r, EINVAL); assert(ltm == NULL); - r = toku_ltm_create(<m, 0, max_lock_memory, dbpanic, get_compare_fun_from_db); + r = toku_ltm_create(<m, 0, max_lock_memory, dbpanic); CKERR2(r, EINVAL); assert(ltm == NULL); - r = toku_ltm_create(<m, max_locks, 0, dbpanic, get_compare_fun_from_db); + r = toku_ltm_create(<m, max_locks, 0, dbpanic); CKERR2(r, EINVAL); assert(ltm == NULL); /* Actually create it. */ - r = toku_ltm_create(<m, max_locks, max_lock_memory, dbpanic, get_compare_fun_from_db); + r = toku_ltm_create(<m, max_locks, max_lock_memory, dbpanic); CKERR(r); assert(ltm); @@ -144,17 +144,12 @@ int main(int argc, const char *argv[]) { /* create tests. */ { - r = toku_lt_create(NULL, dbpanic, ltm, get_compare_fun_from_db); + r = toku_lt_create(NULL, ltm, dbcmp); CKERR2(r, EINVAL); - r = toku_lt_create(<, NULL, ltm, get_compare_fun_from_db); + r = toku_lt_create(<, NULL, dbcmp); CKERR2(r, EINVAL); - r = toku_lt_create(<, dbpanic, NULL, get_compare_fun_from_db); - CKERR2(r, EINVAL); - - r = toku_lt_create(<, dbpanic, ltm, NULL); - CKERR2(r, EINVAL); } /* Close tests. */ diff --git a/src/lock_tree/tests/test_00020_read.c b/src/lock_tree/tests/test_00020_read.c index a66f70c47cf..4cc12635b14 100644 --- a/src/lock_tree/tests/test_00020_read.c +++ b/src/lock_tree/tests/test_00020_read.c @@ -42,17 +42,17 @@ static void init_query(void) { static void setup_tree(void) { assert(!lt && !ltm); - r = toku_ltm_create(<m, max_locks, max_lock_memory, dbpanic, get_compare_fun_from_db); + r = toku_ltm_create(<m, max_locks, max_lock_memory, dbpanic); CKERR(r); assert(ltm); - r = toku_lt_create(<, dbpanic, ltm, get_compare_fun_from_db); + r = toku_lt_create(<, ltm, dbcmp); CKERR(r); assert(lt); init_query(); } static void close_tree(void) { - r = toku_lt_unlock(lt, txn); CKERR(r); + r = toku_lt_unlock_txn(lt, txn); CKERR(r); assert(lt && ltm); r = toku_lt_close(lt); CKERR(r); r = toku_ltm_close(ltm); CKERR(r); @@ -103,22 +103,9 @@ static void setup_payload_len(void** payload, uint32_t* len, int val) { } } -static void temporarily_fake_comparison_functions(void) { - assert(!lt->db && !lt->compare_fun); - lt->db = db; - lt->compare_fun = get_compare_fun_from_db(db); -} - -static void stop_fake_comparison_functions(void) { - assert(lt->db && lt->compare_fun); - lt->db = NULL; - lt->compare_fun = NULL; -} - static void lt_find(toku_range_tree* rt, unsigned k, int key_l, int key_r, TXNID find_txn) { -temporarily_fake_comparison_functions(); r = toku_rt_find(rt, &query, 0, &buf, &buflen, &numfound); CKERR(r); assert(numfound==k); @@ -136,9 +123,8 @@ temporarily_fake_comparison_functions(); } assert(false); //Crash since we didn't find it. cleanup: - stop_fake_comparison_functions(); -} - + return; +} static void insert_1(int key_l, int key_r, const void* kl, const void* kr) { diff --git a/src/lock_tree/tests/test_00040_write.c b/src/lock_tree/tests/test_00040_write.c index f9640c38109..1492de407f1 100644 --- a/src/lock_tree/tests/test_00040_write.c +++ b/src/lock_tree/tests/test_00040_write.c @@ -36,10 +36,10 @@ static void init_query(void) { static void setup_tree(void) { assert(!lt && !ltm); - r = toku_ltm_create(<m, max_locks, max_lock_memory, dbpanic, get_compare_fun_from_db); + r = toku_ltm_create(<m, max_locks, max_lock_memory, dbpanic); CKERR(r); assert(ltm); - r = toku_lt_create(<, dbpanic, ltm, get_compare_fun_from_db); + r = toku_lt_create(<, ltm, dbcmp); CKERR(r); assert(lt); init_query(); @@ -102,7 +102,7 @@ static void lt_insert_write(int r_expect, char txn, int key_l) { static void lt_unlock(char ctxn) { int retval; - retval = toku_lt_unlock(lt, (TXNID) (size_t) ctxn); + retval = toku_lt_unlock_txn(lt, (TXNID) (size_t) ctxn); CKERR(retval); } diff --git a/src/lock_tree/tests/test_00060_lock_escalation.c b/src/lock_tree/tests/test_00060_lock_escalation.c index 57caa8a7c4b..d31ae8a3309 100644 --- a/src/lock_tree/tests/test_00060_lock_escalation.c +++ b/src/lock_tree/tests/test_00060_lock_escalation.c @@ -37,12 +37,12 @@ static void init_query(void) { static void setup_tree(void) { assert(!lt && !ltm); - r = toku_ltm_create(<m, max_locks, max_lock_memory, dbpanic, get_compare_fun_from_db); + r = toku_ltm_create(<m, max_locks, max_lock_memory, dbpanic); CKERR(r); assert(ltm); //ask ltm for lock tree DICTIONARY_ID dict_id = {0x1234}; - r = toku_ltm_get_lt(ltm, <, dict_id, db); + r = toku_ltm_get_lt(ltm, <, dict_id, db, intcmp); CKERR(r); assert(lt); @@ -108,7 +108,7 @@ static void lt_insert_write(int r_expect, char txn, int key_l) { } static void lt_unlock(char ctxn) { - int retval = toku_lt_unlock(lt, (TXNID) (size_t) ctxn); CKERR(retval); + int retval = toku_lt_unlock_txn(lt, (TXNID) (size_t) ctxn); CKERR(retval); } static void run_escalation_test(void) { @@ -370,7 +370,6 @@ static void init_test(void) { buflen = 64; buf = (toku_range*) toku_malloc(buflen*sizeof(toku_range)); - compare_fun = intcmp; } static void close_test(void) { diff --git a/src/lock_tree/tests/test_00070_ltm.c b/src/lock_tree/tests/test_00070_ltm.c index 7803be493e5..211f2c4bb3c 100644 --- a/src/lock_tree/tests/test_00070_ltm.c +++ b/src/lock_tree/tests/test_00070_ltm.c @@ -17,14 +17,14 @@ int nums[10000]; static void setup_ltm(void) { assert(!ltm); - r = toku_ltm_create(<m, max_locks, max_lock_memory, dbpanic, get_compare_fun_from_db); + r = toku_ltm_create(<m, max_locks, max_lock_memory, dbpanic); CKERR(r); assert(ltm); } static void setup_tree(size_t index, DICTIONARY_ID dict_id) { assert(!lt[index] && ltm); - r = toku_ltm_get_lt(ltm, <[index], dict_id, NULL); + r = toku_ltm_get_lt(ltm, <[index], dict_id, NULL, intcmp); CKERR(r); assert(lt[index]); } @@ -67,7 +67,6 @@ static void run_test(void) { int main(int argc, const char *argv[]) { parse_args(argc, argv); - compare_fun = intcmp; r = system("rm -rf " TESTDIR); CKERR(r); diff --git a/src/lock_tree/tests/test_00080_lt_refcount.c b/src/lock_tree/tests/test_00080_lt_refcount.c index 8c528e00682..1fdf544f6a6 100644 --- a/src/lock_tree/tests/test_00080_lt_refcount.c +++ b/src/lock_tree/tests/test_00080_lt_refcount.c @@ -20,7 +20,7 @@ int nums[10000]; static void setup_ltm(void) { assert(!ltm); - r = toku_ltm_create(<m, max_locks, max_lock_memory, dbpanic, get_compare_fun_from_db); + r = toku_ltm_create(<m, max_locks, max_lock_memory, dbpanic); CKERR(r); assert(ltm); } @@ -30,7 +30,7 @@ static void db_open_tree(size_t index, size_t db_id_index) { (lt_refs[index] > 0 && lts[index])); assert(ltm); lt_refs[index]++; - r = toku_ltm_get_lt(ltm, <s[index], dict_ids[db_id_index], NULL); + r = toku_ltm_get_lt(ltm, <s[index], dict_ids[db_id_index], NULL, intcmp); CKERR(r); assert(lts[index]); } @@ -136,7 +136,6 @@ static void close_test(void) { int main(int argc, const char *argv[]) { parse_args(argc, argv); - compare_fun = intcmp; r = system("rm -rf " TESTDIR); CKERR(r); diff --git a/src/lock_tree/tests/test_borderwrite_merge.c b/src/lock_tree/tests/test_borderwrite_merge.c index ade95b10f8d..180be5487b3 100644 --- a/src/lock_tree/tests/test_borderwrite_merge.c +++ b/src/lock_tree/tests/test_borderwrite_merge.c @@ -36,10 +36,10 @@ static void init_query(void) { static void setup_tree(void) { assert(!lt && !ltm); - r = toku_ltm_create(<m, max_locks, max_lock_memory, dbpanic, get_compare_fun_from_db); + r = toku_ltm_create(<m, max_locks, max_lock_memory, dbpanic); CKERR(r); assert(ltm); - r = toku_lt_create(<, dbpanic, ltm, get_compare_fun_from_db); + r = toku_lt_create(<, ltm, dbcmp); CKERR(r); assert(lt); init_query(); @@ -87,7 +87,7 @@ static void lt_insert_write_range(int r_expect, char txn, int key_l, int key_r) } static void lt_unlock(TXNID txnid) { - r= toku_lt_unlock(lt, txnid); CKERR(r); + r= toku_lt_unlock_txn(lt, txnid); CKERR(r); } static void runtest(void) { diff --git a/src/lock_tree/tests/test_conflict_read_table_write.c b/src/lock_tree/tests/test_conflict_read_table_write.c index 5e3c9bd3b28..0434792d547 100644 --- a/src/lock_tree/tests/test_conflict_read_table_write.c +++ b/src/lock_tree/tests/test_conflict_read_table_write.c @@ -43,11 +43,11 @@ int main(int argc, const char *argv[]) { // setup toku_ltm *ltm = NULL; - r = toku_ltm_create(<m, max_locks, max_lock_memory, dbpanic, get_compare_fun_from_db); + r = toku_ltm_create(<m, max_locks, max_lock_memory, dbpanic); assert(r == 0 && ltm); toku_lock_tree *lt = NULL; - r = toku_lt_create(<, dbpanic, ltm, get_compare_fun_from_db); + r = toku_lt_create(<, ltm, dbcmp); assert(r == 0 && lt); DBT key_l; dbt_init(&key_l, "L", 1); @@ -90,7 +90,7 @@ int main(int argc, const char *argv[]) { assert(txnid_set_get(&conflicts, 1) == txn_b); txnid_set_destroy(&conflicts); - r = toku_lt_unlock(lt, txn_a); assert(r == 0); + r = toku_lt_unlock_txn(lt, txn_a); assert(r == 0); assert(c_w_l.state == LOCK_REQUEST_PENDING); txnid_set_init(&conflicts); r = toku_lt_get_lock_request_conflicts(lt, &c_w_l, &conflicts); @@ -99,10 +99,10 @@ int main(int argc, const char *argv[]) { assert(txnid_set_get(&conflicts, 0) == txn_b); txnid_set_destroy(&conflicts); - r = toku_lt_unlock(lt, txn_b); assert(r == 0); + r = toku_lt_unlock_txn(lt, txn_b); assert(r == 0); assert(c_w_l.state == LOCK_REQUEST_COMPLETE && c_w_l.complete_r == 0); toku_lock_request_destroy(&c_w_l); - r = toku_lt_unlock(lt, txn_c); assert(r == 0); + r = toku_lt_unlock_txn(lt, txn_c); assert(r == 0); // shutdown r = toku_lt_close(lt); assert(r == 0); diff --git a/src/lock_tree/tests/test_conflict_read_write.c b/src/lock_tree/tests/test_conflict_read_write.c index 7878b0ee36f..f7738325161 100644 --- a/src/lock_tree/tests/test_conflict_read_write.c +++ b/src/lock_tree/tests/test_conflict_read_write.c @@ -43,11 +43,11 @@ int main(int argc, const char *argv[]) { // setup toku_ltm *ltm = NULL; - r = toku_ltm_create(<m, max_locks, max_lock_memory, dbpanic, get_compare_fun_from_db); + r = toku_ltm_create(<m, max_locks, max_lock_memory, dbpanic); assert(r == 0 && ltm); toku_lock_tree *lt = NULL; - r = toku_lt_create(<, dbpanic, ltm, get_compare_fun_from_db); + r = toku_lt_create(<, ltm, dbcmp); assert(r == 0 && lt); DBT key_l; dbt_init(&key_l, "L", 1); @@ -91,7 +91,7 @@ int main(int argc, const char *argv[]) { assert(txnid_set_get(&conflicts, 1) == txn_b); txnid_set_destroy(&conflicts); - r = toku_lt_unlock(lt, txn_a); assert(r == 0); + r = toku_lt_unlock_txn(lt, txn_a); assert(r == 0); assert(c_w_l.state == LOCK_REQUEST_PENDING); txnid_set_init(&conflicts); r = toku_lt_get_lock_request_conflicts(lt, &c_w_l, &conflicts); @@ -100,10 +100,10 @@ int main(int argc, const char *argv[]) { assert(txnid_set_get(&conflicts, 0) == txn_b); txnid_set_destroy(&conflicts); - r = toku_lt_unlock(lt, txn_b); assert(r == 0); + r = toku_lt_unlock_txn(lt, txn_b); assert(r == 0); assert(c_w_l.state == LOCK_REQUEST_COMPLETE && c_w_l.complete_r == 0); toku_lock_request_destroy(&c_w_l); - r = toku_lt_unlock(lt, txn_c); assert(r == 0); + r = toku_lt_unlock_txn(lt, txn_c); assert(r == 0); // shutdown r = toku_lt_close(lt); assert(r == 0); diff --git a/src/lock_tree/tests/test_conflict_write_read.c b/src/lock_tree/tests/test_conflict_write_read.c index cee52ca69b9..a5b866f5782 100644 --- a/src/lock_tree/tests/test_conflict_write_read.c +++ b/src/lock_tree/tests/test_conflict_write_read.c @@ -33,11 +33,11 @@ int main(int argc, const char *argv[]) { // setup toku_ltm *ltm = NULL; - r = toku_ltm_create(<m, max_locks, max_lock_memory, dbpanic, get_compare_fun_from_db); + r = toku_ltm_create(<m, max_locks, max_lock_memory, dbpanic); assert(r == 0 && ltm); toku_lock_tree *lt = NULL; - r = toku_lt_create(<, dbpanic, ltm, get_compare_fun_from_db); + r = toku_lt_create(<, ltm, dbcmp); assert(r == 0 && lt); const TXNID txn_a = 1; @@ -68,10 +68,10 @@ int main(int argc, const char *argv[]) { assert(txnid_set_get(&conflicts, 0) == txn_a); txnid_set_destroy(&conflicts); - r = toku_lt_unlock(lt, txn_a); assert(r == 0); + r = toku_lt_unlock_txn(lt, txn_a); assert(r == 0); assert(b_r_l.state == LOCK_REQUEST_COMPLETE && b_r_l.complete_r == 0); toku_lock_request_destroy(&b_r_l); - r = toku_lt_unlock(lt, txn_b); assert(r == 0); + r = toku_lt_unlock_txn(lt, txn_b); assert(r == 0); // shutdown r = toku_lt_close(lt); assert(r == 0); diff --git a/src/lock_tree/tests/test_conflict_write_table_read.c b/src/lock_tree/tests/test_conflict_write_table_read.c index 853ebaf105a..1a98ab5b377 100644 --- a/src/lock_tree/tests/test_conflict_write_table_read.c +++ b/src/lock_tree/tests/test_conflict_write_table_read.c @@ -33,11 +33,11 @@ int main(int argc, const char *argv[]) { // setup toku_ltm *ltm = NULL; - r = toku_ltm_create(<m, max_locks, max_lock_memory, dbpanic, get_compare_fun_from_db); + r = toku_ltm_create(<m, max_locks, max_lock_memory, dbpanic); assert(r == 0 && ltm); toku_lock_tree *lt = NULL; - r = toku_lt_create(<, dbpanic, ltm, get_compare_fun_from_db); + r = toku_lt_create(<, ltm, dbcmp); assert(r == 0 && lt); DBT key_l; dbt_init(&key_l, "L", 1); @@ -70,8 +70,8 @@ int main(int argc, const char *argv[]) { txnid_set_destroy(&conflicts); toku_lock_request_destroy(&b_r_l); - r = toku_lt_unlock(lt, txn_a); assert(r == 0); - r = toku_lt_unlock(lt, txn_b); assert(r == 0); + r = toku_lt_unlock_txn(lt, txn_a); assert(r == 0); + r = toku_lt_unlock_txn(lt, txn_b); assert(r == 0); // shutdown r = toku_lt_close(lt); assert(r == 0); diff --git a/src/lock_tree/tests/test_conflict_write_write.c b/src/lock_tree/tests/test_conflict_write_write.c index 5d1bb7c57e5..8d3dfc48db1 100644 --- a/src/lock_tree/tests/test_conflict_write_write.c +++ b/src/lock_tree/tests/test_conflict_write_write.c @@ -33,11 +33,11 @@ int main(int argc, const char *argv[]) { // setup toku_ltm *ltm = NULL; - r = toku_ltm_create(<m, max_locks, max_lock_memory, dbpanic, get_compare_fun_from_db); + r = toku_ltm_create(<m, max_locks, max_lock_memory, dbpanic); assert(r == 0 && ltm); toku_lock_tree *lt = NULL; - r = toku_lt_create(<, dbpanic, ltm, get_compare_fun_from_db); + r = toku_lt_create(<, ltm, dbcmp); assert(r == 0 && lt); DBT key_l; dbt_init(&key_l, "L", 1); @@ -71,7 +71,7 @@ int main(int argc, const char *argv[]) { toku_lock_request_destroy(&b_w_l); - r = toku_lt_unlock(lt, txn_a); assert(r == 0); + r = toku_lt_unlock_txn(lt, txn_a); assert(r == 0); // shutdown r = toku_lt_close(lt); assert(r == 0); diff --git a/src/lock_tree/tests/test_default_lock_timeout.c b/src/lock_tree/tests/test_default_lock_timeout.c index 3be84849f9d..f343ed3d8d9 100644 --- a/src/lock_tree/tests/test_default_lock_timeout.c +++ b/src/lock_tree/tests/test_default_lock_timeout.c @@ -50,11 +50,11 @@ int main(int argc, const char *argv[]) { // setup toku_ltm *ltm = NULL; - r = toku_ltm_create(<m, max_locks, max_lock_memory, dbpanic, get_compare_fun_from_db); + r = toku_ltm_create(<m, max_locks, max_lock_memory, dbpanic); assert(r == 0 && ltm); toku_lock_tree *lt = NULL; - r = toku_lt_create(<, dbpanic, ltm, get_compare_fun_from_db); + r = toku_lt_create(<, ltm, dbcmp); assert(r == 0 && lt); const TXNID txn_a = 1; @@ -68,7 +68,7 @@ int main(int argc, const char *argv[]) { r = write_lock(lt, txn_b, "L"); assert(r == DB_LOCK_NOTGRANTED); } - r = toku_lt_unlock(lt, txn_a); assert(r == 0); + r = toku_lt_unlock_txn(lt, txn_a); assert(r == 0); // shutdown r = toku_lt_close(lt); assert(r == 0); diff --git a/src/lock_tree/tests/test_default_timeout.c b/src/lock_tree/tests/test_default_timeout.c index a3cabbb0c98..86eb74669a3 100644 --- a/src/lock_tree/tests/test_default_timeout.c +++ b/src/lock_tree/tests/test_default_timeout.c @@ -30,7 +30,7 @@ int main(int argc, const char *argv[]) { // setup toku_ltm *ltm = NULL; - r = toku_ltm_create(<m, max_locks, max_lock_memory, dbpanic, get_compare_fun_from_db); + r = toku_ltm_create(<m, max_locks, max_lock_memory, dbpanic); assert(r == 0 && ltm); uint64_t target_wait_time, the_wait_time; diff --git a/src/lock_tree/tests/test_footprint_point_write.c b/src/lock_tree/tests/test_footprint_point_write.c index a006ebfc382..ff7a8ebc545 100644 --- a/src/lock_tree/tests/test_footprint_point_write.c +++ b/src/lock_tree/tests/test_footprint_point_write.c @@ -93,7 +93,7 @@ int main(int argc, const char *argv[]) { // setup toku_ltm *ltm = NULL; - r = toku_ltm_create(<m, max_locks, max_lock_memory, dbpanic, get_compare_fun_from_db); + r = toku_ltm_create(<m, max_locks, max_lock_memory, dbpanic); assert(r == 0 && ltm); struct my_ltm_status s; @@ -104,7 +104,7 @@ int main(int argc, const char *argv[]) { assert(s.curr_lock_memory == 0); toku_lock_tree *lt = NULL; - r = toku_lt_create(<, dbpanic, ltm, get_compare_fun_from_db); + r = toku_lt_create(<, ltm, dbcmp); assert(r == 0 && lt); DB *db_a = (DB *) 2; @@ -135,7 +135,7 @@ int main(int argc, const char *argv[]) { // release the locks - r = toku_lt_unlock(lt, txn_a); assert(r == 0); + r = toku_lt_unlock_txn(lt, txn_a); assert(r == 0); my_ltm_get_status(ltm, &s); assert(s.curr_locks == 0); diff --git a/src/lock_tree/tests/test_footprint_range_write.c b/src/lock_tree/tests/test_footprint_range_write.c index 481722da05d..9555c142008 100644 --- a/src/lock_tree/tests/test_footprint_range_write.c +++ b/src/lock_tree/tests/test_footprint_range_write.c @@ -95,7 +95,7 @@ int main(int argc, const char *argv[]) { // setup toku_ltm *ltm = NULL; - r = toku_ltm_create(<m, max_locks, max_lock_memory, dbpanic, get_compare_fun_from_db); + r = toku_ltm_create(<m, max_locks, max_lock_memory, dbpanic); assert(r == 0 && ltm); struct my_ltm_status s; @@ -106,7 +106,7 @@ int main(int argc, const char *argv[]) { assert(s.curr_lock_memory == 0); toku_lock_tree *lt = NULL; - r = toku_lt_create(<, dbpanic, ltm, get_compare_fun_from_db); + r = toku_lt_create(<, ltm, dbcmp); assert(r == 0 && lt); DB *db_a = (DB *) 2; @@ -139,7 +139,7 @@ int main(int argc, const char *argv[]) { // release the locks - r = toku_lt_unlock(lt, txn_a); assert(r == 0); + r = toku_lt_unlock_txn(lt, txn_a); assert(r == 0); my_ltm_get_status(ltm, &s); assert(s.curr_locks == 0); diff --git a/src/lock_tree/tests/test_global_write_lock.c b/src/lock_tree/tests/test_global_write_lock.c index acdd4340c72..38c56be2a27 100644 --- a/src/lock_tree/tests/test_global_write_lock.c +++ b/src/lock_tree/tests/test_global_write_lock.c @@ -36,10 +36,10 @@ static void init_query(void) { static void setup_tree(void) { assert(!lt && !ltm); - r = toku_ltm_create(<m, max_locks, max_lock_memory, dbpanic, get_compare_fun_from_db); + r = toku_ltm_create(<m, max_locks, max_lock_memory, dbpanic); CKERR(r); assert(ltm); - r = toku_lt_create(<, dbpanic, ltm, get_compare_fun_from_db); + r = toku_lt_create(<, ltm, dbcmp); CKERR(r); assert(lt); init_query(); @@ -103,7 +103,7 @@ static void lt_insert_write_range(int r_expect, char txn, int key_l, int key_r) } static void lt_unlock(TXNID txnid) { - r = toku_lt_unlock(lt, txnid); CKERR(r); + r = toku_lt_unlock_txn(lt, txnid); CKERR(r); } static void runtest(void) { diff --git a/src/lock_tree/tests/test_lock_timeout.c b/src/lock_tree/tests/test_lock_timeout.c index ff20823f0ad..d4293d8062b 100644 --- a/src/lock_tree/tests/test_lock_timeout.c +++ b/src/lock_tree/tests/test_lock_timeout.c @@ -50,11 +50,11 @@ int main(int argc, const char *argv[]) { // setup toku_ltm *ltm = NULL; - r = toku_ltm_create(<m, max_locks, max_lock_memory, dbpanic, get_compare_fun_from_db); + r = toku_ltm_create(<m, max_locks, max_lock_memory, dbpanic); assert(r == 0 && ltm); toku_lock_tree *lt = NULL; - r = toku_lt_create(<, dbpanic, ltm, get_compare_fun_from_db); + r = toku_lt_create(<, ltm, dbcmp); assert(r == 0 && lt); const TXNID txn_a = 1; @@ -68,7 +68,7 @@ int main(int argc, const char *argv[]) { r = write_lock(lt, txn_b, "L", &wait_time); assert(r == DB_LOCK_NOTGRANTED); } - r = toku_lt_unlock(lt, txn_a); assert(r == 0); + r = toku_lt_unlock_txn(lt, txn_a); assert(r == 0); // shutdown r = toku_lt_close(lt); assert(r == 0); diff --git a/src/lock_tree/tests/test_ltm_get_status.c b/src/lock_tree/tests/test_ltm_get_status.c index 04f0085fda4..0e7a7c54f9e 100644 --- a/src/lock_tree/tests/test_ltm_get_status.c +++ b/src/lock_tree/tests/test_ltm_get_status.c @@ -20,7 +20,7 @@ int main(int argc, const char *argv[]) { int r; toku_ltm *ltm = NULL; - r = toku_ltm_create(<m, MAX_LOCKS, MAX_LOCK_MEMORY, dbpanic, get_compare_fun_from_db); + r = toku_ltm_create(<m, MAX_LOCKS, MAX_LOCK_MEMORY, dbpanic); CKERR(r); do_ltm_status(ltm); #if 0 @@ -63,17 +63,17 @@ int main(int argc, const char *argv[]) { /* create tests. */ { r = toku_lt_create(NULL, dbpanic, ltm, - get_compare_fun_from_db, + dbcmp, toku_malloc, toku_free, toku_realloc); CKERR2(r, EINVAL); r = toku_lt_create(<, NULL, ltm, - get_compare_fun_from_db, + dbcmp, toku_malloc, toku_free, toku_realloc); CKERR2(r, EINVAL); r = toku_lt_create(<, dbpanic, NULL, - get_compare_fun_from_db, + dbcmp, toku_malloc, toku_free, toku_realloc); CKERR2(r, EINVAL); @@ -83,15 +83,15 @@ int main(int argc, const char *argv[]) { CKERR2(r, EINVAL); r = toku_lt_create(<, dbpanic, ltm, - get_compare_fun_from_db, + dbcmp, NULL, toku_free, toku_realloc); CKERR2(r, EINVAL); r = toku_lt_create(<, dbpanic, ltm, - get_compare_fun_from_db, + dbcmp, toku_malloc, NULL, toku_realloc); CKERR2(r, EINVAL); r = toku_lt_create(<, dbpanic, ltm, - get_compare_fun_from_db, + dbcmp, toku_malloc, toku_free, NULL); CKERR2(r, EINVAL); } diff --git a/src/lock_tree/tests/test_read_notgranted.c b/src/lock_tree/tests/test_read_notgranted.c index 6d9543f0185..560298f9b6b 100644 --- a/src/lock_tree/tests/test_read_notgranted.c +++ b/src/lock_tree/tests/test_read_notgranted.c @@ -49,11 +49,11 @@ int main(int argc, const char *argv[]) { // setup toku_ltm *ltm = NULL; - r = toku_ltm_create(<m, max_locks, max_lock_memory, dbpanic, get_compare_fun_from_db); + r = toku_ltm_create(<m, max_locks, max_lock_memory, dbpanic); assert(r == 0 && ltm); toku_lock_tree *lt = NULL; - r = toku_lt_create(<, dbpanic, ltm, get_compare_fun_from_db); + r = toku_lt_create(<, ltm, dbcmp); assert(r == 0 && lt); const TXNID txn_a = 1; @@ -65,11 +65,11 @@ int main(int argc, const char *argv[]) { const TXNID txn_c = 3; r = read_lock(lt, txn_c, "L"); assert(r == DB_LOCK_NOTGRANTED); - r = toku_lt_unlock(lt, txn_a); assert(r == 0); + r = toku_lt_unlock_txn(lt, txn_a); assert(r == 0); r = read_lock(lt, txn_b, "L"); assert(r == 0); r = read_lock(lt, txn_c, "L"); assert(r == 0); - r = toku_lt_unlock(lt, txn_b); assert(r == 0); - r = toku_lt_unlock(lt, txn_c); assert(r == 0); + r = toku_lt_unlock_txn(lt, txn_b); assert(r == 0); + r = toku_lt_unlock_txn(lt, txn_c); assert(r == 0); // shutdown r = toku_lt_close(lt); assert(r == 0); diff --git a/src/lock_tree/tests/test_read_out_of_locks.c b/src/lock_tree/tests/test_read_out_of_locks.c index 30860d4b0ea..f2b9a4e3bcd 100644 --- a/src/lock_tree/tests/test_read_out_of_locks.c +++ b/src/lock_tree/tests/test_read_out_of_locks.c @@ -38,11 +38,11 @@ int main(int argc, const char *argv[]) { // setup toku_ltm *ltm = NULL; - r = toku_ltm_create(<m, max_locks, max_lock_memory, dbpanic, get_compare_fun_from_db); + r = toku_ltm_create(<m, max_locks, max_lock_memory, dbpanic); assert(r == 0 && ltm); toku_lock_tree *lt = NULL; - r = toku_lt_create(<, dbpanic, ltm, get_compare_fun_from_db); + r = toku_lt_create(<, ltm, dbcmp); assert(r == 0 && lt); const TXNID txn_a = 1; @@ -62,13 +62,13 @@ int main(int argc, const char *argv[]) { r = toku_lock_request_start(&c_w_l, lt, false); assert(r != 0); assert(c_w_l.state == LOCK_REQUEST_PENDING); - r = toku_lt_unlock(lt, txn_a); assert(r == 0); + r = toku_lt_unlock_txn(lt, txn_a); assert(r == 0); assert(b_w_l.state == LOCK_REQUEST_COMPLETE && b_w_l.complete_r == 0); assert(c_w_l.state == LOCK_REQUEST_COMPLETE && c_w_l.complete_r == TOKUDB_OUT_OF_LOCKS); toku_lock_request_destroy(&b_w_l); - r = toku_lt_unlock(lt, txn_b); assert(r == 0); + r = toku_lt_unlock_txn(lt, txn_b); assert(r == 0); toku_lock_request_destroy(&c_w_l); - r = toku_lt_unlock(lt, txn_c); assert(r == 0); + r = toku_lt_unlock_txn(lt, txn_c); assert(r == 0); // shutdown r = toku_lt_close(lt); assert(r == 0); diff --git a/src/lock_tree/tests/test_read_request_blocked.c b/src/lock_tree/tests/test_read_request_blocked.c index fe2e8972d8e..9944aef8404 100644 --- a/src/lock_tree/tests/test_read_request_blocked.c +++ b/src/lock_tree/tests/test_read_request_blocked.c @@ -37,11 +37,11 @@ int main(int argc, const char *argv[]) { // setup toku_ltm *ltm = NULL; - r = toku_ltm_create(<m, max_locks, max_lock_memory, dbpanic, get_compare_fun_from_db); + r = toku_ltm_create(<m, max_locks, max_lock_memory, dbpanic); assert(r == 0 && ltm); toku_lock_tree *lt = NULL; - r = toku_lt_create(<, dbpanic, ltm, get_compare_fun_from_db); + r = toku_lt_create(<, ltm, dbcmp); assert(r == 0 && lt); const TXNID txn_a = 1; @@ -61,13 +61,13 @@ int main(int argc, const char *argv[]) { r = toku_lock_request_start(&c_w_l, lt, false); assert(r != 0); assert(c_w_l.state == LOCK_REQUEST_PENDING); - r = toku_lt_unlock(lt, txn_a); assert(r == 0); + r = toku_lt_unlock_txn(lt, txn_a); assert(r == 0); assert(b_w_l.state == LOCK_REQUEST_COMPLETE && b_w_l.complete_r == 0); assert(c_w_l.state == LOCK_REQUEST_COMPLETE && c_w_l.complete_r == 0);; toku_lock_request_destroy(&b_w_l); toku_lock_request_destroy(&c_w_l); - r = toku_lt_unlock(lt, txn_b); assert(r == 0); - r = toku_lt_unlock(lt, txn_c); assert(r == 0); + r = toku_lt_unlock_txn(lt, txn_b); assert(r == 0); + r = toku_lt_unlock_txn(lt, txn_c); assert(r == 0); // shutdown r = toku_lt_close(lt); assert(r == 0); diff --git a/src/lock_tree/tests/test_set_mutex.c b/src/lock_tree/tests/test_set_mutex.c deleted file mode 100644 index 29baaa06eb5..00000000000 --- a/src/lock_tree/tests/test_set_mutex.c +++ /dev/null @@ -1,84 +0,0 @@ -// verify that a user supplied mutex works -// T(A) gets W(L) -// T(B) tries W(L), gets lock request blocked -// T(B) lock request W(L) times out -// T(A) releases locks -// T(B) releases locks - -#include "test.h" - -int -main(int argc, const char *argv[]) { - int r; - - uint32_t max_locks = 2; - uint64_t max_lock_memory = 4096; - - for (int i = 1; i < argc; i++) { - if (strcmp(argv[i], "-v") == 0 || strcmp(argv[i], "--verbose") == 0) { - if (verbose > 0) verbose++; - continue; - } - if (strcmp(argv[i], "-q") == 0 || strcmp(argv[i], "--quiet") == 0) { - if (verbose > 0) verbose--; - continue; - } - if (strcmp(argv[i], "--max_locks") == 0 && i+1 < argc) { - max_locks = atoi(argv[++i]); - continue; - } - if (strcmp(argv[i], "--max_lock_memory") == 0 && i+1 < argc) { - max_lock_memory = atoi(argv[++i]); - continue; - } - assert(0); - } - - // setup - toku_ltm *ltm = NULL; - r = toku_ltm_create(<m, max_locks, max_lock_memory, dbpanic, get_compare_fun_from_db); - assert(r == 0 && ltm); - - toku_ltm_set_lock_wait_time(ltm, 5000); - - toku_pthread_mutex_t my_mutex = TOKU_PTHREAD_MUTEX_INITIALIZER; - toku_ltm_set_mutex(ltm, &my_mutex); - - toku_lock_tree *lt = NULL; - r = toku_lt_create(<, dbpanic, ltm, get_compare_fun_from_db); - assert(r == 0 && lt); - - const TXNID txn_a = 1; - const TXNID txn_b = 2; - - DBT key_l; dbt_init(&key_l, "L", 1); - toku_lock_request a_w_l; toku_lock_request_init(&a_w_l, (DB *)1, txn_a, &key_l, &key_l, LOCK_REQUEST_WRITE); - toku_ltm_lock_mutex(ltm); - r = toku_lock_request_start_locked(&a_w_l, lt, false); assert(r == 0); - toku_ltm_unlock_mutex(ltm); - assert(a_w_l.state == LOCK_REQUEST_COMPLETE && a_w_l.complete_r == 0); - - toku_lock_request b_w_l; toku_lock_request_init(&b_w_l, (DB *)1, txn_b, &key_l, &key_l, LOCK_REQUEST_WRITE); - toku_ltm_lock_mutex(ltm); - r = toku_lock_request_start_locked(&b_w_l, lt, false); assert(r != 0); - toku_ltm_unlock_mutex(ltm); - assert(b_w_l.state == LOCK_REQUEST_PENDING); - - toku_ltm_lock_mutex(ltm); - r = toku_lock_request_wait_with_default_timeout(&b_w_l, lt); - toku_ltm_unlock_mutex(ltm); - assert(r == DB_LOCK_NOTGRANTED); - assert(b_w_l.state == LOCK_REQUEST_COMPLETE); - - toku_lock_request_destroy(&a_w_l); - toku_lock_request_destroy(&b_w_l); - - r = toku_lt_unlock(lt, txn_a); assert(r == 0); - r = toku_lt_unlock(lt, txn_b); assert(r == 0); - - // shutdown - r = toku_lt_close(lt); assert(r == 0); - r = toku_ltm_close(ltm); assert(r == 0); - - return 0; -} diff --git a/src/lock_tree/tests/test_set_mutex_default_lock_timeout.c b/src/lock_tree/tests/test_set_mutex_default_lock_timeout.c deleted file mode 100644 index 35832df8fa0..00000000000 --- a/src/lock_tree/tests/test_set_mutex_default_lock_timeout.c +++ /dev/null @@ -1,87 +0,0 @@ -// T(A) gets W(L) -// T(B) tries W(L) with timeout, gets DB_LOCK_NOTGRANTED -// T(B) releases locks - -#include "test.h" - -static int read_lock(toku_ltm *ltm, toku_lock_tree *lt, TXNID txnid, char *k) { - DBT key; dbt_init(&key, k, strlen(k)); - toku_lock_request lr; - toku_lock_request_init(&lr, (DB*)1, txnid, &key, &key, LOCK_REQUEST_READ); - toku_ltm_lock_mutex(ltm); - int r = toku_lt_acquire_lock_request_with_default_timeout_locked(lt, &lr); - toku_ltm_unlock_mutex(ltm); - toku_lock_request_destroy(&lr); - return r; -} - -static int write_lock(toku_ltm *ltm, toku_lock_tree *lt, TXNID txnid, char *k) { - DBT key; dbt_init(&key, k, strlen(k)); - toku_lock_request lr; - toku_lock_request_init(&lr, (DB*)1, txnid, &key, &key, LOCK_REQUEST_WRITE); - toku_ltm_lock_mutex(ltm); - int r = toku_lt_acquire_lock_request_with_default_timeout_locked(lt, &lr); - toku_ltm_unlock_mutex(ltm); - toku_lock_request_destroy(&lr); - return r; -} - -int main(int argc, const char *argv[]) { - int r; - - uint32_t max_locks = 1; - uint64_t max_lock_memory = 4096; - - for (int i = 1; i < argc; i++) { - if (strcmp(argv[i], "-v") == 0 || strcmp(argv[i], "--verbose") == 0) { - verbose++; - continue; - } - if (strcmp(argv[i], "-q") == 0 || strcmp(argv[i], "--quiet") == 0) { - if (verbose > 0) verbose--; - continue; - } - if (strcmp(argv[i], "--max_locks") == 0 && i+1 < argc) { - max_locks = atoi(argv[++i]); - continue; - } - if (strcmp(argv[i], "--max_lock_memory") == 0 && i+1 < argc) { - max_lock_memory = atoi(argv[++i]); - continue; - } - assert(0); - } - - // setup - toku_ltm *ltm = NULL; - r = toku_ltm_create(<m, max_locks, max_lock_memory, dbpanic, get_compare_fun_from_db); - assert(r == 0 && ltm); - - toku_pthread_mutex_t my_mutex = TOKU_PTHREAD_MUTEX_INITIALIZER; - toku_ltm_set_mutex(ltm, &my_mutex); - - toku_lock_tree *lt = NULL; - r = toku_lt_create(<, dbpanic, ltm, get_compare_fun_from_db); - assert(r == 0 && lt); - - const TXNID txn_a = 1; - const TXNID txn_b = 2; - - r = write_lock(ltm, lt, txn_a, "L"); assert(r == 0); - for (int t = 1; t < 10; t++) { - toku_ltm_set_lock_wait_time(ltm, t * 1000); - r = read_lock(ltm, lt, txn_b, "L"); - assert(r == DB_LOCK_NOTGRANTED); - r = write_lock(ltm, lt, txn_b, "L"); - assert(r == DB_LOCK_NOTGRANTED); - } - toku_ltm_lock_mutex(ltm); - r = toku_lt_unlock(lt, txn_a); assert(r == 0); - toku_ltm_unlock_mutex(ltm); - - // shutdown - r = toku_lt_close(lt); assert(r == 0); - r = toku_ltm_close(ltm); assert(r == 0); - - return 0; -} diff --git a/src/lock_tree/tests/test_simple_deadlock.c b/src/lock_tree/tests/test_simple_deadlock.c index 286cbaa7725..7ae907418c3 100644 --- a/src/lock_tree/tests/test_simple_deadlock.c +++ b/src/lock_tree/tests/test_simple_deadlock.c @@ -36,11 +36,11 @@ int main(int argc, const char *argv[]) { // setup toku_ltm *ltm = NULL; - r = toku_ltm_create(<m, max_locks, max_lock_memory, dbpanic, get_compare_fun_from_db); + r = toku_ltm_create(<m, max_locks, max_lock_memory, dbpanic); assert(r == 0 && ltm); toku_lock_tree *lt = NULL; - r = toku_lt_create(<, dbpanic, ltm, get_compare_fun_from_db); + r = toku_lt_create(<, ltm, dbcmp); assert(r == 0 && lt); const TXNID txn_a = 1; @@ -65,14 +65,14 @@ int main(int argc, const char *argv[]) { r = toku_lock_request_start(&b_w_l, lt, false); assert(r == DB_LOCK_DEADLOCK); assert(b_w_l.state == LOCK_REQUEST_COMPLETE && b_w_l.complete_r == DB_LOCK_DEADLOCK); - r = toku_lt_unlock(lt, txn_b); assert(r == 0); + r = toku_lt_unlock_txn(lt, txn_b); assert(r == 0); toku_lock_request_destroy(&b_w_l); assert(a_w_m.state == LOCK_REQUEST_COMPLETE && a_w_m.complete_r == 0 ); toku_lock_request_destroy(&a_w_m); - r = toku_lt_unlock(lt, txn_a); assert(r == 0); + r = toku_lt_unlock_txn(lt, txn_a); assert(r == 0); // shutdown r = toku_lt_close(lt); assert(r == 0); diff --git a/src/lock_tree/tests/test_update_deadlock.c b/src/lock_tree/tests/test_update_deadlock.c index 6b66482f2fb..a38689e7e9c 100644 --- a/src/lock_tree/tests/test_update_deadlock.c +++ b/src/lock_tree/tests/test_update_deadlock.c @@ -36,11 +36,11 @@ int main(int argc, const char *argv[]) { // setup toku_ltm *ltm = NULL; - r = toku_ltm_create(<m, max_locks, max_lock_memory, dbpanic, get_compare_fun_from_db); + r = toku_ltm_create(<m, max_locks, max_lock_memory, dbpanic); assert(r == 0 && ltm); toku_lock_tree *lt = NULL; - r = toku_lt_create(<, dbpanic, ltm, get_compare_fun_from_db); + r = toku_lt_create(<, ltm, dbcmp); assert(r == 0 && lt); const TXNID txn_a = 1; @@ -64,13 +64,13 @@ int main(int argc, const char *argv[]) { r = toku_lock_request_start(&b_w_l, lt, false); assert(r == DB_LOCK_DEADLOCK); assert(b_w_l.state == LOCK_REQUEST_COMPLETE && b_w_l.complete_r == DB_LOCK_DEADLOCK); - r = toku_lt_unlock(lt, txn_b); assert(r == 0); + r = toku_lt_unlock_txn(lt, txn_b); assert(r == 0); toku_lock_request_destroy(&b_w_l); assert(a_w_l.state == LOCK_REQUEST_COMPLETE && a_w_l.complete_r == 0); toku_lock_request_destroy(&a_w_l); - r = toku_lt_unlock(lt, txn_a); assert(r == 0); + r = toku_lt_unlock_txn(lt, txn_a); assert(r == 0); // shutdown r = toku_lt_close(lt); assert(r == 0); diff --git a/src/lock_tree/tests/test_update_deadlock_copy_keys.c b/src/lock_tree/tests/test_update_deadlock_copy_keys.c index 784eeaec0b7..6601d56f7e6 100644 --- a/src/lock_tree/tests/test_update_deadlock_copy_keys.c +++ b/src/lock_tree/tests/test_update_deadlock_copy_keys.c @@ -36,11 +36,11 @@ int main(int argc, const char *argv[]) { // setup toku_ltm *ltm = NULL; - r = toku_ltm_create(<m, max_locks, max_lock_memory, dbpanic, get_compare_fun_from_db); + r = toku_ltm_create(<m, max_locks, max_lock_memory, dbpanic); assert(r == 0 && ltm); toku_lock_tree *lt = NULL; - r = toku_lt_create(<, dbpanic, ltm, get_compare_fun_from_db); + r = toku_lt_create(<, ltm, dbcmp); assert(r == 0 && lt); const TXNID txn_a = 1; @@ -64,13 +64,13 @@ int main(int argc, const char *argv[]) { r = toku_lock_request_start(&b_w_l, lt, true); assert(r == DB_LOCK_DEADLOCK); assert(b_w_l.state == LOCK_REQUEST_COMPLETE && b_w_l.complete_r == DB_LOCK_DEADLOCK); - r = toku_lt_unlock(lt, txn_b); assert(r == 0); + r = toku_lt_unlock_txn(lt, txn_b); assert(r == 0); toku_lock_request_destroy(&b_w_l); assert(a_w_l.state == LOCK_REQUEST_COMPLETE && a_w_l.complete_r == 0); toku_lock_request_destroy(&a_w_l); - r = toku_lt_unlock(lt, txn_a); assert(r == 0); + r = toku_lt_unlock_txn(lt, txn_a); assert(r == 0); // shutdown r = toku_lt_close(lt); assert(r == 0); diff --git a/src/lock_tree/tests/test_write_conflict_with_threads.c b/src/lock_tree/tests/test_write_conflict_with_threads.c index 92379e8fdcd..fd79e0beff2 100644 --- a/src/lock_tree/tests/test_write_conflict_with_threads.c +++ b/src/lock_tree/tests/test_write_conflict_with_threads.c @@ -27,7 +27,7 @@ static void *writer_thread(void *arg) { int r = write_lock(writer_arg->lt, writer_arg->id, writer_arg->name); assert(r == 0); printf("%lu locked\n", writer_arg->id); sleep(1); - toku_lt_unlock(writer_arg->lt, writer_arg->id); + toku_lt_unlock_txn(writer_arg->lt, writer_arg->id); printf("%lu unlocked\n", writer_arg->id); return arg; } @@ -65,11 +65,11 @@ int main(int argc, const char *argv[]) { // setup toku_ltm *ltm = NULL; - r = toku_ltm_create(<m, max_locks, max_lock_memory, dbpanic, get_compare_fun_from_db); + r = toku_ltm_create(<m, max_locks, max_lock_memory, dbpanic); assert(r == 0 && ltm); toku_lock_tree *lt = NULL; - r = toku_lt_create(<, dbpanic, ltm, get_compare_fun_from_db); + r = toku_lt_create(<, ltm, dbcmp); assert(r == 0 && lt); const TXNID txn_a = 1; @@ -84,7 +84,7 @@ int main(int argc, const char *argv[]) { r = toku_pthread_create(&tids[i], NULL, writer_thread, writer_arg); assert(r == 0); } sleep(10); - r = toku_lt_unlock(lt, txn_a); assert(r == 0); + r = toku_lt_unlock_txn(lt, txn_a); assert(r == 0); printf("main unlocked\n"); for (int i = 0; i < max_threads; i++) { diff --git a/src/lock_tree/tests/test_write_notgranted.c b/src/lock_tree/tests/test_write_notgranted.c index d2698bc397b..167db5959e6 100644 --- a/src/lock_tree/tests/test_write_notgranted.c +++ b/src/lock_tree/tests/test_write_notgranted.c @@ -40,11 +40,11 @@ int main(int argc, const char *argv[]) { // setup toku_ltm *ltm = NULL; - r = toku_ltm_create(<m, max_locks, max_lock_memory, dbpanic, get_compare_fun_from_db); + r = toku_ltm_create(<m, max_locks, max_lock_memory, dbpanic); assert(r == 0 && ltm); toku_lock_tree *lt = NULL; - r = toku_lt_create(<, dbpanic, ltm, get_compare_fun_from_db); + r = toku_lt_create(<, ltm, dbcmp); assert(r == 0 && lt); const TXNID txn_a = 1; @@ -52,9 +52,9 @@ int main(int argc, const char *argv[]) { r = write_lock(lt, txn_a, "L"); assert(r == 0); r = write_lock(lt, txn_b, "L"); assert(r == DB_LOCK_NOTGRANTED); - r = toku_lt_unlock(lt, txn_a); assert(r == 0); + r = toku_lt_unlock_txn(lt, txn_a); assert(r == 0); r = write_lock(lt, txn_b, "L"); assert(r == 0); - r = toku_lt_unlock(lt, txn_b); assert(r == 0); + r = toku_lt_unlock_txn(lt, txn_b); assert(r == 0); // shutdown r = toku_lt_close(lt); assert(r == 0); diff --git a/src/lock_tree/tests/test_write_range.c b/src/lock_tree/tests/test_write_range.c index 63c4b0b3776..8bcfab454f7 100644 --- a/src/lock_tree/tests/test_write_range.c +++ b/src/lock_tree/tests/test_write_range.c @@ -36,10 +36,10 @@ static void init_query(void) { static void setup_tree(void) { assert(!lt && !ltm); - r = toku_ltm_create(<m, max_locks, max_lock_memory, dbpanic, get_compare_fun_from_db); + r = toku_ltm_create(<m, max_locks, max_lock_memory, dbpanic); CKERR(r); assert(ltm); - r = toku_lt_create(<, dbpanic, ltm, get_compare_fun_from_db); + r = toku_lt_create(<, ltm, dbcmp); CKERR(r); assert(lt); init_query(); @@ -125,7 +125,7 @@ static void lt_insert_write_range(int r_expect, char txn, int key_l, int key_r) static void lt_unlock(char ctxn) UU(); static void lt_unlock(char ctxn) { int retval; - retval = toku_lt_unlock(lt, (TXNID) (size_t) ctxn); + retval = toku_lt_unlock_txn(lt, (TXNID) (size_t) ctxn); CKERR(retval); } diff --git a/src/lock_tree/tests/test_write_range_conflict_read.c b/src/lock_tree/tests/test_write_range_conflict_read.c index 1f85740aa51..def4b49f305 100644 --- a/src/lock_tree/tests/test_write_range_conflict_read.c +++ b/src/lock_tree/tests/test_write_range_conflict_read.c @@ -36,10 +36,10 @@ static void init_query(void) { static void setup_tree(void) { assert(!lt && !ltm); - r = toku_ltm_create(<m, max_locks, max_lock_memory, dbpanic, get_compare_fun_from_db); + r = toku_ltm_create(<m, max_locks, max_lock_memory, dbpanic); CKERR(r); assert(ltm); - r = toku_lt_create(<, dbpanic, ltm, get_compare_fun_from_db); + r = toku_lt_create(<, ltm, dbcmp); CKERR(r); assert(lt); init_query(); @@ -101,7 +101,7 @@ static void lt_insert_write_range(int r_expect, char txn, int key_l, int key_r) static void lt_unlock(char ctxn) UU(); static void lt_unlock(char ctxn) { int retval; - retval = toku_lt_unlock(lt, (TXNID) (size_t) ctxn); + retval = toku_lt_unlock_txn(lt, (TXNID) (size_t) ctxn); CKERR(retval); } diff --git a/src/lock_tree/tests/test_write_range_conflict_write.c b/src/lock_tree/tests/test_write_range_conflict_write.c index ff08271ee13..7b58b533737 100644 --- a/src/lock_tree/tests/test_write_range_conflict_write.c +++ b/src/lock_tree/tests/test_write_range_conflict_write.c @@ -36,10 +36,10 @@ static void init_query(void) { static void setup_tree(void) { assert(!lt && !ltm); - r = toku_ltm_create(<m, max_locks, max_lock_memory, dbpanic, get_compare_fun_from_db); + r = toku_ltm_create(<m, max_locks, max_lock_memory, dbpanic); CKERR(r); assert(ltm); - r = toku_lt_create(<, dbpanic, ltm, get_compare_fun_from_db); + r = toku_lt_create(<, ltm, dbcmp); CKERR(r); assert(lt); init_query(); @@ -102,7 +102,7 @@ static void lt_insert_write_range(int r_expect, char txn, int key_l, int key_r) static void lt_unlock(char ctxn) UU(); static void lt_unlock(char ctxn) { int retval; - retval = toku_lt_unlock(lt, (TXNID) (size_t) ctxn); + retval = toku_lt_unlock_txn(lt, (TXNID) (size_t) ctxn); CKERR(retval); } diff --git a/src/lock_tree/tests/test_write_request_blocked.c b/src/lock_tree/tests/test_write_request_blocked.c index 206185da3d6..196ff120f92 100644 --- a/src/lock_tree/tests/test_write_request_blocked.c +++ b/src/lock_tree/tests/test_write_request_blocked.c @@ -34,11 +34,11 @@ int main(int argc, const char *argv[]) { // setup toku_ltm *ltm = NULL; - r = toku_ltm_create(<m, max_locks, max_lock_memory, dbpanic, get_compare_fun_from_db); + r = toku_ltm_create(<m, max_locks, max_lock_memory, dbpanic); assert(r == 0 && ltm); toku_lock_tree *lt = NULL; - r = toku_lt_create(<, dbpanic, ltm, get_compare_fun_from_db); + r = toku_lt_create(<, ltm, dbcmp); assert(r == 0 && lt); const TXNID txn_a = 1; @@ -54,10 +54,10 @@ int main(int argc, const char *argv[]) { r = toku_lock_request_start(&b_w_l, lt, false); assert(r != 0); assert(b_w_l.state == LOCK_REQUEST_PENDING); - r = toku_lt_unlock(lt, txn_a); assert(r == 0); + r = toku_lt_unlock_txn(lt, txn_a); assert(r == 0); assert(b_w_l.state == LOCK_REQUEST_COMPLETE && b_w_l.complete_r == 0); toku_lock_request_destroy(&b_w_l); - r = toku_lt_unlock(lt, txn_b); assert(r == 0); + r = toku_lt_unlock_txn(lt, txn_b); assert(r == 0); // shutdown r = toku_lt_close(lt); assert(r == 0); diff --git a/src/lock_tree/tests/test_wrw.c b/src/lock_tree/tests/test_wrw.c index 85c0f3b8e9d..b290580596f 100644 --- a/src/lock_tree/tests/test_wrw.c +++ b/src/lock_tree/tests/test_wrw.c @@ -33,11 +33,11 @@ int main(int argc, const char *argv[]) { // setup toku_ltm *ltm = NULL; - r = toku_ltm_create(<m, max_locks, max_lock_memory, dbpanic, get_compare_fun_from_db); + r = toku_ltm_create(<m, max_locks, max_lock_memory, dbpanic); assert(r == 0 && ltm); toku_lock_tree *lt = NULL; - r = toku_lt_create(<, dbpanic, ltm, get_compare_fun_from_db); + r = toku_lt_create(<, ltm, dbcmp); assert(r == 0 && lt); const TXNID txn_a = 1; @@ -57,7 +57,7 @@ int main(int argc, const char *argv[]) { assert(a_w_l_2.state == LOCK_REQUEST_COMPLETE && a_w_l_2.complete_r == 0); toku_lock_request_destroy(&a_w_l_2); - r = toku_lt_unlock(lt, txn_a); assert(r == 0); + r = toku_lt_unlock_txn(lt, txn_a); assert(r == 0); // shutdown r = toku_lt_close(lt); assert(r == 0); diff --git a/src/range_tree/log_nooverlap.c b/src/range_tree/log_nooverlap.c index 3e7bbc1f101..4753cc2aa2d 100644 --- a/src/range_tree/log_nooverlap.c +++ b/src/range_tree/log_nooverlap.c @@ -215,8 +215,6 @@ toku_rt_insert(toku_range_tree* tree, toku_range* range) { insert_range = toku_xmalloc(sizeof *insert_range); *insert_range = *range; size_t start_omt_size = toku_omt_memory_size(tree->i.omt); - static int count = 0; - count++; r = toku_omt_insert_at(tree->i.omt, insert_range, index); assert_zero(r); size_t end_omt_size = toku_omt_memory_size(tree->i.omt); diff --git a/src/tests/Makefile b/src/tests/Makefile index 57ad83dedfd..0d6f632cecc 100644 --- a/src/tests/Makefile +++ b/src/tests/Makefile @@ -178,6 +178,7 @@ BDB_DONTRUN_TESTS = \ perf_malloc_free \ perf_nop \ perf_ptquery \ + perf_ptquery2 \ perf_xmalloc_free \ prelock-read-read \ prelock-read-write \ diff --git a/src/tests/blocking-c-del-deadlock.c b/src/tests/blocking-c-del-deadlock.c deleted file mode 100644 index 9da0c10e63b..00000000000 --- a/src/tests/blocking-c-del-deadlock.c +++ /dev/null @@ -1,215 +0,0 @@ -// verify that cursor deletes without write locks can detect deadlocks. - -#include "test.h" -#include "toku_pthread.h" - -static void populate(DB_ENV *db_env, DB *db, uint64_t nrows) { - int r; - - DB_TXN *txn = NULL; - r = db_env->txn_begin(db_env, NULL, &txn, 0); assert(r == 0); - - for (uint64_t i = 0; i < nrows; i++) { - - uint64_t k = htonl(i); - uint64_t v = i; - DBT key = { .data = &k, .size = sizeof k }; - DBT val = { .data = &v, .size = sizeof v }; - r = db->put(db, txn, &key, &val, 0); assert(r == 0); - } - - r = txn->commit(txn, 0); assert(r == 0); -} - -struct my_callback_context { - DBT key; - DBT val; -}; - -#if TOKUDB -static void copy_dbt(DBT *dest, DBT const *src) { - assert(dest->flags == DB_DBT_REALLOC); - dest->size = src->size; - dest->data = toku_xrealloc(dest->data, dest->size); - memcpy(dest->data, src->data, dest->size); -} - -static int blocking_c_del_callback(DBT const *a UU(), DBT const *b UU(), void *e UU()) { - DBT const *found_key = a; - DBT const *found_val = b; - struct my_callback_context *context = (struct my_callback_context *) e; - copy_dbt(&context->key, found_key); - copy_dbt(&context->val, found_val); - return 0; -} -#endif - -static void blocking_c_del(DB_ENV *db_env, DB *db, uint64_t nrows, long sleeptime) { - int r; - - struct my_callback_context context; - context.key = (DBT) { .data = NULL, .size = 0, .flags = DB_DBT_REALLOC }; - context.val = (DBT) { .data = NULL, .size = 0, .flags = DB_DBT_REALLOC }; - - for (uint64_t i = 0; i < nrows; i++) { - DB_TXN *txn = NULL; - r = db_env->txn_begin(db_env, NULL, &txn, 0); assert(r == 0); - - DBC *cursor = NULL; - r = db->cursor(db, txn, &cursor, 0); assert(r == 0); - - uint64_t k = htonl(i); - DBT key = { .data = &k, .size = sizeof k }; -#if TOKUDB - r = cursor->c_getf_set(cursor, 0, &key, blocking_c_del_callback, &context); -#else - r = cursor->c_get(cursor, &key, &context.val, DB_SET); -#endif - assert(r == 0 || r == DB_NOTFOUND); - - if (r == 0) { - - usleep(sleeptime); - - if (verbose) { - uint64_t kk; -#if TOKUDB - assert(context.key.size == sizeof kk); - memcpy(&kk, context.key.data, sizeof kk); -#else - assert(key.size == sizeof kk); - memcpy(&kk, key.data, sizeof kk); -#endif - printf("%lu deleting %lu\n", toku_pthread_self(), (long unsigned) htonl(kk)); - } - r = cursor->c_del(cursor, 0); - assert(r == 0 || r == DB_LOCK_DEADLOCK); - } - - { int rr = cursor->c_close(cursor); assert(rr == 0); } - - if (r == 0) { - if (verbose) printf("%lu commit\n", toku_pthread_self()); - r = txn->commit(txn, 0); - } else { - if (verbose) printf("%lu abort\n", toku_pthread_self()); - r = txn->abort(txn); - } - assert(r == 0); - if (verbose) - printf("%lu %lu\n", toku_pthread_self(), i); - } - - toku_free(context.key.data); - toku_free(context.val.data); -} - -struct blocking_c_del_args { - DB_ENV *db_env; - DB *db; - uint64_t nrows; - long sleeptime; -}; - -static void *blocking_c_del_thread(void *arg) { - struct blocking_c_del_args *a = (struct blocking_c_del_args *) arg; - blocking_c_del(a->db_env, a->db, a->nrows, a->sleeptime); - return arg; -} - -static void run_test(DB_ENV *db_env, DB *db, int nthreads, uint64_t nrows, long sleeptime) { - int r; - toku_pthread_t tids[nthreads]; - struct blocking_c_del_args a = { db_env, db, nrows, sleeptime }; - for (int i = 0; i < nthreads-1; i++) { - r = toku_pthread_create(&tids[i], NULL, blocking_c_del_thread, &a); assert(r == 0); - } - blocking_c_del(db_env, db, nrows, sleeptime); - for (int i = 0; i < nthreads-1; i++) { - void *ret; - r = toku_pthread_join(tids[i], &ret); assert(r == 0); - } -} - -int test_main(int argc, char * const argv[]) { - uint64_t cachesize = 0; - uint32_t pagesize = 0; - uint64_t nrows = 10; - int nthreads = 2; - long sleeptime = 100000; -#if defined(USE_TDB) - char *db_env_dir = "dir." __FILE__ ".tokudb"; -#elif defined(USE_BDB) - char *db_env_dir = "dir." __FILE__ ".bdb"; -#else -#error -#endif - char *db_filename = "test.db"; - int db_env_open_flags = DB_CREATE | DB_PRIVATE | DB_INIT_MPOOL | DB_INIT_TXN | DB_INIT_LOCK | DB_INIT_LOG | DB_THREAD; - - // parse_args(argc, argv); - for (int i = 1; i < argc; i++) { - if (strcmp(argv[i], "-v") == 0 || strcmp(argv[i], "--verbose") == 0) { - verbose++; - continue; - } - if (strcmp(argv[i], "-q") == 0 || strcmp(argv[i], "--quiet") == 0) { - if (verbose > 0) - verbose--; - continue; - } - if (strcmp(argv[i], "--nrows") == 0 && i+1 < argc) { - nrows = atoll(argv[++i]); - continue; - } - if (strcmp(argv[i], "--nthreads") == 0 && i+1 < argc) { - nthreads = atoi(argv[++i]); - continue; - } - if (strcmp(argv[i], "--sleeptime") == 0 && i+1 < argc) { - sleeptime = atol(argv[++i]); - continue; - } - assert(0); - } - - // setup env - int r; - char rm_cmd[strlen(db_env_dir) + strlen("rm -rf ") + 1]; - snprintf(rm_cmd, sizeof(rm_cmd), "rm -rf %s", db_env_dir); - r = system(rm_cmd); assert(r == 0); - - r = toku_os_mkdir(db_env_dir, S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH); assert(r == 0); - - DB_ENV *db_env = NULL; - r = db_env_create(&db_env, 0); assert(r == 0); - if (cachesize) { - const u_int64_t gig = 1 << 30; - r = db_env->set_cachesize(db_env, cachesize / gig, cachesize % gig, 1); assert(r == 0); - } - r = db_env->open(db_env, db_env_dir, db_env_open_flags, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH); assert(r == 0); -#if TOKUDB - r = db_env->set_lock_timeout(db_env, 30 * 1000); assert(r == 0); -#else - r = db_env->set_lk_detect(db_env, DB_LOCK_YOUNGEST); assert(r == 0); -#endif - - // create the db - DB *db = NULL; - r = db_create(&db, db_env, 0); assert(r == 0); - if (pagesize) { - r = db->set_pagesize(db, pagesize); assert(r == 0); - } - r = db->open(db, NULL, db_filename, NULL, DB_BTREE, DB_CREATE|DB_AUTO_COMMIT|DB_THREAD, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH); assert(r == 0); - - // populate the db - populate(db_env, db, nrows); - - run_test(db_env, db, nthreads, nrows, sleeptime); - - // close env - r = db->close(db, 0); assert(r == 0); db = NULL; - r = db_env->close(db_env, 0); assert(r == 0); db_env = NULL; - - return 0; -} diff --git a/src/tests/blocking-c-del.c b/src/tests/blocking-c-del.c deleted file mode 100644 index 62b3d902558..00000000000 --- a/src/tests/blocking-c-del.c +++ /dev/null @@ -1,213 +0,0 @@ -// verify that cursor deletes with write locking cause transactions with lock conflicts to -// suspend the conflicting threads. - -#include "test.h" -#include "toku_pthread.h" - -static void populate(DB_ENV *db_env, DB *db, uint64_t nrows) { - int r; - - DB_TXN *txn = NULL; - r = db_env->txn_begin(db_env, NULL, &txn, 0); assert(r == 0); - - for (uint64_t i = 0; i < nrows; i++) { - - uint64_t k = htonl(i); - uint64_t v = i; - DBT key = { .data = &k, .size = sizeof k }; - DBT val = { .data = &v, .size = sizeof v }; - r = db->put(db, txn, &key, &val, 0); assert(r == 0); - } - - r = txn->commit(txn, 0); assert(r == 0); -} - -struct my_callback_context { - DBT key; - DBT val; -}; - -#if TOKUDB -static void copy_dbt(DBT *dest, DBT const *src) { - assert(dest->flags == DB_DBT_REALLOC); - dest->size = src->size; - dest->data = toku_xrealloc(dest->data, dest->size); - memcpy(dest->data, src->data, dest->size); -} - -static int blocking_c_del_callback(DBT const *a UU(), DBT const *b UU(), void *e UU()) { - DBT const *found_key = a; - DBT const *found_val = b; - struct my_callback_context *context = (struct my_callback_context *) e; - copy_dbt(&context->key, found_key); - copy_dbt(&context->val, found_val); - return 0; -} -#endif - -static void blocking_c_del(DB_ENV *db_env, DB *db, uint64_t nrows, long sleeptime) { - int r; - - struct my_callback_context context; - context.key = (DBT) { .data = NULL, .size = 0, .flags = DB_DBT_REALLOC }; - context.val = (DBT) { .data = NULL, .size = 0, .flags = DB_DBT_REALLOC }; - - for (uint64_t i = 0; i < nrows; i++) { - DB_TXN *txn = NULL; - r = db_env->txn_begin(db_env, NULL, &txn, 0); assert(r == 0); - - DBC *cursor = NULL; - r = db->cursor(db, txn, &cursor, 0); assert(r == 0); - - uint64_t k = htonl(i); - DBT key = { .data = &k, .size = sizeof k }; -#if TOKUDB - r = cursor->c_getf_set(cursor, DB_RMW, &key, blocking_c_del_callback, &context); -#else - r = cursor->c_get(cursor, &key, &context.val, DB_SET + DB_RMW); -#endif - assert(r == 0 || r == DB_NOTFOUND); - - if (r == 0) { - - usleep(sleeptime); - - if (verbose) { - uint64_t kk; -#if TOKUDB - assert(context.key.size == sizeof kk); - memcpy(&kk, context.key.data, sizeof kk); -#else - assert(key.size == sizeof kk); - memcpy(&kk, key.data, sizeof kk); -#endif - printf("%lu deleting %lu\n", toku_pthread_self(), (long unsigned) htonl(kk)); - } - r = cursor->c_del(cursor, 0); - assert(r == 0 || r == DB_LOCK_DEADLOCK); - } - - { int rr = cursor->c_close(cursor); assert(rr == 0); } - - if (r == 0) - r = txn->commit(txn, 0); - else - r = txn->abort(txn); - assert(r == 0); - if (verbose) - printf("%lu %lu\n", toku_pthread_self(), i); - } - - toku_free(context.key.data); - toku_free(context.val.data); -} - -struct blocking_c_del_args { - DB_ENV *db_env; - DB *db; - uint64_t nrows; - long sleeptime; -}; - -static void *blocking_c_del_thread(void *arg) { - struct blocking_c_del_args *a = (struct blocking_c_del_args *) arg; - blocking_c_del(a->db_env, a->db, a->nrows, a->sleeptime); - return arg; -} - -static void run_test(DB_ENV *db_env, DB *db, int nthreads, uint64_t nrows, long sleeptime) { - int r; - toku_pthread_t tids[nthreads]; - struct blocking_c_del_args a = { db_env, db, nrows, sleeptime }; - for (int i = 0; i < nthreads-1; i++) { - r = toku_pthread_create(&tids[i], NULL, blocking_c_del_thread, &a); assert(r == 0); - } - blocking_c_del(db_env, db, nrows, sleeptime); - for (int i = 0; i < nthreads-1; i++) { - void *ret; - r = toku_pthread_join(tids[i], &ret); assert(r == 0); - } -} - -int test_main(int argc, char * const argv[]) { - uint64_t cachesize = 0; - uint32_t pagesize = 0; - uint64_t nrows = 10; - int nthreads = 2; - long sleeptime = 100000; -#if defined(USE_TDB) - char *db_env_dir = "dir." __FILE__ ".tokudb"; -#elif defined(USE_BDB) - char *db_env_dir = "dir." __FILE__ ".bdb"; -#else -#error -#endif - char *db_filename = "test.db"; - int db_env_open_flags = DB_CREATE | DB_PRIVATE | DB_INIT_MPOOL | DB_INIT_TXN | DB_INIT_LOCK | DB_INIT_LOG | DB_THREAD; - - // parse_args(argc, argv); - for (int i = 1; i < argc; i++) { - if (strcmp(argv[i], "-v") == 0 || strcmp(argv[i], "--verbose") == 0) { - verbose++; - continue; - } - if (strcmp(argv[i], "-q") == 0 || strcmp(argv[i], "--quiet") == 0) { - if (verbose > 0) - verbose--; - continue; - } - if (strcmp(argv[i], "--nrows") == 0 && i+1 < argc) { - nrows = atoll(argv[++i]); - continue; - } - if (strcmp(argv[i], "--nthreads") == 0 && i+1 < argc) { - nthreads = atoi(argv[++i]); - continue; - } - if (strcmp(argv[i], "--sleeptime") == 0 && i+1 < argc) { - sleeptime = atol(argv[++i]); - continue; - } - assert(0); - } - - // setup env - int r; - char rm_cmd[strlen(db_env_dir) + strlen("rm -rf ") + 1]; - snprintf(rm_cmd, sizeof(rm_cmd), "rm -rf %s", db_env_dir); - r = system(rm_cmd); assert(r == 0); - - r = toku_os_mkdir(db_env_dir, S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH); assert(r == 0); - - DB_ENV *db_env = NULL; - r = db_env_create(&db_env, 0); assert(r == 0); - if (cachesize) { - const u_int64_t gig = 1 << 30; - r = db_env->set_cachesize(db_env, cachesize / gig, cachesize % gig, 1); assert(r == 0); - } - r = db_env->open(db_env, db_env_dir, db_env_open_flags, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH); assert(r == 0); -#if TOKUDB - r = db_env->set_lock_timeout(db_env, 30 * 1000); assert(r == 0); -#else - r = db_env->set_lk_detect(db_env, DB_LOCK_YOUNGEST); assert(r == 0); -#endif - - // create the db - DB *db = NULL; - r = db_create(&db, db_env, 0); assert(r == 0); - if (pagesize) { - r = db->set_pagesize(db, pagesize); assert(r == 0); - } - r = db->open(db, NULL, db_filename, NULL, DB_BTREE, DB_CREATE|DB_AUTO_COMMIT|DB_THREAD, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH); assert(r == 0); - - // populate the db - populate(db_env, db, nrows); - - run_test(db_env, db, nthreads, nrows, sleeptime); - - // close env - r = db->close(db, 0); assert(r == 0); db = NULL; - r = db_env->close(db_env, 0); assert(r == 0); db_env = NULL; - - return 0; -} diff --git a/src/tests/bug627.c b/src/tests/bug627.c deleted file mode 100644 index b7a7f2f43f8..00000000000 --- a/src/tests/bug627.c +++ /dev/null @@ -1,75 +0,0 @@ -/* -*- mode: C; c-basic-offset: 4 -*- */ -#ident "Copyright (c) 2007 Tokutek Inc. All rights reserved." -#include "test.h" -/* See #627. */ - -#include -#include - -static void -do_627 (void) { - int r; - DB_ENV *env; - DB *db; - r = system("rm -rf " ENVDIR); - CKERR(r); - r=toku_os_mkdir(ENVDIR, S_IRWXU+S_IRWXG+S_IRWXO); assert(r==0); - - r=db_env_create(&env, 0); assert(r==0); - env->set_errfile(env, stderr); - r=env->open(env, ENVDIR, DB_INIT_LOCK|DB_INIT_LOG|DB_INIT_MPOOL|DB_INIT_TXN|DB_CREATE|DB_PRIVATE, S_IRWXU+S_IRWXG+S_IRWXO); CKERR(r); - r=db_create(&db, env, 0); CKERR(r); - - DB_TXN *t1, *t2; - DBT a,b; - r=env->txn_begin(env, 0, &t1, 0); assert(r==0); - r=db->open(db, t1, "foo.db", 0, DB_BTREE, DB_CREATE, S_IRWXU+S_IRWXG+S_IRWXO); CKERR(r); - r=db->put(db, t1, dbt_init(&a, "a", 2), dbt_init(&b, "b", 2), 0); - r=t1->commit(t1, 0); assert(r==0); - - r=env->txn_begin(env, 0, &t1, 0); assert(r==0); - r=env->txn_begin(env, 0, &t2, 0); assert(r==0); - - DBC *c1,*c2; - - r=db->cursor(db, t1, &c1, 0); CKERR(r); - r=db->cursor(db, t2, &c2, 0); CKERR(r); - - r=c1->c_get(c1, dbt_init(&a, "a", 2), dbt_init_malloc(&b), DB_SET); CKERR(r); - toku_free(b.data); - - r=c2->c_get(c2, dbt_init(&a, "a", 2), dbt_init_malloc(&b), DB_SET); CKERR(r); - toku_free(b.data); - - // This causes all hell to break loose in BDB 4.6, so we just cannot run this under BDB. - // PANIC: Invalid argument - // Expected DB_LOCK_NOTGRANTED, got DB_RUNRECOVERY: Fatal error, run database recovery - // bug627.bdb: bug627.c:44: do_627: Assertion `r==(-30994)' failed. - // Aborted - r=c1->c_del(c1, 0); - if (r!=DB_LOCK_NOTGRANTED) { - fprintf(stderr, "Expected DB_LOCK_NOTGRANTED, got %s\n", db_strerror(r)); - } - assert(r==DB_LOCK_NOTGRANTED); - - r=c1->c_close(c1); CKERR(r); - r=t1->commit(t1, 0); assert(r==0); - - r=c2->c_del(c2, 0); CKERR(r); - r=c2->c_close(c2); CKERR(r); - - r=t2->commit(t2, 0); assert(r==0); - - r=db->close(db, 0); CKERR(r); - r=env->close(env, 0); CKERR(r); - - -} - -int -test_main (int argc, char * const argv[]) { - parse_args(argc, argv); - do_627(); - return 0; -} - diff --git a/src/tests/cursor-more-than-a-leaf-provdel.c b/src/tests/cursor-more-than-a-leaf-provdel.c index a0e55d0ed95..10897338ca9 100644 --- a/src/tests/cursor-more-than-a-leaf-provdel.c +++ b/src/tests/cursor-more-than-a-leaf-provdel.c @@ -66,7 +66,7 @@ doit (BOOL committed_provdels) { r = dbc->c_get(dbc, &key, &data, DB_NEXT); CKERR(r); assert(*(int*)key.data == i); assert(*(int*)data.data == j); - r = dbc->c_del(dbc, 0); CKERR(r); + r = db->del(db, txn, &key, DB_DELETE_ANY); CKERR(r); } r = dbc->c_get(dbc, &key, &data, DB_NEXT); CKERR2(r, DB_NOTFOUND); r = dbc->c_get(dbc, &key, &data, DB_FIRST); CKERR2(r, DB_NOTFOUND); diff --git a/src/tests/loader-cleanup-test.c b/src/tests/loader-cleanup-test.c index f47125bbc71..4562c7105c3 100644 --- a/src/tests/loader-cleanup-test.c +++ b/src/tests/loader-cleanup-test.c @@ -1004,8 +1004,8 @@ static void do_args(int argc, char * const argv[]) { } else if (strcmp(argv[0], "-c")==0) { CHECK_RESULTS = 1; } else if (strcmp(argv[0], "-p")==0) { - USE_PUTS = LOADER_USE_PUTS; - printf("Using puts\n"); + USE_PUTS = 0; + printf("DISABLED Using puts as part of #4503\n"); } else if (strcmp(argv[0], "-k")==0) { test_only_abort_via_poll = 1; printf("Perform only abort_via_poll test\n"); diff --git a/src/tests/perf_ptquery2.c b/src/tests/perf_ptquery2.c new file mode 100644 index 00000000000..7f954635c86 --- /dev/null +++ b/src/tests/perf_ptquery2.c @@ -0,0 +1,80 @@ +/* -*- mode: C; c-basic-offset: 4 -*- */ +#ident "Copyright (c) 2007 Tokutek Inc. All rights reserved." +#ident "$Id: test_stress1.c 39258 2012-01-27 13:51:58Z zardosht $" +#include "test.h" + +#include +#include + +#include +#include +#include +#include +#include + +#include "threaded_stress_test_helpers.h" + + +static int ptquery_op2(DB_TXN *txn, ARG arg, void* operation_extra) { + int db_index = *(int *)operation_extra; + DB* db = arg->dbp[db_index]; + return ptquery_and_maybe_check_op(db, txn, arg, TRUE); +} + + +// +// This test is a form of stress that does operations on a single dictionary: +// We create a dictionary bigger than the cachetable (around 4x greater). +// Then, we spawn a bunch of pthreads that do the following: +// - scan dictionary forward with bulk fetch +// - scan dictionary forward slowly +// - scan dictionary backward with bulk fetch +// - scan dictionary backward slowly +// - Grow the dictionary with insertions +// - do random point queries into the dictionary +// With the small cachetable, this should produce quite a bit of churn in reading in and evicting nodes. +// If the test runs to completion without crashing, we consider it a success. It also tests that snapshots +// work correctly by verifying that table scans sum their vals to 0. +// +// This does NOT test: +// - splits and merges +// - multiple DBs +// +// Variables that are interesting to tweak and run: +// - small cachetable +// - number of elements +// + +static void +stress_table(DB_ENV* env, DB** dbp, struct cli_args *cli_args) { + int n = cli_args->num_elements; + // + // the threads that we want: + // - some threads constantly updating random values + // - one thread doing table scan with bulk fetch + // - one thread doing table scan without bulk fetch + // - some threads doing random point queries + // + + if (verbose) printf("starting creation of pthreads\n"); + const int num_threads = cli_args->num_ptquery_threads; + struct arg myargs[num_threads]; + int thread_ids[num_threads]; + for (int i = 0; i < num_threads; i++) { + arg_init(&myargs[i], n, dbp, env, cli_args); + } + for (int i = 0; i < num_threads; i++) { + thread_ids[i] = i % cli_args->num_DBs; + myargs[i].operation = ptquery_op2; + myargs[i].operation_extra = &thread_ids[i]; + } + run_workers(myargs, num_threads, cli_args->time_of_test, false, cli_args); +} + +int +test_main(int argc, char *const argv[]) { + struct cli_args args = get_default_args_for_perf(); + parse_stress_test_args(argc, argv, &args); + stress_test_main(&args); + return 0; +} diff --git a/src/tests/queries_with_deletes.c b/src/tests/queries_with_deletes.c index afe6ca356b9..a7f1dbf239f 100644 --- a/src/tests/queries_with_deletes.c +++ b/src/tests/queries_with_deletes.c @@ -18,7 +18,11 @@ int test_main (int argc, char * const argv[]) { env->set_errfile(env, stderr); // set a cachetable size of 10K u_int32_t cachesize = 10*1024; - r = env->set_cachesize(env, 0, cachesize, 1); CKERR(r); + // as part of #4503, arbitrarily increasing sizze of cachetable + // the idea is to make it small enough such that all data + // cannot fit in the cachetable, but big enough such that + // we don't have cachet pressure + r = env->set_cachesize(env, 0, 4*cachesize, 1); CKERR(r); r = env->set_default_bt_compare(env, int64_dbt_cmp); CKERR(r); r = env->open(env, ENVDIR, envflags, S_IRWXU+S_IRWXG+S_IRWXO); CKERR(r); diff --git a/src/tests/test938.c b/src/tests/test938.c index d73d1fee418..d4a38e9dc64 100644 --- a/src/tests/test938.c +++ b/src/tests/test938.c @@ -84,7 +84,7 @@ run (int choice) { i=0; while (0==(r=(c->c_get(c, &kdbt, &vdbt, DB_FIRST)))) { i++; - r=c->c_del(c, 0); + r = db->del(db, txn, &kdbt, DB_DELETE_ANY); CKERR(r); } assert(r==DB_NOTFOUND); diff --git a/src/tests/test_789.c b/src/tests/test_789.c index f9c24416e5f..49a3e8c605e 100644 --- a/src/tests/test_789.c +++ b/src/tests/test_789.c @@ -86,7 +86,7 @@ test_789(void) { r = db->cursor(db, txn, &cursor, 0); assert(r == 0); DBT key, val; r = cursor->c_get(cursor, dbt_init_malloc(&key), dbt_init_malloc(&val), DB_NEXT); assert(r == 0); - r = cursor->c_del(cursor, 0); assert(r == 0); + r = db->del(db, txn, &key, DB_DELETE_ANY); assert(r == 0); r = cursor->c_close(cursor); assert(r == 0); toku_free(key.data); toku_free(val.data); r = txn->commit(txn, 0); assert(r == 0); @@ -121,7 +121,7 @@ test_789(void) { r = db->cursor(db, txn, &cursor, 0); assert(r == 0); DBT key, val; r = cursor->c_get(cursor, dbt_init_malloc(&key), dbt_init_malloc(&val), DB_NEXT); assert(r == 0); - r = cursor->c_del(cursor, 0); assert(r == 0); + r = db->del(db, txn, &key, DB_DELETE_ANY); assert(r == 0); r = cursor->c_close(cursor); assert(r == 0); toku_free(key.data); toku_free(val.data); r = txn->commit(txn, 0); assert(r == 0); diff --git a/src/tests/test_bulk_fetch.c b/src/tests/test_bulk_fetch.c index d56ea830332..7a94e9c3082 100644 --- a/src/tests/test_bulk_fetch.c +++ b/src/tests/test_bulk_fetch.c @@ -102,7 +102,10 @@ test_bulk_fetch (u_int64_t n, BOOL prelock, BOOL disable_prefetching) { DB_ENV *env; r = db_env_create(&env, 0); assert(r == 0); r=env->set_default_bt_compare(env, int64_dbt_cmp); CKERR(r); - r = env->set_cachesize(env, 0, (u_int32_t)n, 1); assert(r == 0); + // arbitrarily have cachetable size be 4*n + // goal is to make it small enough such that all of data + // does not fit in cachetable, but not so small that we get thrashing + r = env->set_cachesize(env, 0, (u_int32_t)4*n, 1); assert(r == 0); r = env->open(env, ENVDIR, DB_CREATE+DB_PRIVATE+DB_INIT_MPOOL, 0); assert(r == 0); DB *db; diff --git a/src/tests/test_cursor_db_current.c b/src/tests/test_cursor_db_current.c index e21fd3c4271..7da6e7d6b86 100644 --- a/src/tests/test_cursor_db_current.c +++ b/src/tests/test_cursor_db_current.c @@ -46,9 +46,6 @@ test_cursor_current (void) { DBT key, data; int kk, vv; - r = cursor->c_del(cursor, 0); - assert(r == EINVAL); - r = cursor->c_get(cursor, dbt_init_malloc(&key), dbt_init_malloc(&data), DB_CURRENT); assert(r == EINVAL); @@ -70,17 +67,12 @@ test_cursor_current (void) { assert(data.size == sizeof vv); memcpy(&vv, data.data, data.size); assert(vv == v); + r = db->del(db, null_txn, &key, DB_DELETE_ANY); toku_free(key.data); toku_free(data.data); - r = cursor->c_del(cursor, 0); - CKERR(r); - r = cursor->c_get(cursor, dbt_init_malloc(&key), dbt_init_malloc(&data), DB_CURRENT); CKERR2(r,DB_KEYEMPTY); - r = cursor->c_del(cursor, 0); - CKERR2(r,DB_KEYEMPTY); - r = cursor->c_get(cursor, dbt_init_malloc(&key), dbt_init_malloc(&data), DB_CURRENT); CKERR2(r,DB_KEYEMPTY); diff --git a/src/tests/test_cursor_delete.c b/src/tests/test_cursor_delete.c deleted file mode 100644 index 47baa39afd0..00000000000 --- a/src/tests/test_cursor_delete.c +++ /dev/null @@ -1,103 +0,0 @@ -/* -*- mode: C; c-basic-offset: 4 -*- */ -#ident "Copyright (c) 2007 Tokutek Inc. All rights reserved." -#include "test.h" - -#include -#include - -#include -#include -#include -#include -#include - - -static void -cursor_expect (DBC *cursor, int k, int v, int op) { - DBT key, val; - int r = cursor->c_get(cursor, dbt_init_malloc(&key), dbt_init_malloc(&val), op); - assert(r == 0); - assert(key.size == sizeof k); - int kk; - memcpy(&kk, key.data, key.size); - assert(val.size == sizeof v); - int vv; - memcpy(&vv, val.data, val.size); - if (kk != k || vv != v) printf("expect key %u got %u - %u %u\n", (uint32_t)htonl(k), (uint32_t)htonl(kk), (uint32_t)htonl(v), (uint32_t)htonl(vv)); - assert(kk == k); - assert(vv == v); - - toku_free(key.data); - toku_free(val.data); -} - - -/* generate a multi-level tree and delete all entries with a cursor - verify that the pivot flags are toggled (currently by inspection) */ - -static void -test_cursor_delete (int dup_mode) { - if (verbose) printf("test_cursor_delete:%d\n", dup_mode); - - int pagesize = 4096; - int elementsize = 32; - int npp = pagesize/elementsize; - int n = 16*npp; /* build a 2 level tree */ - - DB_TXN * const null_txn = 0; - const char * const fname = "test.cursor.delete.brt"; - int r; - - r = system("rm -rf " ENVDIR); assert(r == 0); - r = toku_os_mkdir(ENVDIR, S_IRWXU|S_IRWXG|S_IRWXO); assert(r == 0); - - /* create the dup database file */ - DB_ENV *env; - r = db_env_create(&env, 0); assert(r == 0); -#ifdef USE_TDB - r = env->set_redzone(env, 0); CKERR(r); -#endif - r = env->open(env, ENVDIR, DB_CREATE+DB_PRIVATE+DB_INIT_MPOOL, 0); assert(r == 0); - - DB *db; - r = db_create(&db, env, 0); assert(r == 0); - db->set_errfile(db,0); // Turn off those annoying errors - r = db->set_flags(db, dup_mode); assert(r == 0); - r = db->set_pagesize(db, pagesize); assert(r == 0); - r = db->open(db, null_txn, fname, "main", DB_BTREE, DB_CREATE, 0666); assert(r == 0); - - int i; - for (i=0; iput(db, null_txn, dbt_init(&key, &k, sizeof k), dbt_init(&val, &v, sizeof v), 0); assert(r == 0); - } - - /* verify the sort order with a cursor */ - DBC *cursor; - r = db->cursor(db, null_txn, &cursor, 0); assert(r == 0); - - for (i=0; ic_del(cursor, 0); assert(r == 0); - } - - r = cursor->c_close(cursor); assert(r == 0); - - r = db->close(db, 0); assert(r == 0); - r = env->close(env, 0); assert(r == 0); -} - -int -test_main(int argc, char *const argv[]) { - parse_args(argc, argv); - - test_cursor_delete(0); -#ifdef USE_BDB - test_cursor_delete(DB_DUP); -#endif - - return 0; -} diff --git a/src/tests/test_cursor_delete2a.c b/src/tests/test_cursor_delete2a.c deleted file mode 100644 index 82c299893da..00000000000 --- a/src/tests/test_cursor_delete2a.c +++ /dev/null @@ -1,76 +0,0 @@ -/* -*- mode: C; c-basic-offset: 4 -*- */ -#ident "Copyright (c) 2007 Tokutek Inc. All rights reserved." -#include "test.h" - -#include -#include - -#include -#include -#include -#include -#include - - -static DB_ENV *dbenv; -static DB *db; -static DB_TXN * txn; -static DBC *cursor; - -static void -test_cursor_delete2 (void) { - int r; - DBT key,val; - - r = db_env_create(&dbenv, 0); CKERR(r); - r = dbenv->open(dbenv, ENVDIR, DB_PRIVATE|DB_INIT_MPOOL|DB_CREATE|DB_INIT_TXN, 0); CKERR(r); - - r = db_create(&db, dbenv, 0); CKERR(r); - r = dbenv->txn_begin(dbenv, 0, &txn, 0); CKERR(r); - r = db->open(db, txn, "primary.db", NULL, DB_BTREE, DB_CREATE, 0600); CKERR(r); - r = txn->commit(txn, 0); CKERR(r); - - r = dbenv->txn_begin(dbenv, 0, &txn, 0); CKERR(r); - r = db->put(db, txn, dbt_init(&key, "a", 2), dbt_init(&val, "b", 2), 0); CKERR(r); - r = txn->commit(txn, 0); CKERR(r); - - r = dbenv->txn_begin(dbenv, 0, &txn, 0); CKERR(r); - r = db->del(db, txn, dbt_init(&key, "a", 2), 0); CKERR(r); - r = txn->commit(txn, 0); CKERR(r); - - r = dbenv->txn_begin(dbenv, 0, &txn, 0); CKERR(r); - r = db->put(db, txn, dbt_init(&key, "a", 2), dbt_init(&val, "c", 2), 0); CKERR(r); - - cursor=cursor; - - r = db->cursor(db, txn, &cursor, 0); CKERR(r); - r = cursor->c_get(cursor, dbt_init_malloc(&key), dbt_init_malloc(&val), DB_FIRST); CKERR(r); - assert(strcmp(key.data, "a")==0); toku_free(key.data); - assert(strcmp(val.data, "c")==0); toku_free(val.data); - r = cursor->c_del(cursor, 0); CKERR(r); - r = cursor->c_del(cursor, 0); assert(r==DB_KEYEMPTY); - r = cursor->c_get(cursor, dbt_init_malloc(&key), dbt_init_malloc(&val), DB_NEXT); assert(r==DB_NOTFOUND); - - r = cursor->c_close(cursor); CKERR(r); - r = txn->commit(txn, 0); CKERR(r); - - - - r = db->close(db, 0); CKERR(r); - r = dbenv->close(dbenv, 0); CKERR(r); -} - -int -test_main(int argc, char *const argv[]) { - - parse_args(argc, argv); - - int r; - r = system("rm -rf " ENVDIR); - CKERR(r); - toku_os_mkdir(ENVDIR, S_IRWXU+S_IRWXG+S_IRWXO); - - test_cursor_delete2(); - - return 0; -} diff --git a/src/tests/test_cursor_delete_2119.c b/src/tests/test_cursor_delete_2119.c deleted file mode 100644 index 6046a85ee1b..00000000000 --- a/src/tests/test_cursor_delete_2119.c +++ /dev/null @@ -1,89 +0,0 @@ -/* -*- mode: C; c-basic-offset: 4 -*- */ -#ident "Copyright (c) 2007 Tokutek Inc. All rights reserved." -#include "test.h" - -#include -#include - -#include -#include -#include -#include -#include - - -static DB_ENV *dbenv; -static DB *db; -static DB_TXN * txn; -static DBC *cursor; - -static void -test_cursor_delete_2119 (u_int32_t c_del_flags, u_int32_t txn_isolation_flags) { - int r; - r = system("rm -rf " ENVDIR); - CKERR(r); - r = toku_os_mkdir(ENVDIR, S_IRWXU+S_IRWXG+S_IRWXO); - CKERR(r); - DBT key,val; - - r = db_env_create(&dbenv, 0); CKERR(r); - r = dbenv->open(dbenv, ENVDIR, DB_PRIVATE|DB_INIT_MPOOL|DB_CREATE|DB_INIT_TXN|DB_INIT_LOCK, 0); CKERR(r); - - r = db_create(&db, dbenv, 0); CKERR(r); - r = dbenv->txn_begin(dbenv, 0, &txn, txn_isolation_flags); CKERR(r); - r = db->open(db, txn, "primary.db", NULL, DB_BTREE, DB_CREATE, 0600); CKERR(r); - r = txn->commit(txn, 0); CKERR(r); - - r = dbenv->txn_begin(dbenv, 0, &txn, txn_isolation_flags); CKERR(r); - r = db->put(db, txn, dbt_init(&key, "a", 2), dbt_init(&val, "b", 2), 0); CKERR(r); - r = txn->commit(txn, 0); CKERR(r); - - r = dbenv->txn_begin(dbenv, 0, &txn, txn_isolation_flags); CKERR(r); - r = db->del(db, txn, dbt_init(&key, "a", 2), 0); CKERR(r); - r = txn->commit(txn, 0); CKERR(r); - - r = dbenv->txn_begin(dbenv, 0, &txn, txn_isolation_flags); CKERR(r); - r = db->put(db, txn, dbt_init(&key, "a", 2), dbt_init(&val, "c", 2), 0); CKERR(r); - - cursor=cursor; - - r = db->cursor(db, txn, &cursor, 0); CKERR(r); - r = cursor->c_get(cursor, dbt_init_malloc(&key), dbt_init_malloc(&val), DB_FIRST); CKERR(r); - assert(strcmp(key.data, "a")==0); toku_free(key.data); - assert(strcmp(val.data, "c")==0); toku_free(val.data); - r = cursor->c_del(cursor, c_del_flags); CKERR(r); - r = cursor->c_del(cursor, c_del_flags); assert(r==DB_KEYEMPTY); - r = cursor->c_get(cursor, dbt_init_malloc(&key), dbt_init_malloc(&val), DB_NEXT); assert(r==DB_NOTFOUND); - - r = cursor->c_close(cursor); CKERR(r); - r = txn->commit(txn, 0); CKERR(r); - - - - r = db->close(db, 0); CKERR(r); - r = dbenv->close(dbenv, 0); CKERR(r); -} - -int -test_main(int argc, char *const argv[]) { - - parse_args(argc, argv); - - int isolation; - int read_prelocked; - int write_prelocked; - for (isolation = 0; isolation < 2; isolation++) { - u_int32_t isolation_flag = isolation ? DB_READ_UNCOMMITTED : 0; - for (read_prelocked = 0; read_prelocked < 2; read_prelocked++) { - u_int32_t read_prelocked_flag = read_prelocked ? DB_PRELOCKED : 0; - for (write_prelocked = 0; write_prelocked < 2; write_prelocked++) { - u_int32_t write_prelocked_flag = write_prelocked ? DB_PRELOCKED_WRITE : 0; - test_cursor_delete_2119(read_prelocked_flag | write_prelocked_flag, - isolation_flag); - } - } - } - - - return 0; -} diff --git a/src/tests/test_cursor_delete_next.c b/src/tests/test_cursor_delete_next.c deleted file mode 100644 index 87fcf6de8da..00000000000 --- a/src/tests/test_cursor_delete_next.c +++ /dev/null @@ -1,82 +0,0 @@ -/* -*- mode: C; c-basic-offset: 4 -*- */ -#ident "Copyright (c) 2007 Tokutek Inc. All rights reserved." -#include "test.h" - -#include -#include - -#include -#include - - -// ENVDIR is defined in the Makefile - - -DB *db; -DB_ENV *env; -DBT key; -DBT value; -DBC *dbc; -DB_TXN *const null_txn = 0; - -static void -setup_db (char* name) { - int r; - - r = system("rm -rf " ENVDIR); - CKERR(r); - toku_os_mkdir(ENVDIR, S_IRWXU+S_IRWXG+S_IRWXO); - - r = db_env_create(&env, 0); CKERR(r); -#ifdef USE_TDB - r = env->set_redzone(env, 0); CKERR(r); -#endif - r = env->open(env, ENVDIR, DB_CREATE | DB_PRIVATE | DB_INIT_MPOOL, 0666); CKERR(r); - r = db_create(&db, env, 0); CKERR(r); - r = db->set_pagesize(db, 4096); CKERR(r); - r = db->open(db, null_txn, name, "main", DB_BTREE, DB_CREATE, 0666); CKERR(r); -} - -static void -close_db (void) { - int r; - - r = db->close(db, 0); CKERR(r); - r = env->close(env, 0); CKERR(r); -} - -static void -insert (void) { - int r; - - dbt_init(&key, "key", sizeof("key")); - dbt_init(&value, "value1", sizeof("value1")); - r = db->put(db, null_txn, &key, &value, 0); CKERR(r); - - dbt_init(&key, "key2", sizeof("key2")); - dbt_init(&value, "value2", sizeof("value2")); - r = db->put(db, null_txn, &key, &value, 0); CKERR(r); -} - -static void -cursor_range_with_delete (u_int32_t flag) { - int r; - - r = db->cursor(db, null_txn, &dbc, 0); CKERR(r); - r = dbc->c_get(dbc, &key, &value, DB_FIRST); CKERR(r); - r = dbc->c_del(dbc, 0); CKERR(r); - r = dbc->c_get(dbc, &key, &value, flag); CKERR(r); - r = dbc->c_del(dbc, 0); CKERR(r); - r = dbc->c_close(dbc); CKERR(r); -} - -int -test_main(int argc, char *const argv[]) { - parse_args(argc, argv); - setup_db("next.db"); - insert(); - cursor_range_with_delete(DB_NEXT); - close_db(); - - return 0; -} diff --git a/src/tests/test_db_current_clobbers_db.c b/src/tests/test_db_current_clobbers_db.c index 83b1fdd7ef1..0f380f80aa3 100644 --- a/src/tests/test_db_current_clobbers_db.c +++ b/src/tests/test_db_current_clobbers_db.c @@ -60,7 +60,7 @@ test_main (int UU(argc), char UU(*const argv[])) { assert(*(int*)val.data == v1); // Will bring up valgrind error. - r = cursor->c_del(cursor, 0); + r = db->del(db, null_txn, &ckey, DB_DELETE_ANY); assert(r == 0); CKERR(r); assert(*(int*)val.data == v1); // Will bring up valgrind error. diff --git a/src/tests/test_db_txn_locks_nonheaviside.c b/src/tests/test_db_txn_locks_nonheaviside.c index 22754321835..52920bb548b 100644 --- a/src/tests/test_db_txn_locks_nonheaviside.c +++ b/src/tests/test_db_txn_locks_nonheaviside.c @@ -59,18 +59,6 @@ cget(BOOL success, BOOL find, char txn, int _key, int _data, else CKERR2s(r, DB_LOCK_DEADLOCK, DB_LOCK_NOTGRANTED); } -static void -cdel (BOOL success, BOOL find, char txn) { - int r; - - r = cursors[(int)txn]->c_del(cursors[(int)txn], 0); - if (success) { - if (find) CKERR(r); - else CKERR2(r, DB_NOTFOUND); - } - else CKERR2s(r, DB_LOCK_DEADLOCK, DB_LOCK_NOTGRANTED); -} - static void dbdel (BOOL success, BOOL find, char txn, int _key) { int r; @@ -457,30 +445,6 @@ test_prev (u_int32_t next_type) { close_dbs(); } -static void -test_cdel (void) { - /* ********************************************************************** */ - setup_dbs(); - put(TRUE, 'c', 1, 1); - early_commit('c'); - cget(TRUE, TRUE, 'a', 1, 1, 1, 1, DB_SET); - cdel(TRUE, TRUE, 'a'); - cget(FALSE, TRUE, 'b', 1, 1, 1, 1, DB_SET); - cget(FALSE, FALSE, 'b', 1, 2, 1, 2, DB_SET); - cget(FALSE, FALSE, 'b', 1, 0, 1, 0, DB_SET); - cget(TRUE, FALSE, 'b', 0, 0, 0, 0, DB_SET); - cget(TRUE, FALSE, 'b', 2, 10, 2, 10, DB_SET); - close_dbs(); - /* ********************************************************************** */ - setup_dbs(); - put(TRUE, 'c', 1, 1); - early_commit('c'); - cget(TRUE, TRUE, 'a', 1, 1, 1, 1, DB_SET); - cget(TRUE, TRUE, 'b', 1, 1, 1, 1, DB_SET); - cdel(FALSE, TRUE, 'a'); - close_dbs(); -} - static void test_dbdel (void) { /* If DB_DELETE_ANY changes to 0, then find is meaningful and @@ -524,8 +488,6 @@ test_current (void) { early_commit('a'); cget(TRUE, TRUE, 'b', 1, 1, 1, 1, DB_SET); cget(TRUE, TRUE, 'b', 1, 1, 1, 1, DB_CURRENT); - cdel(TRUE, TRUE, 'b'); - cget(TRUE, FALSE, 'b', 1, 1, 1, 1, DB_CURRENT); close_dbs(); } @@ -582,8 +544,6 @@ test (void) { test_prev( DB_PREV); test_prev( DB_PREV_NODUP); /* ********************************************************************** */ - test_cdel(); - /* ********************************************************************** */ test_dbdel(); /* ********************************************************************** */ test_current(); diff --git a/src/tests/test_insert_cursor_delete_insert.c b/src/tests/test_insert_cursor_delete_insert.c index 2ea2f86dc53..60fbec76c1c 100644 --- a/src/tests/test_insert_cursor_delete_insert.c +++ b/src/tests/test_insert_cursor_delete_insert.c @@ -45,7 +45,7 @@ test_insert_delete_insert (void) { assert(r == 0); toku_free(val.data); - r = cursor->c_del(cursor, 0); + r = db->del(db, null_txn, &key, DB_DELETE_ANY); assert(r == 0); assert(r == 0); r = cursor->c_get(cursor, dbt_init_malloc(&key), dbt_init_malloc(&val), DB_CURRENT); diff --git a/src/tests/threaded_stress_test_helpers.h b/src/tests/threaded_stress_test_helpers.h index 1a33704572b..7cab3699a30 100644 --- a/src/tests/threaded_stress_test_helpers.h +++ b/src/tests/threaded_stress_test_helpers.h @@ -1234,12 +1234,10 @@ do_warm_cache(DB_ENV *env, DB **dbs, struct cli_args *args) scan_arg.operation_extra = &soe; scan_arg.operation = scan_op_no_check; scan_arg.lock_type = STRESS_LOCK_NONE; - struct worker_extra we; - we.thread_arg = &scan_arg; - we.operation_lock = NULL; - we.operation_lock_mutex = NULL; - we.num_operations_completed = 0; - worker(&we); + DB_TXN* txn = NULL; + int r = env->txn_begin(env, 0, &txn, 0); CKERR(r); + scan_op_no_check(txn, &scan_arg, &soe); + r = txn->commit(txn,0); CKERR(r); } static void diff --git a/src/ydb-internal.h b/src/ydb-internal.h index 4b7e7e43a04..5fb3c59a93d 100644 --- a/src/ydb-internal.h +++ b/src/ydb-internal.h @@ -67,10 +67,12 @@ struct __toku_db_env_internal { generate_row_for_put_func generate_row_for_put; generate_row_for_del_func generate_row_for_del; //void (*noticecall)(DB_ENV *, db_notices); + unsigned long cachetable_size; CACHETABLE cachetable; TOKULOGGER logger; toku_ltm* ltm; + int open_txns; // Number of open transactions DB *directory; // Maps dnames to inames DB *persistent_environment; // Stores environment settings, can be used for upgrade @@ -127,7 +129,6 @@ int toku_ydb_lock_destroy(void); void toku_ydb_lock(void); void toku_ydb_unlock(void); void toku_ydb_unlock_and_yield(unsigned long useconds); -toku_pthread_mutex_t *toku_ydb_mutex(void); void toku_ydb_lock_get_status(YDB_LOCK_STATUS statp); @@ -240,10 +241,28 @@ struct __toku_dbc_external { #define dbc_struct_i(x) (&((struct __toku_dbc_external *)x)->internal_part) +// needed in ydb_db.c +#define DB_ISOLATION_FLAGS (DB_READ_COMMITTED | DB_READ_UNCOMMITTED | DB_TXN_SNAPSHOT | DB_SERIALIZABLE | DB_INHERIT_ISOLATION) -int toku_db_pre_acquire_table_lock(DB *db, DB_TXN *txn, BOOL just_lock); -int toku_grab_write_lock(DB *db, DBT *key, TOKUTXN tokutxn); +static inline int +env_opened(DB_ENV *env) { + return env->i->cachetable != 0; +} +void env_note_zombie_db(DB_ENV *env, DB *db); +void env_panic(DB_ENV * env, int cause, char * msg); +void env_note_db_opened(DB_ENV *env, DB *db); +void env_note_db_closed(DB_ENV *env, DB *db); +void env_note_zombie_db_closed(DB_ENV *env, DB *db); +int toku_env_dbremove(DB_ENV * env, DB_TXN *txn, const char *fname, const char *dbname, u_int32_t flags); +int toku_env_dbrename(DB_ENV *env, DB_TXN *txn, const char *fname, const char *dbname, const char *newname, u_int32_t flags); + + +int toku_txn_begin_internal(DB_ENV *env, DB_TXN * stxn, DB_TXN ** txn, u_int32_t flags, bool internal, bool holds_ydb_lock); +int toku_txn_commit(DB_TXN * txn, u_int32_t flags, TXN_PROGRESS_POLL_FUNCTION, void*, bool release_multi_operation_client_lock); +int toku_txn_abort(DB_TXN * txn, TXN_PROGRESS_POLL_FUNCTION, void*, bool release_multi_operation_client_lock); +int locked_txn_commit(DB_TXN *txn, u_int32_t flags); +int locked_txn_abort(DB_TXN *txn); #if defined(__cplusplus) } diff --git a/src/ydb.c b/src/ydb.c index 933c8c1a638..1e3cc34d1a6 100644 --- a/src/ydb.c +++ b/src/ydb.c @@ -35,7 +35,11 @@ const char *toku_copyright_string = "Copyright (c) 2007-2009 Tokutek Inc. All r #include "ydb_load.h" #include "brtloader.h" #include "log_header.h" - +#include "ydb_cursor.h" +#include "ydb_row_lock.h" +#include "ydb_env_func.h" +#include "ydb_db.h" +#include "ydb_write.h" #ifdef TOKUTRACE #define DB_ENV_CREATE_FUN db_env_create_toku10 @@ -47,12 +51,11 @@ const char *toku_copyright_string = "Copyright (c) 2007-2009 Tokutek Inc. All r int toku_close_trace_file (void) { return 0; } #endif -#define DB_ISOLATION_FLAGS (DB_READ_COMMITTED | DB_READ_UNCOMMITTED | DB_TXN_SNAPSHOT | DB_SERIALIZABLE | DB_INHERIT_ISOLATION) - // Set when env is panicked, never cleared. static int env_is_panicked = 0; -static void + +void env_panic(DB_ENV * env, int cause, char * msg) { if (cause == 0) cause = -1; // if unknown cause, at least guarantee panic @@ -75,32 +78,10 @@ typedef enum { YDB_LAYER_TIME_CREATION = 0, /* timestamp of environment creation, read from persistent environment */ YDB_LAYER_TIME_STARTUP, /* timestamp of system startup */ YDB_LAYER_TIME_NOW, /* timestamp of engine status query */ - YDB_LAYER_NUM_INSERTS, - YDB_LAYER_NUM_INSERTS_FAIL, - YDB_LAYER_NUM_DELETES, - YDB_LAYER_NUM_DELETES_FAIL, - YDB_LAYER_NUM_UPDATES, - YDB_LAYER_NUM_UPDATES_FAIL, - YDB_LAYER_NUM_UPDATES_BROADCAST, - YDB_LAYER_NUM_UPDATES_BROADCAST_FAIL, - YDB_LAYER_NUM_MULTI_INSERTS, - YDB_LAYER_NUM_MULTI_INSERTS_FAIL, - YDB_LAYER_NUM_MULTI_DELETES, - YDB_LAYER_NUM_MULTI_DELETES_FAIL, - YDB_LAYER_NUM_MULTI_UPDATES, - YDB_LAYER_NUM_MULTI_UPDATES_FAIL, - YDB_LAYER_NUM_POINT_QUERIES, - YDB_LAYER_NUM_SEQUENTIAL_QUERIES, YDB_LAYER_NUM_DB_OPEN, YDB_LAYER_NUM_DB_CLOSE, YDB_LAYER_NUM_OPEN_DBS, YDB_LAYER_MAX_OPEN_DBS, - YDB_LAYER_DIRECTORY_READ_LOCKS, /* total directory read locks taken */ - YDB_LAYER_DIRECTORY_READ_LOCKS_FAIL, /* total directory read locks unable to be taken */ - YDB_LAYER_DIRECTORY_WRITE_LOCKS, /* total directory write locks taken */ - YDB_LAYER_DIRECTORY_WRITE_LOCKS_FAIL, /* total directory write locks unable to be taken */ - YDB_LAYER_LOGSUPPRESS, /* number of times logs are suppressed for empty table (2440) */ - YDB_LAYER_LOGSUPPRESS_FAIL, /* number of times unable to suppress logs for empty table (2440) */ #if 0 YDB_LAYER_ORIGINAL_ENV_VERSION, /* version of original environment, read from persistent environment */ YDB_LAYER_STARTUP_ENV_VERSION, /* version of environment at this startup, read from persistent environment (curr_env_ver_key) */ @@ -135,32 +116,10 @@ ydb_layer_status_init (void) { STATUS_INIT(YDB_LAYER_TIME_CREATION, UNIXTIME, "time of environment creation"); STATUS_INIT(YDB_LAYER_TIME_STARTUP, UNIXTIME, "time of engine startup"); STATUS_INIT(YDB_LAYER_TIME_NOW, UNIXTIME, "time now"); - STATUS_INIT(YDB_LAYER_NUM_INSERTS, UINT64, "dictionary inserts"); - STATUS_INIT(YDB_LAYER_NUM_INSERTS_FAIL, UINT64, "dictionary inserts fail"); - STATUS_INIT(YDB_LAYER_NUM_DELETES, UINT64, "dictionary deletes"); - STATUS_INIT(YDB_LAYER_NUM_DELETES_FAIL, UINT64, "dictionary deletes fail"); - STATUS_INIT(YDB_LAYER_NUM_UPDATES, UINT64, "dictionary updates"); - STATUS_INIT(YDB_LAYER_NUM_UPDATES_FAIL, UINT64, "dictionary updates fail"); - STATUS_INIT(YDB_LAYER_NUM_UPDATES_BROADCAST, UINT64, "dictionary broadcast updates"); - STATUS_INIT(YDB_LAYER_NUM_UPDATES_BROADCAST_FAIL, UINT64, "dictionary broadcast updates fail"); - STATUS_INIT(YDB_LAYER_NUM_MULTI_INSERTS, UINT64, "dictionary multi inserts"); - STATUS_INIT(YDB_LAYER_NUM_MULTI_INSERTS_FAIL, UINT64, "dictionary multi inserts fail"); - STATUS_INIT(YDB_LAYER_NUM_MULTI_DELETES, UINT64, "dictionary multi deletes"); - STATUS_INIT(YDB_LAYER_NUM_MULTI_DELETES_FAIL, UINT64, "dictionary multi deletes fail"); - STATUS_INIT(YDB_LAYER_NUM_MULTI_UPDATES, UINT64, "dictionary updates multi"); - STATUS_INIT(YDB_LAYER_NUM_MULTI_UPDATES_FAIL, UINT64, "dictionary updates multi fail"); - STATUS_INIT(YDB_LAYER_NUM_POINT_QUERIES, UINT64, "dictionary point queries"); - STATUS_INIT(YDB_LAYER_NUM_SEQUENTIAL_QUERIES, UINT64, "dictionary sequential queries"); STATUS_INIT(YDB_LAYER_NUM_DB_OPEN, UINT64, "db opens"); STATUS_INIT(YDB_LAYER_NUM_DB_CLOSE, UINT64, "db closes"); STATUS_INIT(YDB_LAYER_NUM_OPEN_DBS, UINT64, "num open dbs now"); STATUS_INIT(YDB_LAYER_MAX_OPEN_DBS, UINT64, "max open dbs"); - STATUS_INIT(YDB_LAYER_DIRECTORY_READ_LOCKS, UINT64, "directory read locks"); - STATUS_INIT(YDB_LAYER_DIRECTORY_READ_LOCKS_FAIL, UINT64, "directory read locks fail"); - STATUS_INIT(YDB_LAYER_DIRECTORY_WRITE_LOCKS, UINT64, "directory write locks"); - STATUS_INIT(YDB_LAYER_DIRECTORY_WRITE_LOCKS_FAIL, UINT64, "directory write locks fail"); - STATUS_INIT(YDB_LAYER_LOGSUPPRESS, UINT64, "log suppress"); - STATUS_INIT(YDB_LAYER_LOGSUPPRESS_FAIL, UINT64, "log suppress fail"); STATUS_VALUE(YDB_LAYER_TIME_STARTUP) = time(NULL); ydb_layer_status.initialized = true; @@ -181,8 +140,6 @@ ydb_layer_get_status(YDB_LAYER_STATUS statp) { static DB_ENV * volatile most_recent_env; // most recently opened env, used for engine status on crash. Note there are likely to be races on this if you have multiple threads creating and closing environments in parallel. We'll declare it volatile since at least that helps make sure the compiler doesn't optimize away certain code (e.g., if while debugging, you write a code that spins on most_recent_env, you'd like to compiler not to optimize your code away.) -static uint32_t engine_status_enable = 1; // if zero, suppress engine status output on failed assert, for test programs only - const char * environmentdictionary = "tokudb.environment"; const char * fileopsdirectory = "tokudb.directory"; @@ -224,8 +181,8 @@ single_process_unlock(int *lockfd) { } /** The default maximum number of persistent locks in a lock tree */ -const u_int32_t __toku_env_default_max_locks = 0x7FFFFFFF; -const uint64_t __toku_env_default_max_lock_memory = 1000*1024; +static const u_int32_t __toku_env_default_locks_limit = 0x7FFFFFFF; +static const uint64_t __toku_env_default_lock_memory_limit = 1000*1024; static inline DBT* init_dbt_realloc(DBT *dbt) { @@ -275,11 +232,6 @@ static int toku_env_set_data_dir(DB_ENV * env, const char *dir); static int toku_env_set_lg_dir(DB_ENV * env, const char *dir); static int toku_env_set_tmp_dir(DB_ENV * env, const char *tmp_dir); -static inline int -env_opened(DB_ENV *env) { - return env->i->cachetable != 0; -} - static void env_init_open_txn(DB_ENV *env) { env->i->open_txns = 0; @@ -297,9 +249,6 @@ env_remove_open_txn(DB_ENV *UU(env), DB_TXN *txn UU()) { (void) __sync_fetch_and_sub(&env->i->open_txns, 1); } -static int toku_txn_abort(DB_TXN * txn, TXN_PROGRESS_POLL_FUNCTION, void*, - bool release_multi_operation_client_lock); - static void env_fs_report_in_yellow(DB_ENV *UU(env)) { char tbuf[26]; @@ -433,57 +382,6 @@ env_fs_destroy(DB_ENV *env) { } } -// Check if the available file system space is less than the reserve -// Returns ENOSPC if not enough space, othersize 0 -static inline int -env_check_avail_fs_space(DB_ENV *env) { - int r = env->i->fs_state == FS_RED ? ENOSPC : 0; - if (r) env->i->enospc_redzone_ctr++; - return r; -} - -int -toku_ydb_check_avail_fs_space(DB_ENV *env) { - int rval = env_check_avail_fs_space(env); - return rval; -} - -/* db methods */ -static inline int db_opened(DB *db) { - return db->i->opened != 0; -} - - -static int toku_db_put(DB * db, DB_TXN * txn, DBT * key, DBT * data, u_int32_t flags); -static int toku_db_update(DB *db, DB_TXN *txn, const DBT *key, const DBT *update_function_extra, u_int32_t flags); -static int toku_db_update_broadcast(DB *db, DB_TXN *txn, const DBT *update_function_extra, u_int32_t flags); -static int toku_db_get (DB * db, DB_TXN * txn, DBT * key, DBT * data, u_int32_t flags); -static int toku_db_cursor_internal(DB *db, DB_TXN * txn, DBC **c, u_int32_t flags, int is_temporary_cursor); - -/* lightweight cursor methods. */ -static int toku_c_getf_first(DBC *c, u_int32_t flag, YDB_CALLBACK_FUNCTION f, void *extra); - -static int toku_c_getf_last(DBC *c, u_int32_t flag, YDB_CALLBACK_FUNCTION f, void *extra); - -static int toku_c_getf_next(DBC *c, u_int32_t flag, YDB_CALLBACK_FUNCTION f, void *extra); - -static int toku_c_getf_prev(DBC *c, u_int32_t flag, YDB_CALLBACK_FUNCTION f, void *extra); - -static int toku_c_getf_current(DBC *c, u_int32_t flag, YDB_CALLBACK_FUNCTION f, void *extra); -static int toku_c_getf_current_binding(DBC *c, u_int32_t flag, YDB_CALLBACK_FUNCTION f, void *extra); - -static int toku_c_getf_set(DBC *c, u_int32_t flag, DBT *key, YDB_CALLBACK_FUNCTION f, void *extra); -static int toku_c_getf_set_range(DBC *c, u_int32_t flag, DBT *key, YDB_CALLBACK_FUNCTION f, void *extra); -static int toku_c_getf_set_range_reverse(DBC *c, u_int32_t flag, DBT *key, YDB_CALLBACK_FUNCTION f, void *extra); - -// Effect: Lightweight cursor get - -/* cursor methods */ -static int toku_c_get(DBC * c, DBT * key, DBT * data, u_int32_t flag); -static int toku_c_del(DBC *c, u_int32_t flags); -static int toku_c_count(DBC *cursor, db_recno_t *count, u_int32_t flags); -static int toku_c_close(DBC * c); - static void env_setup_real_dir(DB_ENV *env, char **real_dir, const char *nominal_dir) { toku_free(*real_dir); @@ -530,14 +428,8 @@ needs_recovery (DB_ENV *env) { return recovery_needed ? DB_RUNRECOVERY : 0; } -static int toku_db_create(DB ** db, DB_ENV * env, u_int32_t flags); -static int toku_db_open(DB * db, DB_TXN * txn, const char *fname, const char *dbname, DBTYPE dbtype, u_int32_t flags, int mode); static int toku_env_txn_checkpoint(DB_ENV * env, u_int32_t kbyte, u_int32_t min, u_int32_t flags); -static int toku_db_close(DB * db, u_int32_t flags); static int toku_txn_begin(DB_ENV *env, DB_TXN * stxn, DB_TXN ** txn, u_int32_t flags); -static int toku_txn_begin_internal(DB_ENV *env, DB_TXN * stxn, DB_TXN ** txn, u_int32_t flags, bool internal, bool holds_ydb_lock); -static int toku_txn_commit(DB_TXN * txn, u_int32_t flags, TXN_PROGRESS_POLL_FUNCTION, void*, bool release_multi_operation_client_lock); -static int db_open_iname(DB * db, DB_TXN * txn, const char *iname, u_int32_t flags, int mode); static void finalize_file_removal(DICTIONARY_ID dict_id, void * extra); @@ -646,26 +538,26 @@ maybe_upgrade_persistent_environment_dictionary(DB_ENV * env, DB_TXN * txn, LSN const uint32_t curr_env_ver_d = toku_htod32(BRT_LAYOUT_VERSION); toku_fill_dbt(&key, curr_env_ver_key, strlen(curr_env_ver_key)); toku_fill_dbt(&val, &curr_env_ver_d, sizeof(curr_env_ver_d)); - r = toku_db_put(persistent_environment, txn, &key, &val, 0); - assert(r==0); + r = toku_db_put(persistent_environment, txn, &key, &val, 0, TRUE); + assert_zero(r); uint64_t last_lsn_of_v13_d = toku_htod64(last_lsn_of_clean_shutdown_read_from_log.lsn); toku_fill_dbt(&key, last_lsn_of_v13_key, strlen(last_lsn_of_v13_key)); toku_fill_dbt(&val, &last_lsn_of_v13_d, sizeof(last_lsn_of_v13_d)); - r = toku_db_put(persistent_environment, txn, &key, &val, 0); - assert(r==0); + r = toku_db_put(persistent_environment, txn, &key, &val, 0, TRUE); + assert_zero(r); time_t upgrade_v14_time_d = toku_htod64(time(NULL)); toku_fill_dbt(&key, upgrade_v14_time_key, strlen(upgrade_v14_time_key)); toku_fill_dbt(&val, &upgrade_v14_time_d, sizeof(upgrade_v14_time_d)); - r = toku_db_put(persistent_environment, txn, &key, &val, DB_NOOVERWRITE); - assert(r==0); + r = toku_db_put(persistent_environment, txn, &key, &val, DB_NOOVERWRITE, TRUE); + assert_zero(r); uint64_t upgrade_v14_footprint_d = toku_htod64(toku_log_upgrade_get_footprint()); toku_fill_dbt(&key, upgrade_v14_footprint_key, strlen(upgrade_v14_footprint_key)); toku_fill_dbt(&val, &upgrade_v14_footprint_d, sizeof(upgrade_v14_footprint_d)); - r = toku_db_put(persistent_environment, txn, &key, &val, DB_NOOVERWRITE); - assert(r==0); + r = toku_db_put(persistent_environment, txn, &key, &val, DB_NOOVERWRITE, TRUE); + assert_zero(r); } return r; } @@ -849,20 +741,21 @@ ydb_maybe_upgrade_env (DB_ENV *env, LSN * last_lsn_of_clean_shutdown_read_from_l return r; } - static void unlock_single_process(DB_ENV *env) { int r; r = single_process_unlock(&env->i->envdir_lockfd); - lazy_assert(r==0); + lazy_assert_zero(r); r = single_process_unlock(&env->i->datadir_lockfd); - lazy_assert(r==0); + lazy_assert_zero(r); r = single_process_unlock(&env->i->logdir_lockfd); - lazy_assert(r==0); + lazy_assert_zero(r); r = single_process_unlock(&env->i->tmpdir_lockfd); - lazy_assert(r==0); + lazy_assert_zero(r); } +static int toku_db_lt_panic(DB* db, int r); + // Open the environment. // If this is a new environment, then create the necessary files. // Return 0 on success, ENOENT if any of the expected necessary files are missing. @@ -1016,9 +909,12 @@ toku_env_open(DB_ENV * env, const char *home, u_int32_t flags, int mode) { } } else { r = toku_logger_close(&env->i->logger); // if no logging system, then kill the logger - assert(r==0); + assert_zero(r); } + r = toku_ltm_open(env->i->ltm); + assert_zero(r); + unused_flags &= ~DB_INIT_MPOOL; // we always init an mpool. unused_flags &= ~DB_CREATE; // we always do DB_CREATE unused_flags &= ~DB_INIT_LOCK; // we check this later (e.g. in db->open) @@ -1050,22 +946,22 @@ toku_env_open(DB_ENV * env, const char *home, u_int32_t flags, int mode) { toku_logger_set_cachetable(env->i->logger, env->i->cachetable); toku_logger_set_remove_finalize_callback(env->i->logger, finalize_file_removal, env->i->ltm); r = toku_logger_open_rollback(env->i->logger, env->i->cachetable, create_new_rollback_file); - assert(r==0); + assert_zero(r); } DB_TXN *txn=NULL; if (using_txns) { r = toku_txn_begin_internal(env, 0, &txn, 0, 1, true); - assert(r==0); + assert_zero(r); } { r = toku_db_create(&env->i->persistent_environment, env, 0); - assert(r==0); + assert_zero(r); r = db_use_builtin_key_cmp(env->i->persistent_environment); - assert(r==0); + assert_zero(r); r = db_open_iname(env->i->persistent_environment, txn, environmentdictionary, DB_CREATE, mode); - assert(r==0); + assert_zero(r); if (newenv) { // create new persistent_environment DBT key, val; @@ -1074,41 +970,41 @@ toku_env_open(DB_ENV * env, const char *home, u_int32_t flags, int mode) { toku_fill_dbt(&key, orig_env_ver_key, strlen(orig_env_ver_key)); toku_fill_dbt(&val, &environment_version, sizeof(environment_version)); - r = toku_db_put(env->i->persistent_environment, txn, &key, &val, 0); - assert(r==0); + r = toku_db_put(env->i->persistent_environment, txn, &key, &val, 0, TRUE); + assert_zero(r); toku_fill_dbt(&key, curr_env_ver_key, strlen(curr_env_ver_key)); toku_fill_dbt(&val, &environment_version, sizeof(environment_version)); - r = toku_db_put(env->i->persistent_environment, txn, &key, &val, 0); - assert(r==0); + r = toku_db_put(env->i->persistent_environment, txn, &key, &val, 0, TRUE); + assert_zero(r); time_t creation_time_d = toku_htod64(time(NULL)); toku_fill_dbt(&key, creation_time_key, strlen(creation_time_key)); toku_fill_dbt(&val, &creation_time_d, sizeof(creation_time_d)); - r = toku_db_put(env->i->persistent_environment, txn, &key, &val, 0); - assert(r==0); + r = toku_db_put(env->i->persistent_environment, txn, &key, &val, 0, TRUE); + assert_zero(r); } else { r = maybe_upgrade_persistent_environment_dictionary(env, txn, last_lsn_of_clean_shutdown_read_from_log); - assert(r==0); + assert_zero(r); } capture_persistent_env_contents(env, txn); } { r = toku_db_create(&env->i->directory, env, 0); - assert(r==0); + assert_zero(r); r = db_use_builtin_key_cmp(env->i->directory); - assert(r==0); + assert_zero(r); r = db_open_iname(env->i->directory, txn, fileopsdirectory, DB_CREATE, mode); - assert(r==0); + assert_zero(r); } if (using_txns) { r = toku_txn_commit(txn, 0, NULL, NULL, false); - assert(r==0); + assert_zero(r); } toku_ydb_unlock(); r = toku_checkpoint(env->i->cachetable, env->i->logger, NULL, NULL, NULL, NULL, STARTUP_CHECKPOINT); - assert(r==0); + assert_zero(r); toku_ydb_lock(); env_fs_poller(env); // get the file system state at startup env_fs_init_minicron(env); @@ -1144,7 +1040,7 @@ toku_env_close(DB_ENV * env, u_int32_t flags) { r = toku_ydb_do_error(env, EINVAL, "%s", err_msg); goto panic_and_quit_early; } - { //Verify open dbs. Zombies are ok at this stage, fully open is not. + if (env->i->open_dbs) { //Verify open dbs. Zombies are ok at this stage, fully open is not. uint32_t size = toku_omt_size(env->i->open_dbs); assert(size == env->i->num_open_dbs + env->i->num_zombie_dbs); if (env->i->num_open_dbs > 0) { @@ -1235,7 +1131,10 @@ toku_env_close(DB_ENV * env, u_int32_t flags) { assert(env->i->panic_string==0); env_fs_destroy(env); - toku_ltm_close(env->i->ltm); + if (env->i->ltm) { + toku_ltm_close(env->i->ltm); + env->i->ltm = NULL; + } if (env->i->data_dir) toku_free(env->i->data_dir); if (env->i->lg_dir) @@ -1301,8 +1200,6 @@ toku_env_set_cachesize(DB_ENV * env, u_int32_t gbytes, u_int32_t bytes, int ncac return 0; } -static int toku_env_dbremove(DB_ENV * env, DB_TXN *txn, const char *fname, const char *dbname, u_int32_t flags); - static int locked_env_dbremove(DB_ENV * env, DB_TXN *txn, const char *fname, const char *dbname, u_int32_t flags) { toku_multi_operation_client_lock(); //Cannot begin checkpoint @@ -1313,8 +1210,6 @@ locked_env_dbremove(DB_ENV * env, DB_TXN *txn, const char *fname, const char *db return r; } -static int toku_env_dbrename(DB_ENV *env, DB_TXN *txn, const char *fname, const char *dbname, const char *newname, u_int32_t flags); - static int locked_env_dbrename(DB_ENV *env, DB_TXN *txn, const char *fname, const char *dbname, const char *newname, u_int32_t flags) { toku_multi_operation_client_lock(); //Cannot begin checkpoint @@ -1440,11 +1335,14 @@ toku_env_set_lk_detect(DB_ENV * env, u_int32_t detect) { } static int -toku_env_set_lk_max_locks(DB_ENV *dbenv, u_int32_t max) { - int r = ENOSYS; - HANDLE_PANICKED_ENV(dbenv); - if (env_opened(dbenv)) { return EINVAL; } - r = toku_ltm_set_max_locks(dbenv->i->ltm, max); +toku_env_set_lk_max_locks(DB_ENV *env, u_int32_t locks_limit) { + HANDLE_PANICKED_ENV(env); + int r; + if (env_opened(env)) { + r = EINVAL; + } else { + r = toku_ltm_set_max_locks(env->i->ltm, locks_limit); + } return r; } @@ -1456,51 +1354,73 @@ toku_env_set_lk_max(DB_ENV * env, u_int32_t lk_max) { static int locked_env_set_lk_max(DB_ENV * env, u_int32_t lk_max) { - toku_ydb_lock(); int r = toku_env_set_lk_max(env, lk_max); toku_ydb_unlock(); return r; + toku_ydb_lock(); + int r = toku_env_set_lk_max(env, lk_max); + toku_ydb_unlock(); + return r; } #endif static int -toku_env_get_lk_max_locks(DB_ENV *dbenv, u_int32_t *lk_maxp) { - HANDLE_PANICKED_ENV(dbenv); - return toku_ltm_get_max_locks(dbenv->i->ltm, lk_maxp); -} - -static int -locked_env_set_lk_max_locks(DB_ENV *dbenv, u_int32_t max) { - toku_ydb_lock(); int r = toku_env_set_lk_max_locks(dbenv, max); toku_ydb_unlock(); return r; -} - -static int -locked_env_get_lk_max_locks(DB_ENV *dbenv, u_int32_t *lk_maxp) { - toku_ydb_lock(); int r = toku_env_get_lk_max_locks(dbenv, lk_maxp); toku_ydb_unlock(); return r; -} - -static int -toku_env_set_lk_max_memory(DB_ENV *dbenv, uint64_t max) { - int r = ENOSYS; - HANDLE_PANICKED_ENV(dbenv); - if (env_opened(dbenv)) { return EINVAL; } - r = toku_ltm_set_max_lock_memory(dbenv->i->ltm, max); +toku_env_get_lk_max_locks(DB_ENV *env, u_int32_t *lk_maxp) { + HANDLE_PANICKED_ENV(env); + int r; + if (lk_maxp == NULL) + r = EINVAL; + else { + r = toku_ltm_get_max_locks(env->i->ltm, lk_maxp); + } return r; } static int -toku_env_get_lk_max_memory(DB_ENV *dbenv, uint64_t *lk_maxp) { - HANDLE_PANICKED_ENV(dbenv); - return toku_ltm_get_max_lock_memory(dbenv->i->ltm, lk_maxp); -} - -static int -locked_env_set_lk_max_memory(DB_ENV *dbenv, uint64_t max) { +locked_env_set_lk_max_locks(DB_ENV *env, u_int32_t max) { toku_ydb_lock(); - int r = toku_env_set_lk_max_memory(dbenv, max); + int r = toku_env_set_lk_max_locks(env, max); toku_ydb_unlock(); return r; } -static int locked_env_get_lk_max_memory(DB_ENV *dbenv, uint64_t *lk_maxp) { - toku_ydb_lock(); int r = toku_env_get_lk_max_memory(dbenv, lk_maxp); toku_ydb_unlock(); return r; +static int +locked_env_get_lk_max_locks(DB_ENV *env, u_int32_t *lk_maxp) { + toku_ydb_lock(); + int r = toku_env_get_lk_max_locks(env, lk_maxp); + toku_ydb_unlock(); + return r; +} + +static int +toku_env_set_lk_max_memory(DB_ENV *env, uint64_t lock_memory_limit) { + HANDLE_PANICKED_ENV(env); + int r; + if (env_opened(env)) { + r = EINVAL; + } else { + r = toku_ltm_set_max_lock_memory(env->i->ltm, lock_memory_limit); + } + return r; +} + +static int +toku_env_get_lk_max_memory(DB_ENV *env, uint64_t *lk_maxp) { + HANDLE_PANICKED_ENV(env); + int r = toku_ltm_get_max_lock_memory(env->i->ltm, lk_maxp); + return r; +} + +static int +locked_env_set_lk_max_memory(DB_ENV *env, uint64_t max) { + toku_ydb_lock(); + int r = toku_env_set_lk_max_memory(env, max); + toku_ydb_unlock(); + return r; +} + +static int locked_env_get_lk_max_memory(DB_ENV *env, uint64_t *lk_maxp) { + toku_ydb_lock(); + int r = toku_env_get_lk_max_memory(env, lk_maxp); + toku_ydb_unlock(); + return r; } //void toku__env_set_noticecall (DB_ENV *env, void (*noticecall)(DB_ENV *, db_notices)) { @@ -1529,14 +1449,6 @@ toku_env_set_verbose(DB_ENV * env, u_int32_t which, int onoff) { return 1; } -// For test purposes only. -// These callbacks are never used in production code, only as a way to test the system -// (for example, by causing crashes at predictable times). -static void (*checkpoint_callback_f)(void*) = NULL; -static void * checkpoint_callback_extra = NULL; -static void (*checkpoint_callback2_f)(void*) = NULL; -static void * checkpoint_callback2_extra = NULL; - static int toku_env_txn_checkpoint(DB_ENV * env, u_int32_t kbyte __attribute__((__unused__)), u_int32_t min __attribute__((__unused__)), u_int32_t flags __attribute__((__unused__))) { int r = toku_checkpoint(env->i->cachetable, env->i->logger, @@ -1863,53 +1775,6 @@ locked_env_set_generate_row_callback_for_del(DB_ENV *env, generate_row_for_del_f return r; } -static int env_put_multiple(DB_ENV *env, DB *src_db, DB_TXN *txn, - const DBT *src_key, const DBT *src_val, - uint32_t num_dbs, DB **db_array, DBT *keys, DBT *vals, uint32_t *flags_array); - -static int env_del_multiple(DB_ENV *env, DB *src_db, DB_TXN *txn, - const DBT *src_key, const DBT *src_val, - uint32_t num_dbs, DB **db_array, DBT *keys, uint32_t *flags_array); - -static int env_update_multiple(DB_ENV *env, DB *src_db, DB_TXN *txn, - DBT *old_src_key, DBT *old_src_data, - DBT *new_src_key, DBT *new_src_data, - uint32_t num_dbs, DB **db_array, uint32_t* flags_array, - uint32_t num_keys, DBT *keys, - uint32_t num_vals, DBT *vals); - -static int -locked_env_put_multiple(DB_ENV *env, DB *src_db, DB_TXN *txn, const DBT *src_key, const DBT *src_val, uint32_t num_dbs, DB **db_array, DBT *keys, DBT *vals, uint32_t *flags_array) { - int r = env_check_avail_fs_space(env); - if (r == 0) { - toku_ydb_lock(); - r = env_put_multiple(env, src_db, txn, src_key, src_val, num_dbs, db_array, keys, vals, flags_array); - toku_ydb_unlock(); - } - return r; -} - -static int -locked_env_del_multiple(DB_ENV *env, DB *src_db, DB_TXN *txn, const DBT *src_key, const DBT *src_val, uint32_t num_dbs, DB **db_array, DBT *keys, uint32_t *flags_array) { - toku_ydb_lock(); - int r = env_del_multiple(env, src_db, txn, src_key, src_val, num_dbs, db_array, keys, flags_array); - toku_ydb_unlock(); - return r; -} - -static int -locked_env_update_multiple(DB_ENV *env, DB *src_db, DB_TXN *txn, - DBT *old_src_key, DBT *old_src_data, - DBT *new_src_key, DBT *new_src_data, - uint32_t num_dbs, DB **db_array, uint32_t* flags_array, - uint32_t num_keys, DBT *keys, - uint32_t num_vals, DBT *vals) { - toku_ydb_lock(); - int r = env_update_multiple(env, src_db, txn, old_src_key, old_src_data, new_src_key, new_src_data, num_dbs, db_array, flags_array, num_keys, keys, num_vals, vals); - toku_ydb_unlock(); - return r; -} - static int env_set_redzone(DB_ENV *env, int redzone) { HANDLE_PANICKED_ENV(env); @@ -2121,6 +1986,8 @@ static int env_get_engine_status_num_rows (DB_ENV * UU(env), uint64_t * num_rowsp) { uint64_t num_rows = 0; num_rows += YDB_LAYER_STATUS_NUM_ROWS; + num_rows += YDB_C_LAYER_STATUS_NUM_ROWS; + num_rows += YDB_WRITE_LAYER_STATUS_NUM_ROWS; num_rows += YDB_LOCK_STATUS_NUM_ROWS; num_rows += LE_STATUS_NUM_ROWS; num_rows += CP_STATUS_NUM_ROWS; @@ -2179,7 +2046,20 @@ env_get_engine_status (DB_ENV * env, TOKU_ENGINE_STATUS_ROW engstat, uint64_t ma engstat[row++] = ydb_stat.status[i]; } } - + { + YDB_C_LAYER_STATUS_S ydb_c_stat; + ydb_c_layer_get_status(&ydb_c_stat); + for (int i = 0; i < YDB_C_LAYER_STATUS_NUM_ROWS && row < maxrows; i++) { + engstat[row++] = ydb_c_stat.status[i]; + } + } + { + YDB_WRITE_LAYER_STATUS_S ydb_write_stat; + ydb_write_layer_get_status(&ydb_write_stat); + for (int i = 0; i < YDB_WRITE_LAYER_STATUS_NUM_ROWS && row < maxrows; i++) { + engstat[row++] = ydb_write_stat.status[i]; + } + } { YDB_LOCK_STATUS_S ydb_lock_status; toku_ydb_lock_get_status(&ydb_lock_status); @@ -2419,9 +2299,20 @@ env_crash(DB_ENV * UU(db_env), const char* msg, const char * fun, const char* fi return -1; // placate compiler } -static int toku_db_lt_panic(DB* db, int r); +static int +toku_db_lt_panic(DB* db, int r) { + assert(r!=0); + assert(db && db->i && db->dbenv && db->dbenv->i); + DB_ENV* env = db->dbenv; + char * panic_string; -static toku_dbt_cmp toku_db_get_compare_fun(DB* db); + if (r < 0) panic_string = toku_lt_strerror((TOKU_LT_ERROR)r); + else panic_string = "Error in locktree.\n"; + + env_panic(env, r, panic_string); + + return toku_ydb_do_error(env, r, "%s", panic_string); +} static int toku_env_create(DB_ENV ** envp, u_int32_t flags) { @@ -2444,9 +2335,6 @@ toku_env_create(DB_ENV ** envp, u_int32_t flags) { SENV(set_update); SENV(set_generate_row_callback_for_put); SENV(set_generate_row_callback_for_del); - SENV(put_multiple); - SENV(del_multiple); - SENV(update_multiple); SENV(checkpointing_set_period); SENV(checkpointing_get_period); SENV(cleaner_set_period); @@ -2485,7 +2373,11 @@ toku_env_create(DB_ENV ** envp, u_int32_t flags) { SENV(get_lock_timeout); SENV(set_lock_timeout); #undef SENV - + // methods with locking done internally + result->put_multiple = env_put_multiple; + result->del_multiple = env_del_multiple; + result->update_multiple = env_update_multiple; + // unlocked methods result->txn_checkpoint = toku_env_txn_checkpoint; result->checkpointing_postpone = env_checkpointing_postpone; @@ -2512,37 +2404,29 @@ toku_env_create(DB_ENV ** envp, u_int32_t flags) { env_init_open_txn(result); env_fs_init(result); - r = toku_ltm_create(&result->i->ltm, - __toku_env_default_max_locks, __toku_env_default_max_lock_memory, - toku_db_lt_panic, - toku_db_get_compare_fun); - if (r!=0) { goto cleanup; } - toku_ltm_set_mutex(result->i->ltm, toku_ydb_mutex()); + result->i->bt_compare = toku_builtin_compare_fun; - { - r = toku_logger_create(&result->i->logger); - if (r!=0) { goto cleanup; } - assert(result->i->logger); - } - { - r = toku_omt_create(&result->i->open_dbs); - if (r!=0) goto cleanup; - assert(result->i->open_dbs); - } + r = toku_logger_create(&result->i->logger); + assert_zero(r); + assert(result->i->logger); + + r = toku_ltm_create(&result->i->ltm, + __toku_env_default_locks_limit, + __toku_env_default_lock_memory_limit, + toku_db_lt_panic); + assert_zero(r); + assert(result->i->ltm); + + r = toku_omt_create(&result->i->open_dbs); + assert_zero(r); + assert(result->i->open_dbs); *envp = result; r = 0; cleanup: if (r!=0) { if (result) { - if (result->i) { - if (result->i->ltm) { - toku_ltm_close(result->i->ltm); - } - if (result->i->open_dbs) - toku_omt_destroy(&result->i->open_dbs); - toku_free(result->i); - } + toku_free(result->i); toku_free(result); } } @@ -2560,15 +2444,15 @@ DB_ENV_CREATE_FUN (DB_ENV ** envp, u_int32_t flags) { static int toku_txn_release_locks(DB_TXN* txn) { assert(txn); - toku_lth* lth = db_txn_struct_i(txn)->lth; + toku_lth* lth = db_txn_struct_i(txn)->lth; int r = ENOSYS; int first_error = 0; if (lth) { toku_lth_start_scan(lth); toku_lock_tree* next = toku_lth_next(lth); while (next) { - r = toku_lt_unlock(next, toku_txn_get_txnid(db_txn_struct_i(txn)->tokutxn)); + r = toku_lt_unlock_txn(next, toku_txn_get_txnid(db_txn_struct_i(txn)->tokutxn)); if (!first_error && r!=0) { first_error = r; } if (r == 0) { r = toku_lt_remove_ref(next); @@ -2594,7 +2478,7 @@ ydb_yield (voidfp f, void *fv, void *UU(v)) { toku_ydb_lock(); } -static int +int toku_txn_commit(DB_TXN * txn, u_int32_t flags, TXN_PROGRESS_POLL_FUNCTION poll, void* poll_extra, bool release_multi_operation_client_lock) { @@ -2644,7 +2528,7 @@ toku_txn_commit(DB_TXN * txn, u_int32_t flags, } //If panicked, we're done. HANDLE_PANICKED_ENV(txn->mgrp); - assert(r==0); + assert_zero(r); // Close the logger after releasing the locks r = toku_txn_release_locks(txn); @@ -2715,7 +2599,7 @@ toku_txn_id(DB_TXN * txn) { return -1; } -static int +int toku_txn_abort(DB_TXN * txn, TXN_PROGRESS_POLL_FUNCTION poll, void* poll_extra, bool release_multi_operation_client_lock) { @@ -2747,7 +2631,7 @@ toku_txn_abort(DB_TXN * txn, env_panic(txn->mgrp, r, "Error during abort.\n"); } HANDLE_PANICKED_ENV(txn->mgrp); - assert(r==0); + assert_zero(r); r = toku_txn_release_locks(txn); //toku_logger_txn_close(db_txn_struct_i(txn)->tokutxn); toku_txn_close_txn(db_txn_struct_i(txn)->tokutxn); @@ -2800,7 +2684,7 @@ locked_txn_commit_with_progress(DB_TXN *txn, u_int32_t flags, toku_ydb_lock(); int r = toku_unpin_inprogress_rollback_log(ttxn); toku_ydb_unlock(); - assert(r==0); + assert_zero(r); if (toku_txn_requires_checkpoint(ttxn)) { toku_checkpoint(txn->mgrp->i->cachetable, txn->mgrp->i->logger, NULL, NULL, NULL, NULL, TXN_COMMIT_CHECKPOINT); } @@ -2821,21 +2705,21 @@ locked_txn_abort_with_progress(DB_TXN *txn, return r; } -static int +int locked_txn_commit(DB_TXN *txn, u_int32_t flags) { int r; r = locked_txn_commit_with_progress(txn, flags, NULL, NULL); return r; } -static int +int locked_txn_abort(DB_TXN *txn) { int r; r = locked_txn_abort_with_progress(txn, NULL, NULL); return r; } -static int +int toku_txn_begin_internal(DB_ENV *env, DB_TXN * stxn, DB_TXN ** txn, u_int32_t flags, bool internal, bool holds_ydb_lock) { HANDLE_PANICKED_ENV(env); HANDLE_ILLEGAL_WORKING_PARENT_TXN(env, stxn); //Cannot create child while child already exists. @@ -2999,39 +2883,6 @@ log_compare(const DB_LSN * a, const DB_LSN * b) { return 0; } -static void env_note_zombie_db_closed(DB_ENV *env, DB *db); - -static int -db_close_before_brt(DB *db, u_int32_t UU(flags)) { - int r; - char *error_string = NULL; - - if (db_opened(db) && db->i->dname) { - // internal (non-user) dictionary has no dname - env_note_zombie_db_closed(db->dbenv, db); // tell env that this db is no longer a zombie (it is completely closed) - } - r = toku_close_brt(db->i->brt, &error_string); - if (r) { - if (!error_string) - error_string = "Closing file\n"; - // Panicking the whole environment may be overkill, but I'm not sure what else to do. - env_panic(db->dbenv, r, error_string); - toku_ydb_do_error(db->dbenv, r, "%s", error_string); - } - else { - if (db->i->lt) { - toku_lt_remove_db_ref(db->i->lt, db); - } - // printf("%s:%d %d=__toku_db_close(%p)\n", __FILE__, __LINE__, r, db); - toku_sdbt_cleanup(&db->i->skey); - toku_sdbt_cleanup(&db->i->sval); - if (db->i->dname) toku_free(db->i->dname); - toku_free(db->i); - toku_free(db); - } - return r; -} - // return 0 if v and dbv refer to same db (including same dname) // return <0 if v is earlier in omt than dbv // return >0 if v is later in omt than dbv @@ -3054,7 +2905,7 @@ find_db_by_db (OMTVALUE v, void *dbv) { } // Tell env that there is a new db handle (with non-unique dname in db->i-dname) -static void +void env_note_db_opened(DB_ENV *env, DB *db) { assert(db->i->dname); // internal (non-user) dictionary has no dname assert(!db->i->is_zombie); @@ -3069,10 +2920,10 @@ env_note_db_opened(DB_ENV *env, DB *db) { r = toku_omt_find_zero(env->i->open_dbs, find_db_by_db, db, &dbv, &idx); assert(r==DB_NOTFOUND); //Must not already be there. r = toku_omt_insert_at(env->i->open_dbs, db, idx); - assert(r==0); + assert_zero(r); } -static void +void env_note_db_closed(DB_ENV *env, DB *db) { assert(db->i->dname); assert(!db->i->is_zombie); @@ -3084,14 +2935,14 @@ env_note_db_closed(DB_ENV *env, DB *db) { STATUS_VALUE(YDB_LAYER_NUM_OPEN_DBS) = env->i->num_open_dbs; STATUS_VALUE(YDB_LAYER_NUM_DB_CLOSE)++; r = toku_omt_find_zero(env->i->open_dbs, find_db_by_db, db, &dbv, &idx); - assert(r==0); //Must already be there. + assert_zero(r); //Must already be there. assert((DB*)dbv == db); r = toku_omt_delete_at(env->i->open_dbs, idx); - assert(r==0); + assert_zero(r); } // Tell env that there is a new db handle (with non-unique dname in db->i-dname) -static void +void env_note_zombie_db(DB_ENV *env, DB *db) { assert(db->i->dname); // internal (non-user) dictionary has no dname assert(db->i->is_zombie); @@ -3102,10 +2953,10 @@ env_note_zombie_db(DB_ENV *env, DB *db) { r = toku_omt_find_zero(env->i->open_dbs, find_db_by_db, db, &dbv, &idx); assert(r==DB_NOTFOUND); //Must not already be there. r = toku_omt_insert_at(env->i->open_dbs, db, idx); - assert(r==0); + assert_zero(r); } -static void +void env_note_zombie_db_closed(DB_ENV *env, DB *db) { assert(db->i->dname); assert(db->i->is_zombie); @@ -3115,10 +2966,10 @@ env_note_zombie_db_closed(DB_ENV *env, DB *db) { uint32_t idx; env->i->num_zombie_dbs--; r = toku_omt_find_zero(env->i->open_dbs, find_db_by_db, db, &dbv, &idx); - assert(r==0); //Must already be there. + assert_zero(r); //Must already be there. assert((DB*)dbv == db); r = toku_omt_delete_at(env->i->open_dbs, idx); - assert(r==0); + assert_zero(r); } static int @@ -3192,2137 +3043,6 @@ env_get_zombie_db_with_dname(DB_ENV *env, const char *dname) { return rval; } -void -toku_db_add_ref(DB *db) { - db->i->refs++; -} - -void -toku_db_release_ref(DB *db){ - db->i->refs--; -} - -//DB->close() -static int -toku_db_close(DB * db, u_int32_t flags) { - int r = 0; - if (db->i->refs != 1) { - r = EBUSY; - } else { - db->i->refs = 0; - if (db_opened(db) && db->i->dname) { - // internal (non-user) dictionary has no dname - env_note_db_closed(db->dbenv, db); // tell env that this db is no longer in use by the user of this api (user-closed, may still be in use by fractal tree internals) - db->i->is_zombie = TRUE; - env_note_zombie_db(db->dbenv, db); // tell env that this db is a zombie - } - //Remove from transaction's list of 'must close' if necessary. - if (!toku_list_empty(&db->i->dbs_that_must_close_before_abort)) - toku_list_remove(&db->i->dbs_that_must_close_before_abort); - - r = toku_brt_db_delay_closed(db->i->brt, db, db_close_before_brt, flags); - } - return r; -} - - -//Get the main portion of a cursor flag (excluding the bitwise or'd components). -static int -get_main_cursor_flag(u_int32_t flags) { - return flags & DB_OPFLAGS_MASK; -} - -static int -get_nonmain_cursor_flags(u_int32_t flags) { - return flags & ~(DB_OPFLAGS_MASK); -} - -static inline BOOL -toku_c_uninitialized(DBC* c) { - return toku_brt_cursor_uninitialized(dbc_struct_i(c)->c); -} - -typedef struct query_context_wrapped_t { - DBT *key; - DBT *val; - struct simple_dbt *skey; - struct simple_dbt *sval; -} *QUERY_CONTEXT_WRAPPED, QUERY_CONTEXT_WRAPPED_S; - -static inline void -query_context_wrapped_init(QUERY_CONTEXT_WRAPPED context, DBC *c, DBT *key, DBT *val) { - context->key = key; - context->val = val; - context->skey = dbc_struct_i(c)->skey; - context->sval = dbc_struct_i(c)->sval; -} - -static int -c_get_wrapper_callback(DBT const *key, DBT const *val, void *extra) { - QUERY_CONTEXT_WRAPPED context = extra; - int r; - r = toku_dbt_set(key->size, key->data, context->key, context->skey); - if (r==0) r = toku_dbt_set(val->size, val->data, context->val, context->sval); - return r; -} - -static int -toku_c_get_current_unconditional(DBC* c, u_int32_t flags, DBT* key, DBT* val) { - int r; - QUERY_CONTEXT_WRAPPED_S context; - query_context_wrapped_init(&context, c, key, val); - r = toku_c_getf_current_binding(c, flags, c_get_wrapper_callback, &context); - return r; -} - -static inline void -toku_swap_flag(u_int32_t* flag, u_int32_t* get_flag, u_int32_t new_flag) { - *flag -= *get_flag; - *get_flag = new_flag; - *flag += *get_flag; -} - -/* - Used for partial implementation of nested transactions. - Work is done by children as normal, but all locking is done by the - root of the nested txn tree. - This may hold extra locks, and will not work as expected when - a node has two non-completed txns at any time. -*/ -static inline DB_TXN* -toku_txn_ancestor(DB_TXN* txn) { - while (txn && txn->parent) txn = txn->parent; - - return txn; -} - -static int toku_txn_add_lt(DB_TXN* txn, toku_lock_tree* lt); - -/* c_get has many subfunctions with lots of parameters - * this structure exists to simplify it. */ -typedef struct { - DBC* c; // The cursor - DB* db; // db the cursor is iterating over - DB_TXN* txn_anc; // The (root) ancestor of the transaction - TXNID id_anc; - DBT tmp_key; // Temporary key to protect out param - DBT tmp_val; // Temporary val to protect out param - u_int32_t flag; // The c_get flag - u_int32_t op; // The operation portion of the c_get flag - u_int32_t lock_flags; // The prelock flags. - BOOL cursor_is_write; // Whether op can change position of cursor - BOOL key_is_read; - BOOL key_is_write; - BOOL val_is_read; - BOOL val_is_write; - BOOL duplicates; - BOOL tmp_key_malloced; - BOOL tmp_val_malloced; -} C_GET_VARS; - - -static inline u_int32_t -get_prelocked_flags(u_int32_t flags) { - u_int32_t lock_flags = flags & (DB_PRELOCKED | DB_PRELOCKED_WRITE); - return lock_flags; -} - -static inline u_int32_t -get_cursor_prelocked_flags(u_int32_t flags, DBC* dbc) { - u_int32_t lock_flags = flags & (DB_PRELOCKED | DB_PRELOCKED_WRITE); - - //DB_READ_UNCOMMITTED and DB_READ_COMMITTED transactions 'own' all read locks for user-data dictionaries. - if (dbc_struct_i(dbc)->iso != TOKU_ISO_SERIALIZABLE) { - lock_flags |= DB_PRELOCKED; - } - return lock_flags; -} - -static int -toku_c_get(DBC* c, DBT* key, DBT* val, u_int32_t flag) { - //This function exists for legacy (test compatibility) purposes/parity with bdb. - HANDLE_PANICKED_DB(c->dbp); - HANDLE_CURSOR_ILLEGAL_WORKING_PARENT_TXN(c); - - u_int32_t main_flag = get_main_cursor_flag(flag); - u_int32_t remaining_flags = get_nonmain_cursor_flags(flag); - int r; - QUERY_CONTEXT_WRAPPED_S context; - //Passing in NULL for a key or val means that it is NOT an output. - // Both key and val are output: - // query_context_wrapped_init(&context, c, key, val); - // Val is output, key is not: - // query_context_wrapped_init(&context, c, NULL, val); - // Neither key nor val are output: - // query_context_wrapped_init(&context, c, NULL, NULL); // Used for DB_GET_BOTH - switch (main_flag) { - case (DB_FIRST): - query_context_wrapped_init(&context, c, key, val); - r = toku_c_getf_first(c, remaining_flags, c_get_wrapper_callback, &context); - break; - case (DB_LAST): - query_context_wrapped_init(&context, c, key, val); - r = toku_c_getf_last(c, remaining_flags, c_get_wrapper_callback, &context); - break; - case (DB_NEXT): - case (DB_NEXT_NODUP): - query_context_wrapped_init(&context, c, key, val); - r = toku_c_getf_next(c, remaining_flags, c_get_wrapper_callback, &context); - break; - case (DB_PREV): - case (DB_PREV_NODUP): - query_context_wrapped_init(&context, c, key, val); - r = toku_c_getf_prev(c, remaining_flags, c_get_wrapper_callback, &context); - break; -#ifdef DB_PREV_DUP - case (DB_PREV_DUP): - query_context_wrapped_init(&context, c, key, val); - r = toku_c_getf_prev_dup(c, remaining_flags, c_get_wrapper_callback, &context); - break; -#endif - case (DB_CURRENT): - query_context_wrapped_init(&context, c, key, val); - r = toku_c_getf_current(c, remaining_flags, c_get_wrapper_callback, &context); - break; - case (DB_CURRENT_BINDING): - query_context_wrapped_init(&context, c, key, val); - r = toku_c_getf_current_binding(c, remaining_flags, c_get_wrapper_callback, &context); - break; - - case (DB_SET): - query_context_wrapped_init(&context, c, NULL, val); - r = toku_c_getf_set(c, remaining_flags, key, c_get_wrapper_callback, &context); - break; - case (DB_SET_RANGE): - query_context_wrapped_init(&context, c, key, val); - r = toku_c_getf_set_range(c, remaining_flags, key, c_get_wrapper_callback, &context); - break; - case (DB_SET_RANGE_REVERSE): - query_context_wrapped_init(&context, c, key, val); - r = toku_c_getf_set_range_reverse(c, remaining_flags, key, c_get_wrapper_callback, &context); - break; - default: - r = EINVAL; - break; - } - return r; -} - -static int -locked_c_getf_first(DBC *c, u_int32_t flag, YDB_CALLBACK_FUNCTION f, void *extra) { - toku_ydb_lock(); - int r = toku_c_getf_first(c, flag, f, extra); - toku_ydb_unlock(); - return r; -} - -static int -locked_c_getf_last(DBC *c, u_int32_t flag, YDB_CALLBACK_FUNCTION f, void *extra) { - toku_ydb_lock(); int r = toku_c_getf_last(c, flag, f, extra); toku_ydb_unlock(); return r; -} - -static int -locked_c_getf_next(DBC *c, u_int32_t flag, YDB_CALLBACK_FUNCTION f, void *extra) { - toku_ydb_lock(); int r = toku_c_getf_next(c, flag, f, extra); toku_ydb_unlock(); return r; -} - -static int -locked_c_getf_prev(DBC *c, u_int32_t flag, YDB_CALLBACK_FUNCTION f, void *extra) { - toku_ydb_lock(); int r = toku_c_getf_prev(c, flag, f, extra); toku_ydb_unlock(); return r; -} - -static int -locked_c_getf_current(DBC *c, u_int32_t flag, YDB_CALLBACK_FUNCTION f, void *extra) { - toku_ydb_lock(); int r = toku_c_getf_current(c, flag, f, extra); toku_ydb_unlock(); return r; -} - -static int -locked_c_getf_current_binding(DBC *c, u_int32_t flag, YDB_CALLBACK_FUNCTION f, void *extra) { - toku_ydb_lock(); int r = toku_c_getf_current_binding(c, flag, f, extra); toku_ydb_unlock(); return r; -} - -static int -locked_c_getf_set(DBC *c, u_int32_t flag, DBT * key, YDB_CALLBACK_FUNCTION f, void *extra) { - toku_ydb_lock(); int r = toku_c_getf_set(c, flag, key, f, extra); toku_ydb_unlock(); return r; -} - -static int -locked_c_getf_set_range(DBC *c, u_int32_t flag, DBT * key, YDB_CALLBACK_FUNCTION f, void *extra) { - toku_ydb_lock(); int r = toku_c_getf_set_range(c, flag, key, f, extra); toku_ydb_unlock(); return r; -} - -static int -locked_c_getf_set_range_reverse(DBC *c, u_int32_t flag, DBT * key, YDB_CALLBACK_FUNCTION f, void *extra) { - toku_ydb_lock(); int r = toku_c_getf_set_range_reverse(c, flag, key, f, extra); toku_ydb_unlock(); return r; -} - -// Get a range lock. -// Return when the range lock is acquired or the default lock tree timeout has expired. -// The ydb mutex must be held when called and may be released when waiting in the lock tree. -static int -get_range_lock(DB *db, DB_TXN *txn, const DBT *left_key, const DBT *right_key, toku_lock_type lock_type) { - int r; - DB_TXN *txn_anc = toku_txn_ancestor(txn); - r = toku_txn_add_lt(txn_anc, db->i->lt); - if (r == 0) { - TXNID txn_anc_id = toku_txn_get_txnid(db_txn_struct_i(txn_anc)->tokutxn); - toku_lock_request lock_request; - toku_lock_request_init(&lock_request, db, txn_anc_id, left_key, right_key, lock_type); - r = toku_lt_acquire_lock_request_with_default_timeout_locked(db->i->lt, &lock_request); - toku_lock_request_destroy(&lock_request); - } - return r; -} - -// Setup and start an asynchronous lock request. -static int -start_range_lock(DB *db, DB_TXN *txn, const DBT *left_key, const DBT *right_key, toku_lock_type lock_type, toku_lock_request *lock_request) { - int r; - DB_TXN *txn_anc = toku_txn_ancestor(txn); - r = toku_txn_add_lt(txn_anc, db->i->lt); - if (r == 0) { - TXNID txn_anc_id = toku_txn_get_txnid(db_txn_struct_i(txn_anc)->tokutxn); - toku_lock_request_set(lock_request, db, txn_anc_id, left_key, right_key, lock_type); - r = toku_lock_request_start_locked(lock_request, db->i->lt, true); - } - return r; -} - -static int -get_point_write_lock(DB *db, DB_TXN *txn, const DBT *key) { - int r = get_range_lock(db, txn, key, key, LOCK_REQUEST_WRITE); - return r; -} - -//This is the user level callback function given to ydb layer functions like -//toku_c_getf_first - -typedef struct query_context_base_t { - BRT_CURSOR c; - DB_TXN *txn; - DB *db; - YDB_CALLBACK_FUNCTION f; - void *f_extra; - int r_user_callback; - BOOL do_locking; - BOOL is_write_op; - toku_lock_request lock_request; -} *QUERY_CONTEXT_BASE, QUERY_CONTEXT_BASE_S; - -typedef struct query_context_t { - QUERY_CONTEXT_BASE_S base; -} *QUERY_CONTEXT, QUERY_CONTEXT_S; - -typedef struct query_context_with_input_t { - QUERY_CONTEXT_BASE_S base; - DBT *input_key; - DBT *input_val; -} *QUERY_CONTEXT_WITH_INPUT, QUERY_CONTEXT_WITH_INPUT_S; - -static void -query_context_base_init(QUERY_CONTEXT_BASE context, DBC *c, u_int32_t flag, BOOL is_write_op, YDB_CALLBACK_FUNCTION f, void *extra) { - context->c = dbc_struct_i(c)->c; - context->txn = dbc_struct_i(c)->txn; - context->db = c->dbp; - context->f = f; - context->f_extra = extra; - context->is_write_op = is_write_op; - u_int32_t lock_flags = get_cursor_prelocked_flags(flag, c); - if (context->is_write_op) - lock_flags &= DB_PRELOCKED_WRITE; // Only care about whether already locked for write - context->do_locking = (BOOL)(context->db->i->lt!=NULL && !(lock_flags & (DB_PRELOCKED|DB_PRELOCKED_WRITE))); - context->r_user_callback = 0; - toku_lock_request_default_init(&context->lock_request); -} - -static void -query_context_base_destroy(QUERY_CONTEXT_BASE context) { - toku_lock_request_destroy(&context->lock_request); -} - -static void -query_context_init_read(QUERY_CONTEXT context, DBC *c, u_int32_t flag, YDB_CALLBACK_FUNCTION f, void *extra) { - BOOL is_write = FALSE; - query_context_base_init(&context->base, c, flag, is_write, f, extra); -} - -static void -query_context_init_write(QUERY_CONTEXT context, DBC *c, u_int32_t flag, YDB_CALLBACK_FUNCTION f, void *extra) { - BOOL is_write = TRUE; - query_context_base_init(&context->base, c, flag, is_write, f, extra); -} - -static void -query_context_with_input_init(QUERY_CONTEXT_WITH_INPUT context, DBC *c, u_int32_t flag, DBT *key, DBT *val, YDB_CALLBACK_FUNCTION f, void *extra) { - // grab write locks if the DB_RMW flag is set or the cursor was created with the DB_RMW flag - BOOL is_write = ((flag & DB_RMW) != 0) || dbc_struct_i(c)->rmw; - query_context_base_init(&context->base, c, flag, is_write, f, extra); - context->input_key = key; - context->input_val = val; -} - -static int c_del_callback(DBT const *key, DBT const *val, void *extra); - -//Delete whatever the cursor is pointing at. -static int -toku_c_del(DBC * c, u_int32_t flags) { - HANDLE_PANICKED_DB(c->dbp); - HANDLE_CURSOR_ILLEGAL_WORKING_PARENT_TXN(c); - - u_int32_t unchecked_flags = flags; - //DB_DELETE_ANY means delete regardless of whether it exists in the db. - u_int32_t flag_for_brt = flags&DB_DELETE_ANY; - unchecked_flags &= ~flag_for_brt; - u_int32_t lock_flags = get_cursor_prelocked_flags(flags, c); - unchecked_flags &= ~lock_flags; - BOOL do_locking = (BOOL)(c->dbp->i->lt && !(lock_flags&DB_PRELOCKED_WRITE)); - - int r = 0; - if (unchecked_flags!=0) - r = EINVAL; - else { - if (do_locking) { - QUERY_CONTEXT_S context; - query_context_init_write(&context, c, lock_flags, NULL, NULL); - while (r == 0) { - //We do not need a read lock, we must already have it. - r = toku_c_getf_current_binding(c, DB_PRELOCKED, c_del_callback, &context); - if (r == DB_LOCK_NOTGRANTED) - r = toku_lock_request_wait_with_default_timeout(&context.base.lock_request, c->dbp->i->lt); - else - break; - } - query_context_base_destroy(&context.base); - } - if (r==0) { - //Do the actual delete. - TOKUTXN txn = dbc_struct_i(c)->txn ? db_txn_struct_i(dbc_struct_i(c)->txn)->tokutxn : 0; - r = toku_brt_cursor_delete(dbc_struct_i(c)->c, flag_for_brt, txn); - } - } - return r; -} - -//result is the result of the query (i.e. 0 means found, DB_NOTFOUND, etc..) -static int -c_del_callback(DBT const *key, DBT const *val, void *extra) { - QUERY_CONTEXT_WITH_INPUT super_context = extra; - QUERY_CONTEXT_BASE context = &super_context->base; - - int r; - - assert(context->do_locking); - assert(context->is_write_op); - assert(key!=NULL); - assert(val!=NULL); - - //Lock: - // left(key,val)==right(key,val) == (key, val); - r = start_range_lock(context->db, context->txn, key, key, LOCK_REQUEST_WRITE, &context->lock_request); - - //Give brt-layer an error (if any) to return from toku_c_getf_current_binding - return r; -} - -static int c_getf_first_callback(ITEMLEN keylen, bytevec key, ITEMLEN vallen, bytevec val, void *extra, bool); - -static void -c_query_context_init(QUERY_CONTEXT context, DBC *c, u_int32_t flag, YDB_CALLBACK_FUNCTION f, void *extra) { - BOOL is_write_op = FALSE; - // grab write locks if the DB_RMW flag is set or the cursor was created with the DB_RMW flag - if ((flag & DB_RMW) || dbc_struct_i(c)->rmw) - is_write_op = TRUE; - if (is_write_op) - query_context_init_write(context, c, flag, f, extra); - else - query_context_init_read(context, c, flag, f, extra); -} - -static void -c_query_context_destroy(QUERY_CONTEXT context) { - query_context_base_destroy(&context->base); -} - -static int -toku_c_getf_first(DBC *c, u_int32_t flag, YDB_CALLBACK_FUNCTION f, void *extra) { - HANDLE_PANICKED_DB(c->dbp); - HANDLE_CURSOR_ILLEGAL_WORKING_PARENT_TXN(c); - STATUS_VALUE(YDB_LAYER_NUM_POINT_QUERIES)++; - int r = 0; - QUERY_CONTEXT_S context; //Describes the context of this query. - c_query_context_init(&context, c, flag, f, extra); - while (r == 0) { - //toku_brt_cursor_first will call c_getf_first_callback(..., context) (if query is successful) - r = toku_brt_cursor_first(dbc_struct_i(c)->c, c_getf_first_callback, &context); - if (r == DB_LOCK_NOTGRANTED) - r = toku_lock_request_wait_with_default_timeout(&context.base.lock_request, c->dbp->i->lt); - else { - if (r == TOKUDB_USER_CALLBACK_ERROR) - r = context.base.r_user_callback; - break; - } - } - c_query_context_destroy(&context); - return r; -} - -//result is the result of the query (i.e. 0 means found, DB_NOTFOUND, etc..) -static int -c_getf_first_callback(ITEMLEN keylen, bytevec key, ITEMLEN vallen, bytevec val, void *extra, bool lock_only) { - QUERY_CONTEXT super_context = extra; - QUERY_CONTEXT_BASE context = &super_context->base; - - int r; - DBT found_key = { .data = (void *) key, .size = keylen }; - - if (context->do_locking) { - const DBT *left_key = toku_lt_neg_infinity; - const DBT *right_key = key != NULL ? &found_key : toku_lt_infinity; - r = start_range_lock(context->db, context->txn, left_key, right_key, - context->is_write_op ? LOCK_REQUEST_WRITE : LOCK_REQUEST_READ, &context->lock_request); - } else - r = 0; - - //Call application-layer callback if found and locks were successfully obtained. - if (r==0 && key!=NULL && !lock_only) { - DBT found_val = { .data = (void *) val, .size = vallen }; - context->r_user_callback = context->f(&found_key, &found_val, context->f_extra); - r = context->r_user_callback; - } - - //Give brt-layer an error (if any) to return from toku_brt_cursor_first - return r; -} - -static int c_getf_last_callback(ITEMLEN keylen, bytevec key, ITEMLEN vallen, bytevec val, void *extra, bool); - -static int -toku_c_getf_last(DBC *c, u_int32_t flag, YDB_CALLBACK_FUNCTION f, void *extra) { - HANDLE_PANICKED_DB(c->dbp); - HANDLE_CURSOR_ILLEGAL_WORKING_PARENT_TXN(c); - STATUS_VALUE(YDB_LAYER_NUM_POINT_QUERIES)++; - int r = 0; - QUERY_CONTEXT_S context; //Describes the context of this query. - c_query_context_init(&context, c, flag, f, extra); - while (r == 0) { - //toku_brt_cursor_last will call c_getf_last_callback(..., context) (if query is successful) - r = toku_brt_cursor_last(dbc_struct_i(c)->c, c_getf_last_callback, &context); - if (r == DB_LOCK_NOTGRANTED) - r = toku_lock_request_wait_with_default_timeout(&context.base.lock_request, c->dbp->i->lt); - else { - if (r == TOKUDB_USER_CALLBACK_ERROR) - r = context.base.r_user_callback; - break; - } - } - c_query_context_destroy(&context); - return r; -} - -//result is the result of the query (i.e. 0 means found, DB_NOTFOUND, etc..) -static int -c_getf_last_callback(ITEMLEN keylen, bytevec key, ITEMLEN vallen, bytevec val, void *extra, bool lock_only) { - QUERY_CONTEXT super_context = extra; - QUERY_CONTEXT_BASE context = &super_context->base; - - int r; - DBT found_key = { .data = (void *) key, .size = keylen }; - - if (context->do_locking) { - const DBT *left_key = key != NULL ? &found_key : toku_lt_neg_infinity; - const DBT *right_key = toku_lt_infinity; - r = start_range_lock(context->db, context->txn, left_key, right_key, - context->is_write_op ? LOCK_REQUEST_WRITE : LOCK_REQUEST_READ, &context->lock_request); - } else - r = 0; - - //Call application-layer callback if found and locks were successfully obtained. - if (r==0 && key!=NULL && !lock_only) { - DBT found_val = { .data = (void *) val, .size = vallen }; - context->r_user_callback = context->f(&found_key, &found_val, context->f_extra); - r = context->r_user_callback; - } - - //Give brt-layer an error (if any) to return from toku_brt_cursor_last - return r; -} - -static int c_getf_next_callback(ITEMLEN keylen, bytevec key, ITEMLEN vallen, bytevec val, void *extra, bool); - -static int -toku_c_getf_next(DBC *c, u_int32_t flag, YDB_CALLBACK_FUNCTION f, void *extra) { - int r; - HANDLE_PANICKED_DB(c->dbp); - HANDLE_CURSOR_ILLEGAL_WORKING_PARENT_TXN(c); - if (toku_c_uninitialized(c)) - r = toku_c_getf_first(c, flag, f, extra); - else { - r = 0; - QUERY_CONTEXT_S context; //Describes the context of this query. - c_query_context_init(&context, c, flag, f, extra); - while (r == 0) { - //toku_brt_cursor_next will call c_getf_next_callback(..., context) (if query is successful) - r = toku_brt_cursor_next(dbc_struct_i(c)->c, c_getf_next_callback, &context); - if (r == DB_LOCK_NOTGRANTED) - r = toku_lock_request_wait_with_default_timeout(&context.base.lock_request, c->dbp->i->lt); - else { - if (r == TOKUDB_USER_CALLBACK_ERROR) - r = context.base.r_user_callback; - break; - } - } - c_query_context_destroy(&context); - } - return r; -} - -//result is the result of the query (i.e. 0 means found, DB_NOTFOUND, etc..) -static int -c_getf_next_callback(ITEMLEN keylen, bytevec key, ITEMLEN vallen, bytevec val, void *extra, bool lock_only) { - QUERY_CONTEXT super_context = extra; - QUERY_CONTEXT_BASE context = &super_context->base; - - int r; - - DBT found_key = { .data = (void *) key, .size = keylen }; - - if (context->do_locking) { - const DBT *prevkey, *prevval; - toku_brt_cursor_peek(context->c, &prevkey, &prevval); - const DBT *left_key = prevkey; - const DBT *right_key = key != NULL ? &found_key : toku_lt_infinity; - r = start_range_lock(context->db, context->txn, left_key, right_key, - context->is_write_op ? LOCK_REQUEST_WRITE : LOCK_REQUEST_READ, &context->lock_request); - } else - r = 0; - - //Call application-layer callback if found and locks were successfully obtained. - if (r==0 && key!=NULL && !lock_only) { - STATUS_VALUE(YDB_LAYER_NUM_SEQUENTIAL_QUERIES)++; // accountability - DBT found_val = { .data = (void *) val, .size = vallen }; - context->r_user_callback = context->f(&found_key, &found_val, context->f_extra); - r = context->r_user_callback; - } - - //Give brt-layer an error (if any) to return from toku_brt_cursor_next - return r; -} - -static int c_getf_prev_callback(ITEMLEN keylen, bytevec key, ITEMLEN vallen, bytevec val, void *extra, bool); - -static int -toku_c_getf_prev(DBC *c, u_int32_t flag, YDB_CALLBACK_FUNCTION f, void *extra) { - int r; - HANDLE_PANICKED_DB(c->dbp); - HANDLE_CURSOR_ILLEGAL_WORKING_PARENT_TXN(c); - if (toku_c_uninitialized(c)) - r = toku_c_getf_last(c, flag, f, extra); - else { - r = 0; - QUERY_CONTEXT_S context; //Describes the context of this query. - c_query_context_init(&context, c, flag, f, extra); - while (r == 0) { - //toku_brt_cursor_prev will call c_getf_prev_callback(..., context) (if query is successful) - r = toku_brt_cursor_prev(dbc_struct_i(c)->c, c_getf_prev_callback, &context); - if (r == DB_LOCK_NOTGRANTED) - r = toku_lock_request_wait_with_default_timeout(&context.base.lock_request, c->dbp->i->lt); - else { - if (r == TOKUDB_USER_CALLBACK_ERROR) - r = context.base.r_user_callback; - break; - } - } - c_query_context_destroy(&context); - } - return r; -} - -//result is the result of the query (i.e. 0 means found, DB_NOTFOUND, etc..) -static int -c_getf_prev_callback(ITEMLEN keylen, bytevec key, ITEMLEN vallen, bytevec val, void *extra, bool lock_only) { - QUERY_CONTEXT super_context = extra; - QUERY_CONTEXT_BASE context = &super_context->base; - - int r; - DBT found_key = { .data = (void *) key, .size = keylen }; - - if (context->do_locking) { - const DBT *prevkey, *prevval; - toku_brt_cursor_peek(context->c, &prevkey, &prevval); - const DBT *left_key = key != NULL ? &found_key : toku_lt_neg_infinity; - const DBT *right_key = prevkey; - r = start_range_lock(context->db, context->txn, left_key, right_key, - context->is_write_op ? LOCK_REQUEST_WRITE : LOCK_REQUEST_READ, &context->lock_request); - } else - r = 0; - - //Call application-layer callback if found and locks were successfully obtained. - if (r==0 && key!=NULL && !lock_only) { - STATUS_VALUE(YDB_LAYER_NUM_SEQUENTIAL_QUERIES)++; // accountability - DBT found_val = { .data = (void *) val, .size = vallen }; - context->r_user_callback = context->f(&found_key, &found_val, context->f_extra); - r = context->r_user_callback; - } - - //Give brt-layer an error (if any) to return from toku_brt_cursor_prev - return r; -} - -static int c_getf_current_callback(ITEMLEN keylen, bytevec key, ITEMLEN vallen, bytevec val, void *extra, bool); - -static int -toku_c_getf_current(DBC *c, u_int32_t flag, YDB_CALLBACK_FUNCTION f, void *extra) { - HANDLE_PANICKED_DB(c->dbp); - HANDLE_CURSOR_ILLEGAL_WORKING_PARENT_TXN(c); - - QUERY_CONTEXT_S context; //Describes the context of this query. - STATUS_VALUE(YDB_LAYER_NUM_SEQUENTIAL_QUERIES)++; // accountability - c_query_context_init(&context, c, flag, f, extra); - //toku_brt_cursor_current will call c_getf_current_callback(..., context) (if query is successful) - int r = toku_brt_cursor_current(dbc_struct_i(c)->c, DB_CURRENT, c_getf_current_callback, &context); - if (r == TOKUDB_USER_CALLBACK_ERROR) r = context.base.r_user_callback; - c_query_context_destroy(&context); - return r; -} - -//result is the result of the query (i.e. 0 means found, DB_NOTFOUND, etc..) -static int -c_getf_current_callback(ITEMLEN keylen, bytevec key, ITEMLEN vallen, bytevec val, void *extra, bool lock_only) { - QUERY_CONTEXT super_context = extra; - QUERY_CONTEXT_BASE context = &super_context->base; - - int r; - - //Call application-layer callback if found. - if (key!=NULL && !lock_only) { - DBT found_key = { .data = (void *) key, .size = keylen }; - DBT found_val = { .data = (void *) val, .size = vallen }; - context->r_user_callback = context->f(&found_key, &found_val, context->f_extra); - r = context->r_user_callback; - } else - r = 0; - - //Give brt-layer an error (if any) to return from toku_brt_cursor_current - return r; -} - -static int -toku_c_getf_current_binding(DBC *c, u_int32_t flag, YDB_CALLBACK_FUNCTION f, void *extra) { - HANDLE_PANICKED_DB(c->dbp); - HANDLE_CURSOR_ILLEGAL_WORKING_PARENT_TXN(c); - - QUERY_CONTEXT_S context; //Describes the context of this query. - STATUS_VALUE(YDB_LAYER_NUM_SEQUENTIAL_QUERIES)++; // accountability - c_query_context_init(&context, c, flag, f, extra); - //toku_brt_cursor_current will call c_getf_current_callback(..., context) (if query is successful) - int r = toku_brt_cursor_current(dbc_struct_i(c)->c, DB_CURRENT_BINDING, c_getf_current_callback, &context); - if (r == TOKUDB_USER_CALLBACK_ERROR) r = context.base.r_user_callback; - c_query_context_destroy(&context); - return r; -} - -static int c_getf_set_callback(ITEMLEN keylen, bytevec key, ITEMLEN vallen, bytevec val, void *extra, bool); - -static int -toku_c_getf_set(DBC *c, u_int32_t flag, DBT *key, YDB_CALLBACK_FUNCTION f, void *extra) { - HANDLE_PANICKED_DB(c->dbp); - HANDLE_CURSOR_ILLEGAL_WORKING_PARENT_TXN(c); - - int r = 0; - QUERY_CONTEXT_WITH_INPUT_S context; //Describes the context of this query. - STATUS_VALUE(YDB_LAYER_NUM_POINT_QUERIES)++; - query_context_with_input_init(&context, c, flag, key, NULL, f, extra); - while (r == 0) { - //toku_brt_cursor_set will call c_getf_set_callback(..., context) (if query is successful) - r = toku_brt_cursor_set(dbc_struct_i(c)->c, key, c_getf_set_callback, &context); - if (r == DB_LOCK_NOTGRANTED) - r = toku_lock_request_wait_with_default_timeout(&context.base.lock_request, c->dbp->i->lt); - else { - if (r == TOKUDB_USER_CALLBACK_ERROR) - r = context.base.r_user_callback; - break; - } - } - query_context_base_destroy(&context.base); - return r; -} - -//result is the result of the query (i.e. 0 means found, DB_NOTFOUND, etc..) -static int -c_getf_set_callback(ITEMLEN keylen, bytevec key, ITEMLEN vallen, bytevec val, void *extra, bool lock_only) { - QUERY_CONTEXT_WITH_INPUT super_context = extra; - QUERY_CONTEXT_BASE context = &super_context->base; - - int r; - - //Lock: - // left(key,val) = (input_key, -infinity) - // right(key,val) = (input_key, found ? found_val : infinity) - if (context->do_locking) { - r = start_range_lock(context->db, context->txn, super_context->input_key, super_context->input_key, - context->is_write_op ? LOCK_REQUEST_WRITE : LOCK_REQUEST_READ, &context->lock_request); - } else - r = 0; - - //Call application-layer callback if found and locks were successfully obtained. - if (r==0 && key!=NULL && !lock_only) { - DBT found_key = { .data = (void *) key, .size = keylen }; - DBT found_val = { .data = (void *) val, .size = vallen }; - context->r_user_callback = context->f(&found_key, &found_val, context->f_extra); - r = context->r_user_callback; - } - - //Give brt-layer an error (if any) to return from toku_brt_cursor_set - return r; -} - -static int c_getf_set_range_callback(ITEMLEN keylen, bytevec key, ITEMLEN vallen, bytevec val, void *extra, bool); - -static int -toku_c_getf_set_range(DBC *c, u_int32_t flag, DBT *key, YDB_CALLBACK_FUNCTION f, void *extra) { - HANDLE_PANICKED_DB(c->dbp); - HANDLE_CURSOR_ILLEGAL_WORKING_PARENT_TXN(c); - - int r = 0; - QUERY_CONTEXT_WITH_INPUT_S context; //Describes the context of this query. - STATUS_VALUE(YDB_LAYER_NUM_POINT_QUERIES)++; - query_context_with_input_init(&context, c, flag, key, NULL, f, extra); - while (r == 0) { - //toku_brt_cursor_set_range will call c_getf_set_range_callback(..., context) (if query is successful) - r = toku_brt_cursor_set_range(dbc_struct_i(c)->c, key, c_getf_set_range_callback, &context); - if (r == DB_LOCK_NOTGRANTED) - r = toku_lock_request_wait_with_default_timeout(&context.base.lock_request, c->dbp->i->lt); - else { - if (r == TOKUDB_USER_CALLBACK_ERROR) - r = context.base.r_user_callback; - break; - } - } - query_context_base_destroy(&context.base); - return r; -} - -//result is the result of the query (i.e. 0 means found, DB_NOTFOUND, etc..) -static int -c_getf_set_range_callback(ITEMLEN keylen, bytevec key, ITEMLEN vallen, bytevec val, void *extra, bool lock_only) { - QUERY_CONTEXT_WITH_INPUT super_context = extra; - QUERY_CONTEXT_BASE context = &super_context->base; - - int r; - DBT found_key = { .data = (void *) key, .size = keylen }; - - //Lock: - // left(key,val) = (input_key, -infinity) - // right(key) = found ? found_key : infinity - // right(val) = found ? found_val : infinity - if (context->do_locking) { - const DBT *left_key = super_context->input_key; - const DBT *right_key = key != NULL ? &found_key : toku_lt_infinity; - r = start_range_lock(context->db, context->txn, left_key, right_key, - context->is_write_op ? LOCK_REQUEST_WRITE : LOCK_REQUEST_READ, &context->lock_request); - } else - r = 0; - - //Call application-layer callback if found and locks were successfully obtained. - if (r==0 && key!=NULL && !lock_only) { - DBT found_val = { .data = (void *) val, .size = vallen }; - context->r_user_callback = context->f(&found_key, &found_val, context->f_extra); - r = context->r_user_callback; - } - - //Give brt-layer an error (if any) to return from toku_brt_cursor_set_range - return r; -} - -static int c_getf_set_range_reverse_callback(ITEMLEN keylen, bytevec key, ITEMLEN vallen, bytevec val, void *extra, bool); - -static int -toku_c_getf_set_range_reverse(DBC *c, u_int32_t flag, DBT *key, YDB_CALLBACK_FUNCTION f, void *extra) { - HANDLE_PANICKED_DB(c->dbp); - HANDLE_CURSOR_ILLEGAL_WORKING_PARENT_TXN(c); - - int r = 0; - QUERY_CONTEXT_WITH_INPUT_S context; //Describes the context of this query. - STATUS_VALUE(YDB_LAYER_NUM_POINT_QUERIES)++; - query_context_with_input_init(&context, c, flag, key, NULL, f, extra); - while (r == 0) { - //toku_brt_cursor_set_range_reverse will call c_getf_set_range_reverse_callback(..., context) (if query is successful) - r = toku_brt_cursor_set_range_reverse(dbc_struct_i(c)->c, key, c_getf_set_range_reverse_callback, &context); - if (r == DB_LOCK_NOTGRANTED) - r = toku_lock_request_wait_with_default_timeout(&context.base.lock_request, c->dbp->i->lt); - else { - if (r == TOKUDB_USER_CALLBACK_ERROR) - r = context.base.r_user_callback; - break; - } - } - query_context_base_destroy(&context.base); - return r; -} - -//result is the result of the query (i.e. 0 means found, DB_NOTFOUND, etc..) -static int -c_getf_set_range_reverse_callback(ITEMLEN keylen, bytevec key, ITEMLEN vallen, bytevec val, void *extra, bool lock_only) { - QUERY_CONTEXT_WITH_INPUT super_context = extra; - QUERY_CONTEXT_BASE context = &super_context->base; - - int r; - DBT found_key = { .data = (void *) key, .size = keylen }; - - //Lock: - // left(key) = found ? found_key : -infinity - // left(val) = found ? found_val : -infinity - // right(key,val) = (input_key, infinity) - if (context->do_locking) { - const DBT *left_key = key != NULL ? &found_key : toku_lt_neg_infinity; - const DBT *right_key = super_context->input_key; - r = start_range_lock(context->db, context->txn, left_key, right_key, - context->is_write_op ? LOCK_REQUEST_WRITE : LOCK_REQUEST_READ, &context->lock_request); - } else - r = 0; - - //Call application-layer callback if found and locks were successfully obtained. - if (r==0 && key!=NULL && !lock_only) { - DBT found_val = { .data = (void *) val, .size = vallen }; - context->r_user_callback = context->f(&found_key, &found_val, context->f_extra); - r = context->r_user_callback; - } - - //Give brt-layer an error (if any) to return from toku_brt_cursor_set_range_reverse - return r; -} - -// Close a cursor. -// Does not require the ydb lock held when called. -static int -toku_c_close(DBC * c) { - HANDLE_PANICKED_DB(c->dbp); - HANDLE_CURSOR_ILLEGAL_WORKING_PARENT_TXN(c); - int r = toku_brt_cursor_close(dbc_struct_i(c)->c); - toku_sdbt_cleanup(&dbc_struct_i(c)->skey_s); - toku_sdbt_cleanup(&dbc_struct_i(c)->sval_s); -#if !TOKUDB_NATIVE_H - toku_free(dbc_struct_i(c)); -#endif - toku_free(c); - return r; -} - -// Return the number of entries whose key matches the key currently -// pointed to by the brt cursor. -static int -toku_c_count(DBC *cursor, db_recno_t *count, u_int32_t flags) { - HANDLE_PANICKED_DB(cursor->dbp); - HANDLE_CURSOR_ILLEGAL_WORKING_PARENT_TXN(cursor); - int r; - DBC *count_cursor = 0; - DBT currentkey; - - init_dbt_realloc(¤tkey); - u_int32_t lock_flags = get_cursor_prelocked_flags(flags, cursor); - flags &= ~lock_flags; - if (flags != 0) { - r = EINVAL; goto finish; - } - - r = toku_c_get_current_unconditional(cursor, lock_flags, ¤tkey, NULL); - if (r != 0) goto finish; - - //TODO: Optimization - //if (do_locking) { - // do a lock from currentkey,-infinity to currentkey,infinity - // lock_flags |= DB_PRELOCKED - //} - - r = toku_db_cursor_internal(cursor->dbp, dbc_struct_i(cursor)->txn, &count_cursor, DBC_DISABLE_PREFETCHING, 0); - if (r != 0) goto finish; - - r = toku_c_getf_set(count_cursor, lock_flags, ¤tkey, ydb_getf_do_nothing, NULL); - if (r==0) { - *count = 1; // there is a key, so the count is one (since we don't have DUP dbs anymore, the only answers are 0 or 1. - } else { - *count = 0; - } - r = 0; -finish: - if (currentkey.data) toku_free(currentkey.data); - if (count_cursor) { - int rr = toku_c_close(count_cursor); assert(rr == 0); - } - return r; -} - - -/////////// -//db_getf_XXX is equivalent to c_getf_XXX, without a persistent cursor - -static int -db_getf_set(DB *db, DB_TXN *txn, u_int32_t flags, DBT *key, YDB_CALLBACK_FUNCTION f, void *extra) { - HANDLE_PANICKED_DB(db); - HANDLE_DB_ILLEGAL_WORKING_PARENT_TXN(db, txn); - DBC *c; - uint32_t create_flags = flags & (DB_ISOLATION_FLAGS | DB_RMW); - flags &= ~DB_ISOLATION_FLAGS; - int r = toku_db_cursor_internal(db, txn, &c, create_flags | DBC_DISABLE_PREFETCHING, 1); - if (r==0) { - r = toku_c_getf_set(c, flags, key, f, extra); - int r2 = toku_c_close(c); - if (r==0) r = r2; - } - return r; -} - -static int -toku_db_del(DB *db, DB_TXN *txn, DBT *key, u_int32_t flags) { - HANDLE_PANICKED_DB(db); - HANDLE_DB_ILLEGAL_WORKING_PARENT_TXN(db, txn); - - u_int32_t unchecked_flags = flags; - //DB_DELETE_ANY means delete regardless of whether it exists in the db. - BOOL error_if_missing = (BOOL)(!(flags&DB_DELETE_ANY)); - unchecked_flags &= ~DB_DELETE_ANY; - u_int32_t lock_flags = get_prelocked_flags(flags); - unchecked_flags &= ~lock_flags; - BOOL do_locking = (BOOL)(db->i->lt && !(lock_flags&DB_PRELOCKED_WRITE)); - - int r = 0; - if (unchecked_flags!=0) { - r = EINVAL; - } - - if (r == 0 && error_if_missing) { - //Check if the key exists in the db. - r = db_getf_set(db, txn, lock_flags|DB_SERIALIZABLE|DB_RMW, key, ydb_getf_do_nothing, NULL); - } - if (r == 0 && do_locking) { - //Do locking if necessary. - r = get_point_write_lock(db, txn, key); - } - if (r == 0) { - //Do the actual deleting. - r = toku_brt_delete(db->i->brt, key, txn ? db_txn_struct_i(txn)->tokutxn : 0); - } - - if (r == 0) - STATUS_VALUE(YDB_LAYER_NUM_DELETES)++; // accountability - else - STATUS_VALUE(YDB_LAYER_NUM_DELETES_FAIL)++; // accountability - - return r; -} - -static int -log_del_single(DB_TXN *txn, BRT brt, const DBT *key) { - TOKUTXN ttxn = db_txn_struct_i(txn)->tokutxn; - int r = toku_brt_log_del(ttxn, brt, key); - return r; -} - -static uint32_t -sum_size(uint32_t num_keys, DBT keys[], uint32_t overhead) { - uint32_t sum = 0; - for (uint32_t i = 0; i < num_keys; i++) - sum += keys[i].size + overhead; - return sum; -} - -static int -log_del_multiple(DB_TXN *txn, DB *src_db, const DBT *key, const DBT *val, uint32_t num_dbs, BRT brts[], DBT keys[]) { - int r = 0; - if (num_dbs > 0) { - TOKUTXN ttxn = db_txn_struct_i(txn)->tokutxn; - BRT src_brt = src_db ? src_db->i->brt : NULL; - uint32_t del_multiple_size = key->size + val->size + num_dbs*sizeof (uint32_t) + toku_log_enq_delete_multiple_overhead; - uint32_t del_single_sizes = sum_size(num_dbs, keys, toku_log_enq_delete_any_overhead); - if (del_single_sizes < del_multiple_size) { - for (uint32_t i = 0; r == 0 && i < num_dbs; i++) - r = log_del_single(txn, brts[i], &keys[i]); - } else { - r = toku_brt_log_del_multiple(ttxn, src_brt, brts, num_dbs, key, val); - } - } - return r; -} - -static uint32_t -lookup_src_db(uint32_t num_dbs, DB *db_array[], DB *src_db) { - uint32_t which_db; - for (which_db = 0; which_db < num_dbs; which_db++) - if (db_array[which_db] == src_db) - break; - return which_db; -} - -static int -do_del_multiple(DB_TXN *txn, uint32_t num_dbs, DB *db_array[], DBT keys[], DB *src_db, const DBT *src_key) { - src_db = src_db; src_key = src_key; - int r = 0; - TOKUTXN ttxn = db_txn_struct_i(txn)->tokutxn; - for (uint32_t which_db = 0; r == 0 && which_db < num_dbs; which_db++) { - DB *db = db_array[which_db]; - - // if db is being indexed by an indexer, then insert a delete message into the db if the src key is to the left or equal to the - // indexers cursor. we have to get the src_db from the indexer and find it in the db_array. - int do_delete = TRUE; - DB_INDEXER *indexer = toku_db_get_indexer(db); - if (indexer) { // if this db is the index under construction - DB *indexer_src_db = toku_indexer_get_src_db(indexer); - invariant(indexer_src_db != NULL); - const DBT *indexer_src_key; - if (src_db == indexer_src_db) - indexer_src_key = src_key; - else { - uint32_t which_src_db = lookup_src_db(num_dbs, db_array, indexer_src_db); - invariant(which_src_db < num_dbs); - indexer_src_key = &keys[which_src_db]; - } - do_delete = !toku_indexer_is_key_right_of_le_cursor(indexer, indexer_src_db, indexer_src_key); - } - if (r == 0 && do_delete) { - r = toku_brt_maybe_delete(db->i->brt, &keys[which_db], ttxn, FALSE, ZERO_LSN, FALSE); - } - } - return r; -} - -static int -env_del_multiple( - DB_ENV *env, - DB *src_db, - DB_TXN *txn, - const DBT *src_key, - const DBT *src_val, - uint32_t num_dbs, - DB **db_array, - DBT *keys, - uint32_t *flags_array) -{ - int r; - DBT del_keys[num_dbs]; - - HANDLE_PANICKED_ENV(env); - - if (!txn) { - r = EINVAL; - goto cleanup; - } - if (!env->i->generate_row_for_del) { - r = EINVAL; - goto cleanup; - } - - HANDLE_ILLEGAL_WORKING_PARENT_TXN(env, txn); - - { - uint32_t lock_flags[num_dbs]; - uint32_t remaining_flags[num_dbs]; - BRT brts[num_dbs]; - - for (uint32_t which_db = 0; which_db < num_dbs; which_db++) { - DB *db = db_array[which_db]; - lock_flags[which_db] = get_prelocked_flags(flags_array[which_db]); - remaining_flags[which_db] = flags_array[which_db] & ~lock_flags[which_db]; - - if (db == src_db) { - del_keys[which_db] = *src_key; - } - else { - //Generate the key - r = env->i->generate_row_for_del(db, src_db, &keys[which_db], src_key, src_val); - if (r != 0) goto cleanup; - del_keys[which_db] = keys[which_db]; - } - - if (remaining_flags[which_db] & ~DB_DELETE_ANY) { - r = EINVAL; - goto cleanup; - } - BOOL error_if_missing = (BOOL)(!(remaining_flags[which_db]&DB_DELETE_ANY)); - if (error_if_missing) { - //Check if the key exists in the db. - r = db_getf_set(db, txn, lock_flags[which_db]|DB_SERIALIZABLE|DB_RMW, &del_keys[which_db], ydb_getf_do_nothing, NULL); - if (r != 0) goto cleanup; - } - - //Do locking if necessary. - if (db->i->lt && !(lock_flags[which_db] & DB_PRELOCKED_WRITE)) { - //Needs locking - r = get_point_write_lock(db, txn, &del_keys[which_db]); - if (r != 0) goto cleanup; - } - brts[which_db] = db->i->brt; - } - - if (num_dbs == 1) - r = log_del_single(txn, brts[0], &del_keys[0]); - else - r = log_del_multiple(txn, src_db, src_key, src_val, num_dbs, brts, del_keys); - - if (r == 0) - r = do_del_multiple(txn, num_dbs, db_array, del_keys, src_db, src_key); - } - -cleanup: - if (r == 0) - STATUS_VALUE(YDB_LAYER_NUM_MULTI_DELETES) += num_dbs; // accountability - else - STATUS_VALUE(YDB_LAYER_NUM_MULTI_DELETES_FAIL) += num_dbs; // accountability - return r; -} - - -static int -locked_c_get(DBC * c, DBT * key, DBT * data, u_int32_t flag) { - toku_ydb_lock(); int r = toku_c_get(c, key, data, flag); toku_ydb_unlock(); - return r; -} - -static int -locked_c_count(DBC *cursor, db_recno_t *count, u_int32_t flags) { - toku_ydb_lock(); int r = toku_c_count(cursor, count, flags); toku_ydb_unlock(); return r; -} - -static int -locked_c_del(DBC * c, u_int32_t flags) { - toku_ydb_lock(); int r = toku_c_del(c, flags); toku_ydb_unlock(); return r; -} - -static int locked_c_pre_acquire_range_lock(DBC *dbc, const DBT *key_left, const DBT *key_right); - -static int -toku_db_cursor_internal(DB * db, DB_TXN * txn, DBC ** c, u_int32_t flags, int is_temporary_cursor) { - HANDLE_PANICKED_DB(db); - HANDLE_DB_ILLEGAL_WORKING_PARENT_TXN(db, txn); - DB_ENV* env = db->dbenv; - - if (flags & ~(DB_SERIALIZABLE | DB_INHERIT_ISOLATION | DB_RMW | DBC_DISABLE_PREFETCHING)) { - return toku_ydb_do_error( - env, - EINVAL, - "Invalid flags set for toku_db_cursor\n" - ); - } - - int r = 0; - - struct __toku_dbc_external *XMALLOC(eresult); // so the internal stuff is stuck on the end - memset(eresult, 0, sizeof(*eresult)); - DBC *result = &eresult->external_part; - - // methods that grab the ydb lock -#define SCRS(name) result->name = locked_ ## name - SCRS(c_get); - SCRS(c_del); - SCRS(c_count); - SCRS(c_getf_first); - SCRS(c_getf_last); - SCRS(c_getf_next); - SCRS(c_getf_prev); - SCRS(c_getf_current); - SCRS(c_getf_current_binding); - SCRS(c_getf_set); - SCRS(c_getf_set_range); - SCRS(c_getf_set_range_reverse); - SCRS(c_pre_acquire_range_lock); -#undef SCRS - // unlocked methods - result->c_close = toku_c_close; - -#if !TOKUDB_NATIVE_H - MALLOC(result->i); // otherwise it is allocated as part of result->ii - assert(result->i); -#endif - result->dbp = db; - - dbc_struct_i(result)->txn = txn; - dbc_struct_i(result)->skey_s = (struct simple_dbt){0,0}; - dbc_struct_i(result)->sval_s = (struct simple_dbt){0,0}; - if (is_temporary_cursor) { - dbc_struct_i(result)->skey = &db->i->skey; - dbc_struct_i(result)->sval = &db->i->sval; - } else { - dbc_struct_i(result)->skey = &dbc_struct_i(result)->skey_s; - dbc_struct_i(result)->sval = &dbc_struct_i(result)->sval_s; - } - if (flags & DB_SERIALIZABLE) { - dbc_struct_i(result)->iso = TOKU_ISO_SERIALIZABLE; - } else { - dbc_struct_i(result)->iso = txn ? db_txn_struct_i(txn)->iso : TOKU_ISO_SERIALIZABLE; - } - dbc_struct_i(result)->rmw = (flags & DB_RMW) != 0; - BOOL is_snapshot_read = FALSE; - if (txn) { - is_snapshot_read = (dbc_struct_i(result)->iso == TOKU_ISO_READ_COMMITTED || - dbc_struct_i(result)->iso == TOKU_ISO_SNAPSHOT); - } - r = toku_brt_cursor( - db->i->brt, - &dbc_struct_i(result)->c, - txn ? db_txn_struct_i(txn)->tokutxn : NULL, - is_snapshot_read, - ((flags & DBC_DISABLE_PREFETCHING) != 0) - ); - assert(r == 0 || r == TOKUDB_MVCC_DICTIONARY_TOO_NEW); - if (r == 0) { - *c = result; - } - else { -#if !TOKUDB_NATIVE_H - toku_free(result->i); // otherwise it is allocated as part of result->ii -#endif - toku_free(result); - } - return r; -} - -static inline int -db_thread_need_flags(DBT *dbt) { - return (dbt->flags & (DB_DBT_MALLOC+DB_DBT_REALLOC+DB_DBT_USERMEM)) == 0; -} - -static int -toku_db_get (DB * db, DB_TXN * txn, DBT * key, DBT * data, u_int32_t flags) { - HANDLE_PANICKED_DB(db); - HANDLE_DB_ILLEGAL_WORKING_PARENT_TXN(db, txn); - int r; - u_int32_t iso_flags = flags & DB_ISOLATION_FLAGS; - - if ((db->i->open_flags & DB_THREAD) && db_thread_need_flags(data)) - return EINVAL; - - u_int32_t lock_flags = flags & (DB_PRELOCKED | DB_PRELOCKED_WRITE); - flags &= ~lock_flags; - flags &= ~DB_ISOLATION_FLAGS; - // And DB_GET_BOTH is no longer supported. #2862. - if (flags != 0) return EINVAL; - - - DBC *dbc; - r = toku_db_cursor_internal(db, txn, &dbc, iso_flags | DBC_DISABLE_PREFETCHING, 1); - if (r!=0) return r; - u_int32_t c_get_flags = DB_SET; - r = toku_c_get(dbc, key, data, c_get_flags | lock_flags); - int r2 = toku_c_close(dbc); - return r ? r : r2; -} - -#if 0 -static int -toku_db_key_range(DB * db, DB_TXN * txn, DBT * dbt, DB_KEY_RANGE * kr, u_int32_t flags) { - HANDLE_PANICKED_DB(db); - HANDLE_DB_ILLEGAL_WORKING_PARENT_TXN(db, txn); - txn=txn; dbt=dbt; kr=kr; flags=flags; - toku_ydb_barf(); - abort(); -} -#endif - -static int -toku_db_lt_panic(DB* db, int r) { - assert(r!=0); - assert(db && db->i && db->dbenv && db->dbenv->i); - DB_ENV* env = db->dbenv; - char * panic_string; - - if (r < 0) panic_string = toku_lt_strerror((TOKU_LT_ERROR)r); - else panic_string = "Error in locktree.\n"; - - env_panic(env, r, panic_string); - - return toku_ydb_do_error(env, r, "%s", panic_string); -} - -static int -toku_txn_add_lt(DB_TXN* txn, toku_lock_tree* lt) { - int r = ENOSYS; - assert(txn && lt); - toku_lth* lth = db_txn_struct_i(txn)->lth; - // we used to initialize the transaction's lth during begin. - // Now we initialize the lth only if the transaction needs the lth, here - if (!lth) { - r = toku_lth_create(&db_txn_struct_i(txn)->lth); - assert_zero(r); - lth = db_txn_struct_i(txn)->lth; - } - - toku_lock_tree* find = toku_lth_find(lth, lt); - if (find) { - assert(find == lt); - r = 0; - goto cleanup; - } - r = toku_lth_insert(lth, lt); - if (r != 0) { goto cleanup; } - - toku_lt_add_ref(lt); - r = 0; -cleanup: - return r; -} - -static toku_dbt_cmp -toku_db_get_compare_fun(DB* db) { - return db->i->brt->compare_fun; -} - - -static int -db_open_subdb(DB * db, DB_TXN * txn, const char *fname, const char *dbname, DBTYPE dbtype, u_int32_t flags, int mode) { - int r; - if (!fname || !dbname) r = EINVAL; - else { - char subdb_full_name[strlen(fname) + sizeof("/") + strlen(dbname)]; - int bytes = snprintf(subdb_full_name, sizeof(subdb_full_name), "%s/%s", fname, dbname); - assert(bytes==(int)sizeof(subdb_full_name)-1); - const char *null_subdbname = NULL; - r = toku_db_open(db, txn, subdb_full_name, null_subdbname, dbtype, flags, mode); - } - return r; -} - -static void -create_iname_hint(const char *dname, char *hint) { - //Requires: size of hint array must be > strlen(dname) - //Copy alphanumeric characters only. - //Replace strings of non-alphanumeric characters with a single underscore. - BOOL underscored = FALSE; - while (*dname) { - if (isalnum(*dname)) { - char c = *dname++; - *hint++ = c; - underscored = FALSE; - } - else { - if (!underscored) - *hint++ = '_'; - dname++; - underscored = TRUE; - } - } - *hint = '\0'; -} - - -// n < 0 means to ignore mark and ignore n -// n >= 0 means to include mark ("_B_" or "_P_") with hex value of n in iname -// (intended for use by loader, which will create many inames using one txnid). -static char * -create_iname(DB_ENV *env, u_int64_t id, char *hint, char *mark, int n) { - int bytes; - char inamebase[strlen(hint) + - 8 + // hex file format version - 16 + // hex id (normally the txnid) - 8 + // hex value of n if non-neg - sizeof("_B___.tokudb")]; // extra pieces - if (n < 0) - bytes = snprintf(inamebase, sizeof(inamebase), - "%s_%"PRIx64"_%"PRIx32 ".tokudb", - hint, id, BRT_LAYOUT_VERSION); - else { - invariant(strlen(mark) == 1); - bytes = snprintf(inamebase, sizeof(inamebase), - "%s_%"PRIx64"_%"PRIx32"_%s_%"PRIx32".tokudb", - hint, id, BRT_LAYOUT_VERSION, mark, n); - } - assert(bytes>0); - assert(bytes<=(int)sizeof(inamebase)-1); - char *rval; - if (env->i->data_dir) - rval = toku_construct_full_name(2, env->i->data_dir, inamebase); - else - rval = toku_construct_full_name(1, inamebase); - assert(rval); - return rval; -} - - -static int db_open_iname(DB * db, DB_TXN * txn, const char *iname, u_int32_t flags, int mode); - - -// inames are created here. -// algorithm: -// begin txn -// convert dname to iname (possibly creating new iname) -// open file (toku_brt_open() will handle logging) -// close txn -// if created a new iname, take full range lock -static int -toku_db_open(DB * db, DB_TXN * txn, const char *fname, const char *dbname, DBTYPE dbtype, u_int32_t flags, int mode) { - HANDLE_PANICKED_DB(db); - HANDLE_DB_ILLEGAL_WORKING_PARENT_TXN(db, txn); - if (dbname!=NULL) - return db_open_subdb(db, txn, fname, dbname, dbtype, flags, mode); - - // at this point fname is the dname - //This code ONLY supports single-db files. - assert(dbname==NULL); - const char * dname = fname; // db_open_subdb() converts (fname, dbname) to dname - - ////////////////////////////// do some level of parameter checking. - u_int32_t unused_flags = flags; - int using_txns = db->dbenv->i->open_flags & DB_INIT_TXN; - int r; - if (dbtype!=DB_BTREE && dbtype!=DB_UNKNOWN) return EINVAL; - int is_db_excl = flags & DB_EXCL; unused_flags&=~DB_EXCL; - int is_db_create = flags & DB_CREATE; unused_flags&=~DB_CREATE; - int is_db_hot_index = flags & DB_IS_HOT_INDEX; unused_flags&=~DB_IS_HOT_INDEX; - - //We support READ_UNCOMMITTED and READ_COMMITTED whether or not the flag is provided. - unused_flags&=~DB_READ_UNCOMMITTED; - unused_flags&=~DB_READ_COMMITTED; - unused_flags&=~DB_SERIALIZABLE; - if (unused_flags & ~DB_THREAD) return EINVAL; // unknown flags - - if (is_db_excl && !is_db_create) return EINVAL; - if (dbtype==DB_UNKNOWN && is_db_excl) return EINVAL; - - /* tokudb supports no duplicates and sorted duplicates only */ - unsigned int tflags; - r = toku_brt_get_flags(db->i->brt, &tflags); - if (r != 0) - return r; - - if (db_opened(db)) - return EINVAL; /* It was already open. */ - ////////////////////////////// - - DB_TXN *child = NULL; - // begin child (unless transactionless) - if (using_txns) { - r = toku_txn_begin_internal(db->dbenv, txn, &child, DB_TXN_NOSYNC, 1, true); - assert(r==0); - } - - // convert dname to iname - // - look up dname, get iname - // - if dname does not exist, create iname and make entry in directory - DBT dname_dbt; // holds dname - DBT iname_dbt; // holds iname_in_env - toku_fill_dbt(&dname_dbt, dname, strlen(dname)+1); - init_dbt_realloc(&iname_dbt); // sets iname_dbt.data = NULL - r = toku_db_get(db->dbenv->i->directory, child, &dname_dbt, &iname_dbt, DB_SERIALIZABLE); // allocates memory for iname - char *iname = iname_dbt.data; - if (r==DB_NOTFOUND && !is_db_create) - r = ENOENT; - else if (r==0 && is_db_excl) { - r = EEXIST; - } - else if (r==DB_NOTFOUND) { - char hint[strlen(dname) + 1]; - - // create iname and make entry in directory - u_int64_t id = 0; - - if (using_txns) { - id = toku_txn_get_txnid(db_txn_struct_i(child)->tokutxn); - } - create_iname_hint(dname, hint); - iname = create_iname(db->dbenv, id, hint, NULL, -1); // allocated memory for iname - toku_fill_dbt(&iname_dbt, iname, strlen(iname) + 1); - // - // 0 for performance only, avoid unnecessary query - // if we are creating a hot index, per #3166, we do not want the write lock in directory grabbed. - // directory read lock is grabbed in toku_db_get above - // - u_int32_t put_flags = 0 | ((is_db_hot_index) ? DB_PRELOCKED_WRITE : 0); - r = toku_db_put(db->dbenv->i->directory, child, &dname_dbt, &iname_dbt, put_flags); - } - - // we now have an iname - if (r == 0) { - r = db_open_iname(db, child, iname, flags, mode); - if (r==0) { - db->i->dname = toku_xstrdup(dname); - env_note_db_opened(db->dbenv, db); // tell env that a new db handle is open (using dname) - } - } - - // free string holding iname - if (iname) toku_free(iname); - - if (using_txns) { - // close txn - if (r == 0) { // commit - r = toku_txn_commit(child, DB_TXN_NOSYNC, NULL, NULL, false); - invariant(r==0); // TODO panic - } - else { // abort - int r2 = toku_txn_abort(child, NULL, NULL, false); - invariant(r2==0); // TODO panic - } - } - - return r; -} - -static int -db_open_iname(DB * db, DB_TXN * txn, const char *iname_in_env, u_int32_t flags, int mode) { - int r; - - //Set comparison functions if not yet set. - if (!db->i->key_compare_was_set && db->dbenv->i->bt_compare) { - r = toku_brt_set_bt_compare(db->i->brt, db->dbenv->i->bt_compare); - assert(r==0); - db->i->key_compare_was_set = TRUE; - } - if (db->dbenv->i->update_function) { - r = toku_brt_set_update(db->i->brt,db->dbenv->i->update_function); - assert(r==0); - } - BOOL need_locktree = (BOOL)((db->dbenv->i->open_flags & DB_INIT_LOCK) && - (db->dbenv->i->open_flags & DB_INIT_TXN)); - - int is_db_excl = flags & DB_EXCL; flags&=~DB_EXCL; - int is_db_create = flags & DB_CREATE; flags&=~DB_CREATE; - //We support READ_UNCOMMITTED and READ_COMMITTED whether or not the flag is provided. - flags&=~DB_READ_UNCOMMITTED; - flags&=~DB_READ_COMMITTED; - flags&=~DB_SERIALIZABLE; - flags&=~DB_IS_HOT_INDEX; - if (flags & ~DB_THREAD) return EINVAL; // unknown flags - - if (is_db_excl && !is_db_create) return EINVAL; - - /* tokudb supports no duplicates and sorted duplicates only */ - unsigned int tflags; - r = toku_brt_get_flags(db->i->brt, &tflags); - if (r != 0) - return r; - - if (db_opened(db)) - return EINVAL; /* It was already open. */ - - db->i->open_flags = flags; - db->i->open_mode = mode; - - r = toku_brt_open(db->i->brt, iname_in_env, - is_db_create, is_db_excl, - db->dbenv->i->cachetable, - txn ? db_txn_struct_i(txn)->tokutxn : NULL_TXN, - db); - if (r != 0) - goto error_cleanup; - - db->i->opened = 1; - if (need_locktree) { - db->i->dict_id = toku_brt_get_dictionary_id(db->i->brt); - r = toku_ltm_get_lt(db->dbenv->i->ltm, &db->i->lt, db->i->dict_id, db); - if (r!=0) { goto error_cleanup; } - } - //Add to transaction's list of 'must close' if necessary. - if (txn) { - //Do last so we don't have to undo. - toku_list_push(&db_txn_struct_i(txn)->dbs_that_must_close_before_abort, - &db->i->dbs_that_must_close_before_abort); - } - - return 0; - -error_cleanup: - db->i->dict_id = DICTIONARY_ID_NONE; - db->i->opened = 0; - if (db->i->lt) { - toku_lt_remove_db_ref(db->i->lt, db); - db->i->lt = NULL; - } - return r; -} - -// Return 0 if proposed pair do not violate size constraints of DB -// (insertion is legal) -// Return non zero otherwise. -static int -db_put_check_size_constraints(DB *db, const DBT *key, const DBT *val) { - int r = 0; - unsigned int klimit, vlimit; - - toku_brt_get_maximum_advised_key_value_lengths(&klimit, &vlimit); - if (key->size > klimit) { - r = toku_ydb_do_error(db->dbenv, EINVAL, - "The largest key allowed is %u bytes", klimit); - } else if (val->size > vlimit) { - r = toku_ydb_do_error(db->dbenv, EINVAL, - "The largest value allowed is %u bytes", vlimit); - } - return r; -} - -// Return the maximum key and val size in -// *key_size and *val_size respectively -static void -db_get_max_row_size(DB * UU(db), uint32_t * max_key_size, uint32_t * max_val_size) { - *max_key_size = 0; - *max_val_size = 0; - toku_brt_get_maximum_advised_key_value_lengths(max_key_size, max_val_size); -} - -static void -locked_db_get_max_row_size(DB *db, uint32_t *max_key_size, uint32_t *max_val_size) { - toku_ydb_lock(); - db_get_max_row_size(db, max_key_size, max_val_size); - toku_ydb_unlock(); -} - -//Return 0 if insert is legal -static int -db_put_check_overwrite_constraint(DB *db, DB_TXN *txn, DBT *key, - u_int32_t lock_flags, u_int32_t overwrite_flag) { - int r; - - if (overwrite_flag == 0) { // 0 (yesoverwrite) does not impose constraints. - r = 0; - } else if (overwrite_flag == DB_NOOVERWRITE) { - // Check if (key,anything) exists in dictionary. - // If exists, fail. Otherwise, do insert. - // The DB_RMW flag causes the cursor to grab a write lock instead of a read lock on the key if it exists. - r = db_getf_set(db, txn, lock_flags|DB_SERIALIZABLE|DB_RMW, key, ydb_getf_do_nothing, NULL); - if (r == DB_NOTFOUND) - r = 0; - else if (r == 0) - r = DB_KEYEXIST; - //Any other error is passed through. - } else if (overwrite_flag == DB_NOOVERWRITE_NO_ERROR) { - r = 0; - } else { - //Other flags are not (yet) supported. - r = EINVAL; - } - return r; -} - -static int -toku_db_put(DB *db, DB_TXN *txn, DBT *key, DBT *val, u_int32_t flags) { - HANDLE_PANICKED_DB(db); - HANDLE_DB_ILLEGAL_WORKING_PARENT_TXN(db, txn); - int r = 0; - - u_int32_t lock_flags = get_prelocked_flags(flags); - flags &= ~lock_flags; - - r = db_put_check_size_constraints(db, key, val); - if (r == 0) { - //Do any checking required by the flags. - r = db_put_check_overwrite_constraint(db, txn, key, lock_flags, flags); - } - BOOL do_locking = (BOOL)(db->i->lt && !(lock_flags&DB_PRELOCKED_WRITE)); - if (r == 0 && do_locking) { - //Do locking if necessary. - r = get_point_write_lock(db, txn, key); - } - if (r == 0) { - //Insert into the brt. - TOKUTXN ttxn = txn ? db_txn_struct_i(txn)->tokutxn : NULL; - enum brt_msg_type type = BRT_INSERT; - if (flags==DB_NOOVERWRITE_NO_ERROR) - type = BRT_INSERT_NO_OVERWRITE; - r = toku_brt_maybe_insert(db->i->brt, key, val, ttxn, FALSE, ZERO_LSN, TRUE, type); - } - - if (r == 0) - STATUS_VALUE(YDB_LAYER_NUM_INSERTS)++; // accountability - else - STATUS_VALUE(YDB_LAYER_NUM_INSERTS_FAIL)++; // accountability - - return r; -} - -static int toku_db_pre_acquire_fileops_lock(DB *db, DB_TXN *txn) { - // bad hack because some environment dictionaries do not have a dname - char *dname = db->i->dname; - if (!dname) - return 0; - - DBT key_in_directory = { .data = dname, .size = strlen(dname)+1 }; - //Left end of range == right end of range (point lock) - int r = get_range_lock(db->dbenv->i->directory, txn, &key_in_directory, &key_in_directory, LOCK_REQUEST_WRITE); - if (r == 0) - STATUS_VALUE(YDB_LAYER_DIRECTORY_WRITE_LOCKS)++; // accountability - else - STATUS_VALUE(YDB_LAYER_DIRECTORY_WRITE_LOCKS_FAIL)++; // accountability - return r; -} - -static int -toku_db_update(DB *db, DB_TXN *txn, - const DBT *key, - const DBT *update_function_extra, - u_int32_t flags) { - HANDLE_PANICKED_DB(db); - HANDLE_DB_ILLEGAL_WORKING_PARENT_TXN(db, txn); - int r = 0; - - u_int32_t lock_flags = get_prelocked_flags(flags); - flags &= ~lock_flags; - - r = db_put_check_size_constraints(db, key, update_function_extra); - if (r != 0) { goto cleanup; } - - BOOL do_locking = (db->i->lt && !(lock_flags & DB_PRELOCKED_WRITE)); - if (do_locking) { - r = get_point_write_lock(db, txn, key); - if (r != 0) { goto cleanup; } - } - - TOKUTXN ttxn = txn ? db_txn_struct_i(txn)->tokutxn : NULL; - r = toku_brt_maybe_update(db->i->brt, key, update_function_extra, ttxn, - FALSE, ZERO_LSN, TRUE); - -cleanup: - if (r == 0) - STATUS_VALUE(YDB_LAYER_NUM_UPDATES)++; // accountability - else - STATUS_VALUE(YDB_LAYER_NUM_UPDATES_FAIL)++; // accountability - return r; -} - - -// DB_IS_RESETTING_OP is true if the dictionary should be considered as if created by this transaction. -// For example, it will be true if toku_db_update_broadcast() is used to implement a schema change (such -// as adding a column), and will be false if used simply to update all the rows of a table (such as -// incrementing a field). -static int -toku_db_update_broadcast(DB *db, DB_TXN *txn, - const DBT *update_function_extra, - u_int32_t flags) { - HANDLE_PANICKED_DB(db); - HANDLE_DB_ILLEGAL_WORKING_PARENT_TXN(db, txn); - int r = 0; - - u_int32_t lock_flags = get_prelocked_flags(flags); - flags &= ~lock_flags; - u_int32_t is_resetting_op_flag = flags & DB_IS_RESETTING_OP; - flags &= is_resetting_op_flag; - BOOL is_resetting_op = (is_resetting_op_flag != 0); - - - if (is_resetting_op) { - if (txn->parent != NULL) { - r = EINVAL; // cannot have a parent if you are a resetting op - goto cleanup; - } - r = toku_db_pre_acquire_fileops_lock(db, txn); - if (r != 0) { goto cleanup; } - } - { - DBT null_key; - toku_init_dbt(&null_key); - r = db_put_check_size_constraints(db, &null_key, update_function_extra); - if (r != 0) { goto cleanup; } - } - - BOOL do_locking = (db->i->lt && !(lock_flags & DB_PRELOCKED_WRITE)); - if (do_locking) { - r = toku_db_pre_acquire_table_lock(db, txn, TRUE); - if (r != 0) { goto cleanup; } - } - - TOKUTXN ttxn = txn ? db_txn_struct_i(txn)->tokutxn : NULL; - r = toku_brt_maybe_update_broadcast(db->i->brt, update_function_extra, ttxn, - FALSE, ZERO_LSN, TRUE, is_resetting_op); - -cleanup: - if (r == 0) - STATUS_VALUE(YDB_LAYER_NUM_UPDATES_BROADCAST)++; // accountability - else - STATUS_VALUE(YDB_LAYER_NUM_UPDATES_BROADCAST_FAIL)++; // accountability - return r; -} - -static int -log_put_single(DB_TXN *txn, BRT brt, const DBT *key, const DBT *val) { - TOKUTXN ttxn = db_txn_struct_i(txn)->tokutxn; - int r = toku_brt_log_put(ttxn, brt, key, val); - return r; -} - -static int -log_put_multiple(DB_TXN *txn, DB *src_db, const DBT *src_key, const DBT *src_val, uint32_t num_dbs, BRT brts[]) { - int r = 0; - if (num_dbs > 0) { - TOKUTXN ttxn = db_txn_struct_i(txn)->tokutxn; - BRT src_brt = src_db ? src_db->i->brt : NULL; - r = toku_brt_log_put_multiple(ttxn, src_brt, brts, num_dbs, src_key, src_val); - } - return r; -} - -static int -do_put_multiple(DB_TXN *txn, uint32_t num_dbs, DB *db_array[], DBT keys[], DBT vals[], DB *src_db, const DBT *src_key) { - int r = 0; - TOKUTXN ttxn = db_txn_struct_i(txn)->tokutxn; - for (uint32_t which_db = 0; r == 0 && which_db < num_dbs; which_db++) { - DB *db = db_array[which_db]; - - // if db is being indexed by an indexer, then put into that db if the src key is to the left or equal to the - // indexers cursor. we have to get the src_db from the indexer and find it in the db_array. - int do_put = TRUE; - DB_INDEXER *indexer = toku_db_get_indexer(db); - if (indexer) { // if this db is the index under construction - DB *indexer_src_db = toku_indexer_get_src_db(indexer); - invariant(indexer_src_db != NULL); - const DBT *indexer_src_key; - if (src_db == indexer_src_db) - indexer_src_key = src_key; - else { - uint32_t which_src_db = lookup_src_db(num_dbs, db_array, indexer_src_db); - invariant(which_src_db < num_dbs); - indexer_src_key = &keys[which_src_db]; - } - do_put = !toku_indexer_is_key_right_of_le_cursor(indexer, indexer_src_db, indexer_src_key); - } - if (r == 0 && do_put) { - r = toku_brt_maybe_insert(db->i->brt, &keys[which_db], &vals[which_db], ttxn, FALSE, ZERO_LSN, FALSE, BRT_INSERT); - } - } - return r; -} - -static int -env_put_multiple( - DB_ENV *env, - DB *src_db, - DB_TXN *txn, - const DBT *src_key, - const DBT *src_val, - uint32_t num_dbs, - DB **db_array, - DBT *keys, - DBT *vals, - uint32_t *flags_array) -{ - int r; - DBT put_keys[num_dbs]; - DBT put_vals[num_dbs]; - - HANDLE_PANICKED_ENV(env); - - { - uint32_t lock_flags[num_dbs]; - uint32_t remaining_flags[num_dbs]; - BRT brts[num_dbs]; - - if (!txn || !num_dbs) { - r = EINVAL; - goto cleanup; - } - if (!env->i->generate_row_for_put) { - r = EINVAL; - goto cleanup; - } - - HANDLE_ILLEGAL_WORKING_PARENT_TXN(env, txn); - - for (uint32_t which_db = 0; which_db < num_dbs; which_db++) { - DB *db = db_array[which_db]; - - lock_flags[which_db] = get_prelocked_flags(flags_array[which_db]); - remaining_flags[which_db] = flags_array[which_db] & ~lock_flags[which_db]; - - //Generate the row - if (db == src_db) { - put_keys[which_db] = *src_key; - put_vals[which_db] = *src_val; - } - else { - r = env->i->generate_row_for_put(db, src_db, &keys[which_db], &vals[which_db], src_key, src_val); - if (r != 0) goto cleanup; - put_keys[which_db] = keys[which_db]; - put_vals[which_db] = vals[which_db]; - } - - // check size constraints - r = db_put_check_size_constraints(db, &put_keys[which_db], &put_vals[which_db]); - if (r != 0) goto cleanup; - - //Check overwrite constraints - r = db_put_check_overwrite_constraint(db, txn, - &put_keys[which_db], - lock_flags[which_db], remaining_flags[which_db]); - if (r != 0) goto cleanup; - if (remaining_flags[which_db] == DB_NOOVERWRITE_NO_ERROR) { - //put_multiple does not support delaying the no error, since we would - //have to log the flag in the put_multiple. - r = EINVAL; goto cleanup; - } - - //Do locking if necessary. - if (db->i->lt && !(lock_flags[which_db] & DB_PRELOCKED_WRITE)) { - //Needs locking - r = get_point_write_lock(db, txn, &put_keys[which_db]); - if (r != 0) goto cleanup; - } - brts[which_db] = db->i->brt; - } - - if (num_dbs == 1) - r = log_put_single(txn, brts[0], &put_keys[0], &put_vals[0]); - else - r = log_put_multiple(txn, src_db, src_key, src_val, num_dbs, brts); - - if (r == 0) - r = do_put_multiple(txn, num_dbs, db_array, put_keys, put_vals, src_db, src_key); - - } - -cleanup: - if (r == 0) - STATUS_VALUE(YDB_LAYER_NUM_MULTI_INSERTS) += num_dbs; // accountability - else - STATUS_VALUE(YDB_LAYER_NUM_MULTI_INSERTS_FAIL) += num_dbs; // accountability - return r; -} - -static int -env_update_multiple(DB_ENV *env, DB *src_db, DB_TXN *txn, - DBT *old_src_key, DBT *old_src_data, - DBT *new_src_key, DBT *new_src_data, - uint32_t num_dbs, DB **db_array, uint32_t* flags_array, - uint32_t num_keys, DBT keys[], - uint32_t num_vals, DBT vals[]) { - int r = 0; - - HANDLE_PANICKED_ENV(env); - - if (!txn) { - r = EINVAL; - goto cleanup; - } - if (!env->i->generate_row_for_put) { - r = EINVAL; - goto cleanup; - } - - HANDLE_ILLEGAL_WORKING_PARENT_TXN(env, txn); - - { - uint32_t n_del_dbs = 0; - DB *del_dbs[num_dbs]; - BRT del_brts[num_dbs]; - DBT del_keys[num_dbs]; - - uint32_t n_put_dbs = 0; - DB *put_dbs[num_dbs]; - BRT put_brts[num_dbs]; - DBT put_keys[num_dbs]; - DBT put_vals[num_dbs]; - - uint32_t lock_flags[num_dbs]; - uint32_t remaining_flags[num_dbs]; - - for (uint32_t which_db = 0; which_db < num_dbs; which_db++) { - DB *db = db_array[which_db]; - DBT curr_old_key, curr_new_key, curr_new_val; - - lock_flags[which_db] = get_prelocked_flags(flags_array[which_db]); - remaining_flags[which_db] = flags_array[which_db] & ~lock_flags[which_db]; - - // keys[0..num_dbs-1] are the new keys - // keys[num_dbs..2*num_dbs-1] are the old keys - // vals[0..num_dbs-1] are the new vals - - // Generate the old key and val - if (which_db + num_dbs >= num_keys) { - r = ENOMEM; goto cleanup; - } - if (db == src_db) { - curr_old_key = *old_src_key; - } - else { - r = env->i->generate_row_for_put(db, src_db, &keys[which_db + num_dbs], NULL, old_src_key, old_src_data); - if (r != 0) goto cleanup; - curr_old_key = keys[which_db + num_dbs]; - } - // Generate the new key and val - if (which_db >= num_keys || which_db >= num_vals) { - r = ENOMEM; goto cleanup; - } - if (db == src_db) { - curr_new_key = *new_src_key; - curr_new_val = *new_src_data; - } - else { - r = env->i->generate_row_for_put(db, src_db, &keys[which_db], &vals[which_db], new_src_key, new_src_data); - if (r != 0) goto cleanup; - curr_new_key = keys[which_db]; - curr_new_val = vals[which_db]; - } - toku_dbt_cmp cmpfun = toku_db_get_compare_fun(db); - BOOL key_eq = cmpfun(db, &curr_old_key, &curr_new_key) == 0; - if (!key_eq) { - //Check overwrite constraints only in the case where - // the keys are not equal. - // If the keys are equal, then we do not care of the flag is DB_NOOVERWRITE or 0 - r = db_put_check_overwrite_constraint(db, txn, - &curr_new_key, - lock_flags[which_db], remaining_flags[which_db]); - if (r != 0) goto cleanup; - if (remaining_flags[which_db] == DB_NOOVERWRITE_NO_ERROR) { - //update_multiple does not support delaying the no error, since we would - //have to log the flag in the put_multiple. - r = EINVAL; goto cleanup; - } - - // lock old key - if (db->i->lt && !(lock_flags[which_db] & DB_PRELOCKED_WRITE)) { - r = get_point_write_lock(db, txn, &curr_old_key); - if (r != 0) goto cleanup; - } - del_dbs[n_del_dbs] = db; - del_brts[n_del_dbs] = db->i->brt; - del_keys[n_del_dbs] = curr_old_key; - n_del_dbs++; - - } - - // we take a shortcut and avoid generating the old val - // we assume that any new vals with size > 0 are different than the old val - // if (!key_eq || !(dbt_cmp(&vals[which_db], &vals[which_db + num_dbs]) == 0)) { - if (!key_eq || curr_new_val.size > 0) { - r = db_put_check_size_constraints(db, &curr_new_key, &curr_new_val); - if (r != 0) goto cleanup; - - // lock new key - if (db->i->lt) { - r = get_point_write_lock(db, txn, &curr_new_key); - if (r != 0) goto cleanup; - } - put_dbs[n_put_dbs] = db; - put_brts[n_put_dbs] = db->i->brt; - put_keys[n_put_dbs] = curr_new_key; - put_vals[n_put_dbs] = curr_new_val; - n_put_dbs++; - } - } - - if (r == 0 && n_del_dbs > 0) { - if (n_del_dbs == 1) - r = log_del_single(txn, del_brts[0], &del_keys[0]); - else - r = log_del_multiple(txn, src_db, old_src_key, old_src_data, n_del_dbs, del_brts, del_keys); - if (r == 0) - r = do_del_multiple(txn, n_del_dbs, del_dbs, del_keys, src_db, old_src_key); - } - - if (r == 0 && n_put_dbs > 0) { - if (n_put_dbs == 1) - r = log_put_single(txn, put_brts[0], &put_keys[0], &put_vals[0]); - else - r = log_put_multiple(txn, src_db, new_src_key, new_src_data, n_put_dbs, put_brts); - if (r == 0) - r = do_put_multiple(txn, n_put_dbs, put_dbs, put_keys, put_vals, src_db, new_src_key); - } - } - -cleanup: - if (r == 0) - STATUS_VALUE(YDB_LAYER_NUM_MULTI_UPDATES) += num_dbs; // accountability - else - STATUS_VALUE(YDB_LAYER_NUM_MULTI_UPDATES_FAIL) += num_dbs; // accountability - return r; -} - -static int toku_db_remove(DB * db, const char *fname, const char *dbname, u_int32_t flags); - //We do not (yet?) support deleting subdbs by deleting the enclosing 'fname' static int env_dbremove_subdb(DB_ENV * env, DB_TXN * txn, const char *fname, const char *dbname, int32_t flags) { @@ -5352,7 +3072,7 @@ finalize_file_removal(DICTIONARY_ID dict_id, void * extra) { //static int toku_db_pre_acquire_table_lock(DB *db, DB_TXN *txn); -static int +int toku_env_dbremove(DB_ENV * env, DB_TXN *txn, const char *fname, const char *dbname, u_int32_t flags) { int r; HANDLE_PANICKED_ENV(env); @@ -5379,7 +3099,7 @@ toku_env_dbremove(DB_ENV * env, DB_TXN *txn, const char *fname, const char *dbna // begin child (unless transactionless) if (using_txns) { r = toku_txn_begin_internal(env, txn, &child, DB_TXN_NOSYNC, 1, true); - assert(r==0); + assert_zero(r); } // get iname @@ -5389,11 +3109,11 @@ toku_env_dbremove(DB_ENV * env, DB_TXN *txn, const char *fname, const char *dbna r = ENOENT; else if (r==0) { // remove (dname,iname) from directory - r = toku_db_del(env->i->directory, child, &dname_dbt, DB_DELETE_ANY); + r = toku_db_del(env->i->directory, child, &dname_dbt, DB_DELETE_ANY, TRUE); if (r == 0) { if (using_txns) { r = toku_brt_remove_on_commit(db_txn_struct_i(child)->tokutxn, &iname_dbt); - assert(r==0); + assert_zero(r); //Now that we have a writelock on dname, verify that there are still no handles open. (to prevent race conditions) if (r==0 && env_is_db_with_dname_open(env, dname)) r = toku_ydb_do_error(env, EINVAL, "Cannot remove dictionary with an open handle.\n"); @@ -5407,7 +3127,7 @@ toku_env_dbremove(DB_ENV * env, DB_TXN *txn, const char *fname, const char *dbna } else { r = toku_brt_remove_now(env->i->cachetable, &iname_dbt); - assert(r==0); + assert_zero(r); } } } @@ -5430,16 +3150,6 @@ toku_env_dbremove(DB_ENV * env, DB_TXN *txn, const char *fname, const char *dbna } -static int -toku_db_remove(DB * db, const char *fname, const char *dbname, u_int32_t flags) { - HANDLE_PANICKED_DB(db); - DB_TXN *null_txn = NULL; - int r = toku_env_dbremove(db->dbenv, null_txn, fname, dbname, flags); - int r2 = toku_db_close(db, 0); - if (r==0) r = r2; - return r; -} - static int env_dbrename_subdb(DB_ENV *env, DB_TXN *txn, const char *fname, const char *dbname, const char *newname, u_int32_t flags) { int r; @@ -5462,7 +3172,7 @@ env_dbrename_subdb(DB_ENV *env, DB_TXN *txn, const char *fname, const char *dbna } -static int +int toku_env_dbrename(DB_ENV *env, DB_TXN *txn, const char *fname, const char *dbname, const char *newname, u_int32_t flags) { int r; HANDLE_PANICKED_ENV(env); @@ -5493,7 +3203,7 @@ toku_env_dbrename(DB_ENV *env, DB_TXN *txn, const char *fname, const char *dbnam // begin child (unless transactionless) if (using_txns) { r = toku_txn_begin_internal(env, txn, &child, DB_TXN_NOSYNC, 1, true); - assert(r==0); + assert_zero(r); } r = toku_db_get(env->i->directory, child, &old_dname_dbt, &iname_dbt, DB_SERIALIZABLE); // allocates memory for iname @@ -5507,9 +3217,9 @@ toku_env_dbrename(DB_ENV *env, DB_TXN *txn, const char *fname, const char *dbnam r = EEXIST; else if (r == DB_NOTFOUND) { // remove old (dname,iname) and insert (newname,iname) in directory - r = toku_db_del(env->i->directory, child, &old_dname_dbt, DB_DELETE_ANY); + r = toku_db_del(env->i->directory, child, &old_dname_dbt, DB_DELETE_ANY, TRUE); if (r == 0) - r = toku_db_put(env->i->directory, child, &new_dname_dbt, &iname_dbt, 0); + r = toku_db_put(env->i->directory, child, &new_dname_dbt, &iname_dbt, 0, TRUE); //Now that we have writelocks on both dnames, verify that there are still no handles open. (to prevent race conditions) if (r==0 && env_is_db_with_dname_open(env, dname)) r = toku_ydb_do_error(env, EINVAL, "Cannot rename dictionary with an open handle.\n"); @@ -5550,789 +3260,6 @@ toku_env_dbrename(DB_ENV *env, DB_TXN *txn, const char *fname, const char *dbnam } -static int -toku_db_rename(DB * db, const char *fname, const char *dbname, const char *newname, u_int32_t flags) { - HANDLE_PANICKED_DB(db); - DB_TXN *null_txn = NULL; - int r = toku_env_dbrename(db->dbenv, null_txn, fname, dbname, newname, flags); - int r2 = toku_db_close(db, 0); - if (r==0) r = r2; - return r; -} - -// -// This function is the only way to set a descriptor of a DB. -// -static int -toku_db_change_descriptor(DB *db, DB_TXN* txn, const DBT* descriptor, u_int32_t flags) { - HANDLE_PANICKED_DB(db); - HANDLE_DB_ILLEGAL_WORKING_PARENT_TXN(db, txn); - int r; - TOKUTXN ttxn = txn ? db_txn_struct_i(txn)->tokutxn : NULL; - DBT old_descriptor; - BOOL is_db_hot_index = ((flags & DB_IS_HOT_INDEX) != 0); - - toku_init_dbt(&old_descriptor); - if (!db_opened(db) || !txn || !descriptor || (descriptor->size>0 && !descriptor->data)){ - r = EINVAL; - goto cleanup; - } - if (txn->parent != NULL) { - r = EINVAL; // cannot have a parent if you are a resetting op - goto cleanup; - } - if (!is_db_hot_index) { - r = toku_db_pre_acquire_fileops_lock(db, txn); - if (r != 0) { goto cleanup; } - } - - old_descriptor.size = db->descriptor->dbt.size; - old_descriptor.data = toku_memdup(db->descriptor->dbt.data, db->descriptor->dbt.size); - r = toku_brt_change_descriptor(db->i->brt, &old_descriptor, descriptor, TRUE, ttxn); -cleanup: - if (old_descriptor.data) toku_free(old_descriptor.data); - return r; -} - -static int -toku_db_set_flags(DB *db, u_int32_t flags) { - HANDLE_PANICKED_DB(db); - - /* the following matches BDB */ - if (db_opened(db) && flags != 0) return EINVAL; - - return 0; -} - -static int -toku_db_get_flags(DB *db, u_int32_t *pflags) { - HANDLE_PANICKED_DB(db); - if (!pflags) return EINVAL; - *pflags = 0; - return 0; -} - -static int -toku_db_set_pagesize(DB *db, u_int32_t pagesize) { - HANDLE_PANICKED_DB(db); - int r = toku_brt_set_nodesize(db->i->brt, pagesize); - return r; -} - -static int -toku_db_get_pagesize(DB *db, u_int32_t *pagesize_ptr) { - HANDLE_PANICKED_DB(db); - int r = toku_brt_get_nodesize(db->i->brt, pagesize_ptr); - return r; -} - -static int -toku_db_set_readpagesize(DB *db, u_int32_t readpagesize) { - HANDLE_PANICKED_DB(db); - int r = toku_brt_set_basementnodesize(db->i->brt, readpagesize); - return r; -} - -static int -toku_db_get_readpagesize(DB *db, u_int32_t *readpagesize_ptr) { - HANDLE_PANICKED_DB(db); - int r = toku_brt_get_basementnodesize(db->i->brt, readpagesize_ptr); - return r; -} - -static int -toku_db_stat64(DB * db, DB_TXN *txn, DB_BTREE_STAT64 *s) { - HANDLE_PANICKED_DB(db); - HANDLE_DB_ILLEGAL_WORKING_PARENT_TXN(db, txn); - struct brtstat64_s brtstat; - TOKUTXN tokutxn = NULL; - if (txn != NULL) { - tokutxn = db_txn_struct_i(txn)->tokutxn; - } - int r = toku_brt_stat64(db->i->brt, tokutxn, &brtstat); - if (r==0) { - s->bt_nkeys = brtstat.nkeys; - s->bt_ndata = brtstat.ndata; - s->bt_dsize = brtstat.dsize; - s->bt_fsize = brtstat.fsize; - // 4018 - s->bt_create_time_sec = brtstat.create_time_sec; - s->bt_modify_time_sec = brtstat.modify_time_sec; - s->bt_verify_time_sec = brtstat.verify_time_sec; - } - return r; -} - -static int -locked_db_stat64 (DB *db, DB_TXN *txn, DB_BTREE_STAT64 *s) { - toku_ydb_lock(); - int r = toku_db_stat64(db, txn, s); - toku_ydb_unlock(); - return r; -} - - -static int -toku_db_key_range64(DB* db, DB_TXN* txn __attribute__((__unused__)), DBT* key, u_int64_t* less, u_int64_t* equal, u_int64_t* greater, int* is_exact) { - HANDLE_PANICKED_DB(db); - HANDLE_DB_ILLEGAL_WORKING_PARENT_TXN(db, txn); - - // note that toku_brt_keyrange does not have a txn param - // this will be fixed later - // temporarily, because the caller, locked_db_keyrange, - // has the ydb lock, we are ok - int r = toku_brt_keyrange(db->i->brt, key, less, equal, greater); - if (r != 0) { goto cleanup; } - // temporarily set is_exact to 0 because brt_keyrange does not have this parameter - *is_exact = 0; -cleanup: - return r; -} - -static int -toku_c_pre_acquire_range_lock(DBC *dbc, const DBT *key_left, const DBT *key_right) { - DB *db = dbc->dbp; - DB_TXN *txn = dbc_struct_i(dbc)->txn; - HANDLE_PANICKED_DB(db); - toku_brt_cursor_set_range_lock(dbc_struct_i(dbc)->c, key_left, key_right, - (key_left == toku_lt_neg_infinity), - (key_right == toku_lt_infinity)); - if (!db->i->lt || !txn) - return 0; - //READ_UNCOMMITTED and READ_COMMITTED transactions do not need read locks. - if (!dbc_struct_i(dbc)->rmw && dbc_struct_i(dbc)->iso != TOKU_ISO_SERIALIZABLE) - return 0; - - toku_lock_type lock_type = dbc_struct_i(dbc)->rmw ? LOCK_REQUEST_WRITE : LOCK_REQUEST_READ; - int r = get_range_lock(db, txn, key_left, key_right, lock_type); - return r; -} - -//static int toku_db_pre_acquire_table_lock(DB *db, DB_TXN *txn) { -// needed by loader.c -int -toku_db_pre_acquire_table_lock(DB *db, DB_TXN *txn, BOOL just_lock) { - HANDLE_PANICKED_DB(db); - if (!db->i->lt || !txn) return 0; - - int r; - - r = get_range_lock(db, txn, toku_lt_neg_infinity, toku_lt_infinity, LOCK_REQUEST_WRITE); - - if (r==0 && !just_lock && - !toku_brt_is_recovery_logging_suppressed(db->i->brt) && - toku_brt_is_empty_fast(db->i->brt) - ) { - //Try to suppress both rollback and recovery logs - DB_LOADER *loader; - DB *dbs[1] = {db}; - uint32_t db_flags[1] = {DB_NOOVERWRITE}; - uint32_t dbt_flags[1] = {0}; - uint32_t loader_flags = DB_PRELOCKED_WRITE; //Don't recursively prelock - DB_ENV *env = db->dbenv; - DB_TXN *child = NULL; - - { - // begin child - int rt = toku_txn_begin_internal(env, txn, &child, DB_TXN_NOSYNC, 1, true); - assert(rt==0); - } - - toku_ydb_unlock(); //Cannot hold ydb lock when creating loader - - int r_loader = env->create_loader(env, child, &loader, NULL, 1, dbs, db_flags, dbt_flags, loader_flags); - if (r_loader==0) { - r_loader = loader->set_error_callback(loader, NULL, NULL); - assert(r_loader==0); - r_loader = loader->set_poll_function(loader, NULL, NULL); - assert(r_loader==0); - // close the loader - r_loader = loader->close(loader); - if (r_loader==0) { - toku_brt_suppress_recovery_logs(db->i->brt, db_txn_struct_i(child)->tokutxn); - } - } - else if (r_loader != DB_LOCK_NOTGRANTED) { - //Lock not granted is not an error. - //It just means we cannot use the loader optimization. - assert(r==0); - r = r_loader; - } - if (r_loader == 0) { // commit - r = locked_txn_commit(child, 0); - assert(r==0); - STATUS_VALUE(YDB_LAYER_LOGSUPPRESS)++; // accountability - } - else { // abort - r = locked_txn_abort(child); - assert(r==0); - STATUS_VALUE(YDB_LAYER_LOGSUPPRESS_FAIL)++; // accountability - } - toku_ydb_lock(); //Reaquire ydb lock. - } - - return r; -} - -//TODO: DB_AUTO_COMMIT. -//TODO: Nowait only conditionally? -//TODO: NOSYNC change to SYNC if DB_ENV has something in set_flags -static inline int -toku_db_construct_autotxn(DB* db, DB_TXN **txn, BOOL* changed, BOOL force_auto_commit) { - assert(db && txn && changed); - DB_ENV* env = db->dbenv; - if (*txn || !(env->i->open_flags & DB_INIT_TXN)) { - *changed = FALSE; - return 0; - } - BOOL nosync = (BOOL)(!force_auto_commit && !(env->i->open_flags & DB_AUTO_COMMIT)); - u_int32_t txn_flags = DB_TXN_NOWAIT | (nosync ? DB_TXN_NOSYNC : 0); - int r = toku_txn_begin_internal(env, NULL, txn, txn_flags, 1, true); - if (r!=0) return r; - *changed = TRUE; - return 0; -} - -static inline int -toku_db_destruct_autotxn(DB_TXN *txn, int r, BOOL changed) { - if (!changed) return r; - if (r==0) return toku_txn_commit(txn, 0, NULL, NULL, false); - toku_txn_abort(txn, NULL, NULL, false); - return r; -} - -static int -locked_db_close(DB * db, u_int32_t flags) { - toku_ydb_lock(); - int r = toku_db_close(db, flags); - toku_ydb_unlock(); - return r; -} - -static inline int -autotxn_db_cursor(DB *db, DB_TXN *txn, DBC **c, u_int32_t flags) { - if (!txn && (db->dbenv->i->open_flags & DB_INIT_TXN)) { - return toku_ydb_do_error(db->dbenv, EINVAL, - "Cursors in a transaction environment must have transactions.\n"); - } - return toku_db_cursor_internal(db, txn, c, flags, 0); -} - -// Create a cursor on a db. -// Called without holding the ydb lock. -static int -toku_db_cursor(DB *db, DB_TXN *txn, DBC **c, u_int32_t flags) { - int r = autotxn_db_cursor(db, txn, c, flags); - return r; -} - -static inline int -autotxn_db_del(DB* db, DB_TXN* txn, DBT* key, u_int32_t flags) { - BOOL changed; int r; - r = toku_db_construct_autotxn(db, &txn, &changed, FALSE); - if (r!=0) return r; - r = toku_db_del(db, txn, key, flags); - return toku_db_destruct_autotxn(txn, r, changed); -} - -static int -locked_db_del(DB * db, DB_TXN * txn, DBT * key, u_int32_t flags) { - toku_ydb_lock(); int r = autotxn_db_del(db, txn, key, flags); toku_ydb_unlock(); return r; -} - -static inline int -autotxn_db_get(DB* db, DB_TXN* txn, DBT* key, DBT* data, u_int32_t flags) { - BOOL changed; int r; - r = toku_db_construct_autotxn(db, &txn, &changed, FALSE); - if (r!=0) return r; - r = toku_db_get(db, txn, key, data, flags); - return toku_db_destruct_autotxn(txn, r, changed); -} - -static int -locked_db_get (DB * db, DB_TXN * txn, DBT * key, DBT * data, u_int32_t flags) { - toku_ydb_lock(); int r = autotxn_db_get(db, txn, key, data, flags); toku_ydb_unlock(); return r; -} - -static inline int -autotxn_db_getf_set (DB *db, DB_TXN *txn, u_int32_t flags, DBT *key, YDB_CALLBACK_FUNCTION f, void *extra) { - BOOL changed; int r; - r = toku_db_construct_autotxn(db, &txn, &changed, FALSE); - if (r!=0) return r; - r = db_getf_set(db, txn, flags, key, f, extra); - return toku_db_destruct_autotxn(txn, r, changed); -} - -static int -locked_db_getf_set (DB *db, DB_TXN *txn, u_int32_t flags, DBT *key, YDB_CALLBACK_FUNCTION f, void *extra) { - toku_ydb_lock(); int r = autotxn_db_getf_set(db, txn, flags, key, f, extra); toku_ydb_unlock(); return r; -} - -static int -locked_c_pre_acquire_range_lock(DBC *dbc, const DBT *key_left, const DBT *key_right) { - toku_ydb_lock(); - int r = toku_c_pre_acquire_range_lock(dbc, key_left, key_right); - toku_ydb_unlock(); - return r; -} - -static int -locked_db_pre_acquire_table_lock(DB *db, DB_TXN *txn) { - toku_ydb_lock(); - int r = toku_db_pre_acquire_table_lock(db, txn, FALSE); - toku_ydb_unlock(); - return r; -} - -static int locked_db_pre_acquire_fileops_lock(DB *db, DB_TXN *txn) { - toku_ydb_lock(); - int r = toku_db_pre_acquire_fileops_lock(db, txn); - toku_ydb_unlock(); - return r; -} - -// truncate a database -// effect: remove all of the rows from a database -static int -toku_db_truncate(DB *db, DB_TXN *txn, u_int32_t *row_count, u_int32_t flags) { - HANDLE_PANICKED_DB(db); - HANDLE_DB_ILLEGAL_WORKING_PARENT_TXN(db, txn); - int r; - - u_int32_t unhandled_flags = flags; - int ignore_cursors = 0; - if (flags & DB_TRUNCATE_WITHCURSORS) { - ignore_cursors = 1; - unhandled_flags &= ~DB_TRUNCATE_WITHCURSORS; - } - - // dont support flags (yet) - if (unhandled_flags) - return EINVAL; - // dont support cursors unless explicitly told to - if (!ignore_cursors && toku_brt_get_cursor_count(db->i->brt) > 0) - return EINVAL; - - // acquire a table lock - if (txn) { - r = toku_db_pre_acquire_fileops_lock(db, txn); - if (r != 0) { - return r; - } - r = toku_db_pre_acquire_table_lock(db, txn, TRUE); - if (r != 0) { - return r; - } - } - - *row_count = 0; - - r = toku_brt_truncate(db->i->brt); - - return r; -} - -static inline int -autotxn_db_open(DB* db, DB_TXN* txn, const char *fname, const char *dbname, DBTYPE dbtype, u_int32_t flags, int mode) { - BOOL changed; int r; - r = toku_db_construct_autotxn(db, &txn, &changed, (BOOL)((flags & DB_AUTO_COMMIT) != 0)); - if (r!=0) return r; - r = toku_db_open(db, txn, fname, dbname, dbtype, flags & ~DB_AUTO_COMMIT, mode); - return toku_db_destruct_autotxn(txn, r, changed); -} - -static int -locked_db_open(DB *db, DB_TXN *txn, const char *fname, const char *dbname, DBTYPE dbtype, u_int32_t flags, int mode) { - toku_multi_operation_client_lock(); //Cannot begin checkpoint - toku_ydb_lock(); int r = autotxn_db_open(db, txn, fname, dbname, dbtype, flags, mode); toku_ydb_unlock(); - toku_multi_operation_client_unlock(); //Can now begin checkpoint - return r; -} - -static inline int -autotxn_db_put(DB* db, DB_TXN* txn, DBT* key, DBT* data, u_int32_t flags) { - //{ unsigned i; printf("put %p keylen=%d key={", db, key->size); for(i=0; isize; i++) printf("%d,", ((char*)key->data)[i]); printf("} datalen=%d data={", data->size); for(i=0; isize; i++) printf("%d,", ((char*)data->data)[i]); printf("}\n"); } - BOOL changed; int r; - r = toku_db_construct_autotxn(db, &txn, &changed, FALSE); - if (r!=0) return r; - r = toku_db_put(db, txn, key, data, flags); - return toku_db_destruct_autotxn(txn, r, changed); -} - -static int -locked_db_put(DB * db, DB_TXN * txn, DBT * key, DBT * data, u_int32_t flags) { - int r = env_check_avail_fs_space(db->dbenv); - if (r == 0) { - toku_ydb_lock(); - r = autotxn_db_put(db, txn, key, data, flags); - toku_ydb_unlock(); - } - return r; -} - -static inline int -autotxn_db_update(DB *db, DB_TXN *txn, - const DBT *key, - const DBT *update_function_extra, - u_int32_t flags) { - BOOL changed; int r; - r = toku_db_construct_autotxn(db, &txn, &changed, FALSE); - if (r != 0) { return r; } - r = toku_db_update(db, txn, key, update_function_extra, flags); - return toku_db_destruct_autotxn(txn, r, changed); -} - -static int -locked_db_update(DB *db, DB_TXN *txn, - const DBT *key, - const DBT *update_function_extra, - u_int32_t flags) { - int r = env_check_avail_fs_space(db->dbenv); - if (r != 0) { goto cleanup; } - toku_ydb_lock(); - r = autotxn_db_update(db, txn, key, update_function_extra, flags); - toku_ydb_unlock(); -cleanup: - return r; -} - -static inline int -autotxn_db_update_broadcast(DB *db, DB_TXN *txn, - const DBT *update_function_extra, - u_int32_t flags) { - BOOL changed; int r; - r = toku_db_construct_autotxn(db, &txn, &changed, FALSE); - if (r != 0) { return r; } - r = toku_db_update_broadcast(db, txn, update_function_extra, flags); - return toku_db_destruct_autotxn(txn, r, changed); -} - -static int -locked_db_update_broadcast(DB *db, DB_TXN *txn, - const DBT *update_function_extra, - u_int32_t flags) { - int r = env_check_avail_fs_space(db->dbenv); - if (r != 0) { goto cleanup; } - toku_ydb_lock(); - r = autotxn_db_update_broadcast(db, txn, update_function_extra, flags); - toku_ydb_unlock(); -cleanup: - return r; -} - -static int -locked_db_remove(DB * db, const char *fname, const char *dbname, u_int32_t flags) { - toku_multi_operation_client_lock(); //Cannot begin checkpoint - toku_ydb_lock(); - int r = toku_db_remove(db, fname, dbname, flags); - toku_ydb_unlock(); - toku_multi_operation_client_unlock(); //Can now begin checkpoint - return r; -} - -static int -locked_db_rename(DB * db, const char *namea, const char *nameb, const char *namec, u_int32_t flags) { - toku_multi_operation_client_lock(); //Cannot begin checkpoint - toku_ydb_lock(); - int r = toku_db_rename(db, namea, nameb, namec, flags); - toku_ydb_unlock(); - toku_multi_operation_client_unlock(); //Can now begin checkpoint - return r; -} - -static int -locked_db_change_descriptor(DB *db, DB_TXN* txn, const DBT* descriptor, u_int32_t flags) { - toku_ydb_lock(); - int r = toku_db_change_descriptor(db, txn, descriptor, flags); - toku_ydb_unlock(); - return r; -} - -static void -locked_db_set_errfile (DB *db, FILE *errfile) { - db->dbenv->set_errfile(db->dbenv, errfile); -} - -static int -locked_db_set_flags(DB *db, u_int32_t flags) { - toku_ydb_lock(); int r = toku_db_set_flags(db, flags); toku_ydb_unlock(); return r; -} - -static int -locked_db_get_flags(DB *db, u_int32_t *flags) { - toku_ydb_lock(); int r = toku_db_get_flags(db, flags); toku_ydb_unlock(); return r; -} - -static int -locked_db_set_pagesize(DB *db, u_int32_t pagesize) { - toku_ydb_lock(); int r = toku_db_set_pagesize(db, pagesize); toku_ydb_unlock(); return r; -} - -static int -locked_db_get_pagesize(DB *db, u_int32_t *pagesize_ptr) { - toku_ydb_lock(); int r = toku_db_get_pagesize(db, pagesize_ptr); toku_ydb_unlock(); return r; -} - -static int -locked_db_set_readpagesize(DB *db, u_int32_t readpagesize) { - toku_ydb_lock(); int r = toku_db_set_readpagesize(db, readpagesize); toku_ydb_unlock(); return r; -} - -static int -locked_db_get_readpagesize(DB *db, u_int32_t *readpagesize_ptr) { - toku_ydb_lock(); int r = toku_db_get_readpagesize(db, readpagesize_ptr); toku_ydb_unlock(); return r; -} - -// TODO 2216 delete this -static int -locked_db_fd(DB * UU(db), int * UU(fdp)) { - // toku_ydb_lock(); - // int r = toku_db_fd(db, fdp); - // toku_ydb_unlock(); - // return r; - return 0; -} - - -static int locked_db_key_range64(DB* db, DB_TXN* txn, DBT* dbt, u_int64_t* less, u_int64_t* equal, u_int64_t* greater, int* is_exact) { - toku_ydb_lock(); int r = toku_db_key_range64(db, txn, dbt, less, equal, greater, is_exact); toku_ydb_unlock(); return r; -} - -static const DBT* toku_db_dbt_pos_infty(void) __attribute__((pure)); -static const DBT* -toku_db_dbt_pos_infty(void) { - return toku_lt_infinity; -} - -static const DBT* toku_db_dbt_neg_infty(void) __attribute__((pure)); -static const DBT* -toku_db_dbt_neg_infty(void) { - return toku_lt_neg_infinity; -} - -static int -locked_db_truncate(DB *db, DB_TXN *txn, u_int32_t *row_count, u_int32_t flags) { - toku_checkpoint_safe_client_lock(); - toku_ydb_lock(); - int r = toku_db_truncate(db, txn, row_count, flags); - toku_ydb_unlock(); - toku_checkpoint_safe_client_unlock(); - return r; -} - -static int -toku_db_optimize(DB *db) { - HANDLE_PANICKED_DB(db); - int r = toku_brt_optimize(db->i->brt); - return r; -} - -static int -toku_db_hot_optimize(DB *db, - int (*progress_callback)(void *extra, float progress), - void *progress_extra) -{ - HANDLE_PANICKED_DB(db); - int r = 0; - - - // If we areunable to get a directory read lock, do nothing. - r = toku_brt_hot_optimize(db->i->brt, - progress_callback, - progress_extra); - - return r; -} - -static int -toku_db_flatten(DB *db, DB_TXN *txn) { - HANDLE_PANICKED_DB(db); - TOKUTXN ttxn = txn ? db_txn_struct_i(txn)->tokutxn : NULL; - int r = toku_brt_flatten(db->i->brt, ttxn); - return r; -} - -static inline int -autotxn_db_flatten(DB* db, DB_TXN* txn) { - BOOL changed; int r; - r = toku_db_construct_autotxn(db, &txn, &changed, FALSE); - if (r!=0) return r; - r = toku_db_flatten(db, txn); - return toku_db_destruct_autotxn(txn, r, changed); -} - -static int -locked_db_flatten(DB *db, DB_TXN *txn) { - toku_ydb_lock(); int r = autotxn_db_flatten(db, txn); toku_ydb_unlock(); return r; -} - -static int -locked_db_optimize(DB *db) { - toku_ydb_lock(); - int r = toku_db_optimize(db); - toku_ydb_unlock(); - return r; -} - -static int -locked_db_hot_optimize(DB *db, - int (*progress_callback)(void *extra, float progress), - void *progress_extra) -{ - int r = toku_db_hot_optimize(db, progress_callback, progress_extra); - return r; -} - -static int -db_get_fragmentation(DB * db, TOKU_DB_FRAGMENTATION report) { - HANDLE_PANICKED_DB(db); - int r; - if (!db_opened(db)) - r = toku_ydb_do_error(db->dbenv, EINVAL, "Fragmentation report available only on open DBs.\n"); - else - r = toku_brt_get_fragmentation(db->i->brt, report); - return r; -} - -static int -locked_db_get_fragmentation(DB * db, TOKU_DB_FRAGMENTATION report) { - toku_ydb_lock(); - int r = db_get_fragmentation(db, report); - toku_ydb_unlock(); - return r; -} - -int -toku_db_set_indexer(DB *db, DB_INDEXER * indexer) { - int r = 0; - if ( db->i->indexer != NULL && indexer != NULL ) { - // you are trying to overwrite a valid indexer - r = EINVAL; - } - else { - db->i->indexer = indexer; - } - return r; -} - -static int -locked_db_set_indexer(DB *db, DB_INDEXER *indexer) { - toku_ydb_lock(); int r = toku_db_set_indexer(db, indexer); toku_ydb_unlock(); return r; -} - -DB_INDEXER * -toku_db_get_indexer(DB *db) { - return db->i->indexer; -} - -static void -locked_db_get_indexer(DB *db, DB_INDEXER **indexer_ptr) { - toku_ydb_lock(); *indexer_ptr = toku_db_get_indexer(db); toku_ydb_unlock(); -} - -struct ydb_verify_context { - int (*progress_callback)(void *extra, float progress); - void *progress_extra; -}; - -static int -ydb_verify_progress_callback(void *extra, float progress) { - struct ydb_verify_context *context = (struct ydb_verify_context *) extra; - int r = 0; - if (context->progress_callback) { - r = context->progress_callback(context->progress_extra, progress); - } - return r; -} - -static int -locked_db_verify_with_progress(DB *db, int (*progress_callback)(void *extra, float progress), void *progress_extra, int verbose, int keep_going) { - struct ydb_verify_context context = { progress_callback, progress_extra }; - int r = toku_verify_brt_with_progress(db->i->brt, ydb_verify_progress_callback, &context, verbose, keep_going); - return r; -} - -static int -toku_db_create(DB ** db, DB_ENV * env, u_int32_t flags) { - int r; - - if (flags || env == NULL) - return EINVAL; - - if (!env_opened(env)) - return EINVAL; - - DB *MALLOC(result); - if (result == 0) { - return ENOMEM; - } - memset(result, 0, sizeof *result); - result->dbenv = env; - // methods that grab the ydb lock -#define SDB(name) result->name = locked_db_ ## name - SDB(key_range64); - SDB(close); - SDB(del); - SDB(get); - // SDB(key_range); - SDB(open); - SDB(put); - SDB(update); - SDB(update_broadcast); - SDB(remove); - SDB(rename); - SDB(change_descriptor); - SDB(set_errfile); - SDB(set_pagesize); - SDB(get_pagesize); - SDB(set_readpagesize); - SDB(get_readpagesize); - SDB(set_flags); - SDB(get_flags); - SDB(stat64); - SDB(fd); - SDB(pre_acquire_table_lock); - SDB(pre_acquire_fileops_lock); - SDB(truncate); - SDB(get_max_row_size); - SDB(getf_set); - SDB(flatten); - SDB(optimize); - SDB(hot_optimize); - SDB(get_fragmentation); - SDB(set_indexer); - SDB(get_indexer); - SDB(verify_with_progress); -#undef SDB - // unlocked methods - result->cursor = toku_db_cursor; - - result->dbt_pos_infty = toku_db_dbt_pos_infty; - result->dbt_neg_infty = toku_db_dbt_neg_infty; - MALLOC(result->i); - if (result->i == 0) { - toku_free(result); - return ENOMEM; - } - memset(result->i, 0, sizeof *result->i); - result->i->dict_id = DICTIONARY_ID_NONE; - result->i->opened = 0; - result->i->open_flags = 0; - result->i->open_mode = 0; - result->i->brt = 0; - result->i->indexer = NULL; - result->i->refs = 1; - toku_list_init(&result->i->dbs_that_must_close_before_abort); - r = toku_brt_create(&result->i->brt); - if (r != 0) { - toku_free(result->i); - toku_free(result); - return r; - } - *db = result; - return 0; -} - int DB_CREATE_FUN (DB ** db, DB_ENV * env, u_int32_t flags) { toku_ydb_lock(); @@ -6392,139 +3319,6 @@ db_version(int *major, int *minor, int *patch) { #endif } -int -db_env_set_func_fsync (int (*fsync_function)(int)) { - return toku_set_func_fsync(fsync_function); -} - -int -db_env_set_func_pwrite (ssize_t (*pwrite_function)(int, const void *, size_t, toku_off_t)) { - return toku_set_func_pwrite(pwrite_function); -} - -int -db_env_set_func_full_pwrite (ssize_t (*pwrite_function)(int, const void *, size_t, toku_off_t)) { - return toku_set_func_full_pwrite(pwrite_function); -} - -int -db_env_set_func_write (ssize_t (*write_function)(int, const void *, size_t)) { - return toku_set_func_write(write_function); -} - -int -db_env_set_func_full_write (ssize_t (*write_function)(int, const void *, size_t)) { - return toku_set_func_full_write(write_function); -} - -int -db_env_set_func_fdopen (FILE * (*fdopen_function)(int, const char *)) { - return toku_set_func_fdopen(fdopen_function); -} - -int -db_env_set_func_fopen (FILE * (*fopen_function)(const char *, const char *)) { - return toku_set_func_fopen(fopen_function); -} - -int -db_env_set_func_open (int (*open_function)(const char *, int, int)) { - return toku_set_func_open(open_function); -} - -int -db_env_set_func_fclose (int (*fclose_function)(FILE*)) { - return toku_set_func_fclose(fclose_function); -} - -int -db_env_set_func_pread (ssize_t (*fun)(int, void *, size_t, off_t)) { - return toku_set_func_pread(fun); -} - -void -db_env_set_func_loader_fwrite (size_t (*fwrite_fun)(const void*,size_t,size_t,FILE*)) { - brtloader_set_os_fwrite(fwrite_fun); -} - -int -db_env_set_func_malloc (void *(*f)(size_t)) { - toku_set_func_malloc(f); - return 0; -} - -int -db_env_set_func_realloc (void *(*f)(void*, size_t)) { - toku_set_func_realloc(f); - return 0; -} - -int -db_env_set_func_free (void (*f)(void*)) { - toku_set_func_free(f); - return 0; -} - - -// Got to call dlmalloc, or else it won't get included. -void -setup_dlmalloc (void) { - db_env_set_func_malloc(dlmalloc); - db_env_set_func_realloc(dlrealloc); - db_env_set_func_free(dlfree); -} - -// For test purposes only. -// With this interface, all checkpoint users get the same callbacks and the same extras. -void -db_env_set_checkpoint_callback (void (*callback_f)(void*), void* extra) { - toku_checkpoint_safe_client_lock(); - checkpoint_callback_f = callback_f; - checkpoint_callback_extra = extra; - toku_checkpoint_safe_client_unlock(); - //printf("set callback = %p, extra = %p\n", callback_f, extra); -} - -void -db_env_set_checkpoint_callback2 (void (*callback_f)(void*), void* extra) { - toku_checkpoint_safe_client_lock(); - checkpoint_callback2_f = callback_f; - checkpoint_callback2_extra = extra; - toku_checkpoint_safe_client_unlock(); - //printf("set callback2 = %p, extra2 = %p\n", callback2_f, extra2); -} - -void -db_env_set_recover_callback (void (*callback_f)(void*), void* extra) { - toku_recover_set_callback(callback_f, extra); -} - -void -db_env_set_recover_callback2 (void (*callback_f)(void*), void* extra) { - toku_recover_set_callback2(callback_f, extra); -} - -void -db_env_set_flusher_thread_callback(void (*callback_f)(int, void*), void* extra) { - toku_flusher_thread_set_callback(callback_f, extra); -} - -void -db_env_set_loader_size_factor (uint32_t factor) { - toku_brtloader_set_size_factor(factor); -} - -void -db_env_set_mvcc_garbage_collection_verification(u_int32_t verification_mode) { - garbage_collection_debug = (verification_mode != 0); -} - -// Purpose: allow test programs that expect to fail to suppress engine status output on failed assert. -void -db_env_enable_engine_status(uint32_t enable) { - engine_status_enable = enable; -} - // HACK: To ensure toku_pthread_yield gets included in the .so // non-static would require a prototype in a header // static (since unused) would give a warning @@ -6537,115 +3331,12 @@ include_toku_pthread_yield (void) { // For test purposes only, translate dname to iname +// YDB lock is NOT held when this function is called, +// as it is called by user static int env_get_iname(DB_ENV* env, DBT* dname_dbt, DBT* iname_dbt) { - toku_ydb_lock(); DB *directory = env->i->directory; int r = autotxn_db_get(directory, NULL, dname_dbt, iname_dbt, DB_SERIALIZABLE|DB_PRELOCKED); // allocates memory for iname - toku_ydb_unlock(); - return r; -} - -/* Following functions (ydb_load_xxx()) are used by loader: - */ - - -// When the loader is created, it makes this call. -// For each dictionary to be loaded, replace old iname in directory -// with a newly generated iname. This will also take a write lock -// on the directory entries. The write lock will be released when -// the transaction of the loader is completed. -// If the transaction commits, the new inames are in place. -// If the transaction aborts, the old inames will be restored. -// The new inames are returned to the caller. -// It is the caller's responsibility to free them. -// If "mark_as_loader" is true, then include a mark in the iname -// to indicate that the file is created by the brt loader. -// Return 0 on success (could fail if write lock not available). -int -ydb_load_inames(DB_ENV * env, DB_TXN * txn, int N, DB * dbs[N], char * new_inames_in_env[N], LSN *load_lsn, BOOL mark_as_loader) { - int rval; - int i; - - int using_txns = env->i->open_flags & DB_INIT_TXN; - DB_TXN * child = NULL; - TXNID xid = 0; - DBT dname_dbt; // holds dname - DBT iname_dbt; // holds new iname - - char * mark; - - if (mark_as_loader) - mark = "B"; - else - mark = "P"; - - for (i=0; itokutxn); - } - for (i = 0; i < N; i++) { - char * dname = dbs[i]->i->dname; - toku_fill_dbt(&dname_dbt, dname, strlen(dname)+1); - // now create new iname - char hint[strlen(dname) + 1]; - create_iname_hint(dname, hint); - char * new_iname = create_iname(env, xid, hint, mark, i); // allocates memory for iname_in_env - new_inames_in_env[i] = new_iname; - toku_fill_dbt(&iname_dbt, new_iname, strlen(new_iname) + 1); // iname_in_env goes in directory - rval = toku_db_put(env->i->directory, child, &dname_dbt, &iname_dbt, 0); - if (rval) break; - } - - // Generate load log entries. - if (!rval && using_txns) { - TOKUTXN ttxn = db_txn_struct_i(txn)->tokutxn; - int do_fsync = 0; - LSN *get_lsn = NULL; - for (i = 0; i < N; i++) { - BRT brt = dbs[i]->i->brt; - //Fsync is necessary for the last one only. - if (i==N-1) { - do_fsync = 1; //We only need a single fsync of logs. - get_lsn = load_lsn; //Set pointer to capture the last lsn. - } - rval = toku_brt_load(brt, ttxn, new_inames_in_env[i], do_fsync, get_lsn); - if (rval) break; - } - } - - if (using_txns) { - // close txn - if (rval == 0) { // all well so far, commit child - rval = toku_txn_commit(child, DB_TXN_NOSYNC, NULL, NULL, false); - assert(rval==0); - } - else { // abort child - int r2 = toku_txn_abort(child, NULL, NULL, false); - assert(r2==0); - for (i=0; idbenv->i->directory, dbtxn, &dname_dbt, &iname_dbt, DB_SERIALIZABLE); // allocates memory for iname - assert(r==0); + assert_zero(r); new_iname_in_env = iname_dbt.data; r = toku_dictionary_redirect(new_iname_in_env, brt, tokutxn); @@ -6689,20 +3380,5 @@ toku_test_get_checkpointing_user_data_status (void) { return toku_cachetable_get_checkpointing_user_data_status(); } -// acquire a point write lock on the key for a given txn. -// this does not block the calling thread. -int -toku_grab_write_lock (DB *db, DBT *key, TOKUTXN tokutxn) { - DB_TXN *txn = toku_txn_get_container_db_txn(tokutxn); - DB_TXN *txn_anc = toku_txn_ancestor(txn); - int r = toku_txn_add_lt(txn_anc, db->i->lt); - if (r == 0) { - TXNID txn_anc_id = toku_txn_get_txnid(db_txn_struct_i(txn_anc)->tokutxn); - r = toku_lt_acquire_write_lock(db->i->lt, db, txn_anc_id, key); - } - return r; -} - - #undef STATUS_VALUE #undef PERSISTENT_UPGRADE_STATUS_VALUE diff --git a/src/ydb.h b/src/ydb.h index 7b29d020a6a..5c626e6b053 100644 --- a/src/ydb.h +++ b/src/ydb.h @@ -14,9 +14,6 @@ int toku_ydb_init(void); // Called when the ydb library is unloaded. int toku_ydb_destroy(void); -// Called to use dlmalloc functions. -void setup_dlmalloc(void) __attribute__((__visibility__("default"))); - // db_env_create for the trace library int db_env_create_toku10(DB_ENV **, u_int32_t) __attribute__((__visibility__("default"))); diff --git a/src/ydb_cursor.c b/src/ydb_cursor.c new file mode 100755 index 00000000000..0ca1ccaa698 --- /dev/null +++ b/src/ydb_cursor.c @@ -0,0 +1,916 @@ +/* -*- mode: C; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +#ident "Copyright (c) 2007-2009 Tokutek Inc. All rights reserved." + +#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it." + +#include +#include +#include +#include +#include "toku_assert.h" +#include "ydb-internal.h" +#include "ydb_cursor.h" +#include "ydb_row_lock.h" + +static YDB_C_LAYER_STATUS_S ydb_c_layer_status; +#ifdef STATUS_VALUE +#undef STATUS_VALUE +#endif +#define STATUS_VALUE(x) ydb_c_layer_status.status[x].value.num + +#define STATUS_INIT(k,t,l) { \ + ydb_c_layer_status.status[k].keyname = #k; \ + ydb_c_layer_status.status[k].type = t; \ + ydb_c_layer_status.status[k].legend = l; \ + } + +static void +ydb_c_layer_status_init (void) { + // Note, this function initializes the keyname, type, and legend fields. + // Value fields are initialized to zero by compiler. + + STATUS_INIT(YDB_C_LAYER_NUM_POINT_QUERIES, UINT64, "dictionary point queries"); + STATUS_INIT(YDB_C_LAYER_NUM_SEQUENTIAL_QUERIES, UINT64, "dictionary sequential queries"); + ydb_c_layer_status.initialized = true; +} +#undef STATUS_INIT + +void +ydb_c_layer_get_status(YDB_C_LAYER_STATUS statp) { + if (!ydb_c_layer_status.initialized) + ydb_c_layer_status_init(); + *statp = ydb_c_layer_status; +} + + +/* lightweight cursor methods. */ +static int toku_c_getf_current_binding(DBC *c, u_int32_t flag, YDB_CALLBACK_FUNCTION f, void *extra); + +//Get the main portion of a cursor flag (excluding the bitwise or'd components). +static int +get_main_cursor_flag(u_int32_t flags) { + return flags & DB_OPFLAGS_MASK; +} + +static int +get_nonmain_cursor_flags(u_int32_t flags) { + return flags & ~(DB_OPFLAGS_MASK); +} + +static inline BOOL +toku_c_uninitialized(DBC* c) { + return toku_brt_cursor_uninitialized(dbc_struct_i(c)->c); +} + +typedef struct query_context_wrapped_t { + DBT *key; + DBT *val; + struct simple_dbt *skey; + struct simple_dbt *sval; +} *QUERY_CONTEXT_WRAPPED, QUERY_CONTEXT_WRAPPED_S; + +static inline void +query_context_wrapped_init(QUERY_CONTEXT_WRAPPED context, DBC *c, DBT *key, DBT *val) { + context->key = key; + context->val = val; + context->skey = dbc_struct_i(c)->skey; + context->sval = dbc_struct_i(c)->sval; +} + +static int +c_get_wrapper_callback(DBT const *key, DBT const *val, void *extra) { + QUERY_CONTEXT_WRAPPED context = extra; + int r; + r = toku_dbt_set(key->size, key->data, context->key, context->skey); + if (r==0) r = toku_dbt_set(val->size, val->data, context->val, context->sval); + return r; +} + +static int +toku_c_get_current_unconditional(DBC* c, u_int32_t flags, DBT* key, DBT* val) { + int r; + QUERY_CONTEXT_WRAPPED_S context; + query_context_wrapped_init(&context, c, key, val); + r = toku_c_getf_current_binding(c, flags, c_get_wrapper_callback, &context); + return r; +} + +static inline u_int32_t +get_cursor_prelocked_flags(u_int32_t flags, DBC* dbc) { + u_int32_t lock_flags = flags & (DB_PRELOCKED | DB_PRELOCKED_WRITE); + + //DB_READ_UNCOMMITTED and DB_READ_COMMITTED transactions 'own' all read locks for user-data dictionaries. + if (dbc_struct_i(dbc)->iso != TOKU_ISO_SERIALIZABLE) { + lock_flags |= DB_PRELOCKED; + } + return lock_flags; +} + +//This is the user level callback function given to ydb layer functions like +//toku_c_getf_first + +typedef struct query_context_base_t { + BRT_CURSOR c; + DB_TXN *txn; + DB *db; + YDB_CALLBACK_FUNCTION f; + void *f_extra; + int r_user_callback; + BOOL do_locking; + BOOL is_write_op; + toku_lock_request lock_request; +} *QUERY_CONTEXT_BASE, QUERY_CONTEXT_BASE_S; + +typedef struct query_context_t { + QUERY_CONTEXT_BASE_S base; +} *QUERY_CONTEXT, QUERY_CONTEXT_S; + +typedef struct query_context_with_input_t { + QUERY_CONTEXT_BASE_S base; + DBT *input_key; + DBT *input_val; +} *QUERY_CONTEXT_WITH_INPUT, QUERY_CONTEXT_WITH_INPUT_S; + +static void +query_context_base_init(QUERY_CONTEXT_BASE context, DBC *c, u_int32_t flag, BOOL is_write_op, YDB_CALLBACK_FUNCTION f, void *extra) { + context->c = dbc_struct_i(c)->c; + context->txn = dbc_struct_i(c)->txn; + context->db = c->dbp; + context->f = f; + context->f_extra = extra; + context->is_write_op = is_write_op; + u_int32_t lock_flags = get_cursor_prelocked_flags(flag, c); + if (context->is_write_op) + lock_flags &= DB_PRELOCKED_WRITE; // Only care about whether already locked for write + context->do_locking = (BOOL)(context->db->i->lt!=NULL && !(lock_flags & (DB_PRELOCKED|DB_PRELOCKED_WRITE))); + context->r_user_callback = 0; + toku_lock_request_default_init(&context->lock_request); +} + +static void +query_context_base_destroy(QUERY_CONTEXT_BASE context) { + toku_lock_request_destroy(&context->lock_request); +} + +static void +query_context_init_read(QUERY_CONTEXT context, DBC *c, u_int32_t flag, YDB_CALLBACK_FUNCTION f, void *extra) { + BOOL is_write = FALSE; + query_context_base_init(&context->base, c, flag, is_write, f, extra); +} + +static void +query_context_init_write(QUERY_CONTEXT context, DBC *c, u_int32_t flag, YDB_CALLBACK_FUNCTION f, void *extra) { + BOOL is_write = TRUE; + query_context_base_init(&context->base, c, flag, is_write, f, extra); +} + +static void +query_context_with_input_init(QUERY_CONTEXT_WITH_INPUT context, DBC *c, u_int32_t flag, DBT *key, DBT *val, YDB_CALLBACK_FUNCTION f, void *extra) { + // grab write locks if the DB_RMW flag is set or the cursor was created with the DB_RMW flag + BOOL is_write = ((flag & DB_RMW) != 0) || dbc_struct_i(c)->rmw; + query_context_base_init(&context->base, c, flag, is_write, f, extra); + context->input_key = key; + context->input_val = val; +} + +static int c_getf_first_callback(ITEMLEN keylen, bytevec key, ITEMLEN vallen, bytevec val, void *extra, bool); + +static void +c_query_context_init(QUERY_CONTEXT context, DBC *c, u_int32_t flag, YDB_CALLBACK_FUNCTION f, void *extra) { + BOOL is_write_op = FALSE; + // grab write locks if the DB_RMW flag is set or the cursor was created with the DB_RMW flag + if ((flag & DB_RMW) || dbc_struct_i(c)->rmw) + is_write_op = TRUE; + if (is_write_op) + query_context_init_write(context, c, flag, f, extra); + else + query_context_init_read(context, c, flag, f, extra); +} + +static void +c_query_context_destroy(QUERY_CONTEXT context) { + query_context_base_destroy(&context->base); +} + +static int +toku_c_getf_first(DBC *c, u_int32_t flag, YDB_CALLBACK_FUNCTION f, void *extra) { + HANDLE_PANICKED_DB(c->dbp); + HANDLE_CURSOR_ILLEGAL_WORKING_PARENT_TXN(c); + STATUS_VALUE(YDB_C_LAYER_NUM_POINT_QUERIES)++; + int r = 0; + QUERY_CONTEXT_S context; //Describes the context of this query. + c_query_context_init(&context, c, flag, f, extra); + while (r == 0) { + //toku_brt_cursor_first will call c_getf_first_callback(..., context) (if query is successful) + r = toku_brt_cursor_first(dbc_struct_i(c)->c, c_getf_first_callback, &context); + if (r == DB_LOCK_NOTGRANTED) + r = toku_lock_request_wait_with_default_timeout(&context.base.lock_request, c->dbp->i->lt); + else { + if (r == TOKUDB_USER_CALLBACK_ERROR) + r = context.base.r_user_callback; + break; + } + } + c_query_context_destroy(&context); + return r; +} + +//result is the result of the query (i.e. 0 means found, DB_NOTFOUND, etc..) +static int +c_getf_first_callback(ITEMLEN keylen, bytevec key, ITEMLEN vallen, bytevec val, void *extra, bool lock_only) { + QUERY_CONTEXT super_context = extra; + QUERY_CONTEXT_BASE context = &super_context->base; + + int r; + DBT found_key = { .data = (void *) key, .size = keylen }; + + if (context->do_locking) { + const DBT *left_key = toku_lt_neg_infinity; + const DBT *right_key = key != NULL ? &found_key : toku_lt_infinity; + r = start_range_lock(context->db, context->txn, left_key, right_key, + context->is_write_op ? LOCK_REQUEST_WRITE : LOCK_REQUEST_READ, &context->lock_request); + } else + r = 0; + + //Call application-layer callback if found and locks were successfully obtained. + if (r==0 && key!=NULL && !lock_only) { + DBT found_val = { .data = (void *) val, .size = vallen }; + context->r_user_callback = context->f(&found_key, &found_val, context->f_extra); + r = context->r_user_callback; + } + + //Give brt-layer an error (if any) to return from toku_brt_cursor_first + return r; +} + +static int c_getf_last_callback(ITEMLEN keylen, bytevec key, ITEMLEN vallen, bytevec val, void *extra, bool); + +static int +toku_c_getf_last(DBC *c, u_int32_t flag, YDB_CALLBACK_FUNCTION f, void *extra) { + HANDLE_PANICKED_DB(c->dbp); + HANDLE_CURSOR_ILLEGAL_WORKING_PARENT_TXN(c); + STATUS_VALUE(YDB_C_LAYER_NUM_POINT_QUERIES)++; + int r = 0; + QUERY_CONTEXT_S context; //Describes the context of this query. + c_query_context_init(&context, c, flag, f, extra); + while (r == 0) { + //toku_brt_cursor_last will call c_getf_last_callback(..., context) (if query is successful) + r = toku_brt_cursor_last(dbc_struct_i(c)->c, c_getf_last_callback, &context); + if (r == DB_LOCK_NOTGRANTED) + r = toku_lock_request_wait_with_default_timeout(&context.base.lock_request, c->dbp->i->lt); + else { + if (r == TOKUDB_USER_CALLBACK_ERROR) + r = context.base.r_user_callback; + break; + } + } + c_query_context_destroy(&context); + return r; +} + +//result is the result of the query (i.e. 0 means found, DB_NOTFOUND, etc..) +static int +c_getf_last_callback(ITEMLEN keylen, bytevec key, ITEMLEN vallen, bytevec val, void *extra, bool lock_only) { + QUERY_CONTEXT super_context = extra; + QUERY_CONTEXT_BASE context = &super_context->base; + + int r; + DBT found_key = { .data = (void *) key, .size = keylen }; + + if (context->do_locking) { + const DBT *left_key = key != NULL ? &found_key : toku_lt_neg_infinity; + const DBT *right_key = toku_lt_infinity; + r = start_range_lock(context->db, context->txn, left_key, right_key, + context->is_write_op ? LOCK_REQUEST_WRITE : LOCK_REQUEST_READ, &context->lock_request); + } else + r = 0; + + //Call application-layer callback if found and locks were successfully obtained. + if (r==0 && key!=NULL && !lock_only) { + DBT found_val = { .data = (void *) val, .size = vallen }; + context->r_user_callback = context->f(&found_key, &found_val, context->f_extra); + r = context->r_user_callback; + } + + //Give brt-layer an error (if any) to return from toku_brt_cursor_last + return r; +} + +static int c_getf_next_callback(ITEMLEN keylen, bytevec key, ITEMLEN vallen, bytevec val, void *extra, bool); + +static int +toku_c_getf_next(DBC *c, u_int32_t flag, YDB_CALLBACK_FUNCTION f, void *extra) { + int r; + HANDLE_PANICKED_DB(c->dbp); + HANDLE_CURSOR_ILLEGAL_WORKING_PARENT_TXN(c); + if (toku_c_uninitialized(c)) + r = toku_c_getf_first(c, flag, f, extra); + else { + r = 0; + QUERY_CONTEXT_S context; //Describes the context of this query. + c_query_context_init(&context, c, flag, f, extra); + while (r == 0) { + //toku_brt_cursor_next will call c_getf_next_callback(..., context) (if query is successful) + r = toku_brt_cursor_next(dbc_struct_i(c)->c, c_getf_next_callback, &context); + if (r == DB_LOCK_NOTGRANTED) + r = toku_lock_request_wait_with_default_timeout(&context.base.lock_request, c->dbp->i->lt); + else { + if (r == TOKUDB_USER_CALLBACK_ERROR) + r = context.base.r_user_callback; + break; + } + } + c_query_context_destroy(&context); + } + return r; +} + +//result is the result of the query (i.e. 0 means found, DB_NOTFOUND, etc..) +static int +c_getf_next_callback(ITEMLEN keylen, bytevec key, ITEMLEN vallen, bytevec val, void *extra, bool lock_only) { + QUERY_CONTEXT super_context = extra; + QUERY_CONTEXT_BASE context = &super_context->base; + + int r; + + DBT found_key = { .data = (void *) key, .size = keylen }; + + if (context->do_locking) { + const DBT *prevkey, *prevval; + toku_brt_cursor_peek(context->c, &prevkey, &prevval); + const DBT *left_key = prevkey; + const DBT *right_key = key != NULL ? &found_key : toku_lt_infinity; + r = start_range_lock(context->db, context->txn, left_key, right_key, + context->is_write_op ? LOCK_REQUEST_WRITE : LOCK_REQUEST_READ, &context->lock_request); + } else + r = 0; + + //Call application-layer callback if found and locks were successfully obtained. + if (r==0 && key!=NULL && !lock_only) { + STATUS_VALUE(YDB_C_LAYER_NUM_SEQUENTIAL_QUERIES)++; // accountability + DBT found_val = { .data = (void *) val, .size = vallen }; + context->r_user_callback = context->f(&found_key, &found_val, context->f_extra); + r = context->r_user_callback; + } + + //Give brt-layer an error (if any) to return from toku_brt_cursor_next + return r; +} + +static int c_getf_prev_callback(ITEMLEN keylen, bytevec key, ITEMLEN vallen, bytevec val, void *extra, bool); + +static int +toku_c_getf_prev(DBC *c, u_int32_t flag, YDB_CALLBACK_FUNCTION f, void *extra) { + int r; + HANDLE_PANICKED_DB(c->dbp); + HANDLE_CURSOR_ILLEGAL_WORKING_PARENT_TXN(c); + if (toku_c_uninitialized(c)) + r = toku_c_getf_last(c, flag, f, extra); + else { + r = 0; + QUERY_CONTEXT_S context; //Describes the context of this query. + c_query_context_init(&context, c, flag, f, extra); + while (r == 0) { + //toku_brt_cursor_prev will call c_getf_prev_callback(..., context) (if query is successful) + r = toku_brt_cursor_prev(dbc_struct_i(c)->c, c_getf_prev_callback, &context); + if (r == DB_LOCK_NOTGRANTED) + r = toku_lock_request_wait_with_default_timeout(&context.base.lock_request, c->dbp->i->lt); + else { + if (r == TOKUDB_USER_CALLBACK_ERROR) + r = context.base.r_user_callback; + break; + } + } + c_query_context_destroy(&context); + } + return r; +} + +//result is the result of the query (i.e. 0 means found, DB_NOTFOUND, etc..) +static int +c_getf_prev_callback(ITEMLEN keylen, bytevec key, ITEMLEN vallen, bytevec val, void *extra, bool lock_only) { + QUERY_CONTEXT super_context = extra; + QUERY_CONTEXT_BASE context = &super_context->base; + + int r; + DBT found_key = { .data = (void *) key, .size = keylen }; + + if (context->do_locking) { + const DBT *prevkey, *prevval; + toku_brt_cursor_peek(context->c, &prevkey, &prevval); + const DBT *left_key = key != NULL ? &found_key : toku_lt_neg_infinity; + const DBT *right_key = prevkey; + r = start_range_lock(context->db, context->txn, left_key, right_key, + context->is_write_op ? LOCK_REQUEST_WRITE : LOCK_REQUEST_READ, &context->lock_request); + } else + r = 0; + + //Call application-layer callback if found and locks were successfully obtained. + if (r==0 && key!=NULL && !lock_only) { + STATUS_VALUE(YDB_C_LAYER_NUM_SEQUENTIAL_QUERIES)++; // accountability + DBT found_val = { .data = (void *) val, .size = vallen }; + context->r_user_callback = context->f(&found_key, &found_val, context->f_extra); + r = context->r_user_callback; + } + + //Give brt-layer an error (if any) to return from toku_brt_cursor_prev + return r; +} + +static int c_getf_current_callback(ITEMLEN keylen, bytevec key, ITEMLEN vallen, bytevec val, void *extra, bool); + +static int +toku_c_getf_current(DBC *c, u_int32_t flag, YDB_CALLBACK_FUNCTION f, void *extra) { + HANDLE_PANICKED_DB(c->dbp); + HANDLE_CURSOR_ILLEGAL_WORKING_PARENT_TXN(c); + + QUERY_CONTEXT_S context; //Describes the context of this query. + STATUS_VALUE(YDB_C_LAYER_NUM_SEQUENTIAL_QUERIES)++; // accountability + c_query_context_init(&context, c, flag, f, extra); + //toku_brt_cursor_current will call c_getf_current_callback(..., context) (if query is successful) + int r = toku_brt_cursor_current(dbc_struct_i(c)->c, DB_CURRENT, c_getf_current_callback, &context); + if (r == TOKUDB_USER_CALLBACK_ERROR) r = context.base.r_user_callback; + c_query_context_destroy(&context); + return r; +} + +//result is the result of the query (i.e. 0 means found, DB_NOTFOUND, etc..) +static int +c_getf_current_callback(ITEMLEN keylen, bytevec key, ITEMLEN vallen, bytevec val, void *extra, bool lock_only) { + QUERY_CONTEXT super_context = extra; + QUERY_CONTEXT_BASE context = &super_context->base; + + int r; + + //Call application-layer callback if found. + if (key!=NULL && !lock_only) { + DBT found_key = { .data = (void *) key, .size = keylen }; + DBT found_val = { .data = (void *) val, .size = vallen }; + context->r_user_callback = context->f(&found_key, &found_val, context->f_extra); + r = context->r_user_callback; + } else + r = 0; + + //Give brt-layer an error (if any) to return from toku_brt_cursor_current + return r; +} + +static int +toku_c_getf_current_binding(DBC *c, u_int32_t flag, YDB_CALLBACK_FUNCTION f, void *extra) { + HANDLE_PANICKED_DB(c->dbp); + HANDLE_CURSOR_ILLEGAL_WORKING_PARENT_TXN(c); + + QUERY_CONTEXT_S context; //Describes the context of this query. + STATUS_VALUE(YDB_C_LAYER_NUM_SEQUENTIAL_QUERIES)++; // accountability + c_query_context_init(&context, c, flag, f, extra); + //toku_brt_cursor_current will call c_getf_current_callback(..., context) (if query is successful) + int r = toku_brt_cursor_current(dbc_struct_i(c)->c, DB_CURRENT_BINDING, c_getf_current_callback, &context); + if (r == TOKUDB_USER_CALLBACK_ERROR) r = context.base.r_user_callback; + c_query_context_destroy(&context); + return r; +} + +static int c_getf_set_callback(ITEMLEN keylen, bytevec key, ITEMLEN vallen, bytevec val, void *extra, bool); + +int +toku_c_getf_set(DBC *c, u_int32_t flag, DBT *key, YDB_CALLBACK_FUNCTION f, void *extra) { + HANDLE_PANICKED_DB(c->dbp); + HANDLE_CURSOR_ILLEGAL_WORKING_PARENT_TXN(c); + + int r = 0; + QUERY_CONTEXT_WITH_INPUT_S context; //Describes the context of this query. + STATUS_VALUE(YDB_C_LAYER_NUM_POINT_QUERIES)++; + query_context_with_input_init(&context, c, flag, key, NULL, f, extra); + while (r == 0) { + //toku_brt_cursor_set will call c_getf_set_callback(..., context) (if query is successful) + r = toku_brt_cursor_set(dbc_struct_i(c)->c, key, c_getf_set_callback, &context); + if (r == DB_LOCK_NOTGRANTED) + r = toku_lock_request_wait_with_default_timeout(&context.base.lock_request, c->dbp->i->lt); + else { + if (r == TOKUDB_USER_CALLBACK_ERROR) + r = context.base.r_user_callback; + break; + } + } + query_context_base_destroy(&context.base); + return r; +} + +//result is the result of the query (i.e. 0 means found, DB_NOTFOUND, etc..) +static int +c_getf_set_callback(ITEMLEN keylen, bytevec key, ITEMLEN vallen, bytevec val, void *extra, bool lock_only) { + QUERY_CONTEXT_WITH_INPUT super_context = extra; + QUERY_CONTEXT_BASE context = &super_context->base; + + int r; + + //Lock: + // left(key,val) = (input_key, -infinity) + // right(key,val) = (input_key, found ? found_val : infinity) + if (context->do_locking) { + r = start_range_lock(context->db, context->txn, super_context->input_key, super_context->input_key, + context->is_write_op ? LOCK_REQUEST_WRITE : LOCK_REQUEST_READ, &context->lock_request); + } else + r = 0; + + //Call application-layer callback if found and locks were successfully obtained. + if (r==0 && key!=NULL && !lock_only) { + DBT found_key = { .data = (void *) key, .size = keylen }; + DBT found_val = { .data = (void *) val, .size = vallen }; + context->r_user_callback = context->f(&found_key, &found_val, context->f_extra); + r = context->r_user_callback; + } + + //Give brt-layer an error (if any) to return from toku_brt_cursor_set + return r; +} + +static int c_getf_set_range_callback(ITEMLEN keylen, bytevec key, ITEMLEN vallen, bytevec val, void *extra, bool); + +static int +toku_c_getf_set_range(DBC *c, u_int32_t flag, DBT *key, YDB_CALLBACK_FUNCTION f, void *extra) { + HANDLE_PANICKED_DB(c->dbp); + HANDLE_CURSOR_ILLEGAL_WORKING_PARENT_TXN(c); + + int r = 0; + QUERY_CONTEXT_WITH_INPUT_S context; //Describes the context of this query. + STATUS_VALUE(YDB_C_LAYER_NUM_POINT_QUERIES)++; + query_context_with_input_init(&context, c, flag, key, NULL, f, extra); + while (r == 0) { + //toku_brt_cursor_set_range will call c_getf_set_range_callback(..., context) (if query is successful) + r = toku_brt_cursor_set_range(dbc_struct_i(c)->c, key, c_getf_set_range_callback, &context); + if (r == DB_LOCK_NOTGRANTED) + r = toku_lock_request_wait_with_default_timeout(&context.base.lock_request, c->dbp->i->lt); + else { + if (r == TOKUDB_USER_CALLBACK_ERROR) + r = context.base.r_user_callback; + break; + } + } + query_context_base_destroy(&context.base); + return r; +} + +//result is the result of the query (i.e. 0 means found, DB_NOTFOUND, etc..) +static int +c_getf_set_range_callback(ITEMLEN keylen, bytevec key, ITEMLEN vallen, bytevec val, void *extra, bool lock_only) { + QUERY_CONTEXT_WITH_INPUT super_context = extra; + QUERY_CONTEXT_BASE context = &super_context->base; + + int r; + DBT found_key = { .data = (void *) key, .size = keylen }; + + //Lock: + // left(key,val) = (input_key, -infinity) + // right(key) = found ? found_key : infinity + // right(val) = found ? found_val : infinity + if (context->do_locking) { + const DBT *left_key = super_context->input_key; + const DBT *right_key = key != NULL ? &found_key : toku_lt_infinity; + r = start_range_lock(context->db, context->txn, left_key, right_key, + context->is_write_op ? LOCK_REQUEST_WRITE : LOCK_REQUEST_READ, &context->lock_request); + } else + r = 0; + + //Call application-layer callback if found and locks were successfully obtained. + if (r==0 && key!=NULL && !lock_only) { + DBT found_val = { .data = (void *) val, .size = vallen }; + context->r_user_callback = context->f(&found_key, &found_val, context->f_extra); + r = context->r_user_callback; + } + + //Give brt-layer an error (if any) to return from toku_brt_cursor_set_range + return r; +} + +static int c_getf_set_range_reverse_callback(ITEMLEN keylen, bytevec key, ITEMLEN vallen, bytevec val, void *extra, bool); + +static int +toku_c_getf_set_range_reverse(DBC *c, u_int32_t flag, DBT *key, YDB_CALLBACK_FUNCTION f, void *extra) { + HANDLE_PANICKED_DB(c->dbp); + HANDLE_CURSOR_ILLEGAL_WORKING_PARENT_TXN(c); + + int r = 0; + QUERY_CONTEXT_WITH_INPUT_S context; //Describes the context of this query. + STATUS_VALUE(YDB_C_LAYER_NUM_POINT_QUERIES)++; + query_context_with_input_init(&context, c, flag, key, NULL, f, extra); + while (r == 0) { + //toku_brt_cursor_set_range_reverse will call c_getf_set_range_reverse_callback(..., context) (if query is successful) + r = toku_brt_cursor_set_range_reverse(dbc_struct_i(c)->c, key, c_getf_set_range_reverse_callback, &context); + if (r == DB_LOCK_NOTGRANTED) + r = toku_lock_request_wait_with_default_timeout(&context.base.lock_request, c->dbp->i->lt); + else { + if (r == TOKUDB_USER_CALLBACK_ERROR) + r = context.base.r_user_callback; + break; + } + } + query_context_base_destroy(&context.base); + return r; +} + +//result is the result of the query (i.e. 0 means found, DB_NOTFOUND, etc..) +static int +c_getf_set_range_reverse_callback(ITEMLEN keylen, bytevec key, ITEMLEN vallen, bytevec val, void *extra, bool lock_only) { + QUERY_CONTEXT_WITH_INPUT super_context = extra; + QUERY_CONTEXT_BASE context = &super_context->base; + + int r; + DBT found_key = { .data = (void *) key, .size = keylen }; + + //Lock: + // left(key) = found ? found_key : -infinity + // left(val) = found ? found_val : -infinity + // right(key,val) = (input_key, infinity) + if (context->do_locking) { + const DBT *left_key = key != NULL ? &found_key : toku_lt_neg_infinity; + const DBT *right_key = super_context->input_key; + r = start_range_lock(context->db, context->txn, left_key, right_key, + context->is_write_op ? LOCK_REQUEST_WRITE : LOCK_REQUEST_READ, &context->lock_request); + } else + r = 0; + + //Call application-layer callback if found and locks were successfully obtained. + if (r==0 && key!=NULL && !lock_only) { + DBT found_val = { .data = (void *) val, .size = vallen }; + context->r_user_callback = context->f(&found_key, &found_val, context->f_extra); + r = context->r_user_callback; + } + + //Give brt-layer an error (if any) to return from toku_brt_cursor_set_range_reverse + return r; +} + +// Close a cursor. +// Does not require the ydb lock held when called. +int +toku_c_close(DBC * c) { + HANDLE_PANICKED_DB(c->dbp); + HANDLE_CURSOR_ILLEGAL_WORKING_PARENT_TXN(c); + int r = toku_brt_cursor_close(dbc_struct_i(c)->c); + toku_sdbt_cleanup(&dbc_struct_i(c)->skey_s); + toku_sdbt_cleanup(&dbc_struct_i(c)->sval_s); +#if !TOKUDB_NATIVE_H + toku_free(dbc_struct_i(c)); +#endif + toku_free(c); + return r; +} + +// these next two static functions are defined +// both here and ydb.c. We should find a good +// place for them. +static int +ydb_getf_do_nothing(DBT const* UU(key), DBT const* UU(val), void* UU(extra)) { + return 0; +} + +static inline DBT* +init_dbt_realloc(DBT *dbt) { + memset(dbt, 0, sizeof(*dbt)); + dbt->flags = DB_DBT_REALLOC; + return dbt; +} + +// Return the number of entries whose key matches the key currently +// pointed to by the brt cursor. +static int +toku_c_count(DBC *cursor, db_recno_t *count, u_int32_t flags) { + HANDLE_PANICKED_DB(cursor->dbp); + HANDLE_CURSOR_ILLEGAL_WORKING_PARENT_TXN(cursor); + int r; + DBC *count_cursor = 0; + DBT currentkey; + + init_dbt_realloc(¤tkey); + u_int32_t lock_flags = get_cursor_prelocked_flags(flags, cursor); + flags &= ~lock_flags; + if (flags != 0) { + r = EINVAL; goto finish; + } + + r = toku_c_get_current_unconditional(cursor, lock_flags, ¤tkey, NULL); + if (r != 0) goto finish; + + //TODO: Optimization + //if (do_locking) { + // do a lock from currentkey,-infinity to currentkey,infinity + // lock_flags |= DB_PRELOCKED + //} + + r = toku_db_cursor_internal(cursor->dbp, dbc_struct_i(cursor)->txn, &count_cursor, DBC_DISABLE_PREFETCHING, 0); + if (r != 0) goto finish; + + r = toku_c_getf_set(count_cursor, lock_flags, ¤tkey, ydb_getf_do_nothing, NULL); + if (r==0) { + *count = 1; // there is a key, so the count is one (since we don't have DUP dbs anymore, the only answers are 0 or 1. + } else { + *count = 0; + } + r = 0; +finish: + if (currentkey.data) toku_free(currentkey.data); + if (count_cursor) { + int rr = toku_c_close(count_cursor); assert(rr == 0); + } + return r; +} + +static int +toku_c_pre_acquire_range_lock(DBC *dbc, const DBT *key_left, const DBT *key_right) { + DB *db = dbc->dbp; + DB_TXN *txn = dbc_struct_i(dbc)->txn; + HANDLE_PANICKED_DB(db); + toku_brt_cursor_set_range_lock(dbc_struct_i(dbc)->c, key_left, key_right, + (key_left == toku_lt_neg_infinity), + (key_right == toku_lt_infinity)); + if (!db->i->lt || !txn) + return 0; + //READ_UNCOMMITTED and READ_COMMITTED transactions do not need read locks. + if (!dbc_struct_i(dbc)->rmw && dbc_struct_i(dbc)->iso != TOKU_ISO_SERIALIZABLE) + return 0; + + toku_lock_type lock_type = dbc_struct_i(dbc)->rmw ? LOCK_REQUEST_WRITE : LOCK_REQUEST_READ; + int r = get_range_lock(db, txn, key_left, key_right, lock_type); + return r; +} + +int +toku_c_get(DBC* c, DBT* key, DBT* val, u_int32_t flag) { + //This function exists for legacy (test compatibility) purposes/parity with bdb. + HANDLE_PANICKED_DB(c->dbp); + HANDLE_CURSOR_ILLEGAL_WORKING_PARENT_TXN(c); + + u_int32_t main_flag = get_main_cursor_flag(flag); + u_int32_t remaining_flags = get_nonmain_cursor_flags(flag); + int r; + QUERY_CONTEXT_WRAPPED_S context; + //Passing in NULL for a key or val means that it is NOT an output. + // Both key and val are output: + // query_context_wrapped_init(&context, c, key, val); + // Val is output, key is not: + // query_context_wrapped_init(&context, c, NULL, val); + // Neither key nor val are output: + // query_context_wrapped_init(&context, c, NULL, NULL); // Used for DB_GET_BOTH + switch (main_flag) { + case (DB_FIRST): + query_context_wrapped_init(&context, c, key, val); + r = toku_c_getf_first(c, remaining_flags, c_get_wrapper_callback, &context); + break; + case (DB_LAST): + query_context_wrapped_init(&context, c, key, val); + r = toku_c_getf_last(c, remaining_flags, c_get_wrapper_callback, &context); + break; + case (DB_NEXT): + case (DB_NEXT_NODUP): + query_context_wrapped_init(&context, c, key, val); + r = toku_c_getf_next(c, remaining_flags, c_get_wrapper_callback, &context); + break; + case (DB_PREV): + case (DB_PREV_NODUP): + query_context_wrapped_init(&context, c, key, val); + r = toku_c_getf_prev(c, remaining_flags, c_get_wrapper_callback, &context); + break; +#ifdef DB_PREV_DUP + case (DB_PREV_DUP): + query_context_wrapped_init(&context, c, key, val); + r = toku_c_getf_prev_dup(c, remaining_flags, c_get_wrapper_callback, &context); + break; +#endif + case (DB_CURRENT): + query_context_wrapped_init(&context, c, key, val); + r = toku_c_getf_current(c, remaining_flags, c_get_wrapper_callback, &context); + break; + case (DB_CURRENT_BINDING): + query_context_wrapped_init(&context, c, key, val); + r = toku_c_getf_current_binding(c, remaining_flags, c_get_wrapper_callback, &context); + break; + + case (DB_SET): + query_context_wrapped_init(&context, c, NULL, val); + r = toku_c_getf_set(c, remaining_flags, key, c_get_wrapper_callback, &context); + break; + case (DB_SET_RANGE): + query_context_wrapped_init(&context, c, key, val); + r = toku_c_getf_set_range(c, remaining_flags, key, c_get_wrapper_callback, &context); + break; + case (DB_SET_RANGE_REVERSE): + query_context_wrapped_init(&context, c, key, val); + r = toku_c_getf_set_range_reverse(c, remaining_flags, key, c_get_wrapper_callback, &context); + break; + default: + r = EINVAL; + break; + } + return r; +} + +int +toku_db_cursor_internal(DB * db, DB_TXN * txn, DBC ** c, u_int32_t flags, int is_temporary_cursor) { + HANDLE_PANICKED_DB(db); + HANDLE_DB_ILLEGAL_WORKING_PARENT_TXN(db, txn); + DB_ENV* env = db->dbenv; + + if (flags & ~(DB_SERIALIZABLE | DB_INHERIT_ISOLATION | DB_RMW | DBC_DISABLE_PREFETCHING)) { + return toku_ydb_do_error( + env, + EINVAL, + "Invalid flags set for toku_db_cursor\n" + ); + } + + int r = 0; + + struct __toku_dbc_external *XMALLOC(eresult); // so the internal stuff is stuck on the end + memset(eresult, 0, sizeof(*eresult)); + DBC *result = &eresult->external_part; + + // these methods DO NOT grab the ydb lock +#define SCRS(name) result->name = toku_ ## name + SCRS(c_get); + SCRS(c_count); + SCRS(c_getf_first); + SCRS(c_getf_last); + SCRS(c_getf_next); + SCRS(c_getf_prev); + SCRS(c_getf_current); + SCRS(c_getf_current_binding); + SCRS(c_getf_set); + SCRS(c_getf_set_range); + SCRS(c_getf_set_range_reverse); + SCRS(c_pre_acquire_range_lock); + SCRS(c_close); +#undef SCRS + +#if !TOKUDB_NATIVE_H + MALLOC(result->i); // otherwise it is allocated as part of result->ii + assert(result->i); +#endif + result->dbp = db; + + dbc_struct_i(result)->txn = txn; + dbc_struct_i(result)->skey_s = (struct simple_dbt){0,0}; + dbc_struct_i(result)->sval_s = (struct simple_dbt){0,0}; + if (is_temporary_cursor) { + dbc_struct_i(result)->skey = &db->i->skey; + dbc_struct_i(result)->sval = &db->i->sval; + } else { + dbc_struct_i(result)->skey = &dbc_struct_i(result)->skey_s; + dbc_struct_i(result)->sval = &dbc_struct_i(result)->sval_s; + } + if (flags & DB_SERIALIZABLE) { + dbc_struct_i(result)->iso = TOKU_ISO_SERIALIZABLE; + } else { + dbc_struct_i(result)->iso = txn ? db_txn_struct_i(txn)->iso : TOKU_ISO_SERIALIZABLE; + } + dbc_struct_i(result)->rmw = (flags & DB_RMW) != 0; + BOOL is_snapshot_read = FALSE; + if (txn) { + is_snapshot_read = (dbc_struct_i(result)->iso == TOKU_ISO_READ_COMMITTED || + dbc_struct_i(result)->iso == TOKU_ISO_SNAPSHOT); + } + r = toku_brt_cursor( + db->i->brt, + &dbc_struct_i(result)->c, + txn ? db_txn_struct_i(txn)->tokutxn : NULL, + is_snapshot_read, + ((flags & DBC_DISABLE_PREFETCHING) != 0) + ); + assert(r == 0 || r == TOKUDB_MVCC_DICTIONARY_TOO_NEW); + if (r == 0) { + *c = result; + } + else { +#if !TOKUDB_NATIVE_H + toku_free(result->i); // otherwise it is allocated as part of result->ii +#endif + toku_free(result); + } + return r; +} + +static inline int +autotxn_db_cursor(DB *db, DB_TXN *txn, DBC **c, u_int32_t flags) { + if (!txn && (db->dbenv->i->open_flags & DB_INIT_TXN)) { + return toku_ydb_do_error(db->dbenv, EINVAL, + "Cursors in a transaction environment must have transactions.\n"); + } + return toku_db_cursor_internal(db, txn, c, flags, 0); +} + +// Create a cursor on a db. +// Called without holding the ydb lock. +int +toku_db_cursor(DB *db, DB_TXN *txn, DBC **c, u_int32_t flags) { + int r = autotxn_db_cursor(db, txn, c, flags); + return r; +} + +#undef STATUS_VALUE + +#include +void __attribute__((constructor)) toku_ydb_cursor_drd_ignore(void); +void +toku_ydb_cursor_drd_ignore(void) { + DRD_IGNORE_VAR(ydb_c_layer_status); +} diff --git a/src/ydb_cursor.h b/src/ydb_cursor.h new file mode 100755 index 00000000000..f11e7d2de6d --- /dev/null +++ b/src/ydb_cursor.h @@ -0,0 +1,34 @@ +// This file defines the public interface to the ydb library + +#if !defined(TOKU_YDB_CURSOR_H) +#define TOKU_YDB_CURSOR_H + +#if defined(__cplusplus) +extern "C" { +#endif + +typedef enum { + YDB_C_LAYER_NUM_POINT_QUERIES = 0, + YDB_C_LAYER_NUM_SEQUENTIAL_QUERIES, + YDB_C_LAYER_STATUS_NUM_ROWS /* number of rows in this status array */ +} ydb_c_lock_layer_status_entry; + +typedef struct { + BOOL initialized; + TOKU_ENGINE_STATUS_ROW_S status[YDB_C_LAYER_STATUS_NUM_ROWS]; +} YDB_C_LAYER_STATUS_S, *YDB_C_LAYER_STATUS; + +void ydb_c_layer_get_status(YDB_C_LAYER_STATUS statp); + +int toku_c_get(DBC * c, DBT * key, DBT * data, u_int32_t flag); +int toku_c_getf_set(DBC *c, u_int32_t flag, DBT *key, YDB_CALLBACK_FUNCTION f, void *extra); +int toku_c_close(DBC * c); +int toku_db_cursor_internal(DB *db, DB_TXN * txn, DBC **c, u_int32_t flags, int is_temporary_cursor); +int toku_db_cursor(DB *db, DB_TXN *txn, DBC **c, u_int32_t flags); + + +#if defined(__cplusplus) +} +#endif + +#endif diff --git a/src/ydb_db.c b/src/ydb_db.c new file mode 100644 index 00000000000..33356517d8f --- /dev/null +++ b/src/ydb_db.c @@ -0,0 +1,1179 @@ +/* -*- mode: C; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +#ident "Copyright (c) 2007-2009 Tokutek Inc. All rights reserved." + +#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it." + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "toku_assert.h" +#include "ydb.h" +#include "ydb-internal.h" +#include "brt-internal.h" +#include "brt-flusher.h" +#include "cachetable.h" +#include "log.h" +#include "memory.h" +#include "dlmalloc.h" +#include "checkpoint.h" +#include "key.h" +#include "loader.h" +#include "indexer.h" +#include "ydb_load.h" +#include "brtloader.h" +#include "log_header.h" +#include "ydb_cursor.h" +#include "ydb_row_lock.h" +#include "ydb_env_func.h" +#include "ydb_db.h" +#include "ydb_write.h" + + +static YDB_DB_LAYER_STATUS_S ydb_db_layer_status; +#ifdef STATUS_VALUE +#undef STATUS_VALUE +#endif +#define STATUS_VALUE(x) ydb_db_layer_status.status[x].value.num + +#define STATUS_INIT(k,t,l) { \ + ydb_db_layer_status.status[k].keyname = #k; \ + ydb_db_layer_status.status[k].type = t; \ + ydb_db_layer_status.status[k].legend = l; \ + } + +static void +ydb_db_layer_status_init (void) { + // Note, this function initializes the keyname, type, and legend fields. + // Value fields are initialized to zero by compiler. + + STATUS_INIT(YDB_LAYER_DIRECTORY_WRITE_LOCKS, UINT64, "directory write locks"); + STATUS_INIT(YDB_LAYER_DIRECTORY_WRITE_LOCKS_FAIL, UINT64, "directory write locks fail"); + STATUS_INIT(YDB_LAYER_LOGSUPPRESS, UINT64, "log suppress"); + STATUS_INIT(YDB_LAYER_LOGSUPPRESS_FAIL, UINT64, "log suppress fail"); + ydb_db_layer_status.initialized = true; +} +#undef STATUS_INIT + +void +ydb_db_layer_get_status(YDB_DB_LAYER_STATUS statp) { + if (!ydb_db_layer_status.initialized) + ydb_db_layer_status_init(); + *statp = ydb_db_layer_status; +} + +static inline DBT* +init_dbt_realloc(DBT *dbt) { + memset(dbt, 0, sizeof(*dbt)); + dbt->flags = DB_DBT_REALLOC; + return dbt; +} + +static void +create_iname_hint(const char *dname, char *hint) { + //Requires: size of hint array must be > strlen(dname) + //Copy alphanumeric characters only. + //Replace strings of non-alphanumeric characters with a single underscore. + BOOL underscored = FALSE; + while (*dname) { + if (isalnum(*dname)) { + char c = *dname++; + *hint++ = c; + underscored = FALSE; + } + else { + if (!underscored) + *hint++ = '_'; + dname++; + underscored = TRUE; + } + } + *hint = '\0'; +} + + +// n < 0 means to ignore mark and ignore n +// n >= 0 means to include mark ("_B_" or "_P_") with hex value of n in iname +// (intended for use by loader, which will create many inames using one txnid). +static char * +create_iname(DB_ENV *env, u_int64_t id, char *hint, char *mark, int n) { + int bytes; + char inamebase[strlen(hint) + + 8 + // hex file format version + 16 + // hex id (normally the txnid) + 8 + // hex value of n if non-neg + sizeof("_B___.tokudb")]; // extra pieces + if (n < 0) + bytes = snprintf(inamebase, sizeof(inamebase), + "%s_%"PRIx64"_%"PRIx32 ".tokudb", + hint, id, BRT_LAYOUT_VERSION); + else { + invariant(strlen(mark) == 1); + bytes = snprintf(inamebase, sizeof(inamebase), + "%s_%"PRIx64"_%"PRIx32"_%s_%"PRIx32".tokudb", + hint, id, BRT_LAYOUT_VERSION, mark, n); + } + assert(bytes>0); + assert(bytes<=(int)sizeof(inamebase)-1); + char *rval; + if (env->i->data_dir) + rval = toku_construct_full_name(2, env->i->data_dir, inamebase); + else + rval = toku_construct_full_name(1, inamebase); + assert(rval); + return rval; +} + + + +static int toku_db_open(DB * db, DB_TXN * txn, const char *fname, const char *dbname, DBTYPE dbtype, u_int32_t flags, int mode); + +static int +db_close_before_brt(DB *db, u_int32_t UU(flags)) { + int r; + char *error_string = NULL; + + if (db_opened(db) && db->i->dname) { + // internal (non-user) dictionary has no dname + env_note_zombie_db_closed(db->dbenv, db); // tell env that this db is no longer a zombie (it is completely closed) + } + r = toku_close_brt(db->i->brt, &error_string); + if (r) { + if (!error_string) + error_string = "Closing file\n"; + // Panicking the whole environment may be overkill, but I'm not sure what else to do. + env_panic(db->dbenv, r, error_string); + toku_ydb_do_error(db->dbenv, r, "%s", error_string); + } + else { + if (db->i->lt) { + toku_lt_remove_db_ref(db->i->lt, db); + } + // printf("%s:%d %d=__toku_db_close(%p)\n", __FILE__, __LINE__, r, db); + toku_sdbt_cleanup(&db->i->skey); + toku_sdbt_cleanup(&db->i->sval); + if (db->i->dname) toku_free(db->i->dname); + toku_free(db->i); + toku_free(db); + } + return r; +} + +void +toku_db_add_ref(DB *db) { + db->i->refs++; +} + +void +toku_db_release_ref(DB *db){ + db->i->refs--; +} + +//DB->close() +int +toku_db_close(DB * db, u_int32_t flags) { + int r = 0; + if (db->i->refs != 1) { + r = EBUSY; + } else { + db->i->refs = 0; + if (db_opened(db) && db->i->dname) { + // internal (non-user) dictionary has no dname + env_note_db_closed(db->dbenv, db); // tell env that this db is no longer in use by the user of this api (user-closed, may still be in use by fractal tree internals) + db->i->is_zombie = TRUE; + env_note_zombie_db(db->dbenv, db); // tell env that this db is a zombie + } + //Remove from transaction's list of 'must close' if necessary. + if (!toku_list_empty(&db->i->dbs_that_must_close_before_abort)) + toku_list_remove(&db->i->dbs_that_must_close_before_abort); + + r = toku_brt_db_delay_closed(db->i->brt, db, db_close_before_brt, flags); + } + return r; +} + + +/////////// +//db_getf_XXX is equivalent to c_getf_XXX, without a persistent cursor + +int +db_getf_set(DB *db, DB_TXN *txn, u_int32_t flags, DBT *key, YDB_CALLBACK_FUNCTION f, void *extra) { + HANDLE_PANICKED_DB(db); + HANDLE_DB_ILLEGAL_WORKING_PARENT_TXN(db, txn); + DBC *c; + uint32_t create_flags = flags & (DB_ISOLATION_FLAGS | DB_RMW); + flags &= ~DB_ISOLATION_FLAGS; + int r = toku_db_cursor_internal(db, txn, &c, create_flags | DBC_DISABLE_PREFETCHING, 1); + if (r==0) { + r = toku_c_getf_set(c, flags, key, f, extra); + int r2 = toku_c_close(c); + if (r==0) r = r2; + } + return r; +} + +static inline int +db_thread_need_flags(DBT *dbt) { + return (dbt->flags & (DB_DBT_MALLOC+DB_DBT_REALLOC+DB_DBT_USERMEM)) == 0; +} + +int +toku_db_get (DB * db, DB_TXN * txn, DBT * key, DBT * data, u_int32_t flags) { + HANDLE_PANICKED_DB(db); + HANDLE_DB_ILLEGAL_WORKING_PARENT_TXN(db, txn); + int r; + u_int32_t iso_flags = flags & DB_ISOLATION_FLAGS; + + if ((db->i->open_flags & DB_THREAD) && db_thread_need_flags(data)) + return EINVAL; + + u_int32_t lock_flags = flags & (DB_PRELOCKED | DB_PRELOCKED_WRITE); + flags &= ~lock_flags; + flags &= ~DB_ISOLATION_FLAGS; + // And DB_GET_BOTH is no longer supported. #2862. + if (flags != 0) return EINVAL; + + + DBC *dbc; + r = toku_db_cursor_internal(db, txn, &dbc, iso_flags | DBC_DISABLE_PREFETCHING, 1); + if (r!=0) return r; + u_int32_t c_get_flags = DB_SET; + r = toku_c_get(dbc, key, data, c_get_flags | lock_flags); + int r2 = toku_c_close(dbc); + return r ? r : r2; +} + +#if 0 +static int +toku_db_key_range(DB * db, DB_TXN * txn, DBT * dbt, DB_KEY_RANGE * kr, u_int32_t flags) { + HANDLE_PANICKED_DB(db); + HANDLE_DB_ILLEGAL_WORKING_PARENT_TXN(db, txn); + txn=txn; dbt=dbt; kr=kr; flags=flags; + toku_ydb_barf(); + abort(); +} +#endif + +static int +db_open_subdb(DB * db, DB_TXN * txn, const char *fname, const char *dbname, DBTYPE dbtype, u_int32_t flags, int mode) { + int r; + if (!fname || !dbname) r = EINVAL; + else { + char subdb_full_name[strlen(fname) + sizeof("/") + strlen(dbname)]; + int bytes = snprintf(subdb_full_name, sizeof(subdb_full_name), "%s/%s", fname, dbname); + assert(bytes==(int)sizeof(subdb_full_name)-1); + const char *null_subdbname = NULL; + r = toku_db_open(db, txn, subdb_full_name, null_subdbname, dbtype, flags, mode); + } + return r; +} + +// inames are created here. +// algorithm: +// begin txn +// convert dname to iname (possibly creating new iname) +// open file (toku_brt_open() will handle logging) +// close txn +// if created a new iname, take full range lock +static int +toku_db_open(DB * db, DB_TXN * txn, const char *fname, const char *dbname, DBTYPE dbtype, u_int32_t flags, int mode) { + HANDLE_PANICKED_DB(db); + HANDLE_DB_ILLEGAL_WORKING_PARENT_TXN(db, txn); + if (dbname!=NULL) + return db_open_subdb(db, txn, fname, dbname, dbtype, flags, mode); + + // at this point fname is the dname + //This code ONLY supports single-db files. + assert(dbname==NULL); + const char * dname = fname; // db_open_subdb() converts (fname, dbname) to dname + + ////////////////////////////// do some level of parameter checking. + u_int32_t unused_flags = flags; + int using_txns = db->dbenv->i->open_flags & DB_INIT_TXN; + int r; + if (dbtype!=DB_BTREE && dbtype!=DB_UNKNOWN) return EINVAL; + int is_db_excl = flags & DB_EXCL; unused_flags&=~DB_EXCL; + int is_db_create = flags & DB_CREATE; unused_flags&=~DB_CREATE; + int is_db_hot_index = flags & DB_IS_HOT_INDEX; unused_flags&=~DB_IS_HOT_INDEX; + + //We support READ_UNCOMMITTED and READ_COMMITTED whether or not the flag is provided. + unused_flags&=~DB_READ_UNCOMMITTED; + unused_flags&=~DB_READ_COMMITTED; + unused_flags&=~DB_SERIALIZABLE; + if (unused_flags & ~DB_THREAD) return EINVAL; // unknown flags + + if (is_db_excl && !is_db_create) return EINVAL; + if (dbtype==DB_UNKNOWN && is_db_excl) return EINVAL; + + /* tokudb supports no duplicates and sorted duplicates only */ + unsigned int tflags; + r = toku_brt_get_flags(db->i->brt, &tflags); + if (r != 0) + return r; + + if (db_opened(db)) + return EINVAL; /* It was already open. */ + ////////////////////////////// + + DB_TXN *child = NULL; + // begin child (unless transactionless) + if (using_txns) { + r = toku_txn_begin_internal(db->dbenv, txn, &child, DB_TXN_NOSYNC, 1, true); + assert(r==0); + } + + // convert dname to iname + // - look up dname, get iname + // - if dname does not exist, create iname and make entry in directory + DBT dname_dbt; // holds dname + DBT iname_dbt; // holds iname_in_env + toku_fill_dbt(&dname_dbt, dname, strlen(dname)+1); + init_dbt_realloc(&iname_dbt); // sets iname_dbt.data = NULL + r = toku_db_get(db->dbenv->i->directory, child, &dname_dbt, &iname_dbt, DB_SERIALIZABLE); // allocates memory for iname + char *iname = iname_dbt.data; + if (r==DB_NOTFOUND && !is_db_create) + r = ENOENT; + else if (r==0 && is_db_excl) { + r = EEXIST; + } + else if (r==DB_NOTFOUND) { + char hint[strlen(dname) + 1]; + + // create iname and make entry in directory + u_int64_t id = 0; + + if (using_txns) { + id = toku_txn_get_txnid(db_txn_struct_i(child)->tokutxn); + } + create_iname_hint(dname, hint); + iname = create_iname(db->dbenv, id, hint, NULL, -1); // allocated memory for iname + toku_fill_dbt(&iname_dbt, iname, strlen(iname) + 1); + // + // 0 for performance only, avoid unnecessary query + // if we are creating a hot index, per #3166, we do not want the write lock in directory grabbed. + // directory read lock is grabbed in toku_db_get above + // + u_int32_t put_flags = 0 | ((is_db_hot_index) ? DB_PRELOCKED_WRITE : 0); + r = toku_db_put(db->dbenv->i->directory, child, &dname_dbt, &iname_dbt, put_flags, TRUE); + } + + // we now have an iname + if (r == 0) { + r = db_open_iname(db, child, iname, flags, mode); + if (r==0) { + db->i->dname = toku_xstrdup(dname); + env_note_db_opened(db->dbenv, db); // tell env that a new db handle is open (using dname) + } + } + + // free string holding iname + if (iname) toku_free(iname); + + if (using_txns) { + // close txn + if (r == 0) { // commit + r = toku_txn_commit(child, DB_TXN_NOSYNC, NULL, NULL, false); + invariant(r==0); // TODO panic + } + else { // abort + int r2 = toku_txn_abort(child, NULL, NULL, false); + invariant(r2==0); // TODO panic + } + } + + return r; +} + +int +db_open_iname(DB * db, DB_TXN * txn, const char *iname_in_env, u_int32_t flags, int mode) { + int r; + + //Set comparison functions if not yet set. + if (!db->i->key_compare_was_set && db->dbenv->i->bt_compare) { + r = toku_brt_set_bt_compare(db->i->brt, db->dbenv->i->bt_compare); + assert(r==0); + db->i->key_compare_was_set = TRUE; + } + if (db->dbenv->i->update_function) { + r = toku_brt_set_update(db->i->brt,db->dbenv->i->update_function); + assert(r==0); + } + BOOL need_locktree = (BOOL)((db->dbenv->i->open_flags & DB_INIT_LOCK) && + (db->dbenv->i->open_flags & DB_INIT_TXN)); + + int is_db_excl = flags & DB_EXCL; flags&=~DB_EXCL; + int is_db_create = flags & DB_CREATE; flags&=~DB_CREATE; + //We support READ_UNCOMMITTED and READ_COMMITTED whether or not the flag is provided. + flags&=~DB_READ_UNCOMMITTED; + flags&=~DB_READ_COMMITTED; + flags&=~DB_SERIALIZABLE; + flags&=~DB_IS_HOT_INDEX; + if (flags & ~DB_THREAD) return EINVAL; // unknown flags + + if (is_db_excl && !is_db_create) return EINVAL; + + /* tokudb supports no duplicates and sorted duplicates only */ + unsigned int tflags; + r = toku_brt_get_flags(db->i->brt, &tflags); + if (r != 0) + return r; + + if (db_opened(db)) + return EINVAL; /* It was already open. */ + + db->i->open_flags = flags; + db->i->open_mode = mode; + + r = toku_brt_open(db->i->brt, iname_in_env, + is_db_create, is_db_excl, + db->dbenv->i->cachetable, + txn ? db_txn_struct_i(txn)->tokutxn : NULL_TXN, + db); + if (r != 0) + goto error_cleanup; + + db->i->opened = 1; + if (need_locktree) { + db->i->dict_id = toku_brt_get_dictionary_id(db->i->brt); + r = toku_ltm_get_lt(db->dbenv->i->ltm, &db->i->lt, db->i->dict_id, db, toku_brt_get_bt_compare(db->i->brt)); + if (r!=0) { goto error_cleanup; } + } + //Add to transaction's list of 'must close' if necessary. + if (txn) { + //Do last so we don't have to undo. + toku_list_push(&db_txn_struct_i(txn)->dbs_that_must_close_before_abort, + &db->i->dbs_that_must_close_before_abort); + } + + return 0; + +error_cleanup: + db->i->dict_id = DICTIONARY_ID_NONE; + db->i->opened = 0; + if (db->i->lt) { + toku_lt_remove_db_ref(db->i->lt, db); + db->i->lt = NULL; + } + return r; +} + +// Return the maximum key and val size in +// *key_size and *val_size respectively +static void +db_get_max_row_size(DB * UU(db), uint32_t * max_key_size, uint32_t * max_val_size) { + *max_key_size = 0; + *max_val_size = 0; + toku_brt_get_maximum_advised_key_value_lengths(max_key_size, max_val_size); +} + +static void +locked_db_get_max_row_size(DB *db, uint32_t *max_key_size, uint32_t *max_val_size) { + toku_ydb_lock(); + db_get_max_row_size(db, max_key_size, max_val_size); + toku_ydb_unlock(); +} + +int toku_db_pre_acquire_fileops_lock(DB *db, DB_TXN *txn) { + // bad hack because some environment dictionaries do not have a dname + char *dname = db->i->dname; + if (!dname) + return 0; + + DBT key_in_directory = { .data = dname, .size = strlen(dname)+1 }; + //Left end of range == right end of range (point lock) + int r = get_range_lock(db->dbenv->i->directory, txn, &key_in_directory, &key_in_directory, LOCK_REQUEST_WRITE); + if (r == 0) + STATUS_VALUE(YDB_LAYER_DIRECTORY_WRITE_LOCKS)++; // accountability + else + STATUS_VALUE(YDB_LAYER_DIRECTORY_WRITE_LOCKS_FAIL)++; // accountability + return r; +} + +static int +toku_db_remove(DB * db, const char *fname, const char *dbname, u_int32_t flags) { + HANDLE_PANICKED_DB(db); + DB_TXN *null_txn = NULL; + int r = toku_env_dbremove(db->dbenv, null_txn, fname, dbname, flags); + int r2 = toku_db_close(db, 0); + if (r==0) r = r2; + return r; +} + +static int +toku_db_rename(DB * db, const char *fname, const char *dbname, const char *newname, u_int32_t flags) { + HANDLE_PANICKED_DB(db); + DB_TXN *null_txn = NULL; + int r = toku_env_dbrename(db->dbenv, null_txn, fname, dbname, newname, flags); + int r2 = toku_db_close(db, 0); + if (r==0) r = r2; + return r; +} + +// +// This function is the only way to set a descriptor of a DB. +// +static int +toku_db_change_descriptor(DB *db, DB_TXN* txn, const DBT* descriptor, u_int32_t flags) { + HANDLE_PANICKED_DB(db); + HANDLE_DB_ILLEGAL_WORKING_PARENT_TXN(db, txn); + int r; + TOKUTXN ttxn = txn ? db_txn_struct_i(txn)->tokutxn : NULL; + DBT old_descriptor; + BOOL is_db_hot_index = ((flags & DB_IS_HOT_INDEX) != 0); + + toku_init_dbt(&old_descriptor); + if (!db_opened(db) || !txn || !descriptor || (descriptor->size>0 && !descriptor->data)){ + r = EINVAL; + goto cleanup; + } + if (txn->parent != NULL) { + r = EINVAL; // cannot have a parent if you are a resetting op + goto cleanup; + } + if (!is_db_hot_index) { + r = toku_db_pre_acquire_fileops_lock(db, txn); + if (r != 0) { goto cleanup; } + } + + old_descriptor.size = db->descriptor->dbt.size; + old_descriptor.data = toku_memdup(db->descriptor->dbt.data, db->descriptor->dbt.size); + r = toku_brt_change_descriptor(db->i->brt, &old_descriptor, descriptor, TRUE, ttxn); +cleanup: + if (old_descriptor.data) toku_free(old_descriptor.data); + return r; +} + +static int +toku_db_set_flags(DB *db, u_int32_t flags) { + HANDLE_PANICKED_DB(db); + + /* the following matches BDB */ + if (db_opened(db) && flags != 0) return EINVAL; + + return 0; +} + +static int +toku_db_get_flags(DB *db, u_int32_t *pflags) { + HANDLE_PANICKED_DB(db); + if (!pflags) return EINVAL; + *pflags = 0; + return 0; +} + +static int +toku_db_set_pagesize(DB *db, u_int32_t pagesize) { + HANDLE_PANICKED_DB(db); + int r = toku_brt_set_nodesize(db->i->brt, pagesize); + return r; +} + +static int +toku_db_get_pagesize(DB *db, u_int32_t *pagesize_ptr) { + HANDLE_PANICKED_DB(db); + int r = toku_brt_get_nodesize(db->i->brt, pagesize_ptr); + return r; +} + +static int +toku_db_set_readpagesize(DB *db, u_int32_t readpagesize) { + HANDLE_PANICKED_DB(db); + int r = toku_brt_set_basementnodesize(db->i->brt, readpagesize); + return r; +} + +static int +toku_db_get_readpagesize(DB *db, u_int32_t *readpagesize_ptr) { + HANDLE_PANICKED_DB(db); + int r = toku_brt_get_basementnodesize(db->i->brt, readpagesize_ptr); + return r; +} + +static int +toku_db_stat64(DB * db, DB_TXN *txn, DB_BTREE_STAT64 *s) { + HANDLE_PANICKED_DB(db); + HANDLE_DB_ILLEGAL_WORKING_PARENT_TXN(db, txn); + struct brtstat64_s brtstat; + TOKUTXN tokutxn = NULL; + if (txn != NULL) { + tokutxn = db_txn_struct_i(txn)->tokutxn; + } + int r = toku_brt_stat64(db->i->brt, tokutxn, &brtstat); + if (r==0) { + s->bt_nkeys = brtstat.nkeys; + s->bt_ndata = brtstat.ndata; + s->bt_dsize = brtstat.dsize; + s->bt_fsize = brtstat.fsize; + // 4018 + s->bt_create_time_sec = brtstat.create_time_sec; + s->bt_modify_time_sec = brtstat.modify_time_sec; + s->bt_verify_time_sec = brtstat.verify_time_sec; + } + return r; +} + +static int +toku_db_key_range64(DB* db, DB_TXN* txn __attribute__((__unused__)), DBT* key, u_int64_t* less, u_int64_t* equal, u_int64_t* greater, int* is_exact) { + HANDLE_PANICKED_DB(db); + HANDLE_DB_ILLEGAL_WORKING_PARENT_TXN(db, txn); + + // note that toku_brt_keyrange does not have a txn param + // this will be fixed later + // temporarily, because the caller, locked_db_keyrange, + // has the ydb lock, we are ok + int r = toku_brt_keyrange(db->i->brt, key, less, equal, greater); + if (r != 0) { goto cleanup; } + // temporarily set is_exact to 0 because brt_keyrange does not have this parameter + *is_exact = 0; +cleanup: + return r; +} + +// needed by loader.c +int +toku_db_pre_acquire_table_lock(DB *db, DB_TXN *txn, BOOL UU(just_lock)) { + HANDLE_PANICKED_DB(db); + if (!db->i->lt || !txn) return 0; + int r; + r = get_range_lock(db, txn, toku_lt_neg_infinity, toku_lt_infinity, LOCK_REQUEST_WRITE); +// commented out code for log suppression and recovery suppression. +// TODO: figure out right thing to do with this code. +#if 0 + if (r==0 && !just_lock && + !toku_brt_is_recovery_logging_suppressed(db->i->brt) && + toku_brt_is_empty_fast(db->i->brt) + ) { + //Try to suppress both rollback and recovery logs + DB_LOADER *loader; + DB *dbs[1] = {db}; + uint32_t db_flags[1] = {DB_NOOVERWRITE}; + uint32_t dbt_flags[1] = {0}; + uint32_t loader_flags = DB_PRELOCKED_WRITE; //Don't recursively prelock + DB_ENV *env = db->dbenv; + DB_TXN *child = NULL; + + { + // begin child + int rt = toku_txn_begin_internal(env, txn, &child, DB_TXN_NOSYNC, 1, true); + assert(rt==0); + } + + toku_ydb_unlock(); //Cannot hold ydb lock when creating loader + + int r_loader = env->create_loader(env, child, &loader, NULL, 1, dbs, db_flags, dbt_flags, loader_flags); + if (r_loader==0) { + r_loader = loader->set_error_callback(loader, NULL, NULL); + assert(r_loader==0); + r_loader = loader->set_poll_function(loader, NULL, NULL); + assert(r_loader==0); + // close the loader + r_loader = loader->close(loader); + if (r_loader==0) { + toku_brt_suppress_recovery_logs(db->i->brt, db_txn_struct_i(child)->tokutxn); + } + } + else if (r_loader != DB_LOCK_NOTGRANTED) { + //Lock not granted is not an error. + //It just means we cannot use the loader optimization. + assert(r==0); + r = r_loader; + } + if (r_loader == 0) { // commit + r = locked_txn_commit(child, 0); + assert(r==0); + STATUS_VALUE(YDB_LAYER_LOGSUPPRESS)++; // accountability + } + else { // abort + r = locked_txn_abort(child); + assert(r==0); + STATUS_VALUE(YDB_LAYER_LOGSUPPRESS_FAIL)++; // accountability + } + toku_ydb_lock(); //Reaquire ydb lock. + } +#endif + return r; +} + +static int +locked_db_close(DB * db, u_int32_t flags) { + toku_ydb_lock(); + int r = toku_db_close(db, flags); + toku_ydb_unlock(); + return r; +} + +int +autotxn_db_get(DB* db, DB_TXN* txn, DBT* key, DBT* data, u_int32_t flags) { + BOOL changed; int r; + // ydb lock is NOT held here + r = toku_db_construct_autotxn(db, &txn, &changed, FALSE, FALSE); + if (r!=0) return r; + r = toku_db_get(db, txn, key, data, flags); + return toku_db_destruct_autotxn(txn, r, changed, FALSE); +} + +static inline int +autotxn_db_getf_set (DB *db, DB_TXN *txn, u_int32_t flags, DBT *key, YDB_CALLBACK_FUNCTION f, void *extra) { + BOOL changed; int r; + // ydb lock is NOT held here + r = toku_db_construct_autotxn(db, &txn, &changed, FALSE, FALSE); + if (r!=0) return r; + r = db_getf_set(db, txn, flags, key, f, extra); + return toku_db_destruct_autotxn(txn, r, changed, FALSE); +} + +// truncate a database +// effect: remove all of the rows from a database +static int +toku_db_truncate(DB *db, DB_TXN *txn, u_int32_t *row_count, u_int32_t flags) { + HANDLE_PANICKED_DB(db); + HANDLE_DB_ILLEGAL_WORKING_PARENT_TXN(db, txn); + int r; + + u_int32_t unhandled_flags = flags; + int ignore_cursors = 0; + if (flags & DB_TRUNCATE_WITHCURSORS) { + ignore_cursors = 1; + unhandled_flags &= ~DB_TRUNCATE_WITHCURSORS; + } + + // dont support flags (yet) + if (unhandled_flags) + return EINVAL; + // dont support cursors unless explicitly told to + if (!ignore_cursors && toku_brt_get_cursor_count(db->i->brt) > 0) + return EINVAL; + + // acquire a table lock + if (txn) { + r = toku_db_pre_acquire_fileops_lock(db, txn); + if (r != 0) { + return r; + } + r = toku_db_pre_acquire_table_lock(db, txn, TRUE); + if (r != 0) { + return r; + } + } + + *row_count = 0; + + r = toku_brt_truncate(db->i->brt); + + return r; +} + +static inline int +autotxn_db_open(DB* db, DB_TXN* txn, const char *fname, const char *dbname, DBTYPE dbtype, u_int32_t flags, int mode) { + BOOL changed; int r; + // YDB lock is held when this function is called + r = toku_db_construct_autotxn(db, &txn, &changed, (BOOL)((flags & DB_AUTO_COMMIT) != 0), TRUE); + if (r!=0) return r; + r = toku_db_open(db, txn, fname, dbname, dbtype, flags & ~DB_AUTO_COMMIT, mode); + return toku_db_destruct_autotxn(txn, r, changed, TRUE); +} + +static int +locked_db_open(DB *db, DB_TXN *txn, const char *fname, const char *dbname, DBTYPE dbtype, u_int32_t flags, int mode) { + toku_multi_operation_client_lock(); //Cannot begin checkpoint + toku_ydb_lock(); int r = autotxn_db_open(db, txn, fname, dbname, dbtype, flags, mode); toku_ydb_unlock(); + toku_multi_operation_client_unlock(); //Can now begin checkpoint + return r; +} + +static int +locked_db_remove(DB * db, const char *fname, const char *dbname, u_int32_t flags) { + toku_multi_operation_client_lock(); //Cannot begin checkpoint + toku_ydb_lock(); + int r = toku_db_remove(db, fname, dbname, flags); + toku_ydb_unlock(); + toku_multi_operation_client_unlock(); //Can now begin checkpoint + return r; +} + +static int +locked_db_rename(DB * db, const char *namea, const char *nameb, const char *namec, u_int32_t flags) { + toku_multi_operation_client_lock(); //Cannot begin checkpoint + toku_ydb_lock(); + int r = toku_db_rename(db, namea, nameb, namec, flags); + toku_ydb_unlock(); + toku_multi_operation_client_unlock(); //Can now begin checkpoint + return r; +} + +static int +locked_db_change_descriptor(DB *db, DB_TXN* txn, const DBT* descriptor, u_int32_t flags) { + toku_ydb_lock(); + int r = toku_db_change_descriptor(db, txn, descriptor, flags); + toku_ydb_unlock(); + return r; +} + +static void +locked_db_set_errfile (DB *db, FILE *errfile) { + db->dbenv->set_errfile(db->dbenv, errfile); +} + +static int +locked_db_set_flags(DB *db, u_int32_t flags) { + toku_ydb_lock(); int r = toku_db_set_flags(db, flags); toku_ydb_unlock(); return r; +} + +static int +locked_db_get_flags(DB *db, u_int32_t *flags) { + toku_ydb_lock(); int r = toku_db_get_flags(db, flags); toku_ydb_unlock(); return r; +} + +static int +locked_db_set_pagesize(DB *db, u_int32_t pagesize) { + toku_ydb_lock(); int r = toku_db_set_pagesize(db, pagesize); toku_ydb_unlock(); return r; +} + +static int +locked_db_get_pagesize(DB *db, u_int32_t *pagesize_ptr) { + toku_ydb_lock(); int r = toku_db_get_pagesize(db, pagesize_ptr); toku_ydb_unlock(); return r; +} + +static int +locked_db_set_readpagesize(DB *db, u_int32_t readpagesize) { + toku_ydb_lock(); int r = toku_db_set_readpagesize(db, readpagesize); toku_ydb_unlock(); return r; +} + +static int +locked_db_get_readpagesize(DB *db, u_int32_t *readpagesize_ptr) { + toku_ydb_lock(); int r = toku_db_get_readpagesize(db, readpagesize_ptr); toku_ydb_unlock(); return r; +} + +// TODO 2216 delete this +static int +locked_db_fd(DB * UU(db), int * UU(fdp)) { + // toku_ydb_lock(); + // int r = toku_db_fd(db, fdp); + // toku_ydb_unlock(); + // return r; + return 0; +} +static const DBT* toku_db_dbt_pos_infty(void) __attribute__((pure)); +static const DBT* +toku_db_dbt_pos_infty(void) { + return toku_lt_infinity; +} + +static const DBT* toku_db_dbt_neg_infty(void) __attribute__((pure)); +static const DBT* +toku_db_dbt_neg_infty(void) { + return toku_lt_neg_infinity; +} + +static int +locked_db_truncate(DB *db, DB_TXN *txn, u_int32_t *row_count, u_int32_t flags) { + toku_checkpoint_safe_client_lock(); + toku_ydb_lock(); + int r = toku_db_truncate(db, txn, row_count, flags); + toku_ydb_unlock(); + toku_checkpoint_safe_client_unlock(); + return r; +} + +static int +toku_db_optimize(DB *db) { + HANDLE_PANICKED_DB(db); + int r = toku_brt_optimize(db->i->brt); + return r; +} + +static int +toku_db_hot_optimize(DB *db, + int (*progress_callback)(void *extra, float progress), + void *progress_extra) +{ + HANDLE_PANICKED_DB(db); + int r = 0; + // If we areunable to get a directory read lock, do nothing. + r = toku_brt_hot_optimize(db->i->brt, + progress_callback, + progress_extra); + + return r; +} + +static int +locked_db_optimize(DB *db) { + toku_ydb_lock(); + int r = toku_db_optimize(db); + toku_ydb_unlock(); + return r; +} + +static int +db_get_fragmentation(DB * db, TOKU_DB_FRAGMENTATION report) { + HANDLE_PANICKED_DB(db); + int r; + if (!db_opened(db)) + r = toku_ydb_do_error(db->dbenv, EINVAL, "Fragmentation report available only on open DBs.\n"); + else + r = toku_brt_get_fragmentation(db->i->brt, report); + return r; +} + +static int +locked_db_get_fragmentation(DB * db, TOKU_DB_FRAGMENTATION report) { + toku_ydb_lock(); + int r = db_get_fragmentation(db, report); + toku_ydb_unlock(); + return r; +} + +int +toku_db_set_indexer(DB *db, DB_INDEXER * indexer) { + int r = 0; + if ( db->i->indexer != NULL && indexer != NULL ) { + // you are trying to overwrite a valid indexer + r = EINVAL; + } + else { + db->i->indexer = indexer; + } + return r; +} + +static int +locked_db_set_indexer(DB *db, DB_INDEXER *indexer) { + toku_ydb_lock(); int r = toku_db_set_indexer(db, indexer); toku_ydb_unlock(); return r; +} + +DB_INDEXER * +toku_db_get_indexer(DB *db) { + return db->i->indexer; +} + +static void +locked_db_get_indexer(DB *db, DB_INDEXER **indexer_ptr) { + toku_ydb_lock(); *indexer_ptr = toku_db_get_indexer(db); toku_ydb_unlock(); +} + +struct ydb_verify_context { + int (*progress_callback)(void *extra, float progress); + void *progress_extra; +}; + +static int +ydb_verify_progress_callback(void *extra, float progress) { + struct ydb_verify_context *context = (struct ydb_verify_context *) extra; + int r = 0; + if (context->progress_callback) { + r = context->progress_callback(context->progress_extra, progress); + } + return r; +} + +static int +toku_db_verify_with_progress(DB *db, int (*progress_callback)(void *extra, float progress), void *progress_extra, int verbose, int keep_going) { + struct ydb_verify_context context = { progress_callback, progress_extra }; + int r = toku_verify_brt_with_progress(db->i->brt, ydb_verify_progress_callback, &context, verbose, keep_going); + return r; +} + + +static int +db_pre_acquire_table_lock(DB *db, DB_TXN *txn) { + return toku_db_pre_acquire_table_lock(db, txn, FALSE); +} + +int +toku_db_create(DB ** db, DB_ENV * env, u_int32_t flags) { + int r; + + if (flags || env == NULL) + return EINVAL; + + if (!env_opened(env)) + return EINVAL; + + DB *MALLOC(result); + if (result == 0) { + return ENOMEM; + } + memset(result, 0, sizeof *result); + result->dbenv = env; + // methods that grab the ydb lock +#define SDB(name) result->name = locked_db_ ## name + SDB(close); + // SDB(key_range); + SDB(open); + SDB(remove); + SDB(rename); + SDB(change_descriptor); + SDB(set_errfile); + SDB(set_pagesize); + SDB(get_pagesize); + SDB(set_readpagesize); + SDB(get_readpagesize); + SDB(set_flags); + SDB(get_flags); + SDB(fd); + SDB(truncate); + SDB(get_max_row_size); + SDB(optimize); + SDB(get_fragmentation); + SDB(set_indexer); + SDB(get_indexer); +#undef SDB + // methods that take the ydb lock in some capacity, + // but not from beginning to end + result->del = autotxn_db_del; + result->put = autotxn_db_put; + result->update = autotxn_db_update; + result->update_broadcast = autotxn_db_update_broadcast; + + // unlocked methods + result->cursor = toku_db_cursor; + result->get = autotxn_db_get; + result->getf_set = autotxn_db_getf_set; + result->pre_acquire_table_lock = db_pre_acquire_table_lock; + result->pre_acquire_fileops_lock = toku_db_pre_acquire_fileops_lock; + result->key_range64 = toku_db_key_range64; + result->hot_optimize = toku_db_hot_optimize; + result->stat64 = toku_db_stat64; + result->verify_with_progress = toku_db_verify_with_progress; + + result->dbt_pos_infty = toku_db_dbt_pos_infty; + result->dbt_neg_infty = toku_db_dbt_neg_infty; + MALLOC(result->i); + if (result->i == 0) { + toku_free(result); + return ENOMEM; + } + memset(result->i, 0, sizeof *result->i); + result->i->dict_id = DICTIONARY_ID_NONE; + result->i->opened = 0; + result->i->open_flags = 0; + result->i->open_mode = 0; + result->i->brt = 0; + result->i->indexer = NULL; + result->i->refs = 1; + toku_list_init(&result->i->dbs_that_must_close_before_abort); + r = toku_brt_create(&result->i->brt); + if (r != 0) { + toku_free(result->i); + toku_free(result); + return r; + } + *db = result; + return 0; +} + + +/* Following functions (ydb_load_xxx()) are used by loader: + */ + +// When the loader is created, it makes this call. +// For each dictionary to be loaded, replace old iname in directory +// with a newly generated iname. This will also take a write lock +// on the directory entries. The write lock will be released when +// the transaction of the loader is completed. +// If the transaction commits, the new inames are in place. +// If the transaction aborts, the old inames will be restored. +// The new inames are returned to the caller. +// It is the caller's responsibility to free them. +// If "mark_as_loader" is true, then include a mark in the iname +// to indicate that the file is created by the brt loader. +// Return 0 on success (could fail if write lock not available). +int +ydb_load_inames(DB_ENV * env, DB_TXN * txn, int N, DB * dbs[N], char * new_inames_in_env[N], LSN *load_lsn, BOOL mark_as_loader) { + int rval; + int i; + + int using_txns = env->i->open_flags & DB_INIT_TXN; + DB_TXN * child = NULL; + TXNID xid = 0; + DBT dname_dbt; // holds dname + DBT iname_dbt; // holds new iname + + char * mark; + + if (mark_as_loader) + mark = "B"; + else + mark = "P"; + + for (i=0; itokutxn); + } + for (i = 0; i < N; i++) { + char * dname = dbs[i]->i->dname; + toku_fill_dbt(&dname_dbt, dname, strlen(dname)+1); + // now create new iname + char hint[strlen(dname) + 1]; + create_iname_hint(dname, hint); + char * new_iname = create_iname(env, xid, hint, mark, i); // allocates memory for iname_in_env + new_inames_in_env[i] = new_iname; + toku_fill_dbt(&iname_dbt, new_iname, strlen(new_iname) + 1); // iname_in_env goes in directory + rval = toku_db_put(env->i->directory, child, &dname_dbt, &iname_dbt, 0, TRUE); + if (rval) break; + } + + // Generate load log entries. + if (!rval && using_txns) { + TOKUTXN ttxn = db_txn_struct_i(txn)->tokutxn; + int do_fsync = 0; + LSN *get_lsn = NULL; + for (i = 0; i < N; i++) { + BRT brt = dbs[i]->i->brt; + //Fsync is necessary for the last one only. + if (i==N-1) { + do_fsync = 1; //We only need a single fsync of logs. + get_lsn = load_lsn; //Set pointer to capture the last lsn. + } + rval = toku_brt_load(brt, ttxn, new_inames_in_env[i], do_fsync, get_lsn); + if (rval) break; + } + } + + if (using_txns) { + // close txn + if (rval == 0) { // all well so far, commit child + rval = toku_txn_commit(child, DB_TXN_NOSYNC, NULL, NULL, false); + assert(rval==0); + } + else { // abort child + int r2 = toku_txn_abort(child, NULL, NULL, false); + assert(r2==0); + for (i=0; ii->opened != 0; +} + +static inline toku_dbt_cmp +toku_db_get_compare_fun(DB* db) { + return db->i->brt->compare_fun; +} + +int toku_db_pre_acquire_fileops_lock(DB *db, DB_TXN *txn); +int db_open_iname(DB * db, DB_TXN * txn, const char *iname, u_int32_t flags, int mode); +int toku_db_pre_acquire_table_lock(DB *db, DB_TXN *txn, BOOL just_lock); +int toku_db_get (DB * db, DB_TXN * txn, DBT * key, DBT * data, u_int32_t flags); +int toku_db_create(DB ** db, DB_ENV * env, u_int32_t flags); +int toku_db_close(DB * db, u_int32_t flags); +int db_getf_set(DB *db, DB_TXN *txn, u_int32_t flags, DBT *key, YDB_CALLBACK_FUNCTION f, void *extra); +int autotxn_db_get(DB* db, DB_TXN* txn, DBT* key, DBT* data, u_int32_t flags); + +//TODO: DB_AUTO_COMMIT. +//TODO: Nowait only conditionally? +//TODO: NOSYNC change to SYNC if DB_ENV has something in set_flags +static inline int +toku_db_construct_autotxn(DB* db, DB_TXN **txn, BOOL* changed, BOOL force_auto_commit, BOOL holds_ydb_lock) { + assert(db && txn && changed); + DB_ENV* env = db->dbenv; + if (*txn || !(env->i->open_flags & DB_INIT_TXN)) { + *changed = FALSE; + return 0; + } + BOOL nosync = (BOOL)(!force_auto_commit && !(env->i->open_flags & DB_AUTO_COMMIT)); + u_int32_t txn_flags = DB_TXN_NOWAIT | (nosync ? DB_TXN_NOSYNC : 0); + int r = toku_txn_begin_internal(env, NULL, txn, txn_flags, 1, holds_ydb_lock); + if (r!=0) return r; + *changed = TRUE; + return 0; +} + +static inline int +toku_db_destruct_autotxn(DB_TXN *txn, int r, BOOL changed, BOOL holds_ydb_lock) { + if (!changed) return r; + if (!holds_ydb_lock) toku_ydb_lock(); + if (r==0) { + r = toku_txn_commit(txn, 0, NULL, NULL, false); + } + else { + toku_txn_abort(txn, NULL, NULL, false); + } + if (!holds_ydb_lock) toku_ydb_unlock(); + return r; +} + + + +#if defined(__cplusplus) +} +#endif + +#endif diff --git a/src/ydb_env_func.c b/src/ydb_env_func.c new file mode 100644 index 00000000000..54f6fe170ac --- /dev/null +++ b/src/ydb_env_func.c @@ -0,0 +1,158 @@ +/* -*- mode: C; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +#ident "Copyright (c) 2007-2009 Tokutek Inc. All rights reserved." + +#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it." + +#include +#include +#include "brt-internal.h" +#include "brt-flusher.h" +#include "dlmalloc.h" +#include "checkpoint.h" +#include "brtloader.h" +#include "ydb_env_func.h" + +// For test purposes only. +// These callbacks are never used in production code, only as a way to test the system +// (for example, by causing crashes at predictable times). +void (*checkpoint_callback_f)(void*) = NULL; +void * checkpoint_callback_extra = NULL; +void (*checkpoint_callback2_f)(void*) = NULL; +void * checkpoint_callback2_extra = NULL; + +uint32_t engine_status_enable = 1; // if zero, suppress engine status output on failed assert, for test programs only + +int +db_env_set_func_fsync (int (*fsync_function)(int)) { + return toku_set_func_fsync(fsync_function); +} + +int +db_env_set_func_pwrite (ssize_t (*pwrite_function)(int, const void *, size_t, toku_off_t)) { + return toku_set_func_pwrite(pwrite_function); +} + +int +db_env_set_func_full_pwrite (ssize_t (*pwrite_function)(int, const void *, size_t, toku_off_t)) { + return toku_set_func_full_pwrite(pwrite_function); +} + +int +db_env_set_func_write (ssize_t (*write_function)(int, const void *, size_t)) { + return toku_set_func_write(write_function); +} + +int +db_env_set_func_full_write (ssize_t (*write_function)(int, const void *, size_t)) { + return toku_set_func_full_write(write_function); +} + +int +db_env_set_func_fdopen (FILE * (*fdopen_function)(int, const char *)) { + return toku_set_func_fdopen(fdopen_function); +} + +int +db_env_set_func_fopen (FILE * (*fopen_function)(const char *, const char *)) { + return toku_set_func_fopen(fopen_function); +} + +int +db_env_set_func_open (int (*open_function)(const char *, int, int)) { + return toku_set_func_open(open_function); +} + +int +db_env_set_func_fclose (int (*fclose_function)(FILE*)) { + return toku_set_func_fclose(fclose_function); +} + +int +db_env_set_func_pread (ssize_t (*fun)(int, void *, size_t, off_t)) { + return toku_set_func_pread(fun); +} + +void +db_env_set_func_loader_fwrite (size_t (*fwrite_fun)(const void*,size_t,size_t,FILE*)) { + brtloader_set_os_fwrite(fwrite_fun); +} + +int +db_env_set_func_malloc (void *(*f)(size_t)) { + toku_set_func_malloc(f); + return 0; +} + +int +db_env_set_func_realloc (void *(*f)(void*, size_t)) { + toku_set_func_realloc(f); + return 0; +} + +int +db_env_set_func_free (void (*f)(void*)) { + toku_set_func_free(f); + return 0; +} + + +// Got to call dlmalloc, or else it won't get included. +void +setup_dlmalloc (void) { + db_env_set_func_malloc(dlmalloc); + db_env_set_func_realloc(dlrealloc); + db_env_set_func_free(dlfree); +} + +// For test purposes only. +// With this interface, all checkpoint users get the same callbacks and the same extras. +void +db_env_set_checkpoint_callback (void (*callback_f)(void*), void* extra) { + toku_checkpoint_safe_client_lock(); + checkpoint_callback_f = callback_f; + checkpoint_callback_extra = extra; + toku_checkpoint_safe_client_unlock(); + //printf("set callback = %p, extra = %p\n", callback_f, extra); +} + +void +db_env_set_checkpoint_callback2 (void (*callback_f)(void*), void* extra) { + toku_checkpoint_safe_client_lock(); + checkpoint_callback2_f = callback_f; + checkpoint_callback2_extra = extra; + toku_checkpoint_safe_client_unlock(); + //printf("set callback2 = %p, extra2 = %p\n", callback2_f, extra2); +} + +void +db_env_set_recover_callback (void (*callback_f)(void*), void* extra) { + toku_recover_set_callback(callback_f, extra); +} + +void +db_env_set_recover_callback2 (void (*callback_f)(void*), void* extra) { + toku_recover_set_callback2(callback_f, extra); +} + +void +db_env_set_flusher_thread_callback(void (*callback_f)(int, void*), void* extra) { + toku_flusher_thread_set_callback(callback_f, extra); +} + +void +db_env_set_loader_size_factor (uint32_t factor) { + toku_brtloader_set_size_factor(factor); +} + +void +db_env_set_mvcc_garbage_collection_verification(u_int32_t verification_mode) { + garbage_collection_debug = (verification_mode != 0); +} + +// Purpose: allow test programs that expect to fail to suppress engine status output on failed assert. +void +db_env_enable_engine_status(uint32_t enable) { + engine_status_enable = enable; +} + + diff --git a/src/ydb_env_func.h b/src/ydb_env_func.h new file mode 100644 index 00000000000..728ed1288ea --- /dev/null +++ b/src/ydb_env_func.h @@ -0,0 +1,23 @@ +// This file defines the public interface to the ydb library + +#if !defined(TOKU_YDB_ENV_FUNC_H) +#define TOKU_YDB_ENV_FUNC_H + +#if defined(__cplusplus) +extern "C" { +#endif + +extern void (*checkpoint_callback_f)(void*); +extern void * checkpoint_callback_extra; +extern void (*checkpoint_callback2_f)(void*); +extern void * checkpoint_callback2_extra; + +extern uint32_t engine_status_enable; +// Called to use dlmalloc functions. +void setup_dlmalloc(void) __attribute__((__visibility__("default"))); + +#if defined(__cplusplus) +} +#endif + +#endif diff --git a/src/ydb_row_lock.c b/src/ydb_row_lock.c new file mode 100755 index 00000000000..a61fd9084b4 --- /dev/null +++ b/src/ydb_row_lock.c @@ -0,0 +1,106 @@ +/* -*- mode: C; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +#ident "Copyright (c) 2007-2009 Tokutek Inc. All rights reserved." + +#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it." + +#include +#include "ydb-internal.h" +#include "ydb_row_lock.h" + +static int +toku_txn_add_lt(DB_TXN* txn, toku_lock_tree* lt) { + int r = ENOSYS; + assert(txn && lt); + toku_mutex_lock(<->mgr->mutex); + toku_lth* lth = db_txn_struct_i(txn)->lth; + // we used to initialize the transaction's lth during begin. + // Now we initialize the lth only if the transaction needs the lth, here + if (!lth) { + r = toku_lth_create(&db_txn_struct_i(txn)->lth); + assert_zero(r); + lth = db_txn_struct_i(txn)->lth; + } + + toku_lock_tree* find = toku_lth_find(lth, lt); + if (find) { + assert(find == lt); + r = 0; + goto cleanup; + } + r = toku_lth_insert(lth, lt); + if (r != 0) { goto cleanup; } + + toku_lt_add_ref(lt); + r = 0; +cleanup: + toku_mutex_unlock(<->mgr->mutex); + return r; +} + +/* + Used for partial implementation of nested transactions. + Work is done by children as normal, but all locking is done by the + root of the nested txn tree. + This may hold extra locks, and will not work as expected when + a node has two non-completed txns at any time. +*/ +static inline DB_TXN* +toku_txn_ancestor(DB_TXN* txn) { + while (txn && txn->parent) txn = txn->parent; + + return txn; +} + +// Get a range lock. +// Return when the range lock is acquired or the default lock tree timeout has expired. +// The ydb mutex must be held when called and may be released when waiting in the lock tree. +int +get_range_lock(DB *db, DB_TXN *txn, const DBT *left_key, const DBT *right_key, toku_lock_type lock_type) { + int r; + DB_TXN *txn_anc = toku_txn_ancestor(txn); + r = toku_txn_add_lt(txn_anc, db->i->lt); + if (r == 0) { + TXNID txn_anc_id = toku_txn_get_txnid(db_txn_struct_i(txn_anc)->tokutxn); + toku_lock_request lock_request; + toku_lock_request_init(&lock_request, db, txn_anc_id, left_key, right_key, lock_type); + r = toku_lt_acquire_lock_request_with_default_timeout(db->i->lt, &lock_request); + toku_lock_request_destroy(&lock_request); + } + return r; +} + +// Setup and start an asynchronous lock request. +int +start_range_lock(DB *db, DB_TXN *txn, const DBT *left_key, const DBT *right_key, toku_lock_type lock_type, toku_lock_request *lock_request) { + int r; + DB_TXN *txn_anc = toku_txn_ancestor(txn); + r = toku_txn_add_lt(txn_anc, db->i->lt); + if (r == 0) { + TXNID txn_anc_id = toku_txn_get_txnid(db_txn_struct_i(txn_anc)->tokutxn); + toku_lock_request_set(lock_request, db, txn_anc_id, left_key, right_key, lock_type); + r = toku_lock_request_start(lock_request, db->i->lt, true); + } + return r; +} + +int +get_point_write_lock(DB *db, DB_TXN *txn, const DBT *key) { + int r = get_range_lock(db, txn, key, key, LOCK_REQUEST_WRITE); + return r; +} + +// acquire a point write lock on the key for a given txn. +// this does not block the calling thread. +int +toku_grab_write_lock (DB *db, DBT *key, TOKUTXN tokutxn) { + DB_TXN *txn = toku_txn_get_container_db_txn(tokutxn); + DB_TXN *txn_anc = toku_txn_ancestor(txn); + int r = toku_txn_add_lt(txn_anc, db->i->lt); + if (r == 0) { + TXNID txn_anc_id = toku_txn_get_txnid(db_txn_struct_i(txn_anc)->tokutxn); + r = toku_lt_acquire_write_lock(db->i->lt, db, txn_anc_id, key); + } + return r; +} + + diff --git a/src/ydb_row_lock.h b/src/ydb_row_lock.h new file mode 100755 index 00000000000..004983251ef --- /dev/null +++ b/src/ydb_row_lock.h @@ -0,0 +1,27 @@ +// This file defines the public interface to the ydb library + +#if !defined(TOKU_YDB_ROWLOCK_H) +#define TOKU_YDB_ROWLOCK_H + +#if defined(__cplusplus) +extern "C" { +#endif + +int +get_range_lock(DB *db, DB_TXN *txn, const DBT *left_key, const DBT *right_key, toku_lock_type lock_type); + +int +start_range_lock(DB *db, DB_TXN *txn, const DBT *left_key, const DBT *right_key, toku_lock_type lock_type, toku_lock_request *lock_request); + +int +get_point_write_lock(DB *db, DB_TXN *txn, const DBT *key); + +int +toku_grab_write_lock (DB *db, DBT *key, TOKUTXN tokutxn); + + +#if defined(__cplusplus) +} +#endif + +#endif diff --git a/src/ydb_write.c b/src/ydb_write.c new file mode 100644 index 00000000000..f660014b182 --- /dev/null +++ b/src/ydb_write.c @@ -0,0 +1,865 @@ +/* -*- mode: C; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +#ident "Copyright (c) 2007-2009 Tokutek Inc. All rights reserved." + +#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it." + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "toku_assert.h" +#include "ydb.h" +#include "ydb-internal.h" +#include "brt-internal.h" +#include "brt-flusher.h" +#include "cachetable.h" +#include "log.h" +#include "memory.h" +#include "dlmalloc.h" +#include "checkpoint.h" +#include "key.h" +#include "loader.h" +#include "indexer.h" +#include "ydb_load.h" +#include "brtloader.h" +#include "log_header.h" +#include "ydb_cursor.h" +#include "ydb_row_lock.h" +#include "ydb_env_func.h" +#include "ydb_write.h" +#include "ydb_db.h" + +static YDB_WRITE_LAYER_STATUS_S ydb_write_layer_status; +#ifdef STATUS_VALUE +#undef STATUS_VALUE +#endif +#define STATUS_VALUE(x) ydb_write_layer_status.status[x].value.num + +#define STATUS_INIT(k,t,l) { \ + ydb_write_layer_status.status[k].keyname = #k; \ + ydb_write_layer_status.status[k].type = t; \ + ydb_write_layer_status.status[k].legend = l; \ + } + +static void +ydb_write_layer_status_init (void) { + // Note, this function initializes the keyname, type, and legend fields. + // Value fields are initialized to zero by compiler. + STATUS_INIT(YDB_LAYER_NUM_INSERTS, UINT64, "dictionary inserts"); + STATUS_INIT(YDB_LAYER_NUM_INSERTS_FAIL, UINT64, "dictionary inserts fail"); + STATUS_INIT(YDB_LAYER_NUM_DELETES, UINT64, "dictionary deletes"); + STATUS_INIT(YDB_LAYER_NUM_DELETES_FAIL, UINT64, "dictionary deletes fail"); + STATUS_INIT(YDB_LAYER_NUM_UPDATES, UINT64, "dictionary updates"); + STATUS_INIT(YDB_LAYER_NUM_UPDATES_FAIL, UINT64, "dictionary updates fail"); + STATUS_INIT(YDB_LAYER_NUM_UPDATES_BROADCAST, UINT64, "dictionary broadcast updates"); + STATUS_INIT(YDB_LAYER_NUM_UPDATES_BROADCAST_FAIL, UINT64, "dictionary broadcast updates fail"); + STATUS_INIT(YDB_LAYER_NUM_MULTI_INSERTS, UINT64, "dictionary multi inserts"); + STATUS_INIT(YDB_LAYER_NUM_MULTI_INSERTS_FAIL, UINT64, "dictionary multi inserts fail"); + STATUS_INIT(YDB_LAYER_NUM_MULTI_DELETES, UINT64, "dictionary multi deletes"); + STATUS_INIT(YDB_LAYER_NUM_MULTI_DELETES_FAIL, UINT64, "dictionary multi deletes fail"); + STATUS_INIT(YDB_LAYER_NUM_MULTI_UPDATES, UINT64, "dictionary updates multi"); + STATUS_INIT(YDB_LAYER_NUM_MULTI_UPDATES_FAIL, UINT64, "dictionary updates multi fail"); + ydb_write_layer_status.initialized = true; +} +#undef STATUS_INIT + +void +ydb_write_layer_get_status(YDB_WRITE_LAYER_STATUS statp) { + if (!ydb_write_layer_status.initialized) + ydb_write_layer_status_init(); + *statp = ydb_write_layer_status; +} + + +static inline u_int32_t +get_prelocked_flags(u_int32_t flags) { + u_int32_t lock_flags = flags & (DB_PRELOCKED | DB_PRELOCKED_WRITE); + return lock_flags; +} + +// these next two static functions are defined +// both here and ydb.c. We should find a good +// place for them. +static int +ydb_getf_do_nothing(DBT const* UU(key), DBT const* UU(val), void* UU(extra)) { + return 0; +} + +// Check if the available file system space is less than the reserve +// Returns ENOSPC if not enough space, othersize 0 +static inline int +env_check_avail_fs_space(DB_ENV *env) { + int r = env->i->fs_state == FS_RED ? ENOSPC : 0; + if (r) env->i->enospc_redzone_ctr++; + return r; +} + +// Return 0 if proposed pair do not violate size constraints of DB +// (insertion is legal) +// Return non zero otherwise. +static int +db_put_check_size_constraints(DB *db, const DBT *key, const DBT *val) { + int r = 0; + unsigned int klimit, vlimit; + + toku_brt_get_maximum_advised_key_value_lengths(&klimit, &vlimit); + if (key->size > klimit) { + r = toku_ydb_do_error(db->dbenv, EINVAL, + "The largest key allowed is %u bytes", klimit); + } else if (val->size > vlimit) { + r = toku_ydb_do_error(db->dbenv, EINVAL, + "The largest value allowed is %u bytes", vlimit); + } + return r; +} + +//Return 0 if insert is legal +static int +db_put_check_overwrite_constraint(DB *db, DB_TXN *txn, DBT *key, + u_int32_t lock_flags, u_int32_t overwrite_flag) { + int r; + + if (overwrite_flag == 0) { // 0 (yesoverwrite) does not impose constraints. + r = 0; + } else if (overwrite_flag == DB_NOOVERWRITE) { + // Check if (key,anything) exists in dictionary. + // If exists, fail. Otherwise, do insert. + // The DB_RMW flag causes the cursor to grab a write lock instead of a read lock on the key if it exists. + r = db_getf_set(db, txn, lock_flags|DB_SERIALIZABLE|DB_RMW, key, ydb_getf_do_nothing, NULL); + if (r == DB_NOTFOUND) + r = 0; + else if (r == 0) + r = DB_KEYEXIST; + //Any other error is passed through. + } else if (overwrite_flag == DB_NOOVERWRITE_NO_ERROR) { + r = 0; + } else { + //Other flags are not (yet) supported. + r = EINVAL; + } + return r; +} + + +int +toku_db_del(DB *db, DB_TXN *txn, DBT *key, u_int32_t flags, BOOL holds_ydb_lock) { + HANDLE_PANICKED_DB(db); + HANDLE_DB_ILLEGAL_WORKING_PARENT_TXN(db, txn); + + u_int32_t unchecked_flags = flags; + //DB_DELETE_ANY means delete regardless of whether it exists in the db. + BOOL error_if_missing = (BOOL)(!(flags&DB_DELETE_ANY)); + unchecked_flags &= ~DB_DELETE_ANY; + u_int32_t lock_flags = get_prelocked_flags(flags); + unchecked_flags &= ~lock_flags; + BOOL do_locking = (BOOL)(db->i->lt && !(lock_flags&DB_PRELOCKED_WRITE)); + + int r = 0; + if (unchecked_flags!=0) { + r = EINVAL; + } + + if (r == 0 && error_if_missing) { + //Check if the key exists in the db. + r = db_getf_set(db, txn, lock_flags|DB_SERIALIZABLE|DB_RMW, key, ydb_getf_do_nothing, NULL); + } + if (r == 0 && do_locking) { + //Do locking if necessary. + r = get_point_write_lock(db, txn, key); + } + if (r == 0) { + //Do the actual deleting. + if (!holds_ydb_lock) toku_ydb_lock(); + r = toku_brt_delete(db->i->brt, key, txn ? db_txn_struct_i(txn)->tokutxn : 0); + if (!holds_ydb_lock) toku_ydb_unlock(); + } + + if (r == 0) { + STATUS_VALUE(YDB_LAYER_NUM_DELETES)++; // accountability + } + else { + STATUS_VALUE(YDB_LAYER_NUM_DELETES_FAIL)++; // accountability + } + return r; +} + + +int +toku_db_put(DB *db, DB_TXN *txn, DBT *key, DBT *val, u_int32_t flags, BOOL holds_ydb_lock) { + HANDLE_PANICKED_DB(db); + HANDLE_DB_ILLEGAL_WORKING_PARENT_TXN(db, txn); + int r = 0; + + u_int32_t lock_flags = get_prelocked_flags(flags); + flags &= ~lock_flags; + + r = db_put_check_size_constraints(db, key, val); + if (r == 0) { + //Do any checking required by the flags. + r = db_put_check_overwrite_constraint(db, txn, key, lock_flags, flags); + } + BOOL do_locking = (BOOL)(db->i->lt && !(lock_flags&DB_PRELOCKED_WRITE)); + if (r == 0 && do_locking) { + //Do locking if necessary. + r = get_point_write_lock(db, txn, key); + } + if (r == 0) { + //Insert into the brt. + TOKUTXN ttxn = txn ? db_txn_struct_i(txn)->tokutxn : NULL; + enum brt_msg_type type = BRT_INSERT; + if (flags==DB_NOOVERWRITE_NO_ERROR) { + type = BRT_INSERT_NO_OVERWRITE; + } + if (!holds_ydb_lock) toku_ydb_lock(); + r = toku_brt_maybe_insert(db->i->brt, key, val, ttxn, FALSE, ZERO_LSN, TRUE, type); + if (!holds_ydb_lock) toku_ydb_unlock(); + } + + if (r == 0) { + // helgrind flags a race on this status update. we increment it atomically to satisfy helgrind. + // STATUS_VALUE(YDB_LAYER_NUM_INSERTS)++; // accountability + (void) __sync_fetch_and_add(&STATUS_VALUE(YDB_LAYER_NUM_INSERTS), 1); + } else { + // STATUS_VALUE(YDB_LAYER_NUM_INSERTS_FAIL)++; // accountability + (void) __sync_fetch_and_add(&STATUS_VALUE(YDB_LAYER_NUM_INSERTS_FAIL), 1); + } + + return r; +} + +static int +toku_db_update(DB *db, DB_TXN *txn, + const DBT *key, + const DBT *update_function_extra, + u_int32_t flags) { + HANDLE_PANICKED_DB(db); + HANDLE_DB_ILLEGAL_WORKING_PARENT_TXN(db, txn); + int r = 0; + + u_int32_t lock_flags = get_prelocked_flags(flags); + flags &= ~lock_flags; + + r = db_put_check_size_constraints(db, key, update_function_extra); + if (r != 0) { goto cleanup; } + + BOOL do_locking = (db->i->lt && !(lock_flags & DB_PRELOCKED_WRITE)); + if (do_locking) { + r = get_point_write_lock(db, txn, key); + if (r != 0) { goto cleanup; } + } + + TOKUTXN ttxn = txn ? db_txn_struct_i(txn)->tokutxn : NULL; + toku_ydb_lock(); + r = toku_brt_maybe_update(db->i->brt, key, update_function_extra, ttxn, + FALSE, ZERO_LSN, TRUE); + toku_ydb_unlock(); + +cleanup: + if (r == 0) + STATUS_VALUE(YDB_LAYER_NUM_UPDATES)++; // accountability + else + STATUS_VALUE(YDB_LAYER_NUM_UPDATES_FAIL)++; // accountability + return r; +} + + +// DB_IS_RESETTING_OP is true if the dictionary should be considered as if created by this transaction. +// For example, it will be true if toku_db_update_broadcast() is used to implement a schema change (such +// as adding a column), and will be false if used simply to update all the rows of a table (such as +// incrementing a field). +static int +toku_db_update_broadcast(DB *db, DB_TXN *txn, + const DBT *update_function_extra, + u_int32_t flags) { + HANDLE_PANICKED_DB(db); + HANDLE_DB_ILLEGAL_WORKING_PARENT_TXN(db, txn); + int r = 0; + + u_int32_t lock_flags = get_prelocked_flags(flags); + flags &= ~lock_flags; + u_int32_t is_resetting_op_flag = flags & DB_IS_RESETTING_OP; + flags &= is_resetting_op_flag; + BOOL is_resetting_op = (is_resetting_op_flag != 0); + + + if (is_resetting_op) { + if (txn->parent != NULL) { + r = EINVAL; // cannot have a parent if you are a resetting op + goto cleanup; + } + r = toku_db_pre_acquire_fileops_lock(db, txn); + if (r != 0) { goto cleanup; } + } + { + DBT null_key; + toku_init_dbt(&null_key); + r = db_put_check_size_constraints(db, &null_key, update_function_extra); + if (r != 0) { goto cleanup; } + } + + BOOL do_locking = (db->i->lt && !(lock_flags & DB_PRELOCKED_WRITE)); + if (do_locking) { + r = toku_db_pre_acquire_table_lock(db, txn, TRUE); + if (r != 0) { goto cleanup; } + } + + TOKUTXN ttxn = txn ? db_txn_struct_i(txn)->tokutxn : NULL; + toku_ydb_lock(); + r = toku_brt_maybe_update_broadcast(db->i->brt, update_function_extra, ttxn, + FALSE, ZERO_LSN, TRUE, is_resetting_op); + toku_ydb_unlock(); + +cleanup: + if (r == 0) + STATUS_VALUE(YDB_LAYER_NUM_UPDATES_BROADCAST)++; // accountability + else + STATUS_VALUE(YDB_LAYER_NUM_UPDATES_BROADCAST_FAIL)++; // accountability + return r; +} + +static int +log_del_single(DB_TXN *txn, BRT brt, const DBT *key) { + TOKUTXN ttxn = db_txn_struct_i(txn)->tokutxn; + int r = toku_brt_log_del(ttxn, brt, key); + return r; +} + +static uint32_t +sum_size(uint32_t num_keys, DBT keys[], uint32_t overhead) { + uint32_t sum = 0; + for (uint32_t i = 0; i < num_keys; i++) + sum += keys[i].size + overhead; + return sum; +} + +static int +log_del_multiple(DB_TXN *txn, DB *src_db, const DBT *key, const DBT *val, uint32_t num_dbs, BRT brts[], DBT keys[]) { + int r = 0; + if (num_dbs > 0) { + TOKUTXN ttxn = db_txn_struct_i(txn)->tokutxn; + BRT src_brt = src_db ? src_db->i->brt : NULL; + uint32_t del_multiple_size = key->size + val->size + num_dbs*sizeof (uint32_t) + toku_log_enq_delete_multiple_overhead; + uint32_t del_single_sizes = sum_size(num_dbs, keys, toku_log_enq_delete_any_overhead); + if (del_single_sizes < del_multiple_size) { + for (uint32_t i = 0; r == 0 && i < num_dbs; i++) + r = log_del_single(txn, brts[i], &keys[i]); + } else { + r = toku_brt_log_del_multiple(ttxn, src_brt, brts, num_dbs, key, val); + } + } + return r; +} + +static uint32_t +lookup_src_db(uint32_t num_dbs, DB *db_array[], DB *src_db) { + uint32_t which_db; + for (which_db = 0; which_db < num_dbs; which_db++) + if (db_array[which_db] == src_db) + break; + return which_db; +} + +static int +do_del_multiple(DB_TXN *txn, uint32_t num_dbs, DB *db_array[], DBT keys[], DB *src_db, const DBT *src_key) { + src_db = src_db; src_key = src_key; + int r = 0; + TOKUTXN ttxn = db_txn_struct_i(txn)->tokutxn; + for (uint32_t which_db = 0; r == 0 && which_db < num_dbs; which_db++) { + DB *db = db_array[which_db]; + + // if db is being indexed by an indexer, then insert a delete message into the db if the src key is to the left or equal to the + // indexers cursor. we have to get the src_db from the indexer and find it in the db_array. + int do_delete = TRUE; + DB_INDEXER *indexer = toku_db_get_indexer(db); + if (indexer) { // if this db is the index under construction + DB *indexer_src_db = toku_indexer_get_src_db(indexer); + invariant(indexer_src_db != NULL); + const DBT *indexer_src_key; + if (src_db == indexer_src_db) + indexer_src_key = src_key; + else { + uint32_t which_src_db = lookup_src_db(num_dbs, db_array, indexer_src_db); + invariant(which_src_db < num_dbs); + indexer_src_key = &keys[which_src_db]; + } + do_delete = !toku_indexer_is_key_right_of_le_cursor(indexer, indexer_src_db, indexer_src_key); + } + if (r == 0 && do_delete) { + r = toku_brt_maybe_delete(db->i->brt, &keys[which_db], ttxn, FALSE, ZERO_LSN, FALSE); + } + } + return r; +} + +int +env_del_multiple( + DB_ENV *env, + DB *src_db, + DB_TXN *txn, + const DBT *src_key, + const DBT *src_val, + uint32_t num_dbs, + DB **db_array, + DBT *keys, + uint32_t *flags_array) +{ + int r; + DBT del_keys[num_dbs]; + + HANDLE_PANICKED_ENV(env); + + if (!txn) { + r = EINVAL; + goto cleanup; + } + if (!env->i->generate_row_for_del) { + r = EINVAL; + goto cleanup; + } + + HANDLE_ILLEGAL_WORKING_PARENT_TXN(env, txn); + + { + uint32_t lock_flags[num_dbs]; + uint32_t remaining_flags[num_dbs]; + BRT brts[num_dbs]; + + for (uint32_t which_db = 0; which_db < num_dbs; which_db++) { + DB *db = db_array[which_db]; + lock_flags[which_db] = get_prelocked_flags(flags_array[which_db]); + remaining_flags[which_db] = flags_array[which_db] & ~lock_flags[which_db]; + + if (db == src_db) { + del_keys[which_db] = *src_key; + } + else { + //Generate the key + r = env->i->generate_row_for_del(db, src_db, &keys[which_db], src_key, src_val); + if (r != 0) goto cleanup; + del_keys[which_db] = keys[which_db]; + } + + if (remaining_flags[which_db] & ~DB_DELETE_ANY) { + r = EINVAL; + goto cleanup; + } + BOOL error_if_missing = (BOOL)(!(remaining_flags[which_db]&DB_DELETE_ANY)); + if (error_if_missing) { + //Check if the key exists in the db. + r = db_getf_set(db, txn, lock_flags[which_db]|DB_SERIALIZABLE|DB_RMW, &del_keys[which_db], ydb_getf_do_nothing, NULL); + if (r != 0) goto cleanup; + } + + //Do locking if necessary. + if (db->i->lt && !(lock_flags[which_db] & DB_PRELOCKED_WRITE)) { + //Needs locking + r = get_point_write_lock(db, txn, &del_keys[which_db]); + if (r != 0) goto cleanup; + } + brts[which_db] = db->i->brt; + } + + toku_ydb_lock(); + if (num_dbs == 1) { + r = log_del_single(txn, brts[0], &del_keys[0]); + } + else { + r = log_del_multiple(txn, src_db, src_key, src_val, num_dbs, brts, del_keys); + } + if (r == 0) + r = do_del_multiple(txn, num_dbs, db_array, del_keys, src_db, src_key); + } + toku_ydb_unlock(); + +cleanup: + if (r == 0) + STATUS_VALUE(YDB_LAYER_NUM_MULTI_DELETES) += num_dbs; // accountability + else + STATUS_VALUE(YDB_LAYER_NUM_MULTI_DELETES_FAIL) += num_dbs; // accountability + return r; +} + +static int +log_put_single(DB_TXN *txn, BRT brt, const DBT *key, const DBT *val) { + TOKUTXN ttxn = db_txn_struct_i(txn)->tokutxn; + int r = toku_brt_log_put(ttxn, brt, key, val); + return r; +} + +static int +log_put_multiple(DB_TXN *txn, DB *src_db, const DBT *src_key, const DBT *src_val, uint32_t num_dbs, BRT brts[]) { + int r = 0; + if (num_dbs > 0) { + TOKUTXN ttxn = db_txn_struct_i(txn)->tokutxn; + BRT src_brt = src_db ? src_db->i->brt : NULL; + r = toku_brt_log_put_multiple(ttxn, src_brt, brts, num_dbs, src_key, src_val); + } + return r; +} + +static int +do_put_multiple(DB_TXN *txn, uint32_t num_dbs, DB *db_array[], DBT keys[], DBT vals[], DB *src_db, const DBT *src_key) { + int r = 0; + TOKUTXN ttxn = db_txn_struct_i(txn)->tokutxn; + for (uint32_t which_db = 0; r == 0 && which_db < num_dbs; which_db++) { + DB *db = db_array[which_db]; + + // if db is being indexed by an indexer, then put into that db if the src key is to the left or equal to the + // indexers cursor. we have to get the src_db from the indexer and find it in the db_array. + int do_put = TRUE; + DB_INDEXER *indexer = toku_db_get_indexer(db); + if (indexer) { // if this db is the index under construction + DB *indexer_src_db = toku_indexer_get_src_db(indexer); + invariant(indexer_src_db != NULL); + const DBT *indexer_src_key; + if (src_db == indexer_src_db) + indexer_src_key = src_key; + else { + uint32_t which_src_db = lookup_src_db(num_dbs, db_array, indexer_src_db); + invariant(which_src_db < num_dbs); + indexer_src_key = &keys[which_src_db]; + } + do_put = !toku_indexer_is_key_right_of_le_cursor(indexer, indexer_src_db, indexer_src_key); + } + if (r == 0 && do_put) { + r = toku_brt_maybe_insert(db->i->brt, &keys[which_db], &vals[which_db], ttxn, FALSE, ZERO_LSN, FALSE, BRT_INSERT); + } + } + return r; +} + +static int +env_put_multiple_internal( + DB_ENV *env, + DB *src_db, + DB_TXN *txn, + const DBT *src_key, + const DBT *src_val, + uint32_t num_dbs, + DB **db_array, + DBT *keys, + DBT *vals, + uint32_t *flags_array) +{ + int r; + DBT put_keys[num_dbs]; + DBT put_vals[num_dbs]; + + HANDLE_PANICKED_ENV(env); + + uint32_t lock_flags[num_dbs]; + uint32_t remaining_flags[num_dbs]; + BRT brts[num_dbs]; + + if (!txn || !num_dbs) { + r = EINVAL; + goto cleanup; + } + if (!env->i->generate_row_for_put) { + r = EINVAL; + goto cleanup; + } + + HANDLE_ILLEGAL_WORKING_PARENT_TXN(env, txn); + + for (uint32_t which_db = 0; which_db < num_dbs; which_db++) { + DB *db = db_array[which_db]; + + lock_flags[which_db] = get_prelocked_flags(flags_array[which_db]); + remaining_flags[which_db] = flags_array[which_db] & ~lock_flags[which_db]; + + //Generate the row + if (db == src_db) { + put_keys[which_db] = *src_key; + put_vals[which_db] = *src_val; + } + else { + r = env->i->generate_row_for_put(db, src_db, &keys[which_db], &vals[which_db], src_key, src_val); + if (r != 0) goto cleanup; + put_keys[which_db] = keys[which_db]; + put_vals[which_db] = vals[which_db]; + } + + // check size constraints + r = db_put_check_size_constraints(db, &put_keys[which_db], &put_vals[which_db]); + if (r != 0) goto cleanup; + + //Check overwrite constraints + r = db_put_check_overwrite_constraint(db, txn, + &put_keys[which_db], + lock_flags[which_db], remaining_flags[which_db]); + if (r != 0) goto cleanup; + if (remaining_flags[which_db] == DB_NOOVERWRITE_NO_ERROR) { + //put_multiple does not support delaying the no error, since we would + //have to log the flag in the put_multiple. + r = EINVAL; goto cleanup; + } + + //Do locking if necessary. + if (db->i->lt && !(lock_flags[which_db] & DB_PRELOCKED_WRITE)) { + //Needs locking + r = get_point_write_lock(db, txn, &put_keys[which_db]); + if (r != 0) goto cleanup; + } + brts[which_db] = db->i->brt; + } + + toku_ydb_lock(); + if (num_dbs == 1) { + r = log_put_single(txn, brts[0], &put_keys[0], &put_vals[0]); + } + else { + r = log_put_multiple(txn, src_db, src_key, src_val, num_dbs, brts); + } + if (r == 0) { + r = do_put_multiple(txn, num_dbs, db_array, put_keys, put_vals, src_db, src_key); + } + toku_ydb_unlock(); + +cleanup: + if (r == 0) + STATUS_VALUE(YDB_LAYER_NUM_MULTI_INSERTS) += num_dbs; // accountability + else + STATUS_VALUE(YDB_LAYER_NUM_MULTI_INSERTS_FAIL) += num_dbs; // accountability + return r; +} + +int +env_update_multiple(DB_ENV *env, DB *src_db, DB_TXN *txn, + DBT *old_src_key, DBT *old_src_data, + DBT *new_src_key, DBT *new_src_data, + uint32_t num_dbs, DB **db_array, uint32_t* flags_array, + uint32_t num_keys, DBT keys[], + uint32_t num_vals, DBT vals[]) { + int r = 0; + + HANDLE_PANICKED_ENV(env); + + if (!txn) { + r = EINVAL; + goto cleanup; + } + if (!env->i->generate_row_for_put) { + r = EINVAL; + goto cleanup; + } + + HANDLE_ILLEGAL_WORKING_PARENT_TXN(env, txn); + + { + uint32_t n_del_dbs = 0; + DB *del_dbs[num_dbs]; + BRT del_brts[num_dbs]; + DBT del_keys[num_dbs]; + + uint32_t n_put_dbs = 0; + DB *put_dbs[num_dbs]; + BRT put_brts[num_dbs]; + DBT put_keys[num_dbs]; + DBT put_vals[num_dbs]; + + uint32_t lock_flags[num_dbs]; + uint32_t remaining_flags[num_dbs]; + + for (uint32_t which_db = 0; which_db < num_dbs; which_db++) { + DB *db = db_array[which_db]; + DBT curr_old_key, curr_new_key, curr_new_val; + + lock_flags[which_db] = get_prelocked_flags(flags_array[which_db]); + remaining_flags[which_db] = flags_array[which_db] & ~lock_flags[which_db]; + + // keys[0..num_dbs-1] are the new keys + // keys[num_dbs..2*num_dbs-1] are the old keys + // vals[0..num_dbs-1] are the new vals + + // Generate the old key and val + if (which_db + num_dbs >= num_keys) { + r = ENOMEM; goto cleanup; + } + if (db == src_db) { + curr_old_key = *old_src_key; + } + else { + r = env->i->generate_row_for_put(db, src_db, &keys[which_db + num_dbs], NULL, old_src_key, old_src_data); + if (r != 0) goto cleanup; + curr_old_key = keys[which_db + num_dbs]; + } + // Generate the new key and val + if (which_db >= num_keys || which_db >= num_vals) { + r = ENOMEM; goto cleanup; + } + if (db == src_db) { + curr_new_key = *new_src_key; + curr_new_val = *new_src_data; + } + else { + r = env->i->generate_row_for_put(db, src_db, &keys[which_db], &vals[which_db], new_src_key, new_src_data); + if (r != 0) goto cleanup; + curr_new_key = keys[which_db]; + curr_new_val = vals[which_db]; + } + toku_dbt_cmp cmpfun = toku_db_get_compare_fun(db); + BOOL key_eq = cmpfun(db, &curr_old_key, &curr_new_key) == 0; + if (!key_eq) { + //Check overwrite constraints only in the case where + // the keys are not equal. + // If the keys are equal, then we do not care of the flag is DB_NOOVERWRITE or 0 + r = db_put_check_overwrite_constraint(db, txn, + &curr_new_key, + lock_flags[which_db], remaining_flags[which_db]); + if (r != 0) goto cleanup; + if (remaining_flags[which_db] == DB_NOOVERWRITE_NO_ERROR) { + //update_multiple does not support delaying the no error, since we would + //have to log the flag in the put_multiple. + r = EINVAL; goto cleanup; + } + + // lock old key + if (db->i->lt && !(lock_flags[which_db] & DB_PRELOCKED_WRITE)) { + r = get_point_write_lock(db, txn, &curr_old_key); + if (r != 0) goto cleanup; + } + del_dbs[n_del_dbs] = db; + del_brts[n_del_dbs] = db->i->brt; + del_keys[n_del_dbs] = curr_old_key; + n_del_dbs++; + + } + + // we take a shortcut and avoid generating the old val + // we assume that any new vals with size > 0 are different than the old val + // if (!key_eq || !(dbt_cmp(&vals[which_db], &vals[which_db + num_dbs]) == 0)) { + if (!key_eq || curr_new_val.size > 0) { + r = db_put_check_size_constraints(db, &curr_new_key, &curr_new_val); + if (r != 0) goto cleanup; + + // lock new key + if (db->i->lt) { + r = get_point_write_lock(db, txn, &curr_new_key); + if (r != 0) goto cleanup; + } + put_dbs[n_put_dbs] = db; + put_brts[n_put_dbs] = db->i->brt; + put_keys[n_put_dbs] = curr_new_key; + put_vals[n_put_dbs] = curr_new_val; + n_put_dbs++; + } + } + // grab the ydb lock for the actual work that + // depends on it + toku_ydb_lock(); + if (r == 0 && n_del_dbs > 0) { + if (n_del_dbs == 1) + r = log_del_single(txn, del_brts[0], &del_keys[0]); + else + r = log_del_multiple(txn, src_db, old_src_key, old_src_data, n_del_dbs, del_brts, del_keys); + if (r == 0) + r = do_del_multiple(txn, n_del_dbs, del_dbs, del_keys, src_db, old_src_key); + } + + if (r == 0 && n_put_dbs > 0) { + if (n_put_dbs == 1) + r = log_put_single(txn, put_brts[0], &put_keys[0], &put_vals[0]); + else + r = log_put_multiple(txn, src_db, new_src_key, new_src_data, n_put_dbs, put_brts); + if (r == 0) + r = do_put_multiple(txn, n_put_dbs, put_dbs, put_keys, put_vals, src_db, new_src_key); + } + toku_ydb_unlock(); + } + +cleanup: + if (r == 0) + STATUS_VALUE(YDB_LAYER_NUM_MULTI_UPDATES) += num_dbs; // accountability + else + STATUS_VALUE(YDB_LAYER_NUM_MULTI_UPDATES_FAIL) += num_dbs; // accountability + return r; +} + +int +autotxn_db_del(DB* db, DB_TXN* txn, DBT* key, u_int32_t flags) { + BOOL changed; int r; + r = toku_db_construct_autotxn(db, &txn, &changed, FALSE, FALSE); + if (r!=0) return r; + r = toku_db_del(db, txn, key, flags, FALSE); + return toku_db_destruct_autotxn(txn, r, changed, FALSE); +} + +int +autotxn_db_put(DB* db, DB_TXN* txn, DBT* key, DBT* data, u_int32_t flags) { + //{ unsigned i; printf("put %p keylen=%d key={", db, key->size); for(i=0; isize; i++) printf("%d,", ((char*)key->data)[i]); printf("} datalen=%d data={", data->size); for(i=0; isize; i++) printf("%d,", ((char*)data->data)[i]); printf("}\n"); } + BOOL changed; int r; + r = env_check_avail_fs_space(db->dbenv); + if (r != 0) { goto cleanup; } + r = toku_db_construct_autotxn(db, &txn, &changed, FALSE, FALSE); + if (r!=0) { + goto cleanup; + } + r = toku_db_put(db, txn, key, data, flags, FALSE); + r = toku_db_destruct_autotxn(txn, r, changed, FALSE); +cleanup: + return r; +} + +int +autotxn_db_update(DB *db, DB_TXN *txn, + const DBT *key, + const DBT *update_function_extra, + u_int32_t flags) { + BOOL changed; int r; + r = env_check_avail_fs_space(db->dbenv); + if (r != 0) { goto cleanup; } + r = toku_db_construct_autotxn(db, &txn, &changed, FALSE, FALSE); + if (r != 0) { return r; } + r = toku_db_update(db, txn, key, update_function_extra, flags); + r = toku_db_destruct_autotxn(txn, r, changed, FALSE); +cleanup: + return r; +} + +int +autotxn_db_update_broadcast(DB *db, DB_TXN *txn, + const DBT *update_function_extra, + u_int32_t flags) { + BOOL changed; int r; + r = env_check_avail_fs_space(db->dbenv); + if (r != 0) { goto cleanup; } + r = toku_db_construct_autotxn(db, &txn, &changed, FALSE, FALSE); + if (r != 0) { return r; } + r = toku_db_update_broadcast(db, txn, update_function_extra, flags); + r = toku_db_destruct_autotxn(txn, r, changed, FALSE); +cleanup: + return r; +} + +int +env_put_multiple(DB_ENV *env, DB *src_db, DB_TXN *txn, const DBT *src_key, const DBT *src_val, uint32_t num_dbs, DB **db_array, DBT *keys, DBT *vals, uint32_t *flags_array) { + int r = env_check_avail_fs_space(env); + if (r == 0) { + r = env_put_multiple_internal(env, src_db, txn, src_key, src_val, num_dbs, db_array, keys, vals, flags_array); + } + return r; +} + +int +toku_ydb_check_avail_fs_space(DB_ENV *env) { + int rval = env_check_avail_fs_space(env); + return rval; +} +#undef STATUS_VALUE + +#include +void __attribute__((constructor)) toku_ydb_write_drd_ignore(void); +void +toku_ydb_write_drd_ignore(void) { + DRD_IGNORE_VAR(ydb_write_layer_status); +} diff --git a/src/ydb_write.h b/src/ydb_write.h new file mode 100644 index 00000000000..ca0d9862f26 --- /dev/null +++ b/src/ydb_write.h @@ -0,0 +1,82 @@ +// This file defines the public interface to the ydb library + +#if !defined(TOKU_YDB_WRITE_H) +#define TOKU_YDB_WRITE_H + +#if defined(__cplusplus) +extern "C" { +#endif + +typedef enum { + YDB_LAYER_NUM_INSERTS = 0, + YDB_LAYER_NUM_INSERTS_FAIL, + YDB_LAYER_NUM_DELETES, + YDB_LAYER_NUM_DELETES_FAIL, + YDB_LAYER_NUM_UPDATES, + YDB_LAYER_NUM_UPDATES_FAIL, + YDB_LAYER_NUM_UPDATES_BROADCAST, + YDB_LAYER_NUM_UPDATES_BROADCAST_FAIL, + YDB_LAYER_NUM_MULTI_INSERTS, + YDB_LAYER_NUM_MULTI_INSERTS_FAIL, + YDB_LAYER_NUM_MULTI_DELETES, + YDB_LAYER_NUM_MULTI_DELETES_FAIL, + YDB_LAYER_NUM_MULTI_UPDATES, + YDB_LAYER_NUM_MULTI_UPDATES_FAIL, + YDB_WRITE_LAYER_STATUS_NUM_ROWS /* number of rows in this status array */ +} ydb_write_lock_layer_status_entry; + +typedef struct { + BOOL initialized; + TOKU_ENGINE_STATUS_ROW_S status[YDB_WRITE_LAYER_STATUS_NUM_ROWS]; +} YDB_WRITE_LAYER_STATUS_S, *YDB_WRITE_LAYER_STATUS; + +void ydb_write_layer_get_status(YDB_WRITE_LAYER_STATUS statp); + + +int toku_db_del(DB *db, DB_TXN *txn, DBT *key, u_int32_t flags, BOOL holds_ydb_lock); +int toku_db_put(DB *db, DB_TXN *txn, DBT *key, DBT *val, u_int32_t flags, BOOL holds_ydb_lock); +int autotxn_db_del(DB* db, DB_TXN* txn, DBT* key, u_int32_t flags); +int autotxn_db_put(DB* db, DB_TXN* txn, DBT* key, DBT* data, u_int32_t flags); +int autotxn_db_update(DB *db, DB_TXN *txn, const DBT *key, const DBT *update_function_extra, u_int32_t flags); +int autotxn_db_update_broadcast(DB *db, DB_TXN *txn, const DBT *update_function_extra, u_int32_t flags); +int env_put_multiple( + DB_ENV *env, + DB *src_db, + DB_TXN *txn, + const DBT *src_key, const DBT *src_val, + uint32_t num_dbs, + DB **db_array, + DBT *keys, DBT *vals, + uint32_t *flags_array + ); +int env_del_multiple( + DB_ENV *env, + DB *src_db, + DB_TXN *txn, + const DBT *src_key, + const DBT *src_val, + uint32_t num_dbs, + DB **db_array, + DBT *keys, + uint32_t *flags_array + ); +int env_update_multiple( + DB_ENV *env, + DB *src_db, + DB_TXN *txn, + DBT *old_src_key, DBT *old_src_data, + DBT *new_src_key, DBT *new_src_data, + uint32_t num_dbs, + DB **db_array, + uint32_t* flags_array, + uint32_t num_keys, DBT keys[], + uint32_t num_vals, DBT vals[] + ); + + + +#if defined(__cplusplus) +} +#endif + +#endif