mirror of
https://github.com/MariaDB/server.git
synced 2025-01-30 18:41:56 +01:00
746f35d6f3
Flashcache is available only in WebScaleSQL (not in MySQL or MariaDB).
10706 lines
318 KiB
C++
10706 lines
318 KiB
C++
/*
|
|
Copyright (c) 2012, Monty Program Ab
|
|
|
|
This program is free software; you can redistribute it and/or modify
|
|
it under the terms of the GNU General Public License as published by
|
|
the Free Software Foundation; version 2 of the License.
|
|
|
|
This program is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
GNU General Public License for more details.
|
|
|
|
You should have received a copy of the GNU General Public License
|
|
along with this program; if not, write to the Free Software
|
|
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
|
|
|
|
#ifdef USE_PRAGMA_IMPLEMENTATION
|
|
#pragma implementation // gcc: Class implementation
|
|
#endif
|
|
|
|
#define MYSQL_SERVER 1
|
|
|
|
#include <my_config.h>
|
|
|
|
/* The C++ file's header */
|
|
#include "./ha_rocksdb.h"
|
|
|
|
/* C++ standard header files */
|
|
#include <set>
|
|
#include <queue>
|
|
#include <string>
|
|
#include <vector>
|
|
|
|
/* MySQL includes */
|
|
#include "./debug_sync.h"
|
|
#include "./my_bit.h"
|
|
#include "./my_stacktrace.h"
|
|
#include "./sql_table.h"
|
|
#include <mysys_err.h>
|
|
#include <mysql/psi/mysql_table.h>
|
|
#ifdef MARIAROCKS_NOT_YET
|
|
#include <mysql/thread_pool_priv.h>
|
|
#endif
|
|
|
|
/* RocksDB includes */
|
|
#include "rocksdb/compaction_filter.h"
|
|
#include "rocksdb/rate_limiter.h"
|
|
#include "rocksdb/slice_transform.h"
|
|
#include "rocksdb/utilities/checkpoint.h"
|
|
#include "rocksdb/utilities/convenience.h"
|
|
#include "rocksdb/utilities/flashcache.h"
|
|
#include "rocksdb/utilities/memory_util.h"
|
|
|
|
/* MyRocks includes */
|
|
#include "./event_listener.h"
|
|
#include "./ha_rocksdb_proto.h"
|
|
#include "./logger.h"
|
|
#include "./rdb_cf_manager.h"
|
|
#include "./rdb_cf_options.h"
|
|
#include "./rdb_datadic.h"
|
|
#include "./rdb_i_s.h"
|
|
#include "./rdb_index_merge.h"
|
|
#include "./rdb_mutex_wrapper.h"
|
|
#include "./rdb_threads.h"
|
|
|
|
#ifdef TARGET_OS_LINUX
|
|
extern my_bool cachedev_enabled;
|
|
#endif /* TARGET_OS_LINUX */
|
|
|
|
// Internal MySQL APIs not exposed in any header.
|
|
extern "C"
|
|
{
|
|
/**
|
|
Mark transaction to rollback and mark error as fatal to a sub-statement.
|
|
@param thd Thread handle
|
|
@param all TRUE <=> rollback main transaction.
|
|
*/
|
|
void thd_mark_transaction_to_rollback(MYSQL_THD thd, bool all);
|
|
|
|
/**
|
|
* Get the user thread's binary logging format
|
|
* @param thd user thread
|
|
* @return Value to be used as index into the binlog_format_names array
|
|
*/
|
|
int thd_binlog_format(const MYSQL_THD thd);
|
|
|
|
/**
|
|
* Check if binary logging is filtered for thread's current db.
|
|
* @param thd Thread handle
|
|
* @retval 1 the query is not filtered, 0 otherwise.
|
|
*/
|
|
bool thd_binlog_filter_ok(const MYSQL_THD thd);
|
|
}
|
|
|
|
namespace myrocks {
|
|
|
|
static st_global_stats global_stats;
|
|
static st_export_stats export_stats;
|
|
|
|
/**
|
|
Updates row counters based on the table type and operation type.
|
|
*/
|
|
void ha_rocksdb::update_row_stats(operation_type type) {
|
|
DBUG_ASSERT(type < ROWS_MAX);
|
|
// Find if we are modifying system databases.
|
|
if (table->s && m_tbl_def->m_is_mysql_system_table)
|
|
global_stats.system_rows[type].inc();
|
|
else
|
|
global_stats.rows[type].inc();
|
|
}
|
|
|
|
void dbug_dump_database(rocksdb::DB *db);
|
|
static handler *rocksdb_create_handler(my_core::handlerton *hton,
|
|
my_core::TABLE_SHARE *table_arg,
|
|
my_core::MEM_ROOT *mem_root);
|
|
|
|
bool can_use_bloom_filter(THD *thd,
|
|
const std::shared_ptr<const Rdb_key_def>& kd,
|
|
const rocksdb::Slice &eq_cond,
|
|
const bool use_all_keys,
|
|
bool is_ascending);
|
|
|
|
///////////////////////////////////////////////////////////
|
|
// Parameters and settings
|
|
///////////////////////////////////////////////////////////
|
|
static char * rocksdb_default_cf_options;
|
|
static char * rocksdb_override_cf_options;
|
|
Rdb_cf_options rocksdb_cf_options_map;
|
|
|
|
///////////////////////////////////////////////////////////
|
|
// Globals
|
|
///////////////////////////////////////////////////////////
|
|
handlerton *rocksdb_hton;
|
|
|
|
rocksdb::TransactionDB *rdb= nullptr;
|
|
|
|
static std::shared_ptr<rocksdb::Statistics> rocksdb_stats;
|
|
static std::unique_ptr<rocksdb::Env> flashcache_aware_env;
|
|
static std::shared_ptr<Rdb_tbl_prop_coll_factory>
|
|
properties_collector_factory;
|
|
|
|
Rdb_dict_manager dict_manager;
|
|
Rdb_cf_manager cf_manager;
|
|
Rdb_ddl_manager ddl_manager;
|
|
Rdb_binlog_manager binlog_manager;
|
|
|
|
|
|
/**
|
|
MyRocks background thread control
|
|
N.B. This is besides RocksDB's own background threads
|
|
(@see rocksdb::CancelAllBackgroundWork())
|
|
*/
|
|
|
|
static Rdb_background_thread rdb_bg_thread;
|
|
|
|
|
|
// List of table names (using regex) that are exceptions to the strict
|
|
// collation check requirement.
|
|
Regex_list_handler *rdb_collation_exceptions;
|
|
|
|
static const char* const ERRSTR_ROLLBACK_ONLY
|
|
= "This transaction was rolled back and cannot be "
|
|
"committed. Only supported operation is to roll it back, "
|
|
"so all pending changes will be discarded. "
|
|
"Please restart another transaction.";
|
|
|
|
|
|
static void
|
|
rocksdb_flush_all_memtables()
|
|
{
|
|
Rdb_cf_manager& cf_manager= rdb_get_cf_manager();
|
|
for (auto cf_handle : cf_manager.get_all_cf()) {
|
|
rdb->Flush(rocksdb::FlushOptions(), cf_handle);
|
|
}
|
|
}
|
|
|
|
static void
|
|
rocksdb_compact_column_family_stub(THD* thd,
|
|
struct st_mysql_sys_var* var,
|
|
void* var_ptr,
|
|
const void* save)
|
|
{
|
|
}
|
|
|
|
static int
|
|
rocksdb_compact_column_family(THD* thd,
|
|
struct st_mysql_sys_var* var,
|
|
void* var_ptr,
|
|
struct st_mysql_value* value)
|
|
{
|
|
char buff[STRING_BUFFER_USUAL_SIZE];
|
|
int len = sizeof(buff);
|
|
|
|
if (const char* cf = value->val_str(value, buff, &len)) {
|
|
bool is_automatic;
|
|
auto cfh = cf_manager.get_cf(cf, "", nullptr, &is_automatic);
|
|
if (cfh != nullptr && rdb != nullptr) {
|
|
sql_print_information("RocksDB: Manual compaction of column family: %s\n", cf);
|
|
rdb->CompactRange(rocksdb::CompactRangeOptions(), cfh, nullptr, nullptr);
|
|
}
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
///////////////////////////////////////////////////////////
|
|
// Hash map: table name => open table handler
|
|
///////////////////////////////////////////////////////////
|
|
|
|
namespace // anonymous namespace = not visible outside this source file
|
|
{
|
|
|
|
struct Rdb_open_tables_map
|
|
{
|
|
/* Hash table used to track the handlers of open tables */
|
|
my_core::HASH m_hash;
|
|
/* The mutex used to protect the hash table */
|
|
mutable mysql_mutex_t m_mutex;
|
|
|
|
void init_hash(void)
|
|
{
|
|
(void) my_hash_init(&m_hash, my_core::system_charset_info, 32, 0, 0,
|
|
(my_hash_get_key) Rdb_open_tables_map::get_hash_key,
|
|
0, 0);
|
|
}
|
|
|
|
void free_hash(void)
|
|
{
|
|
my_hash_free(&m_hash);
|
|
}
|
|
|
|
static uchar* get_hash_key(Rdb_table_handler *table_handler,
|
|
size_t *length,
|
|
my_bool not_used __attribute__((__unused__)));
|
|
|
|
Rdb_table_handler* get_table_handler(const char *table_name);
|
|
void release_table_handler(Rdb_table_handler *table_handler);
|
|
|
|
std::vector<std::string> get_table_names(void) const;
|
|
};
|
|
|
|
} // anonymous namespace
|
|
|
|
static Rdb_open_tables_map rdb_open_tables;
|
|
|
|
|
|
static std::string rdb_normalize_dir(std::string dir)
|
|
{
|
|
while (dir.size() > 0 && dir.back() == '/')
|
|
{
|
|
dir.resize(dir.size() - 1);
|
|
}
|
|
return dir;
|
|
}
|
|
|
|
|
|
static int rocksdb_create_checkpoint(
|
|
THD* thd __attribute__((__unused__)),
|
|
struct st_mysql_sys_var* var __attribute__((__unused__)),
|
|
void* save __attribute__((__unused__)),
|
|
struct st_mysql_value* value)
|
|
{
|
|
char buf[512];
|
|
int len = sizeof(buf);
|
|
const char* checkpoint_dir_raw= value->val_str(value, buf, &len);
|
|
if (checkpoint_dir_raw) {
|
|
if (rdb != nullptr) {
|
|
std::string checkpoint_dir= rdb_normalize_dir(checkpoint_dir_raw);
|
|
// NO_LINT_DEBUG
|
|
sql_print_information("RocksDB: creating checkpoint in directory : %s\n",
|
|
checkpoint_dir.c_str());
|
|
rocksdb::Checkpoint* checkpoint;
|
|
auto status = rocksdb::Checkpoint::Create(rdb, &checkpoint);
|
|
if (status.ok()) {
|
|
status = checkpoint->CreateCheckpoint(checkpoint_dir.c_str());
|
|
if (status.ok()) {
|
|
sql_print_information(
|
|
"RocksDB: created checkpoint in directory : %s\n",
|
|
checkpoint_dir.c_str());
|
|
} else {
|
|
my_printf_error(
|
|
ER_UNKNOWN_ERROR,
|
|
"RocksDB: Failed to create checkpoint directory. status %d %s",
|
|
MYF(0), status.code(), status.ToString().c_str());
|
|
}
|
|
delete checkpoint;
|
|
} else {
|
|
std::string err_text(status.ToString());
|
|
my_printf_error(ER_UNKNOWN_ERROR,
|
|
"RocksDB: failed to initialize checkpoint. status %d %s\n",
|
|
MYF(0), status.code(), err_text.c_str());
|
|
}
|
|
return status.code();
|
|
}
|
|
}
|
|
return HA_ERR_INTERNAL_ERROR;
|
|
}
|
|
|
|
/* This method is needed to indicate that the
|
|
ROCKSDB_CREATE_CHECKPOINT command is not read-only */
|
|
static void
|
|
rocksdb_create_checkpoint_stub(THD* thd,
|
|
struct st_mysql_sys_var* var,
|
|
void* var_ptr,
|
|
const void* save)
|
|
{
|
|
}
|
|
|
|
static void
|
|
rocksdb_force_flush_memtable_now_stub(THD* thd,
|
|
struct st_mysql_sys_var* var,
|
|
void* var_ptr,
|
|
const void* save)
|
|
{
|
|
}
|
|
|
|
static int
|
|
rocksdb_force_flush_memtable_now(THD* thd,
|
|
struct st_mysql_sys_var* var,
|
|
void* var_ptr,
|
|
struct st_mysql_value* value)
|
|
{
|
|
sql_print_information("RocksDB: Manual memtable flush\n");
|
|
rocksdb_flush_all_memtables();
|
|
return 0;
|
|
}
|
|
|
|
static void rocksdb_drop_index_wakeup_thread(
|
|
my_core::THD* thd __attribute__((__unused__)),
|
|
struct st_mysql_sys_var* var __attribute__((__unused__)),
|
|
void* var_ptr __attribute__((__unused__)),
|
|
const void* save);
|
|
|
|
static my_bool rocksdb_pause_background_work= 0;
|
|
static mysql_mutex_t rdb_sysvars_mutex;
|
|
|
|
static void rocksdb_set_pause_background_work(
|
|
my_core::THD* thd __attribute__((__unused__)),
|
|
struct st_mysql_sys_var* var __attribute__((__unused__)),
|
|
void* var_ptr __attribute__((__unused__)),
|
|
const void* save)
|
|
{
|
|
mysql_mutex_lock(&rdb_sysvars_mutex);
|
|
bool pause_requested= *static_cast<const bool*>(save);
|
|
if (rocksdb_pause_background_work != pause_requested) {
|
|
if (pause_requested) {
|
|
rdb->PauseBackgroundWork();
|
|
} else {
|
|
rdb->ContinueBackgroundWork();
|
|
}
|
|
rocksdb_pause_background_work= pause_requested;
|
|
}
|
|
mysql_mutex_unlock(&rdb_sysvars_mutex);
|
|
}
|
|
|
|
static void
|
|
rocksdb_set_compaction_options(THD* thd,
|
|
struct st_mysql_sys_var* var,
|
|
void* var_ptr,
|
|
const void* save);
|
|
|
|
static void
|
|
rocksdb_set_table_stats_sampling_pct(THD* thd,
|
|
struct st_mysql_sys_var* var,
|
|
void* var_ptr,
|
|
const void* save);
|
|
|
|
static void
|
|
rocksdb_set_rate_limiter_bytes_per_sec(THD* thd,
|
|
struct st_mysql_sys_var* var,
|
|
void* var_ptr,
|
|
const void* save);
|
|
|
|
static void rdb_set_collation_exception_list(const char *exception_list);
|
|
static void
|
|
rocksdb_set_collation_exception_list(THD* thd,
|
|
struct st_mysql_sys_var* var,
|
|
void* var_ptr,
|
|
const void* save);
|
|
|
|
static void
|
|
rocksdb_set_bulk_load(THD* thd,
|
|
struct st_mysql_sys_var* var __attribute__((__unused__)),
|
|
void* var_ptr,
|
|
const void* save);
|
|
//////////////////////////////////////////////////////////////////////////////
|
|
// Options definitions
|
|
//////////////////////////////////////////////////////////////////////////////
|
|
static long long rocksdb_block_cache_size;
|
|
/* Use unsigned long long instead of uint64_t because of MySQL compatibility */
|
|
static unsigned long long // NOLINT(runtime/int)
|
|
rocksdb_rate_limiter_bytes_per_sec;
|
|
static uint64_t rocksdb_info_log_level;
|
|
static char * rocksdb_wal_dir;
|
|
static uint64_t rocksdb_index_type;
|
|
static char rocksdb_background_sync;
|
|
static uint32_t rocksdb_debug_optimizer_n_rows;
|
|
static my_bool rocksdb_debug_optimizer_no_zero_cardinality;
|
|
static uint32_t rocksdb_wal_recovery_mode;
|
|
static uint32_t rocksdb_access_hint_on_compaction_start;
|
|
static char * rocksdb_compact_cf_name;
|
|
static char * rocksdb_checkpoint_name;
|
|
static my_bool rocksdb_signal_drop_index_thread;
|
|
static my_bool rocksdb_strict_collation_check= 1;
|
|
static my_bool rocksdb_disable_2pc= 0;
|
|
static char * rocksdb_strict_collation_exceptions;
|
|
static my_bool rocksdb_collect_sst_properties= 1;
|
|
static my_bool rocksdb_force_flush_memtable_now_var= 0;
|
|
static uint64_t rocksdb_number_stat_computes= 0;
|
|
static uint32_t rocksdb_seconds_between_stat_computes= 3600;
|
|
static long long rocksdb_compaction_sequential_deletes= 0l;
|
|
static long long rocksdb_compaction_sequential_deletes_window= 0l;
|
|
static long long rocksdb_compaction_sequential_deletes_file_size= 0l;
|
|
static uint32_t rocksdb_validate_tables = 1;
|
|
static char * rocksdb_datadir;
|
|
static uint32_t rocksdb_table_stats_sampling_pct;
|
|
static my_bool rocksdb_enable_bulk_load_api= 1;
|
|
static my_bool rpl_skip_tx_api_var= 0;
|
|
|
|
std::atomic<uint64_t> rocksdb_snapshot_conflict_errors(0);
|
|
|
|
static rocksdb::DBOptions rdb_init_rocksdb_db_options(void)
|
|
{
|
|
rocksdb::DBOptions o;
|
|
|
|
o.create_if_missing= true;
|
|
o.listeners.push_back(std::make_shared<Rdb_event_listener>(&ddl_manager));
|
|
o.info_log_level= rocksdb::InfoLogLevel::INFO_LEVEL;
|
|
o.max_subcompactions= DEFAULT_SUBCOMPACTIONS;
|
|
|
|
return o;
|
|
}
|
|
|
|
static rocksdb::DBOptions rocksdb_db_options= rdb_init_rocksdb_db_options();
|
|
static rocksdb::BlockBasedTableOptions rocksdb_tbl_options;
|
|
|
|
static std::shared_ptr<rocksdb::RateLimiter> rocksdb_rate_limiter;
|
|
|
|
/* This enum needs to be kept up to date with rocksdb::InfoLogLevel */
|
|
static const char* info_log_level_names[] = {
|
|
"debug_level",
|
|
"info_level",
|
|
"warn_level",
|
|
"error_level",
|
|
"fatal_level",
|
|
NullS
|
|
};
|
|
|
|
static TYPELIB info_log_level_typelib = {
|
|
array_elements(info_log_level_names) - 1,
|
|
"info_log_level_typelib",
|
|
info_log_level_names,
|
|
nullptr
|
|
};
|
|
|
|
static void
|
|
rocksdb_set_rocksdb_info_log_level(THD* thd,
|
|
struct st_mysql_sys_var* var,
|
|
void* var_ptr,
|
|
const void* save)
|
|
{
|
|
mysql_mutex_lock(&rdb_sysvars_mutex);
|
|
rocksdb_info_log_level = *static_cast<const uint64_t*>(save);
|
|
rocksdb_db_options.info_log->SetInfoLogLevel(
|
|
static_cast<const rocksdb::InfoLogLevel>(rocksdb_info_log_level));
|
|
mysql_mutex_unlock(&rdb_sysvars_mutex);
|
|
}
|
|
|
|
static const char* index_type_names[] = {
|
|
"kBinarySearch",
|
|
"kHashSearch",
|
|
NullS
|
|
};
|
|
|
|
static TYPELIB index_type_typelib = {
|
|
array_elements(index_type_names) - 1,
|
|
"index_type_typelib",
|
|
index_type_names,
|
|
nullptr
|
|
};
|
|
|
|
//TODO: 0 means don't wait at all, and we don't support it yet?
|
|
static MYSQL_THDVAR_ULONG(lock_wait_timeout, PLUGIN_VAR_RQCMDARG,
|
|
"Number of seconds to wait for lock",
|
|
nullptr, nullptr, /*default*/ 1, /*min*/ 1, /*max*/ 1024*1024*1024, 0);
|
|
|
|
static MYSQL_THDVAR_BOOL(bulk_load, PLUGIN_VAR_RQCMDARG,
|
|
"Use bulk-load mode for inserts. This enables both "
|
|
"rocksdb_skip_unique_check and rocksdb_commit_in_the_middle.",
|
|
nullptr, rocksdb_set_bulk_load, FALSE);
|
|
|
|
static MYSQL_SYSVAR_BOOL(enable_bulk_load_api,
|
|
rocksdb_enable_bulk_load_api,
|
|
PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
|
|
"Enables using SstFileWriter for bulk loading",
|
|
nullptr, nullptr, rocksdb_enable_bulk_load_api);
|
|
|
|
static MYSQL_THDVAR_STR(skip_unique_check_tables,
|
|
PLUGIN_VAR_RQCMDARG|PLUGIN_VAR_MEMALLOC,
|
|
"Skip unique constraint checking for the specified tables", nullptr, nullptr,
|
|
".*");
|
|
|
|
static MYSQL_THDVAR_BOOL(skip_unique_check, PLUGIN_VAR_RQCMDARG,
|
|
"Skip unique constraint checking for all tables", nullptr, nullptr, FALSE);
|
|
|
|
static MYSQL_THDVAR_BOOL(commit_in_the_middle, PLUGIN_VAR_RQCMDARG,
|
|
"Commit rows implicitly every rocksdb_bulk_load_size, on bulk load/insert, "
|
|
"update and delete",
|
|
nullptr, nullptr, FALSE);
|
|
|
|
static MYSQL_THDVAR_STR(read_free_rpl_tables,
|
|
PLUGIN_VAR_RQCMDARG|PLUGIN_VAR_MEMALLOC,
|
|
"List of tables that will use read-free replication on the slave "
|
|
"(i.e. not lookup a row during replication)", nullptr, nullptr, "");
|
|
|
|
static MYSQL_SYSVAR_BOOL(
|
|
rpl_skip_tx_api,
|
|
rpl_skip_tx_api_var,
|
|
PLUGIN_VAR_RQCMDARG,
|
|
"Use write batches for replication thread instead of tx api", nullptr,
|
|
nullptr, FALSE);
|
|
|
|
static MYSQL_THDVAR_BOOL(skip_bloom_filter_on_read, PLUGIN_VAR_RQCMDARG,
|
|
"Skip using bloom filter for reads", nullptr, nullptr, FALSE);
|
|
|
|
static MYSQL_THDVAR_ULONG(max_row_locks, PLUGIN_VAR_RQCMDARG,
|
|
"Maximum number of locks a transaction can have",
|
|
nullptr, nullptr, /*default*/ 1024*1024*1024, /*min*/ 1,
|
|
/*max*/ 1024*1024*1024, 0);
|
|
|
|
static MYSQL_THDVAR_BOOL(lock_scanned_rows, PLUGIN_VAR_RQCMDARG,
|
|
"Take and hold locks on rows that are scanned but not updated",
|
|
nullptr, nullptr, FALSE);
|
|
|
|
static MYSQL_THDVAR_ULONG(bulk_load_size, PLUGIN_VAR_RQCMDARG,
|
|
"Max #records in a batch for bulk-load mode",
|
|
nullptr, nullptr, /*default*/ 1000, /*min*/ 1, /*max*/ 1024*1024*1024, 0);
|
|
|
|
static MYSQL_THDVAR_ULONGLONG(merge_buf_size, PLUGIN_VAR_RQCMDARG,
|
|
"Size to allocate for merge sort buffers written out to disk "
|
|
"during inplace index creation.",
|
|
nullptr, nullptr,
|
|
/* default (64MB) */ (ulonglong) 67108864,
|
|
/* min (100B) */ 100,
|
|
/* max */ SIZE_T_MAX, 1);
|
|
|
|
static MYSQL_THDVAR_ULONGLONG(merge_combine_read_size, PLUGIN_VAR_RQCMDARG,
|
|
"Size that we have to work with during combine (reading from disk) phase of "
|
|
"external sort during fast index creation.",
|
|
nullptr, nullptr,
|
|
/* default (1GB) */ (ulonglong) 1073741824,
|
|
/* min (100B) */ 100,
|
|
/* max */ SIZE_T_MAX, 1);
|
|
|
|
static MYSQL_SYSVAR_BOOL(create_if_missing,
|
|
*reinterpret_cast<my_bool*>(&rocksdb_db_options.create_if_missing),
|
|
PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
|
|
"DBOptions::create_if_missing for RocksDB",
|
|
nullptr, nullptr, rocksdb_db_options.create_if_missing);
|
|
|
|
static MYSQL_SYSVAR_BOOL(create_missing_column_families,
|
|
*reinterpret_cast<my_bool*>(
|
|
&rocksdb_db_options.create_missing_column_families),
|
|
PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
|
|
"DBOptions::create_missing_column_families for RocksDB",
|
|
nullptr, nullptr, rocksdb_db_options.create_missing_column_families);
|
|
|
|
static MYSQL_SYSVAR_BOOL(error_if_exists,
|
|
*reinterpret_cast<my_bool*>(&rocksdb_db_options.error_if_exists),
|
|
PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
|
|
"DBOptions::error_if_exists for RocksDB",
|
|
nullptr, nullptr, rocksdb_db_options.error_if_exists);
|
|
|
|
static MYSQL_SYSVAR_BOOL(paranoid_checks,
|
|
*reinterpret_cast<my_bool*>(&rocksdb_db_options.paranoid_checks),
|
|
PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
|
|
"DBOptions::paranoid_checks for RocksDB",
|
|
nullptr, nullptr, rocksdb_db_options.paranoid_checks);
|
|
|
|
static MYSQL_SYSVAR_ULONGLONG(rate_limiter_bytes_per_sec,
|
|
rocksdb_rate_limiter_bytes_per_sec,
|
|
PLUGIN_VAR_RQCMDARG,
|
|
"DBOptions::rate_limiter bytes_per_sec for RocksDB",
|
|
nullptr, rocksdb_set_rate_limiter_bytes_per_sec, /* default */ 0L,
|
|
/* min */ 0L, /* max */ MAX_RATE_LIMITER_BYTES_PER_SEC, 0);
|
|
|
|
static MYSQL_SYSVAR_ENUM(info_log_level,
|
|
rocksdb_info_log_level,
|
|
PLUGIN_VAR_RQCMDARG,
|
|
"Filter level for info logs to be written mysqld error log. "
|
|
"Valid values include 'debug_level', 'info_level', 'warn_level'"
|
|
"'error_level' and 'fatal_level'.",
|
|
nullptr, rocksdb_set_rocksdb_info_log_level,
|
|
rocksdb::InfoLogLevel::ERROR_LEVEL, &info_log_level_typelib);
|
|
|
|
static MYSQL_THDVAR_INT(perf_context_level,
|
|
PLUGIN_VAR_RQCMDARG,
|
|
"Perf Context Level for rocksdb internal timer stat collection",
|
|
nullptr, nullptr,
|
|
/* default */ rocksdb::PerfLevel::kUninitialized,
|
|
/* min */ rocksdb::PerfLevel::kUninitialized,
|
|
/* max */ rocksdb::PerfLevel::kOutOfBounds - 1, 0);
|
|
|
|
static MYSQL_SYSVAR_UINT(wal_recovery_mode,
|
|
rocksdb_wal_recovery_mode,
|
|
PLUGIN_VAR_RQCMDARG,
|
|
"DBOptions::wal_recovery_mode for RocksDB",
|
|
nullptr, nullptr, 2,
|
|
/* min */ 0L, /* max */ 3, 0);
|
|
|
|
static MYSQL_SYSVAR_ULONG(compaction_readahead_size,
|
|
rocksdb_db_options.compaction_readahead_size,
|
|
PLUGIN_VAR_RQCMDARG,
|
|
"DBOptions::compaction_readahead_size for RocksDB",
|
|
nullptr, nullptr, rocksdb_db_options.compaction_readahead_size,
|
|
/* min */ 0L, /* max */ ULONG_MAX, 0);
|
|
|
|
static MYSQL_SYSVAR_BOOL(new_table_reader_for_compaction_inputs,
|
|
*reinterpret_cast<my_bool*>
|
|
(&rocksdb_db_options.new_table_reader_for_compaction_inputs),
|
|
PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
|
|
"DBOptions::new_table_reader_for_compaction_inputs for RocksDB",
|
|
nullptr, nullptr, rocksdb_db_options.new_table_reader_for_compaction_inputs);
|
|
|
|
static MYSQL_SYSVAR_UINT(access_hint_on_compaction_start,
|
|
rocksdb_access_hint_on_compaction_start,
|
|
PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
|
|
"DBOptions::access_hint_on_compaction_start for RocksDB",
|
|
nullptr, nullptr, 1,
|
|
/* min */ 0L, /* max */ 3, 0);
|
|
|
|
static MYSQL_SYSVAR_BOOL(allow_concurrent_memtable_write,
|
|
*reinterpret_cast<my_bool*>(
|
|
&rocksdb_db_options.allow_concurrent_memtable_write),
|
|
PLUGIN_VAR_RQCMDARG,
|
|
"DBOptions::allow_concurrent_memtable_write for RocksDB",
|
|
nullptr, nullptr, rocksdb_db_options.allow_concurrent_memtable_write);
|
|
|
|
static MYSQL_SYSVAR_BOOL(enable_write_thread_adaptive_yield,
|
|
*reinterpret_cast<my_bool*>(
|
|
&rocksdb_db_options.enable_write_thread_adaptive_yield),
|
|
PLUGIN_VAR_RQCMDARG,
|
|
"DBOptions::enable_write_thread_adaptive_yield for RocksDB",
|
|
nullptr, nullptr, rocksdb_db_options.enable_write_thread_adaptive_yield);
|
|
|
|
static MYSQL_SYSVAR_INT(max_open_files,
|
|
rocksdb_db_options.max_open_files,
|
|
PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
|
|
"DBOptions::max_open_files for RocksDB",
|
|
nullptr, nullptr, rocksdb_db_options.max_open_files,
|
|
/* min */ -1, /* max */ INT_MAX, 0);
|
|
|
|
static MYSQL_SYSVAR_ULONG(max_total_wal_size,
|
|
rocksdb_db_options.max_total_wal_size,
|
|
PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
|
|
"DBOptions::max_total_wal_size for RocksDB",
|
|
nullptr, nullptr, rocksdb_db_options.max_total_wal_size,
|
|
/* min */ 0L, /* max */ LONG_MAX, 0);
|
|
|
|
static MYSQL_SYSVAR_BOOL(disabledatasync,
|
|
*reinterpret_cast<my_bool*>(&rocksdb_db_options.disableDataSync),
|
|
PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
|
|
"DBOptions::disableDataSync for RocksDB",
|
|
nullptr, nullptr, rocksdb_db_options.disableDataSync);
|
|
|
|
static MYSQL_SYSVAR_BOOL(use_fsync,
|
|
*reinterpret_cast<my_bool*>(&rocksdb_db_options.use_fsync),
|
|
PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
|
|
"DBOptions::use_fsync for RocksDB",
|
|
nullptr, nullptr, rocksdb_db_options.use_fsync);
|
|
|
|
static MYSQL_SYSVAR_STR(wal_dir, rocksdb_wal_dir,
|
|
PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
|
|
"DBOptions::wal_dir for RocksDB",
|
|
nullptr, nullptr, rocksdb_db_options.wal_dir.c_str());
|
|
|
|
static MYSQL_SYSVAR_ULONG(delete_obsolete_files_period_micros,
|
|
rocksdb_db_options.delete_obsolete_files_period_micros,
|
|
PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
|
|
"DBOptions::delete_obsolete_files_period_micros for RocksDB",
|
|
nullptr, nullptr, rocksdb_db_options.delete_obsolete_files_period_micros,
|
|
/* min */ 0L, /* max */ LONG_MAX, 0);
|
|
|
|
static MYSQL_SYSVAR_INT(base_background_compactions,
|
|
rocksdb_db_options.base_background_compactions,
|
|
PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
|
|
"DBOptions::base_background_compactions for RocksDB",
|
|
nullptr, nullptr, rocksdb_db_options.base_background_compactions,
|
|
/* min */ -1, /* max */ MAX_BACKGROUND_COMPACTIONS, 0);
|
|
|
|
static MYSQL_SYSVAR_INT(max_background_compactions,
|
|
rocksdb_db_options.max_background_compactions,
|
|
PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
|
|
"DBOptions::max_background_compactions for RocksDB",
|
|
nullptr, nullptr, rocksdb_db_options.max_background_compactions,
|
|
/* min */ 1, /* max */ MAX_BACKGROUND_COMPACTIONS, 0);
|
|
|
|
static MYSQL_SYSVAR_INT(max_background_flushes,
|
|
rocksdb_db_options.max_background_flushes,
|
|
PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
|
|
"DBOptions::max_background_flushes for RocksDB",
|
|
nullptr, nullptr, rocksdb_db_options.max_background_flushes,
|
|
/* min */ 1, /* max */ MAX_BACKGROUND_FLUSHES, 0);
|
|
|
|
static MYSQL_SYSVAR_UINT(max_subcompactions,
|
|
rocksdb_db_options.max_subcompactions,
|
|
PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
|
|
"DBOptions::max_subcompactions for RocksDB",
|
|
nullptr, nullptr, rocksdb_db_options.max_subcompactions,
|
|
/* min */ 1, /* max */ MAX_SUBCOMPACTIONS, 0);
|
|
|
|
static MYSQL_SYSVAR_ULONG(max_log_file_size,
|
|
rocksdb_db_options.max_log_file_size,
|
|
PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
|
|
"DBOptions::max_log_file_size for RocksDB",
|
|
nullptr, nullptr, rocksdb_db_options.max_log_file_size,
|
|
/* min */ 0L, /* max */ LONG_MAX, 0);
|
|
|
|
static MYSQL_SYSVAR_ULONG(log_file_time_to_roll,
|
|
rocksdb_db_options.log_file_time_to_roll,
|
|
PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
|
|
"DBOptions::log_file_time_to_roll for RocksDB",
|
|
nullptr, nullptr, rocksdb_db_options.log_file_time_to_roll,
|
|
/* min */ 0L, /* max */ LONG_MAX, 0);
|
|
|
|
static MYSQL_SYSVAR_ULONG(keep_log_file_num,
|
|
rocksdb_db_options.keep_log_file_num,
|
|
PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
|
|
"DBOptions::keep_log_file_num for RocksDB",
|
|
nullptr, nullptr, rocksdb_db_options.keep_log_file_num,
|
|
/* min */ 0L, /* max */ LONG_MAX, 0);
|
|
|
|
static MYSQL_SYSVAR_ULONG(max_manifest_file_size,
|
|
rocksdb_db_options.max_manifest_file_size,
|
|
PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
|
|
"DBOptions::max_manifest_file_size for RocksDB",
|
|
nullptr, nullptr, rocksdb_db_options.max_manifest_file_size,
|
|
/* min */ 0L, /* max */ ULONG_MAX, 0);
|
|
|
|
static MYSQL_SYSVAR_INT(table_cache_numshardbits,
|
|
rocksdb_db_options.table_cache_numshardbits,
|
|
PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
|
|
"DBOptions::table_cache_numshardbits for RocksDB",
|
|
nullptr, nullptr, rocksdb_db_options.table_cache_numshardbits,
|
|
/* min */ 0, /* max */ INT_MAX, 0);
|
|
|
|
static MYSQL_SYSVAR_ULONG(wal_ttl_seconds,
|
|
rocksdb_db_options.WAL_ttl_seconds,
|
|
PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
|
|
"DBOptions::WAL_ttl_seconds for RocksDB",
|
|
nullptr, nullptr, rocksdb_db_options.WAL_ttl_seconds,
|
|
/* min */ 0L, /* max */ LONG_MAX, 0);
|
|
|
|
static MYSQL_SYSVAR_ULONG(wal_size_limit_mb,
|
|
rocksdb_db_options.WAL_size_limit_MB,
|
|
PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
|
|
"DBOptions::WAL_size_limit_MB for RocksDB",
|
|
nullptr, nullptr, rocksdb_db_options.WAL_size_limit_MB,
|
|
/* min */ 0L, /* max */ LONG_MAX, 0);
|
|
|
|
static MYSQL_SYSVAR_ULONG(manifest_preallocation_size,
|
|
rocksdb_db_options.manifest_preallocation_size,
|
|
PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
|
|
"DBOptions::manifest_preallocation_size for RocksDB",
|
|
nullptr, nullptr, rocksdb_db_options.manifest_preallocation_size,
|
|
/* min */ 0L, /* max */ LONG_MAX, 0);
|
|
|
|
static MYSQL_SYSVAR_BOOL(allow_os_buffer,
|
|
*reinterpret_cast<my_bool*>(&rocksdb_db_options.allow_os_buffer),
|
|
PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
|
|
"DBOptions::allow_os_buffer for RocksDB",
|
|
nullptr, nullptr, rocksdb_db_options.allow_os_buffer);
|
|
|
|
static MYSQL_SYSVAR_BOOL(allow_mmap_reads,
|
|
*reinterpret_cast<my_bool*>(&rocksdb_db_options.allow_mmap_reads),
|
|
PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
|
|
"DBOptions::allow_mmap_reads for RocksDB",
|
|
nullptr, nullptr, rocksdb_db_options.allow_mmap_reads);
|
|
|
|
static MYSQL_SYSVAR_BOOL(allow_mmap_writes,
|
|
*reinterpret_cast<my_bool*>(&rocksdb_db_options.allow_mmap_writes),
|
|
PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
|
|
"DBOptions::allow_mmap_writes for RocksDB",
|
|
nullptr, nullptr, rocksdb_db_options.allow_mmap_writes);
|
|
|
|
static MYSQL_SYSVAR_BOOL(is_fd_close_on_exec,
|
|
*reinterpret_cast<my_bool*>(&rocksdb_db_options.is_fd_close_on_exec),
|
|
PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
|
|
"DBOptions::is_fd_close_on_exec for RocksDB",
|
|
nullptr, nullptr, rocksdb_db_options.is_fd_close_on_exec);
|
|
|
|
static MYSQL_SYSVAR_UINT(stats_dump_period_sec,
|
|
rocksdb_db_options.stats_dump_period_sec,
|
|
PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
|
|
"DBOptions::stats_dump_period_sec for RocksDB",
|
|
nullptr, nullptr, rocksdb_db_options.stats_dump_period_sec,
|
|
/* min */ 0, /* max */ INT_MAX, 0);
|
|
|
|
static MYSQL_SYSVAR_BOOL(advise_random_on_open,
|
|
*reinterpret_cast<my_bool*>(&rocksdb_db_options.advise_random_on_open),
|
|
PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
|
|
"DBOptions::advise_random_on_open for RocksDB",
|
|
nullptr, nullptr, rocksdb_db_options.advise_random_on_open);
|
|
|
|
static MYSQL_SYSVAR_ULONG(db_write_buffer_size,
|
|
rocksdb_db_options.db_write_buffer_size,
|
|
PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
|
|
"DBOptions::db_write_buffer_size for RocksDB",
|
|
nullptr, nullptr, rocksdb_db_options.db_write_buffer_size,
|
|
/* min */ 0L, /* max */ LONG_MAX, 0);
|
|
|
|
static MYSQL_SYSVAR_BOOL(use_adaptive_mutex,
|
|
*reinterpret_cast<my_bool*>(&rocksdb_db_options.use_adaptive_mutex),
|
|
PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
|
|
"DBOptions::use_adaptive_mutex for RocksDB",
|
|
nullptr, nullptr, rocksdb_db_options.use_adaptive_mutex);
|
|
|
|
static MYSQL_SYSVAR_ULONG(bytes_per_sync,
|
|
rocksdb_db_options.bytes_per_sync,
|
|
PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
|
|
"DBOptions::bytes_per_sync for RocksDB",
|
|
nullptr, nullptr, rocksdb_db_options.bytes_per_sync,
|
|
/* min */ 0L, /* max */ LONG_MAX, 0);
|
|
|
|
static MYSQL_SYSVAR_ULONG(wal_bytes_per_sync,
|
|
rocksdb_db_options.wal_bytes_per_sync,
|
|
PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
|
|
"DBOptions::wal_bytes_per_sync for RocksDB",
|
|
nullptr, nullptr, rocksdb_db_options.wal_bytes_per_sync,
|
|
/* min */ 0L, /* max */ LONG_MAX, 0);
|
|
|
|
static MYSQL_SYSVAR_BOOL(enable_thread_tracking,
|
|
*reinterpret_cast<my_bool*>(&rocksdb_db_options.enable_thread_tracking),
|
|
PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
|
|
"DBOptions::enable_thread_tracking for RocksDB",
|
|
nullptr, nullptr, rocksdb_db_options.enable_thread_tracking);
|
|
|
|
static MYSQL_SYSVAR_LONGLONG(block_cache_size, rocksdb_block_cache_size,
|
|
PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
|
|
"block_cache size for RocksDB",
|
|
nullptr, nullptr, /* RocksDB's default is 8 MB: */ 8*1024*1024L,
|
|
/* min */ 1024L, /* max */ LONGLONG_MAX, /* Block size */1024L);
|
|
|
|
static MYSQL_SYSVAR_BOOL(cache_index_and_filter_blocks,
|
|
*reinterpret_cast<my_bool*>(
|
|
&rocksdb_tbl_options.cache_index_and_filter_blocks),
|
|
PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
|
|
"BlockBasedTableOptions::cache_index_and_filter_blocks for RocksDB",
|
|
nullptr, nullptr, true);
|
|
|
|
// When pin_l0_filter_and_index_blocks_in_cache is true, RocksDB will use the
|
|
// LRU cache, but will always keep the filter & idndex block's handle checked
|
|
// out (=won't call ShardedLRUCache::Release), plus the parsed out objects
|
|
// the LRU cache will never push flush them out, hence they're pinned.
|
|
//
|
|
// This fixes the mutex contention between :ShardedLRUCache::Lookup and
|
|
// ShardedLRUCache::Release which reduced the QPS ratio (QPS using secondary
|
|
// index / QPS using PK).
|
|
static MYSQL_SYSVAR_BOOL(pin_l0_filter_and_index_blocks_in_cache,
|
|
*reinterpret_cast<my_bool*>(
|
|
&rocksdb_tbl_options.pin_l0_filter_and_index_blocks_in_cache),
|
|
PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
|
|
"pin_l0_filter_and_index_blocks_in_cache for RocksDB",
|
|
nullptr, nullptr, true);
|
|
|
|
static MYSQL_SYSVAR_ENUM(index_type,
|
|
rocksdb_index_type,
|
|
PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
|
|
"BlockBasedTableOptions::index_type for RocksDB",
|
|
nullptr, nullptr,
|
|
(uint64_t)rocksdb_tbl_options.index_type, &index_type_typelib);
|
|
|
|
static MYSQL_SYSVAR_BOOL(hash_index_allow_collision,
|
|
*reinterpret_cast<my_bool*>(&rocksdb_tbl_options.hash_index_allow_collision),
|
|
PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
|
|
"BlockBasedTableOptions::hash_index_allow_collision for RocksDB",
|
|
nullptr, nullptr, rocksdb_tbl_options.hash_index_allow_collision);
|
|
|
|
static MYSQL_SYSVAR_BOOL(no_block_cache,
|
|
*reinterpret_cast<my_bool*>(&rocksdb_tbl_options.no_block_cache),
|
|
PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
|
|
"BlockBasedTableOptions::no_block_cache for RocksDB",
|
|
nullptr, nullptr, rocksdb_tbl_options.no_block_cache);
|
|
|
|
static MYSQL_SYSVAR_ULONG(block_size,
|
|
rocksdb_tbl_options.block_size,
|
|
PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
|
|
"BlockBasedTableOptions::block_size for RocksDB",
|
|
nullptr, nullptr, rocksdb_tbl_options.block_size,
|
|
/* min */ 1L, /* max */ LONG_MAX, 0);
|
|
|
|
static MYSQL_SYSVAR_INT(block_size_deviation,
|
|
rocksdb_tbl_options.block_size_deviation,
|
|
PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
|
|
"BlockBasedTableOptions::block_size_deviation for RocksDB",
|
|
nullptr, nullptr, rocksdb_tbl_options.block_size_deviation,
|
|
/* min */ 0, /* max */ INT_MAX, 0);
|
|
|
|
static MYSQL_SYSVAR_INT(block_restart_interval,
|
|
rocksdb_tbl_options.block_restart_interval,
|
|
PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
|
|
"BlockBasedTableOptions::block_restart_interval for RocksDB",
|
|
nullptr, nullptr, rocksdb_tbl_options.block_restart_interval,
|
|
/* min */ 1, /* max */ INT_MAX, 0);
|
|
|
|
static MYSQL_SYSVAR_BOOL(whole_key_filtering,
|
|
*reinterpret_cast<my_bool*>(&rocksdb_tbl_options.whole_key_filtering),
|
|
PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
|
|
"BlockBasedTableOptions::whole_key_filtering for RocksDB",
|
|
nullptr, nullptr, rocksdb_tbl_options.whole_key_filtering);
|
|
|
|
static MYSQL_SYSVAR_STR(default_cf_options, rocksdb_default_cf_options,
|
|
PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
|
|
"default cf options for RocksDB",
|
|
nullptr, nullptr, "");
|
|
|
|
static MYSQL_SYSVAR_STR(override_cf_options, rocksdb_override_cf_options,
|
|
PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
|
|
"option overrides per cf for RocksDB",
|
|
nullptr, nullptr, "");
|
|
|
|
static MYSQL_SYSVAR_BOOL(background_sync,
|
|
rocksdb_background_sync,
|
|
PLUGIN_VAR_RQCMDARG,
|
|
"turns on background syncs for RocksDB",
|
|
nullptr, nullptr, FALSE);
|
|
|
|
static MYSQL_THDVAR_BOOL(write_sync,
|
|
PLUGIN_VAR_RQCMDARG,
|
|
"WriteOptions::sync for RocksDB",
|
|
nullptr, nullptr, rocksdb::WriteOptions().sync);
|
|
|
|
static MYSQL_THDVAR_BOOL(write_disable_wal,
|
|
PLUGIN_VAR_RQCMDARG,
|
|
"WriteOptions::disableWAL for RocksDB",
|
|
nullptr, nullptr, rocksdb::WriteOptions().disableWAL);
|
|
|
|
static MYSQL_THDVAR_BOOL(write_ignore_missing_column_families,
|
|
PLUGIN_VAR_RQCMDARG,
|
|
"WriteOptions::ignore_missing_column_families for RocksDB",
|
|
nullptr, nullptr, rocksdb::WriteOptions().ignore_missing_column_families);
|
|
|
|
static MYSQL_THDVAR_BOOL(skip_fill_cache,
|
|
PLUGIN_VAR_RQCMDARG,
|
|
"Skip filling block cache on read requests",
|
|
nullptr, nullptr, FALSE);
|
|
|
|
static MYSQL_THDVAR_BOOL(unsafe_for_binlog,
|
|
PLUGIN_VAR_RQCMDARG,
|
|
"Allowing statement based binary logging which may break consistency",
|
|
nullptr, nullptr, FALSE);
|
|
|
|
static MYSQL_THDVAR_UINT(records_in_range,
|
|
PLUGIN_VAR_RQCMDARG,
|
|
"Used to override the result of records_in_range(). Set to a positive number to override",
|
|
nullptr, nullptr, 0,
|
|
/* min */ 0, /* max */ INT_MAX, 0);
|
|
|
|
static MYSQL_THDVAR_UINT(force_index_records_in_range,
|
|
PLUGIN_VAR_RQCMDARG,
|
|
"Used to override the result of records_in_range() when FORCE INDEX is used.",
|
|
nullptr, nullptr, 0,
|
|
/* min */ 0, /* max */ INT_MAX, 0);
|
|
|
|
static MYSQL_SYSVAR_UINT(debug_optimizer_n_rows,
|
|
rocksdb_debug_optimizer_n_rows,
|
|
PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY | PLUGIN_VAR_NOSYSVAR,
|
|
"Test only to override rocksdb estimates of table size in a memtable",
|
|
nullptr, nullptr, 0, /* min */ 0, /* max */ INT_MAX, 0);
|
|
|
|
static MYSQL_SYSVAR_BOOL(debug_optimizer_no_zero_cardinality,
|
|
rocksdb_debug_optimizer_no_zero_cardinality,
|
|
PLUGIN_VAR_RQCMDARG,
|
|
"In case if cardinality is zero, overrides it with some value",
|
|
nullptr, nullptr, TRUE);
|
|
|
|
static MYSQL_SYSVAR_STR(compact_cf, rocksdb_compact_cf_name,
|
|
PLUGIN_VAR_RQCMDARG,
|
|
"Compact column family",
|
|
rocksdb_compact_column_family, rocksdb_compact_column_family_stub, "");
|
|
|
|
static MYSQL_SYSVAR_STR(create_checkpoint, rocksdb_checkpoint_name,
|
|
PLUGIN_VAR_RQCMDARG,
|
|
"Checkpoint directory",
|
|
rocksdb_create_checkpoint, rocksdb_create_checkpoint_stub, "");
|
|
|
|
static MYSQL_SYSVAR_BOOL(signal_drop_index_thread,
|
|
rocksdb_signal_drop_index_thread,
|
|
PLUGIN_VAR_RQCMDARG,
|
|
"Wake up drop index thread",
|
|
nullptr, rocksdb_drop_index_wakeup_thread, FALSE);
|
|
|
|
static MYSQL_SYSVAR_BOOL(pause_background_work,
|
|
rocksdb_pause_background_work,
|
|
PLUGIN_VAR_RQCMDARG,
|
|
"Disable all rocksdb background operations",
|
|
nullptr, rocksdb_set_pause_background_work, FALSE);
|
|
|
|
static MYSQL_SYSVAR_BOOL(disable_2pc,
|
|
rocksdb_disable_2pc,
|
|
PLUGIN_VAR_RQCMDARG,
|
|
"Disable two phase commit for MyRocks",
|
|
nullptr, nullptr, TRUE);
|
|
|
|
static MYSQL_SYSVAR_BOOL(strict_collation_check,
|
|
rocksdb_strict_collation_check,
|
|
PLUGIN_VAR_RQCMDARG,
|
|
"Enforce case sensitive collation for MyRocks indexes",
|
|
nullptr, nullptr, TRUE);
|
|
|
|
static MYSQL_SYSVAR_STR(strict_collation_exceptions,
|
|
rocksdb_strict_collation_exceptions,
|
|
PLUGIN_VAR_RQCMDARG|PLUGIN_VAR_MEMALLOC,
|
|
"List of tables (using regex) that are excluded "
|
|
"from the case sensitive collation enforcement",
|
|
nullptr, rocksdb_set_collation_exception_list, "");
|
|
|
|
static MYSQL_SYSVAR_BOOL(collect_sst_properties,
|
|
rocksdb_collect_sst_properties,
|
|
PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
|
|
"Enables collecting SST file properties on each flush",
|
|
nullptr, nullptr, rocksdb_collect_sst_properties);
|
|
|
|
static MYSQL_SYSVAR_BOOL(
|
|
force_flush_memtable_now,
|
|
rocksdb_force_flush_memtable_now_var,
|
|
PLUGIN_VAR_RQCMDARG,
|
|
"Forces memstore flush which may block all write requests so be careful",
|
|
rocksdb_force_flush_memtable_now,
|
|
rocksdb_force_flush_memtable_now_stub, FALSE);
|
|
|
|
static MYSQL_THDVAR_BOOL(
|
|
flush_memtable_on_analyze,
|
|
PLUGIN_VAR_RQCMDARG,
|
|
"Forces memtable flush on ANALZYE table to get accurate cardinality",
|
|
nullptr, nullptr, true);
|
|
|
|
static MYSQL_SYSVAR_UINT(seconds_between_stat_computes,
|
|
rocksdb_seconds_between_stat_computes,
|
|
PLUGIN_VAR_RQCMDARG,
|
|
"Sets a number of seconds to wait between optimizer stats recomputation. "
|
|
"Only changed indexes will be refreshed.",
|
|
nullptr, nullptr, rocksdb_seconds_between_stat_computes,
|
|
/* min */ 0L, /* max */ UINT_MAX, 0);
|
|
|
|
static MYSQL_SYSVAR_LONGLONG(
|
|
compaction_sequential_deletes,
|
|
rocksdb_compaction_sequential_deletes,
|
|
PLUGIN_VAR_RQCMDARG,
|
|
"RocksDB will trigger compaction for the file if it has more than this number sequential deletes per window",
|
|
nullptr, rocksdb_set_compaction_options,
|
|
DEFAULT_COMPACTION_SEQUENTIAL_DELETES,
|
|
/* min */ 0L, /* max */ MAX_COMPACTION_SEQUENTIAL_DELETES, 0);
|
|
|
|
static MYSQL_SYSVAR_LONGLONG(
|
|
compaction_sequential_deletes_window,
|
|
rocksdb_compaction_sequential_deletes_window,
|
|
PLUGIN_VAR_RQCMDARG,
|
|
"Size of the window for counting rocksdb_compaction_sequential_deletes",
|
|
nullptr, rocksdb_set_compaction_options,
|
|
DEFAULT_COMPACTION_SEQUENTIAL_DELETES_WINDOW,
|
|
/* min */ 0L, /* max */ MAX_COMPACTION_SEQUENTIAL_DELETES_WINDOW, 0);
|
|
|
|
static MYSQL_SYSVAR_LONGLONG(
|
|
compaction_sequential_deletes_file_size,
|
|
rocksdb_compaction_sequential_deletes_file_size,
|
|
PLUGIN_VAR_RQCMDARG,
|
|
"Minimum file size required for compaction_sequential_deletes",
|
|
nullptr, rocksdb_set_compaction_options, 0L,
|
|
/* min */ -1L, /* max */ LONGLONG_MAX, 0);
|
|
|
|
static MYSQL_SYSVAR_BOOL(compaction_sequential_deletes_count_sd,
|
|
rocksdb_compaction_sequential_deletes_count_sd,
|
|
PLUGIN_VAR_RQCMDARG,
|
|
"Counting SingleDelete as rocksdb_compaction_sequential_deletes",
|
|
nullptr, nullptr, rocksdb_compaction_sequential_deletes_count_sd);
|
|
|
|
static MYSQL_THDVAR_INT(checksums_pct,
|
|
PLUGIN_VAR_RQCMDARG,
|
|
"How many percentages of rows to be checksummed",
|
|
nullptr, nullptr, 100,
|
|
/* min */ 0, /* max */ 100, 0);
|
|
|
|
static MYSQL_THDVAR_BOOL(store_checksums,
|
|
PLUGIN_VAR_RQCMDARG,
|
|
"Include checksums when writing index/table records",
|
|
nullptr, nullptr, false /* default value */);
|
|
|
|
static MYSQL_THDVAR_BOOL(verify_checksums,
|
|
PLUGIN_VAR_RQCMDARG,
|
|
"Verify checksums when reading index/table records",
|
|
nullptr, nullptr, false /* default value */);
|
|
|
|
static MYSQL_SYSVAR_UINT(validate_tables,
|
|
rocksdb_validate_tables,
|
|
PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
|
|
"Verify all .frm files match all RocksDB tables (0 means no verification, "
|
|
"1 means verify and fail on error, and 2 means verify but continue",
|
|
nullptr, nullptr, 1 /* default value */, 0 /* min value */,
|
|
2 /* max value */, 0);
|
|
|
|
static MYSQL_SYSVAR_STR(datadir,
|
|
rocksdb_datadir,
|
|
PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY,
|
|
"RocksDB data directory",
|
|
nullptr, nullptr, "./.rocksdb");
|
|
|
|
static MYSQL_SYSVAR_UINT(
|
|
table_stats_sampling_pct,
|
|
rocksdb_table_stats_sampling_pct,
|
|
PLUGIN_VAR_RQCMDARG,
|
|
"Percentage of entries to sample when collecting statistics about table "
|
|
"properties. Specify either 0 to sample everything or percentage ["
|
|
STRINGIFY_ARG(RDB_TBL_STATS_SAMPLE_PCT_MIN) ".."
|
|
STRINGIFY_ARG(RDB_TBL_STATS_SAMPLE_PCT_MAX) "]. " "By default "
|
|
STRINGIFY_ARG(RDB_DEFAULT_TBL_STATS_SAMPLE_PCT) "% of entries are "
|
|
"sampled.",
|
|
nullptr, rocksdb_set_table_stats_sampling_pct, /* default */
|
|
RDB_DEFAULT_TBL_STATS_SAMPLE_PCT, /* everything */ 0,
|
|
/* max */ RDB_TBL_STATS_SAMPLE_PCT_MAX, 0);
|
|
|
|
static const longlong ROCKSDB_WRITE_BUFFER_SIZE_DEFAULT= 4194304;
|
|
static const int ROCKSDB_ASSUMED_KEY_VALUE_DISK_SIZE= 100;
|
|
|
|
static struct st_mysql_sys_var* rocksdb_system_variables[]= {
|
|
MYSQL_SYSVAR(lock_wait_timeout),
|
|
MYSQL_SYSVAR(max_row_locks),
|
|
MYSQL_SYSVAR(lock_scanned_rows),
|
|
MYSQL_SYSVAR(bulk_load),
|
|
MYSQL_SYSVAR(skip_unique_check_tables),
|
|
MYSQL_SYSVAR(skip_unique_check),
|
|
MYSQL_SYSVAR(commit_in_the_middle),
|
|
MYSQL_SYSVAR(read_free_rpl_tables),
|
|
MYSQL_SYSVAR(rpl_skip_tx_api),
|
|
MYSQL_SYSVAR(bulk_load_size),
|
|
MYSQL_SYSVAR(merge_buf_size),
|
|
MYSQL_SYSVAR(enable_bulk_load_api),
|
|
MYSQL_SYSVAR(merge_combine_read_size),
|
|
MYSQL_SYSVAR(skip_bloom_filter_on_read),
|
|
|
|
MYSQL_SYSVAR(create_if_missing),
|
|
MYSQL_SYSVAR(create_missing_column_families),
|
|
MYSQL_SYSVAR(error_if_exists),
|
|
MYSQL_SYSVAR(paranoid_checks),
|
|
MYSQL_SYSVAR(rate_limiter_bytes_per_sec),
|
|
MYSQL_SYSVAR(info_log_level),
|
|
MYSQL_SYSVAR(max_open_files),
|
|
MYSQL_SYSVAR(max_total_wal_size),
|
|
MYSQL_SYSVAR(disabledatasync),
|
|
MYSQL_SYSVAR(use_fsync),
|
|
MYSQL_SYSVAR(wal_dir),
|
|
MYSQL_SYSVAR(delete_obsolete_files_period_micros),
|
|
MYSQL_SYSVAR(base_background_compactions),
|
|
MYSQL_SYSVAR(max_background_compactions),
|
|
MYSQL_SYSVAR(max_background_flushes),
|
|
MYSQL_SYSVAR(max_log_file_size),
|
|
MYSQL_SYSVAR(max_subcompactions),
|
|
MYSQL_SYSVAR(log_file_time_to_roll),
|
|
MYSQL_SYSVAR(keep_log_file_num),
|
|
MYSQL_SYSVAR(max_manifest_file_size),
|
|
MYSQL_SYSVAR(table_cache_numshardbits),
|
|
MYSQL_SYSVAR(wal_ttl_seconds),
|
|
MYSQL_SYSVAR(wal_size_limit_mb),
|
|
MYSQL_SYSVAR(manifest_preallocation_size),
|
|
MYSQL_SYSVAR(allow_os_buffer),
|
|
MYSQL_SYSVAR(allow_mmap_reads),
|
|
MYSQL_SYSVAR(allow_mmap_writes),
|
|
MYSQL_SYSVAR(is_fd_close_on_exec),
|
|
MYSQL_SYSVAR(stats_dump_period_sec),
|
|
MYSQL_SYSVAR(advise_random_on_open),
|
|
MYSQL_SYSVAR(db_write_buffer_size),
|
|
MYSQL_SYSVAR(use_adaptive_mutex),
|
|
MYSQL_SYSVAR(bytes_per_sync),
|
|
MYSQL_SYSVAR(wal_bytes_per_sync),
|
|
MYSQL_SYSVAR(enable_thread_tracking),
|
|
MYSQL_SYSVAR(perf_context_level),
|
|
MYSQL_SYSVAR(wal_recovery_mode),
|
|
MYSQL_SYSVAR(access_hint_on_compaction_start),
|
|
MYSQL_SYSVAR(new_table_reader_for_compaction_inputs),
|
|
MYSQL_SYSVAR(compaction_readahead_size),
|
|
MYSQL_SYSVAR(allow_concurrent_memtable_write),
|
|
MYSQL_SYSVAR(enable_write_thread_adaptive_yield),
|
|
|
|
MYSQL_SYSVAR(block_cache_size),
|
|
MYSQL_SYSVAR(cache_index_and_filter_blocks),
|
|
MYSQL_SYSVAR(pin_l0_filter_and_index_blocks_in_cache),
|
|
MYSQL_SYSVAR(index_type),
|
|
MYSQL_SYSVAR(hash_index_allow_collision),
|
|
MYSQL_SYSVAR(no_block_cache),
|
|
MYSQL_SYSVAR(block_size),
|
|
MYSQL_SYSVAR(block_size_deviation),
|
|
MYSQL_SYSVAR(block_restart_interval),
|
|
MYSQL_SYSVAR(whole_key_filtering),
|
|
|
|
MYSQL_SYSVAR(default_cf_options),
|
|
MYSQL_SYSVAR(override_cf_options),
|
|
|
|
MYSQL_SYSVAR(background_sync),
|
|
|
|
MYSQL_SYSVAR(write_sync),
|
|
MYSQL_SYSVAR(write_disable_wal),
|
|
MYSQL_SYSVAR(write_ignore_missing_column_families),
|
|
|
|
MYSQL_SYSVAR(skip_fill_cache),
|
|
MYSQL_SYSVAR(unsafe_for_binlog),
|
|
|
|
MYSQL_SYSVAR(records_in_range),
|
|
MYSQL_SYSVAR(force_index_records_in_range),
|
|
MYSQL_SYSVAR(debug_optimizer_n_rows),
|
|
MYSQL_SYSVAR(debug_optimizer_no_zero_cardinality),
|
|
|
|
MYSQL_SYSVAR(compact_cf),
|
|
MYSQL_SYSVAR(signal_drop_index_thread),
|
|
MYSQL_SYSVAR(pause_background_work),
|
|
MYSQL_SYSVAR(disable_2pc),
|
|
MYSQL_SYSVAR(strict_collation_check),
|
|
MYSQL_SYSVAR(strict_collation_exceptions),
|
|
MYSQL_SYSVAR(collect_sst_properties),
|
|
MYSQL_SYSVAR(force_flush_memtable_now),
|
|
MYSQL_SYSVAR(flush_memtable_on_analyze),
|
|
MYSQL_SYSVAR(seconds_between_stat_computes),
|
|
|
|
MYSQL_SYSVAR(compaction_sequential_deletes),
|
|
MYSQL_SYSVAR(compaction_sequential_deletes_window),
|
|
MYSQL_SYSVAR(compaction_sequential_deletes_file_size),
|
|
MYSQL_SYSVAR(compaction_sequential_deletes_count_sd),
|
|
|
|
MYSQL_SYSVAR(datadir),
|
|
MYSQL_SYSVAR(create_checkpoint),
|
|
|
|
MYSQL_SYSVAR(checksums_pct),
|
|
MYSQL_SYSVAR(store_checksums),
|
|
MYSQL_SYSVAR(verify_checksums),
|
|
|
|
MYSQL_SYSVAR(validate_tables),
|
|
MYSQL_SYSVAR(table_stats_sampling_pct),
|
|
nullptr
|
|
};
|
|
|
|
|
|
static rocksdb::WriteOptions rdb_get_rocksdb_write_options(my_core::THD* thd)
|
|
{
|
|
rocksdb::WriteOptions opt;
|
|
|
|
opt.sync= THDVAR(thd, write_sync);
|
|
opt.disableWAL= THDVAR(thd, write_disable_wal);
|
|
opt.ignore_missing_column_families=
|
|
THDVAR(thd, write_ignore_missing_column_families);
|
|
|
|
return opt;
|
|
}
|
|
|
|
///////////////////////////////////////////////////////////////////////////////////////////
|
|
|
|
/**
|
|
@brief
|
|
Function we use in the creation of our hash to get key.
|
|
*/
|
|
|
|
uchar* Rdb_open_tables_map::get_hash_key(
|
|
Rdb_table_handler *table_handler, size_t *length,
|
|
my_bool not_used __attribute__((__unused__)))
|
|
{
|
|
*length= table_handler->m_table_name_length;
|
|
return reinterpret_cast<uchar*>(table_handler->m_table_name);
|
|
}
|
|
|
|
|
|
/*
|
|
The following is needed as an argument for mysql_stage_register,
|
|
irrespectively of whether we're compiling with P_S or not.
|
|
*/
|
|
PSI_stage_info stage_waiting_on_row_lock= { 0, "Waiting for row lock", 0};
|
|
|
|
#ifdef HAVE_PSI_INTERFACE
|
|
static PSI_thread_key rdb_background_psi_thread_key;
|
|
static PSI_thread_key rdb_drop_idx_psi_thread_key;
|
|
|
|
static PSI_stage_info *all_rocksdb_stages[]=
|
|
{
|
|
& stage_waiting_on_row_lock
|
|
};
|
|
|
|
|
|
static my_core::PSI_mutex_key rdb_psi_open_tbls_mutex_key,
|
|
rdb_signal_bg_psi_mutex_key, rdb_signal_drop_idx_psi_mutex_key,
|
|
rdb_collation_data_mutex_key,
|
|
rdb_mem_cmp_space_mutex_key,
|
|
key_mutex_tx_list, rdb_sysvars_psi_mutex_key;
|
|
|
|
static PSI_mutex_info all_rocksdb_mutexes[]=
|
|
{
|
|
{ &rdb_psi_open_tbls_mutex_key, "open tables", PSI_FLAG_GLOBAL},
|
|
{ &rdb_signal_bg_psi_mutex_key, "stop background", PSI_FLAG_GLOBAL},
|
|
{ &rdb_signal_drop_idx_psi_mutex_key, "signal drop index", PSI_FLAG_GLOBAL},
|
|
{ &rdb_collation_data_mutex_key, "collation data init", PSI_FLAG_GLOBAL},
|
|
{ &rdb_mem_cmp_space_mutex_key, "collation space char data init",
|
|
PSI_FLAG_GLOBAL},
|
|
{ &key_mutex_tx_list, "tx_list", PSI_FLAG_GLOBAL},
|
|
{ &rdb_sysvars_psi_mutex_key, "setting sysvar", PSI_FLAG_GLOBAL},
|
|
};
|
|
|
|
static PSI_rwlock_key key_rwlock_collation_exception_list;
|
|
static PSI_rwlock_key key_rwlock_read_free_rpl_tables;
|
|
static PSI_rwlock_key key_rwlock_skip_unique_check_tables;
|
|
|
|
static PSI_rwlock_info all_rocksdb_rwlocks[]=
|
|
{
|
|
{ &key_rwlock_collation_exception_list, "collation_exception_list",
|
|
PSI_FLAG_GLOBAL},
|
|
{ &key_rwlock_read_free_rpl_tables, "read_free_rpl_tables", PSI_FLAG_GLOBAL},
|
|
{ &key_rwlock_skip_unique_check_tables, "skip_unique_check_tables",
|
|
PSI_FLAG_GLOBAL},
|
|
};
|
|
|
|
PSI_cond_key rdb_signal_bg_psi_cond_key, rdb_signal_drop_idx_psi_cond_key;
|
|
|
|
static PSI_cond_info all_rocksdb_conds[]=
|
|
{
|
|
{ &rdb_signal_bg_psi_cond_key, "cond signal background", PSI_FLAG_GLOBAL},
|
|
{ &rdb_signal_drop_idx_psi_cond_key, "cond signal drop index",
|
|
PSI_FLAG_GLOBAL},
|
|
};
|
|
|
|
static PSI_thread_info all_rocksdb_threads[]=
|
|
{
|
|
{ &rdb_background_psi_thread_key, "background", PSI_FLAG_GLOBAL},
|
|
{ &rdb_drop_idx_psi_thread_key, "drop index", PSI_FLAG_GLOBAL},
|
|
};
|
|
|
|
static void init_rocksdb_psi_keys()
|
|
{
|
|
const char* category= "rocksdb";
|
|
int count;
|
|
|
|
if (PSI_server == nullptr)
|
|
return;
|
|
|
|
count= array_elements(all_rocksdb_mutexes);
|
|
PSI_server->register_mutex(category, all_rocksdb_mutexes, count);
|
|
|
|
count= array_elements(all_rocksdb_rwlocks);
|
|
PSI_server->register_rwlock(category, all_rocksdb_rwlocks, count);
|
|
|
|
count= array_elements(all_rocksdb_conds);
|
|
// TODO Disabling PFS for conditions due to the bug https://github.com/MySQLOnRocksDB/mysql-5.6/issues/92
|
|
// PSI_server->register_cond(category, all_rocksdb_conds, count);
|
|
|
|
count= array_elements(all_rocksdb_stages);
|
|
mysql_stage_register(category, all_rocksdb_stages, count);
|
|
|
|
count= array_elements(all_rocksdb_threads);
|
|
mysql_thread_register(category, all_rocksdb_threads, count);
|
|
}
|
|
#endif
|
|
|
|
|
|
/*
|
|
Drop index thread's control
|
|
*/
|
|
|
|
static Rdb_drop_index_thread rdb_drop_idx_thread;
|
|
|
|
static void rocksdb_drop_index_wakeup_thread(
|
|
my_core::THD* thd __attribute__((__unused__)),
|
|
struct st_mysql_sys_var* var __attribute__((__unused__)),
|
|
void* var_ptr __attribute__((__unused__)),
|
|
const void* save)
|
|
{
|
|
if (*static_cast<const bool*>(save)) {
|
|
rdb_drop_idx_thread.signal();
|
|
}
|
|
}
|
|
|
|
static inline uint32_t rocksdb_perf_context_level(THD* thd)
|
|
{
|
|
DBUG_ASSERT(thd != nullptr);
|
|
|
|
int session_perf_context_level= THDVAR(thd, perf_context_level);
|
|
if (session_perf_context_level > rocksdb::PerfLevel::kUninitialized) {
|
|
return session_perf_context_level;
|
|
}
|
|
|
|
/*
|
|
Fallback to global thdvar, if session specific one was not set to a valid
|
|
value.
|
|
*/
|
|
|
|
int global_perf_context_level= THDVAR(nullptr, perf_context_level);
|
|
if (global_perf_context_level > rocksdb::PerfLevel::kUninitialized) {
|
|
return global_perf_context_level;
|
|
}
|
|
|
|
return rocksdb::PerfLevel::kDisable;
|
|
}
|
|
|
|
/*
|
|
Very short (functor-like) interface to be passed to
|
|
Rdb_transaction::walk_tx_list()
|
|
*/
|
|
|
|
interface Rdb_tx_list_walker
|
|
{
|
|
virtual ~Rdb_tx_list_walker() {}
|
|
virtual void process_tran(const Rdb_transaction*) = 0;
|
|
};
|
|
|
|
/*
|
|
This is a helper class that is passed to RocksDB to get notifications when
|
|
a snapshot gets created.
|
|
*/
|
|
|
|
class Rdb_snapshot_notifier : public rocksdb::TransactionNotifier
|
|
{
|
|
Rdb_transaction* m_owning_tx;
|
|
|
|
void SnapshotCreated(const rocksdb::Snapshot *snapshot) override;
|
|
|
|
public:
|
|
explicit Rdb_snapshot_notifier(Rdb_transaction* owning_tx)
|
|
: m_owning_tx(owning_tx) {}
|
|
|
|
// If the owning Rdb_transaction gets destructed we need to not reference
|
|
// it anymore.
|
|
void detach()
|
|
{
|
|
m_owning_tx = nullptr;
|
|
}
|
|
};
|
|
|
|
/* This is the base class for transactions when interacting with rocksdb.
|
|
*/
|
|
class Rdb_transaction
|
|
{
|
|
protected:
|
|
ulonglong m_write_count= 0;
|
|
ulonglong m_lock_count= 0;
|
|
|
|
bool m_is_delayed_snapshot= false;
|
|
bool m_is_two_phase= false;
|
|
|
|
THD* m_thd= nullptr;
|
|
|
|
rocksdb::ReadOptions m_read_opts;
|
|
|
|
static std::multiset<Rdb_transaction*> s_tx_list;
|
|
static mysql_mutex_t s_tx_list_mutex;
|
|
|
|
Rdb_io_perf* m_tbl_io_perf;
|
|
|
|
bool m_tx_read_only= false;
|
|
|
|
int m_timeout_sec; /* Cached value of @@rocksdb_lock_wait_timeout */
|
|
|
|
/* Maximum number of locks the transaction can have */
|
|
ulonglong m_max_row_locks;
|
|
|
|
bool m_is_tx_failed= false;
|
|
bool m_rollback_only= false;
|
|
|
|
std::shared_ptr<Rdb_snapshot_notifier> m_notifier;
|
|
|
|
// This should be used only when updating binlog information.
|
|
virtual rocksdb::WriteBatchBase* get_write_batch()= 0;
|
|
virtual bool commit_no_binlog()= 0;
|
|
virtual rocksdb::Iterator *get_iterator(
|
|
const rocksdb::ReadOptions &options,
|
|
rocksdb::ColumnFamilyHandle* column_family)= 0;
|
|
|
|
|
|
public:
|
|
const char* m_mysql_log_file_name;
|
|
my_off_t m_mysql_log_offset;
|
|
const char* m_mysql_gtid;
|
|
String m_detailed_error;
|
|
int64_t m_snapshot_timestamp= 0;
|
|
bool m_ddl_transaction;
|
|
|
|
static void init_mutex()
|
|
{
|
|
mysql_mutex_init(key_mutex_tx_list, &s_tx_list_mutex, MY_MUTEX_INIT_FAST);
|
|
}
|
|
|
|
static void term_mutex()
|
|
{
|
|
DBUG_ASSERT(s_tx_list.size() == 0);
|
|
mysql_mutex_destroy(&s_tx_list_mutex);
|
|
}
|
|
|
|
static void walk_tx_list(Rdb_tx_list_walker* walker)
|
|
{
|
|
mysql_mutex_lock(&s_tx_list_mutex);
|
|
for (auto it : s_tx_list)
|
|
walker->process_tran(it);
|
|
mysql_mutex_unlock(&s_tx_list_mutex);
|
|
}
|
|
|
|
int set_status_error(THD *thd, const rocksdb::Status &s,
|
|
const std::shared_ptr<const Rdb_key_def>& kd,
|
|
Rdb_tbl_def *tbl_def)
|
|
{
|
|
DBUG_ASSERT(!s.ok());
|
|
|
|
if (s.IsTimedOut())
|
|
{
|
|
/*
|
|
SQL layer has weird expectations. If we return an error when
|
|
doing a read in DELETE IGNORE, it will ignore the error ("because it's
|
|
an IGNORE command!) but then will fail an assert, because "error code
|
|
was returned, but no error happened". Do what InnoDB's
|
|
convert_error_code_to_mysql() does: force a statement
|
|
rollback before returning HA_ERR_LOCK_WAIT_TIMEOUT:
|
|
*/
|
|
my_core::thd_mark_transaction_to_rollback(thd, false /*just statement*/);
|
|
m_detailed_error.copy(timeout_message("index",
|
|
tbl_def->full_tablename().c_str(),
|
|
kd->get_name().c_str()));
|
|
|
|
return HA_ERR_LOCK_WAIT_TIMEOUT;
|
|
}
|
|
if (s.IsBusy())
|
|
{
|
|
rocksdb_snapshot_conflict_errors++;
|
|
return HA_ERR_LOCK_DEADLOCK;
|
|
}
|
|
/* TODO: who returns HA_ERR_ROCKSDB_TOO_MANY_LOCKS now?? */
|
|
|
|
my_error(ER_INTERNAL_ERROR, MYF(0), s.ToString().c_str());
|
|
return HA_ERR_INTERNAL_ERROR;
|
|
}
|
|
|
|
THD* get_thd() const { return m_thd; }
|
|
|
|
/* Used for tracking io_perf counters */
|
|
void io_perf_start(Rdb_io_perf *io_perf)
|
|
{
|
|
/*
|
|
Since perf_context is tracked per thread, it is difficult and expensive
|
|
to maintain perf_context on a per table basis. Therefore, roll all
|
|
perf_context data into the first table used in a query. This works well
|
|
for single table queries and is probably good enough for queries that hit
|
|
multiple tables.
|
|
|
|
perf_context stats gathering is started when the table lock is acquired
|
|
or when ha_rocksdb::start_stmt is called in case of LOCK TABLES. They
|
|
are recorded when the table lock is released, or when commit/rollback
|
|
is called on the transaction, whichever comes first. Table lock release
|
|
and commit/rollback can happen in different orders. In the case where
|
|
the lock is released before commit/rollback is called, an extra step to
|
|
gather stats during commit/rollback is needed.
|
|
*/
|
|
if (m_tbl_io_perf == nullptr &&
|
|
io_perf->start(rocksdb_perf_context_level(m_thd)))
|
|
{
|
|
m_tbl_io_perf= io_perf;
|
|
}
|
|
}
|
|
|
|
void io_perf_end_and_record(void)
|
|
{
|
|
if (m_tbl_io_perf != nullptr)
|
|
{
|
|
m_tbl_io_perf->end_and_record(rocksdb_perf_context_level(m_thd));
|
|
m_tbl_io_perf= nullptr;
|
|
}
|
|
}
|
|
|
|
void io_perf_end_and_record(Rdb_io_perf *io_perf)
|
|
{
|
|
if (m_tbl_io_perf == io_perf)
|
|
{
|
|
io_perf_end_and_record();
|
|
}
|
|
}
|
|
|
|
void set_params(int timeout_sec_arg, int max_row_locks_arg)
|
|
{
|
|
m_timeout_sec= timeout_sec_arg;
|
|
m_max_row_locks= max_row_locks_arg;
|
|
set_lock_timeout(timeout_sec_arg);
|
|
}
|
|
|
|
virtual void set_lock_timeout(int timeout_sec_arg)= 0;
|
|
|
|
ulonglong get_write_count() const { return m_write_count; }
|
|
|
|
ulonglong get_lock_count() const { return m_lock_count; }
|
|
|
|
virtual void set_sync(bool sync)= 0;
|
|
|
|
virtual void release_lock(rocksdb::ColumnFamilyHandle* column_family,
|
|
const std::string& rowkey)= 0;
|
|
|
|
virtual bool prepare(const rocksdb::TransactionName& name)= 0;
|
|
|
|
bool commit_or_rollback()
|
|
{
|
|
bool res;
|
|
if (m_is_tx_failed)
|
|
{
|
|
rollback();
|
|
res= false;
|
|
}
|
|
else
|
|
res= commit();
|
|
return res;
|
|
}
|
|
|
|
bool commit()
|
|
{
|
|
if (get_write_count() == 0)
|
|
{
|
|
rollback();
|
|
return false;
|
|
}
|
|
else if (m_rollback_only)
|
|
{
|
|
/*
|
|
Transactions marked as rollback_only are expected to be rolled back at
|
|
prepare(). But there are some exceptions like below that prepare() is
|
|
never called and commit() is called instead.
|
|
1. Binlog is disabled
|
|
2. No modification exists in binlog cache for the transaction (#195)
|
|
In both cases, rolling back transaction is safe. Nothing is written to
|
|
binlog.
|
|
*/
|
|
my_printf_error(ER_UNKNOWN_ERROR, ERRSTR_ROLLBACK_ONLY, MYF(0));
|
|
rollback();
|
|
return true;
|
|
}
|
|
else
|
|
{
|
|
my_core::thd_binlog_pos(m_thd, &m_mysql_log_file_name,
|
|
&m_mysql_log_offset, &m_mysql_gtid);
|
|
binlog_manager.update(m_mysql_log_file_name,
|
|
m_mysql_log_offset,
|
|
m_mysql_gtid, get_write_batch());
|
|
return commit_no_binlog();
|
|
}
|
|
}
|
|
|
|
virtual void rollback()= 0;
|
|
|
|
void snapshot_created(const rocksdb::Snapshot *snapshot)
|
|
{
|
|
m_read_opts.snapshot = snapshot;
|
|
rdb->GetEnv()->GetCurrentTime(&m_snapshot_timestamp);
|
|
m_is_delayed_snapshot = false;
|
|
}
|
|
|
|
virtual void acquire_snapshot(bool acquire_now)= 0;
|
|
virtual void release_snapshot()= 0;
|
|
|
|
bool has_snapshot() const
|
|
{
|
|
return m_read_opts.snapshot != nullptr;
|
|
}
|
|
|
|
private:
|
|
// The tables we are currently loading. In a partitioned table this can
|
|
// have more than one entry
|
|
std::vector<ha_rocksdb*> m_curr_bulk_load;
|
|
|
|
public:
|
|
int finish_bulk_load()
|
|
{
|
|
int rc= 0;
|
|
|
|
std::vector<ha_rocksdb*>::iterator it;
|
|
while ((it = m_curr_bulk_load.begin()) != m_curr_bulk_load.end())
|
|
{
|
|
int rc2= (*it)->finalize_bulk_load();
|
|
if (rc2 != 0 && rc == 0)
|
|
{
|
|
rc= rc2;
|
|
}
|
|
}
|
|
|
|
DBUG_ASSERT(m_curr_bulk_load.size() == 0);
|
|
|
|
return rc;
|
|
}
|
|
|
|
void start_bulk_load(ha_rocksdb* bulk_load)
|
|
{
|
|
/*
|
|
If we already have an open bulk load of a table and the name doesn't
|
|
match the current one, close out the currently running one. This allows
|
|
multiple bulk loads to occur on a partitioned table, but then closes
|
|
them all out when we switch to another table.
|
|
*/
|
|
if (!m_curr_bulk_load.empty() &&
|
|
!bulk_load->same_table(*m_curr_bulk_load[0]))
|
|
{
|
|
auto res= finish_bulk_load();
|
|
SHIP_ASSERT(res == 0);
|
|
}
|
|
|
|
m_curr_bulk_load.push_back(bulk_load);
|
|
}
|
|
|
|
void end_bulk_load(ha_rocksdb* bulk_load)
|
|
{
|
|
for (auto it = m_curr_bulk_load.begin(); it != m_curr_bulk_load.end();
|
|
it++)
|
|
{
|
|
if (*it == bulk_load)
|
|
{
|
|
m_curr_bulk_load.erase(it);
|
|
return;
|
|
}
|
|
}
|
|
|
|
// Should not reach here
|
|
SHIP_ASSERT(0);
|
|
}
|
|
|
|
/*
|
|
Flush the data accumulated so far. This assumes we're doing a bulk insert.
|
|
|
|
@detail
|
|
This should work like transaction commit, except that we don't
|
|
synchronize with the binlog (there is no API that would allow to have
|
|
binlog flush the changes accumulated so far and return its current
|
|
position)
|
|
|
|
@todo
|
|
Add test coverage for what happens when somebody attempts to do bulk
|
|
inserts while inside a multi-statement transaction.
|
|
*/
|
|
bool flush_batch()
|
|
{
|
|
if (get_write_count() == 0)
|
|
return false;
|
|
|
|
/* Commit the current transaction */
|
|
if (commit_no_binlog())
|
|
return true;
|
|
|
|
/* Start another one */
|
|
start_tx();
|
|
return false;
|
|
}
|
|
|
|
virtual rocksdb::Status put(rocksdb::ColumnFamilyHandle* column_family,
|
|
const rocksdb::Slice& key,
|
|
const rocksdb::Slice& value)= 0;
|
|
virtual rocksdb::Status delete_key(rocksdb::ColumnFamilyHandle* column_family,
|
|
const rocksdb::Slice& key)= 0;
|
|
virtual rocksdb::Status single_delete(
|
|
rocksdb::ColumnFamilyHandle* column_family,
|
|
const rocksdb::Slice& key)= 0;
|
|
|
|
virtual bool has_modifications() const= 0;
|
|
|
|
virtual rocksdb::WriteBatchBase* get_indexed_write_batch()= 0;
|
|
/*
|
|
Return a WriteBatch that one can write to. The writes will skip any
|
|
transaction locking. The writes will NOT be visible to the transaction.
|
|
*/
|
|
rocksdb::WriteBatchBase* get_blind_write_batch()
|
|
{
|
|
return get_indexed_write_batch()->GetWriteBatch();
|
|
}
|
|
|
|
virtual rocksdb::Status get(rocksdb::ColumnFamilyHandle* column_family,
|
|
const rocksdb::Slice& key,
|
|
std::string* value) const= 0;
|
|
virtual rocksdb::Status get_for_update(
|
|
rocksdb::ColumnFamilyHandle* column_family,
|
|
const rocksdb::Slice& key, std::string* value)= 0;
|
|
|
|
rocksdb::Iterator *get_iterator(rocksdb::ColumnFamilyHandle* column_family,
|
|
bool skip_bloom_filter,
|
|
bool fill_cache,
|
|
bool read_current= false,
|
|
bool create_snapshot= true)
|
|
{
|
|
// Make sure we are not doing both read_current (which implies we don't
|
|
// want a snapshot) and create_snapshot which makes sure we create
|
|
// a snapshot
|
|
DBUG_ASSERT(!read_current || !create_snapshot);
|
|
|
|
if (create_snapshot)
|
|
acquire_snapshot(true);
|
|
|
|
rocksdb::ReadOptions options= m_read_opts;
|
|
|
|
if (skip_bloom_filter)
|
|
{
|
|
options.total_order_seek= true;
|
|
}
|
|
else
|
|
{
|
|
// With this option, Iterator::Valid() returns false if key
|
|
// is outside of the prefix bloom filter range set at Seek().
|
|
// Must not be set to true if not using bloom filter.
|
|
options.prefix_same_as_start= true;
|
|
}
|
|
options.fill_cache= fill_cache;
|
|
if (read_current)
|
|
{
|
|
options.snapshot= nullptr;
|
|
}
|
|
return get_iterator(options, column_family);
|
|
}
|
|
|
|
virtual bool is_tx_started() const= 0;
|
|
virtual void start_tx()= 0;
|
|
virtual void start_stmt()= 0;
|
|
virtual void rollback_stmt()= 0;
|
|
|
|
void set_tx_failed(bool failed_arg) { m_is_tx_failed= failed_arg; }
|
|
|
|
bool can_prepare() const
|
|
{
|
|
if (m_rollback_only)
|
|
{
|
|
my_printf_error(ER_UNKNOWN_ERROR, ERRSTR_ROLLBACK_ONLY, MYF(0));
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
int rollback_to_savepoint(void *savepoint)
|
|
{
|
|
if (has_modifications())
|
|
{
|
|
my_printf_error(ER_UNKNOWN_ERROR,
|
|
"MyRocks currently does not support ROLLBACK TO "
|
|
"SAVEPOINT if modifying rows.",
|
|
MYF(0));
|
|
m_rollback_only= true;
|
|
return 1;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
This is used by transactions started with "START TRANSACTION WITH "
|
|
"CONSISTENT [ROCKSDB] SNAPSHOT". When tx_read_only is turned on,
|
|
snapshot has to be created via DB::GetSnapshot(), not via Transaction
|
|
API.
|
|
*/
|
|
bool is_tx_read_only() const
|
|
{
|
|
return m_tx_read_only;
|
|
}
|
|
|
|
bool is_two_phase() const
|
|
{
|
|
return m_is_two_phase;
|
|
}
|
|
|
|
void set_tx_read_only(bool val)
|
|
{
|
|
m_tx_read_only= val;
|
|
}
|
|
|
|
explicit Rdb_transaction(THD *thd): m_thd(thd), m_tbl_io_perf(nullptr)
|
|
{
|
|
mysql_mutex_lock(&s_tx_list_mutex);
|
|
s_tx_list.insert(this);
|
|
mysql_mutex_unlock(&s_tx_list_mutex);
|
|
}
|
|
|
|
virtual ~Rdb_transaction() {
|
|
mysql_mutex_lock(&s_tx_list_mutex);
|
|
s_tx_list.erase(this);
|
|
mysql_mutex_unlock(&s_tx_list_mutex);
|
|
}
|
|
};
|
|
|
|
/*
|
|
This is a rocksdb transaction. Its members represent the current transaction,
|
|
which consists of:
|
|
- the snapshot
|
|
- the changes we've made but are not seeing yet.
|
|
|
|
The changes are made to individual tables, which store them here and then
|
|
this object commits them on commit.
|
|
*/
|
|
class Rdb_transaction_impl : public Rdb_transaction
|
|
{
|
|
rocksdb::Transaction *m_rocksdb_tx= nullptr;
|
|
rocksdb::Transaction *m_rocksdb_reuse_tx= nullptr;
|
|
|
|
public:
|
|
void set_lock_timeout(int timeout_sec_arg) override
|
|
{
|
|
if (m_rocksdb_tx)
|
|
m_rocksdb_tx->SetLockTimeout(m_timeout_sec * 1000);
|
|
}
|
|
|
|
void set_sync(bool sync) override
|
|
{
|
|
m_rocksdb_tx->GetWriteOptions()->sync= sync;
|
|
}
|
|
|
|
void release_lock(rocksdb::ColumnFamilyHandle* column_family,
|
|
const std::string &rowkey) override
|
|
{
|
|
if (!THDVAR(m_thd, lock_scanned_rows))
|
|
{
|
|
m_rocksdb_tx->UndoGetForUpdate(column_family, rocksdb::Slice(rowkey));
|
|
}
|
|
}
|
|
|
|
private:
|
|
void release_tx(void)
|
|
{
|
|
// We are done with the current active transaction object. Preserve it
|
|
// for later reuse.
|
|
DBUG_ASSERT(m_rocksdb_reuse_tx == nullptr);
|
|
m_rocksdb_reuse_tx= m_rocksdb_tx;
|
|
m_rocksdb_tx= nullptr;
|
|
}
|
|
|
|
bool prepare(const rocksdb::TransactionName& name) override
|
|
{
|
|
rocksdb::Status s;
|
|
s= m_rocksdb_tx->SetName(name);
|
|
if (!s.ok())
|
|
{
|
|
rdb_handle_io_error(s, RDB_IO_ERROR_TX_COMMIT);
|
|
return false;
|
|
}
|
|
|
|
s= m_rocksdb_tx->Prepare();
|
|
if (!s.ok())
|
|
{
|
|
rdb_handle_io_error(s, RDB_IO_ERROR_TX_COMMIT);
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
bool commit_no_binlog() override
|
|
{
|
|
bool res= false;
|
|
release_snapshot();
|
|
rocksdb::Status s= m_rocksdb_tx->Commit();
|
|
if (!s.ok())
|
|
{
|
|
rdb_handle_io_error(s, RDB_IO_ERROR_TX_COMMIT);
|
|
res= true;
|
|
}
|
|
|
|
/* Save the transaction object to be reused */
|
|
release_tx();
|
|
|
|
m_write_count= 0;
|
|
m_lock_count= 0;
|
|
set_tx_read_only(false);
|
|
m_rollback_only= false;
|
|
return res;
|
|
}
|
|
|
|
public:
|
|
void rollback() override
|
|
{
|
|
m_write_count= 0;
|
|
m_lock_count= 0;
|
|
m_ddl_transaction= false;
|
|
if (m_rocksdb_tx)
|
|
{
|
|
release_snapshot();
|
|
/* This will also release all of the locks: */
|
|
m_rocksdb_tx->Rollback();
|
|
|
|
/* Save the transaction object to be reused */
|
|
release_tx();
|
|
|
|
set_tx_read_only(false);
|
|
m_rollback_only= false;
|
|
}
|
|
}
|
|
|
|
void acquire_snapshot(bool acquire_now) override
|
|
{
|
|
if (m_read_opts.snapshot == nullptr) {
|
|
if (is_tx_read_only()) {
|
|
snapshot_created(rdb->GetSnapshot());
|
|
}
|
|
else if (acquire_now) {
|
|
m_rocksdb_tx->SetSnapshot();
|
|
snapshot_created(m_rocksdb_tx->GetSnapshot());
|
|
}
|
|
else if (!m_is_delayed_snapshot) {
|
|
m_rocksdb_tx->SetSnapshotOnNextOperation(m_notifier);
|
|
m_is_delayed_snapshot = true;
|
|
}
|
|
}
|
|
}
|
|
|
|
void release_snapshot() override
|
|
{
|
|
bool need_clear = m_is_delayed_snapshot;
|
|
|
|
if (m_read_opts.snapshot != nullptr)
|
|
{
|
|
m_snapshot_timestamp = 0;
|
|
if (is_tx_read_only())
|
|
{
|
|
rdb->ReleaseSnapshot(m_read_opts.snapshot);
|
|
need_clear = false;
|
|
}
|
|
else
|
|
{
|
|
need_clear = true;
|
|
}
|
|
m_read_opts.snapshot = nullptr;
|
|
}
|
|
|
|
if (need_clear && m_rocksdb_tx != nullptr)
|
|
m_rocksdb_tx->ClearSnapshot();
|
|
}
|
|
|
|
bool has_snapshot()
|
|
{
|
|
return m_read_opts.snapshot != nullptr;
|
|
}
|
|
|
|
const char *err_too_many_locks=
|
|
"Number of locks held by the transaction exceeded @@rocksdb_max_row_locks";
|
|
|
|
rocksdb::Status put(rocksdb::ColumnFamilyHandle* column_family,
|
|
const rocksdb::Slice& key,
|
|
const rocksdb::Slice& value) override
|
|
{
|
|
++m_write_count;
|
|
++m_lock_count;
|
|
if (m_write_count > m_max_row_locks || m_lock_count > m_max_row_locks)
|
|
return rocksdb::Status::Aborted(rocksdb::Slice(err_too_many_locks));
|
|
return m_rocksdb_tx->Put(column_family, key, value);
|
|
}
|
|
|
|
rocksdb::Status delete_key(rocksdb::ColumnFamilyHandle* column_family,
|
|
const rocksdb::Slice& key) override
|
|
{
|
|
++m_write_count;
|
|
++m_lock_count;
|
|
if (m_write_count > m_max_row_locks || m_lock_count > m_max_row_locks)
|
|
return rocksdb::Status::Aborted(rocksdb::Slice(err_too_many_locks));
|
|
return m_rocksdb_tx->Delete(column_family, key);
|
|
}
|
|
|
|
rocksdb::Status single_delete(rocksdb::ColumnFamilyHandle* column_family,
|
|
const rocksdb::Slice& key) override
|
|
{
|
|
++m_write_count;
|
|
++m_lock_count;
|
|
if (m_write_count > m_max_row_locks || m_lock_count > m_max_row_locks)
|
|
return rocksdb::Status::Aborted(rocksdb::Slice(err_too_many_locks));
|
|
return m_rocksdb_tx->SingleDelete(column_family, key);
|
|
}
|
|
|
|
bool has_modifications() const override
|
|
{
|
|
return m_rocksdb_tx->GetWriteBatch() &&
|
|
m_rocksdb_tx->GetWriteBatch()->GetWriteBatch() &&
|
|
m_rocksdb_tx->GetWriteBatch()->GetWriteBatch()->Count() > 0;
|
|
}
|
|
|
|
rocksdb::WriteBatchBase* get_write_batch() override
|
|
{
|
|
if (is_two_phase())
|
|
{
|
|
return m_rocksdb_tx->GetCommitTimeWriteBatch();
|
|
}
|
|
return m_rocksdb_tx->GetWriteBatch()->GetWriteBatch();
|
|
}
|
|
|
|
/*
|
|
Return a WriteBatch that one can write to. The writes will skip any
|
|
transaction locking. The writes WILL be visible to the transaction.
|
|
*/
|
|
rocksdb::WriteBatchBase* get_indexed_write_batch() override
|
|
{
|
|
++m_write_count;
|
|
return m_rocksdb_tx->GetWriteBatch();
|
|
}
|
|
|
|
rocksdb::Status get(rocksdb::ColumnFamilyHandle* column_family,
|
|
const rocksdb::Slice& key,
|
|
std::string* value) const override
|
|
{
|
|
return m_rocksdb_tx->Get(m_read_opts, column_family, key, value);
|
|
}
|
|
|
|
rocksdb::Status get_for_update(rocksdb::ColumnFamilyHandle* column_family,
|
|
const rocksdb::Slice& key,
|
|
std::string* value) override
|
|
{
|
|
if (++m_lock_count > m_max_row_locks)
|
|
return rocksdb::Status::Aborted(rocksdb::Slice(err_too_many_locks));
|
|
return m_rocksdb_tx->GetForUpdate(m_read_opts, column_family, key, value);
|
|
}
|
|
|
|
rocksdb::Iterator *get_iterator(const rocksdb::ReadOptions &options,
|
|
rocksdb::ColumnFamilyHandle* column_family)
|
|
override
|
|
{
|
|
return m_rocksdb_tx->GetIterator(options, column_family);
|
|
}
|
|
|
|
bool is_tx_started() const override
|
|
{
|
|
return (m_rocksdb_tx != nullptr);
|
|
}
|
|
|
|
void start_tx() override
|
|
{
|
|
rocksdb::TransactionOptions tx_opts;
|
|
rocksdb::WriteOptions write_opts;
|
|
tx_opts.set_snapshot= false;
|
|
tx_opts.lock_timeout= m_timeout_sec * 1000;
|
|
|
|
write_opts.sync= THDVAR(m_thd, write_sync);
|
|
write_opts.disableWAL= THDVAR(m_thd, write_disable_wal);
|
|
write_opts.ignore_missing_column_families=
|
|
THDVAR(m_thd, write_ignore_missing_column_families);
|
|
m_is_two_phase= !rocksdb_disable_2pc;
|
|
|
|
/*
|
|
If m_rocksdb_reuse_tx is null this will create a new transaction object.
|
|
Otherwise it will reuse the existing one.
|
|
*/
|
|
m_rocksdb_tx= rdb->BeginTransaction(write_opts, tx_opts,
|
|
m_rocksdb_reuse_tx);
|
|
m_rocksdb_reuse_tx= nullptr;
|
|
|
|
m_read_opts= rocksdb::ReadOptions();
|
|
|
|
m_ddl_transaction= false;
|
|
}
|
|
|
|
/*
|
|
Start a statement inside a multi-statement transaction.
|
|
|
|
@todo: are we sure this is called once (and not several times) per
|
|
statement start?
|
|
|
|
For hooking to start of statement that is its own transaction, see
|
|
ha_rocksdb::external_lock().
|
|
*/
|
|
void start_stmt() override
|
|
{
|
|
// Set the snapshot to delayed acquisition (SetSnapshotOnNextOperation)
|
|
acquire_snapshot(false);
|
|
m_rocksdb_tx->SetSavePoint();
|
|
}
|
|
|
|
/*
|
|
This must be called when last statement is rolled back, but the transaction
|
|
continues
|
|
*/
|
|
void rollback_stmt() override
|
|
{
|
|
/* TODO: here we must release the locks taken since the start_stmt() call */
|
|
if (m_rocksdb_tx)
|
|
{
|
|
const rocksdb::Snapshot *org_snapshot = m_rocksdb_tx->GetSnapshot();
|
|
m_rocksdb_tx->RollbackToSavePoint();
|
|
|
|
const rocksdb::Snapshot *cur_snapshot = m_rocksdb_tx->GetSnapshot();
|
|
if (org_snapshot != cur_snapshot)
|
|
{
|
|
if (org_snapshot != nullptr)
|
|
m_snapshot_timestamp = 0;
|
|
|
|
m_read_opts.snapshot = cur_snapshot;
|
|
if (cur_snapshot != nullptr)
|
|
rdb->GetEnv()->GetCurrentTime(&m_snapshot_timestamp);
|
|
else
|
|
m_is_delayed_snapshot = true;
|
|
}
|
|
}
|
|
}
|
|
|
|
explicit Rdb_transaction_impl(THD *thd) :
|
|
Rdb_transaction(thd), m_rocksdb_tx(nullptr)
|
|
{
|
|
// Create a notifier that can be called when a snapshot gets generated.
|
|
m_notifier = std::make_shared<Rdb_snapshot_notifier>(this);
|
|
}
|
|
|
|
virtual ~Rdb_transaction_impl()
|
|
{
|
|
rollback();
|
|
|
|
// Theoretically the notifier could outlive the Rdb_transaction_impl
|
|
// (because of the shared_ptr), so let it know it can't reference
|
|
// the transaction anymore.
|
|
m_notifier->detach();
|
|
|
|
// Free any transaction memory that is still hanging around.
|
|
delete m_rocksdb_reuse_tx;
|
|
DBUG_ASSERT(m_rocksdb_tx == nullptr);
|
|
}
|
|
};
|
|
|
|
/* This is a rocksdb write batch. This class doesn't hold or wait on any
|
|
transaction locks (skips rocksdb transaction API) thus giving better
|
|
performance. The commit is done through rdb->GetBaseDB()->Commit().
|
|
|
|
Currently this is only used for replication threads which are guaranteed
|
|
to be non-conflicting. Any further usage of this class should completely
|
|
be thought thoroughly.
|
|
*/
|
|
class Rdb_writebatch_impl : public Rdb_transaction
|
|
{
|
|
rocksdb::WriteBatchWithIndex* m_batch;
|
|
rocksdb::WriteOptions write_opts;
|
|
// Called after commit/rollback.
|
|
void reset()
|
|
{
|
|
m_batch->Clear();
|
|
m_read_opts = rocksdb::ReadOptions();
|
|
m_ddl_transaction= false;
|
|
}
|
|
private:
|
|
bool prepare(const rocksdb::TransactionName& name) override
|
|
{
|
|
return true;
|
|
}
|
|
|
|
bool commit_no_binlog() override
|
|
{
|
|
bool res= false;
|
|
release_snapshot();
|
|
rocksdb::Status s= rdb->GetBaseDB()->Write(write_opts,
|
|
m_batch->GetWriteBatch());
|
|
if (!s.ok())
|
|
{
|
|
rdb_handle_io_error(s, RDB_IO_ERROR_TX_COMMIT);
|
|
res= true;
|
|
}
|
|
reset();
|
|
|
|
m_write_count= 0;
|
|
set_tx_read_only(false);
|
|
m_rollback_only= false;
|
|
return res;
|
|
}
|
|
public:
|
|
void set_lock_timeout(int timeout_sec_arg) override
|
|
{
|
|
// Nothing to do here.
|
|
}
|
|
|
|
void set_sync(bool sync) override
|
|
{
|
|
write_opts.sync= sync;
|
|
}
|
|
|
|
void release_lock(rocksdb::ColumnFamilyHandle* column_family,
|
|
const std::string &rowkey) override
|
|
{
|
|
// Nothing to do here since we don't hold any row locks.
|
|
}
|
|
|
|
void rollback() override
|
|
{
|
|
m_write_count= 0;
|
|
m_lock_count= 0;
|
|
release_snapshot();
|
|
|
|
reset();
|
|
set_tx_read_only(false);
|
|
m_rollback_only= false;
|
|
}
|
|
|
|
void acquire_snapshot(bool acquire_now) override
|
|
{
|
|
if (m_read_opts.snapshot == nullptr)
|
|
snapshot_created(rdb->GetSnapshot());
|
|
}
|
|
|
|
void release_snapshot() override
|
|
{
|
|
if (m_read_opts.snapshot != nullptr)
|
|
{
|
|
rdb->ReleaseSnapshot(m_read_opts.snapshot);
|
|
m_read_opts.snapshot = nullptr;
|
|
}
|
|
}
|
|
|
|
rocksdb::Status put(rocksdb::ColumnFamilyHandle* column_family,
|
|
const rocksdb::Slice& key,
|
|
const rocksdb::Slice& value) override
|
|
{
|
|
++m_write_count;
|
|
m_batch->Put(column_family, key, value);
|
|
// Note Put/Delete in write batch doesn't return any error code. We simply
|
|
// return OK here.
|
|
return rocksdb::Status::OK();
|
|
}
|
|
|
|
rocksdb::Status delete_key(rocksdb::ColumnFamilyHandle* column_family,
|
|
const rocksdb::Slice& key) override
|
|
{
|
|
++m_write_count;
|
|
m_batch->Delete(column_family, key);
|
|
return rocksdb::Status::OK();
|
|
}
|
|
|
|
rocksdb::Status single_delete(rocksdb::ColumnFamilyHandle* column_family,
|
|
const rocksdb::Slice& key) override
|
|
{
|
|
++m_write_count;
|
|
m_batch->SingleDelete(column_family, key);
|
|
return rocksdb::Status::OK();
|
|
}
|
|
|
|
bool has_modifications() const override
|
|
{
|
|
return m_batch->GetWriteBatch()->Count() > 0;
|
|
}
|
|
|
|
rocksdb::WriteBatchBase* get_write_batch() override
|
|
{
|
|
return m_batch;
|
|
}
|
|
|
|
rocksdb::WriteBatchBase* get_indexed_write_batch() override
|
|
{
|
|
++m_write_count;
|
|
return m_batch;
|
|
}
|
|
|
|
rocksdb::Status get(rocksdb::ColumnFamilyHandle* column_family,
|
|
const rocksdb::Slice& key,
|
|
std::string* value) const override
|
|
{
|
|
return m_batch->GetFromBatchAndDB(
|
|
rdb, m_read_opts, column_family, key, value);
|
|
}
|
|
|
|
rocksdb::Status get_for_update(rocksdb::ColumnFamilyHandle* column_family,
|
|
const rocksdb::Slice& key,
|
|
std::string* value) override
|
|
{
|
|
return get(column_family, key, value);
|
|
}
|
|
|
|
rocksdb::Iterator *get_iterator(const rocksdb::ReadOptions &options,
|
|
rocksdb::ColumnFamilyHandle* column_family)
|
|
override
|
|
{
|
|
auto it = rdb->NewIterator(options);
|
|
return m_batch->NewIteratorWithBase(it);
|
|
}
|
|
|
|
bool is_tx_started() const override
|
|
{
|
|
return (m_batch != nullptr);
|
|
}
|
|
|
|
void start_tx() override
|
|
{
|
|
reset();
|
|
write_opts.sync= THDVAR(m_thd, write_sync);
|
|
write_opts.disableWAL= THDVAR(m_thd, write_disable_wal);
|
|
write_opts.ignore_missing_column_families=
|
|
THDVAR(m_thd, write_ignore_missing_column_families);
|
|
}
|
|
|
|
void start_stmt() override
|
|
{
|
|
m_batch->SetSavePoint();
|
|
}
|
|
|
|
void rollback_stmt() override
|
|
{
|
|
if (m_batch)
|
|
m_batch->RollbackToSavePoint();
|
|
}
|
|
|
|
explicit Rdb_writebatch_impl(THD *thd) :
|
|
Rdb_transaction(thd), m_batch(nullptr)
|
|
{
|
|
m_batch = new rocksdb::WriteBatchWithIndex(rocksdb::BytewiseComparator(),
|
|
0 , true);
|
|
}
|
|
|
|
virtual ~Rdb_writebatch_impl()
|
|
{
|
|
rollback();
|
|
delete m_batch;
|
|
}
|
|
};
|
|
|
|
void Rdb_snapshot_notifier::SnapshotCreated(const rocksdb::Snapshot *snapshot)
|
|
{
|
|
if (m_owning_tx != nullptr)
|
|
{
|
|
m_owning_tx->snapshot_created(snapshot);
|
|
}
|
|
}
|
|
|
|
std::multiset<Rdb_transaction*> Rdb_transaction::s_tx_list;
|
|
mysql_mutex_t Rdb_transaction::s_tx_list_mutex;
|
|
|
|
static Rdb_transaction* &get_tx_from_thd(THD *thd)
|
|
{
|
|
return *reinterpret_cast<Rdb_transaction**>(
|
|
my_core::thd_ha_data(thd, rocksdb_hton));
|
|
}
|
|
|
|
namespace {
|
|
|
|
class Rdb_perf_context_guard
|
|
{
|
|
Rdb_io_perf m_io_perf;
|
|
THD *m_thd;
|
|
|
|
public:
|
|
explicit Rdb_perf_context_guard(THD *thd) : m_thd(thd)
|
|
{
|
|
Rdb_transaction*& tx= get_tx_from_thd(m_thd);
|
|
/*
|
|
if perf_context information is already being recorded, this becomes a
|
|
no-op
|
|
*/
|
|
if (tx != nullptr)
|
|
{
|
|
tx->io_perf_start(&m_io_perf);
|
|
}
|
|
}
|
|
|
|
~Rdb_perf_context_guard()
|
|
{
|
|
Rdb_transaction*& tx= get_tx_from_thd(m_thd);
|
|
if (tx != nullptr)
|
|
{
|
|
tx->io_perf_end_and_record();
|
|
}
|
|
}
|
|
};
|
|
|
|
} // anonymous namespace
|
|
|
|
/*
|
|
TODO: maybe, call this in external_lock() and store in ha_rocksdb..
|
|
*/
|
|
|
|
static Rdb_transaction *get_or_create_tx(THD *thd)
|
|
{
|
|
Rdb_transaction*& tx= get_tx_from_thd(thd);
|
|
// TODO: this is called too many times.. O(#rows)
|
|
if (tx == nullptr)
|
|
{
|
|
if (rpl_skip_tx_api_var && thd->rli_slave)
|
|
tx= new Rdb_writebatch_impl(thd);
|
|
else
|
|
tx= new Rdb_transaction_impl(thd);
|
|
tx->set_params(THDVAR(thd, lock_wait_timeout),
|
|
THDVAR(thd, max_row_locks));
|
|
tx->start_tx();
|
|
}
|
|
else
|
|
{
|
|
tx->set_params(THDVAR(thd, lock_wait_timeout),
|
|
THDVAR(thd, max_row_locks));
|
|
if (!tx->is_tx_started())
|
|
{
|
|
tx->start_tx();
|
|
}
|
|
}
|
|
|
|
return tx;
|
|
}
|
|
|
|
|
|
static int rocksdb_close_connection(handlerton* hton, THD* thd)
|
|
{
|
|
Rdb_transaction*& tx= get_tx_from_thd(thd);
|
|
if (tx != nullptr)
|
|
{
|
|
int rc= tx->finish_bulk_load();
|
|
if (rc != 0)
|
|
{
|
|
// NO_LINT_DEBUG
|
|
sql_print_error("RocksDB: Error %d finalizing last SST file while "
|
|
"disconnecting", rc);
|
|
abort_with_stack_traces();
|
|
}
|
|
|
|
delete tx;
|
|
tx= nullptr;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* Serializes an xid to a string so that it can
|
|
* be used as a rocksdb transaction name
|
|
*/
|
|
static std::string rdb_xid_to_string(const XID& src)
|
|
{
|
|
DBUG_ASSERT(src.gtrid_length >= 0 && src.gtrid_length <= MAXGTRIDSIZE);
|
|
DBUG_ASSERT(src.bqual_length >= 0 && src.bqual_length <= MAXBQUALSIZE);
|
|
|
|
std::string buf;
|
|
buf.reserve(RDB_XIDHDR_LEN + src.gtrid_length + src.bqual_length);
|
|
|
|
/*
|
|
* expand formatID to fill 8 bytes if it doesn't already
|
|
* then reinterpret bit pattern as unsigned and store in network order
|
|
*/
|
|
uchar fidbuf[RDB_FORMATID_SZ];
|
|
int64 signed_fid8= src.formatID;
|
|
uint64 raw_fid8= *reinterpret_cast<uint64*>(&signed_fid8);
|
|
rdb_netbuf_store_uint64(fidbuf, raw_fid8);
|
|
buf.append(reinterpret_cast<const char*>(fidbuf), RDB_FORMATID_SZ);
|
|
|
|
buf.push_back(src.gtrid_length);
|
|
buf.push_back(src.bqual_length);
|
|
buf.append(src.data, (src.gtrid_length) + (src.bqual_length));
|
|
return buf;
|
|
}
|
|
|
|
|
|
/**
|
|
Called by hton->flush_logs after MySQL group commit prepares a set of
|
|
transactions.
|
|
*/
|
|
static bool rocksdb_flush_wal(handlerton* hton __attribute__((__unused__)),
|
|
ulonglong target_lsn __attribute__((__unused__)))
|
|
{
|
|
DBUG_ASSERT(rdb != nullptr);
|
|
rocksdb::Status s= rdb->SyncWAL();
|
|
if (!s.ok()) {
|
|
return 1;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
/**
|
|
For a slave, prepare() updates the slave_gtid_info table which tracks the
|
|
replication progress.
|
|
*/
|
|
static int rocksdb_prepare(handlerton* hton, THD* thd, bool prepare_tx,
|
|
bool async)
|
|
{
|
|
Rdb_transaction*& tx= get_tx_from_thd(thd);
|
|
if (!tx->can_prepare())
|
|
{
|
|
return 1;
|
|
}
|
|
if (prepare_tx ||
|
|
(!my_core::thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN))) {
|
|
/* We were instructed to prepare the whole transaction, or
|
|
this is an SQL statement end and autocommit is on */
|
|
std::vector<st_slave_gtid_info> slave_gtid_info;
|
|
my_core::thd_slave_gtid_info(thd, &slave_gtid_info);
|
|
for (auto it : slave_gtid_info) {
|
|
rocksdb::WriteBatchBase* write_batch = tx->get_blind_write_batch();
|
|
binlog_manager.update_slave_gtid_info(it.id, it.db, it.gtid, write_batch);
|
|
}
|
|
|
|
if (tx->is_two_phase()) {
|
|
if (thd->durability_property == HA_IGNORE_DURABILITY || async) {
|
|
tx->set_sync(false);
|
|
}
|
|
XID xid;
|
|
thd_get_xid(thd, reinterpret_cast<MYSQL_XID*>(&xid));
|
|
if (!tx->prepare(rdb_xid_to_string(xid))) {
|
|
return 1;
|
|
}
|
|
if (thd->durability_property == HA_IGNORE_DURABILITY) {
|
|
/**
|
|
we set the log sequence as '1' just to trigger hton->flush_logs
|
|
*/
|
|
thd_store_lsn(thd, 1, DB_TYPE_ROCKSDB);
|
|
}
|
|
}
|
|
|
|
DEBUG_SYNC(thd, "rocksdb.prepared");
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
/**
|
|
do nothing for prepare/commit by xid
|
|
this is needed to avoid crashes in XA scenarios
|
|
*/
|
|
static int rocksdb_commit_by_xid(handlerton* hton, XID* xid)
|
|
{
|
|
auto name= rdb_xid_to_string(*xid);
|
|
rocksdb::Transaction *trx= rdb->GetTransactionByName(name);
|
|
if (trx == nullptr) {
|
|
return 1;
|
|
}
|
|
rocksdb::Status s= trx->Commit();
|
|
if (!s.ok()) {
|
|
return 1;
|
|
}
|
|
delete trx;
|
|
return 0;
|
|
}
|
|
|
|
static int rocksdb_rollback_by_xid(handlerton* hton __attribute__((__unused__)),
|
|
XID* xid)
|
|
{
|
|
auto name= rdb_xid_to_string(*xid);
|
|
rocksdb::Transaction *trx= rdb->GetTransactionByName(name);
|
|
if (trx == nullptr) {
|
|
return 1;
|
|
}
|
|
rocksdb::Status s= trx->Rollback();
|
|
if (!s.ok()) {
|
|
return 1;
|
|
}
|
|
delete trx;
|
|
return 0;
|
|
}
|
|
|
|
/**
|
|
Rebuilds an XID from a serialized version stored in a string.
|
|
*/
|
|
static void rdb_xid_from_string(const std::string& src, XID *dst)
|
|
{
|
|
DBUG_ASSERT(dst != nullptr);
|
|
uint offset= 0;
|
|
uint64 raw_fid8=
|
|
rdb_netbuf_to_uint64(reinterpret_cast<const uchar*>(src.data()));
|
|
int64 signed_fid8= *reinterpret_cast<int64*>(&raw_fid8);
|
|
dst->formatID= signed_fid8;
|
|
offset += RDB_FORMATID_SZ;
|
|
dst->gtrid_length= src.at(offset);
|
|
offset += RDB_GTRID_SZ;
|
|
dst->bqual_length= src.at(offset);
|
|
offset += RDB_BQUAL_SZ;
|
|
|
|
DBUG_ASSERT(dst->gtrid_length >= 0 && dst->gtrid_length <= MAXGTRIDSIZE);
|
|
DBUG_ASSERT(dst->bqual_length >= 0 && dst->bqual_length <= MAXBQUALSIZE);
|
|
|
|
src.copy(dst->data, (dst->gtrid_length)+(dst->bqual_length), RDB_XIDHDR_LEN);
|
|
}
|
|
|
|
/**
|
|
Reading last committed binary log info from RocksDB system row.
|
|
The info is needed for crash safe slave/master to work.
|
|
*/
|
|
static int rocksdb_recover(handlerton* hton, XID* xid_list, uint len,
|
|
char* binlog_file, my_off_t* binlog_pos)
|
|
{
|
|
if (binlog_file && binlog_pos)
|
|
{
|
|
char file_buf[FN_REFLEN+1]= {0};
|
|
my_off_t pos;
|
|
char gtid_buf[FN_REFLEN+1]= {0};
|
|
if (binlog_manager.read(file_buf, &pos, gtid_buf))
|
|
{
|
|
if (is_binlog_advanced(binlog_file, *binlog_pos, file_buf, pos))
|
|
{
|
|
memcpy(binlog_file, file_buf, FN_REFLEN + 1);
|
|
*binlog_pos= pos;
|
|
fprintf(stderr, "RocksDB: Last binlog file position %llu,"
|
|
" file name %s\n", pos, file_buf);
|
|
if (*gtid_buf)
|
|
{
|
|
fprintf(stderr, "RocksDB: Last MySQL Gtid %s\n", gtid_buf);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if (len == 0 || xid_list == nullptr)
|
|
{
|
|
return 0;
|
|
}
|
|
|
|
std::vector<rocksdb::Transaction*> trans_list;
|
|
rdb->GetAllPreparedTransactions(&trans_list);
|
|
|
|
uint count= 0;
|
|
for (auto& trans : trans_list)
|
|
{
|
|
if (count >= len)
|
|
{
|
|
break;
|
|
}
|
|
auto name= trans->GetName();
|
|
rdb_xid_from_string(name, &xid_list[count]);
|
|
count++;
|
|
}
|
|
return count;
|
|
}
|
|
|
|
static int rocksdb_commit(handlerton* hton, THD* thd, bool commit_tx, bool)
|
|
{
|
|
DBUG_ENTER("rocksdb_commit");
|
|
|
|
/* this will trigger saving of perf_context information */
|
|
Rdb_perf_context_guard guard(thd);
|
|
|
|
/* note: h->external_lock(F_UNLCK) is called after this function is called) */
|
|
Rdb_transaction*& tx= get_tx_from_thd(thd);
|
|
|
|
if (tx != nullptr)
|
|
{
|
|
if (commit_tx ||
|
|
(!my_core::thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)))
|
|
{
|
|
/*
|
|
We get here
|
|
- For a COMMIT statement that finishes a multi-statement transaction
|
|
- For a statement that has its own transaction
|
|
*/
|
|
if (tx->commit())
|
|
DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
|
|
}
|
|
else
|
|
{
|
|
/*
|
|
We get here when committing a statement within a transaction.
|
|
|
|
We don't need to do anything here. tx->start_stmt() will notify
|
|
Rdb_transaction_impl that another statement has started.
|
|
*/
|
|
tx->set_tx_failed(false);
|
|
}
|
|
|
|
if (my_core::thd_tx_isolation(thd) <= ISO_READ_COMMITTED)
|
|
{
|
|
// For READ_COMMITTED, we release any existing snapshot so that we will
|
|
// see any changes that occurred since the last statement.
|
|
tx->release_snapshot();
|
|
}
|
|
}
|
|
DBUG_RETURN(0);
|
|
}
|
|
|
|
|
|
static int rocksdb_rollback(handlerton* hton, THD* thd, bool rollback_tx)
|
|
{
|
|
Rdb_perf_context_guard guard(thd);
|
|
Rdb_transaction*& tx= get_tx_from_thd(thd);
|
|
|
|
if (tx != nullptr)
|
|
{
|
|
if (rollback_tx)
|
|
{
|
|
/*
|
|
We get here, when
|
|
- ROLLBACK statement is issued.
|
|
|
|
Discard the changes made by the transaction
|
|
*/
|
|
tx->rollback();
|
|
}
|
|
else
|
|
{
|
|
/*
|
|
We get here when
|
|
- a statement with AUTOCOMMIT=1 is being rolled back (because of some
|
|
error)
|
|
- a statement inside a transaction is rolled back
|
|
*/
|
|
|
|
tx->rollback_stmt();
|
|
tx->set_tx_failed(true);
|
|
}
|
|
|
|
if (my_core::thd_tx_isolation(thd) <= ISO_READ_COMMITTED)
|
|
{
|
|
// For READ_COMMITTED, we release any existing snapshot so that we will
|
|
// see any changes that occurred since the last statement.
|
|
tx->release_snapshot();
|
|
}
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
static bool print_stats(THD* thd,
|
|
std::string const& type,
|
|
std::string const& name,
|
|
std::string const& status,
|
|
stat_print_fn *stat_print)
|
|
{
|
|
return stat_print(thd, type.c_str(), type.size(), name.c_str(), name.size(),
|
|
status.c_str(), status.size());
|
|
}
|
|
|
|
static std::string format_string(
|
|
const char *format,
|
|
...)
|
|
{
|
|
std::string res;
|
|
va_list args;
|
|
va_list args_copy;
|
|
|
|
va_start(args, format);
|
|
va_copy(args_copy, args);
|
|
|
|
size_t len = vsnprintf(nullptr, 0, format, args) + 1;
|
|
va_end(args);
|
|
|
|
if (len == 0) {
|
|
res = std::string("");
|
|
}
|
|
else {
|
|
char buff[len];
|
|
(void) vsnprintf(buff, len, format, args_copy);
|
|
|
|
res = std::string(buff);
|
|
}
|
|
|
|
va_end(args_copy);
|
|
|
|
return res;
|
|
}
|
|
|
|
class Rdb_snapshot_status : public Rdb_tx_list_walker
|
|
{
|
|
private:
|
|
std::string m_data;
|
|
|
|
static std::string current_timestamp(void)
|
|
{
|
|
static const char *const format = "%d-%02d-%02d %02d:%02d:%02d";
|
|
time_t currtime;
|
|
struct tm currtm;
|
|
|
|
time(&currtime);
|
|
|
|
localtime_r(&currtime, &currtm);
|
|
|
|
return format_string(format, currtm.tm_year + 1900, currtm.tm_mon + 1,
|
|
currtm.tm_mday, currtm.tm_hour, currtm.tm_min,
|
|
currtm.tm_sec);
|
|
}
|
|
|
|
static std::string get_header(void)
|
|
{
|
|
return
|
|
"\n============================================================\n" +
|
|
current_timestamp() +
|
|
" ROCKSDB TRANSACTION MONITOR OUTPUT\n"
|
|
"============================================================\n"
|
|
"---------\n"
|
|
"SNAPSHOTS\n"
|
|
"---------\n"
|
|
"LIST OF SNAPSHOTS FOR EACH SESSION:\n";
|
|
}
|
|
|
|
static std::string get_footer(void)
|
|
{
|
|
return
|
|
"-----------------------------------------\n"
|
|
"END OF ROCKSDB TRANSACTION MONITOR OUTPUT\n"
|
|
"=========================================\n";
|
|
}
|
|
|
|
public:
|
|
Rdb_snapshot_status() : m_data(get_header()) {}
|
|
|
|
std::string getResult() { return m_data + get_footer(); }
|
|
|
|
/* Implement Rdb_transaction interface */
|
|
/* Create one row in the snapshot status table */
|
|
void process_tran(const Rdb_transaction *tx) override
|
|
{
|
|
/* Calculate the duration the snapshot has existed */
|
|
int64_t snapshot_timestamp = tx->m_snapshot_timestamp;
|
|
if (snapshot_timestamp != 0)
|
|
{
|
|
int64_t curr_time;
|
|
rdb->GetEnv()->GetCurrentTime(&curr_time);
|
|
|
|
THD* thd = tx->get_thd();
|
|
|
|
m_data += format_string("---SNAPSHOT, ACTIVE %lld sec\n"
|
|
"MySQL thread id %lu, OS thread handle %p\n"
|
|
"lock count %llu, write count %llu\n",
|
|
curr_time - snapshot_timestamp,
|
|
my_core::thd_get_thread_id(thd), thd,
|
|
tx->get_lock_count(), tx->get_write_count());
|
|
}
|
|
}
|
|
};
|
|
|
|
/* Generate the snapshot status table */
|
|
static bool rocksdb_show_snapshot_status(handlerton* hton,
|
|
THD* thd,
|
|
stat_print_fn* stat_print)
|
|
{
|
|
Rdb_snapshot_status showStatus;
|
|
|
|
Rdb_transaction::walk_tx_list(&showStatus);
|
|
|
|
// Send the result data back to MySQL */
|
|
return print_stats(thd, "SNAPSHOTS", "rocksdb", showStatus.getResult(),
|
|
stat_print);
|
|
}
|
|
|
|
/*
|
|
This is called for SHOW ENGINE ROCKSDB STATUS|LOGS|etc.
|
|
|
|
For now, produce info about live files (which gives an imprecise idea about
|
|
what column families are there)
|
|
*/
|
|
|
|
static bool rocksdb_show_status(handlerton* hton,
|
|
THD* thd,
|
|
stat_print_fn* stat_print,
|
|
enum ha_stat_type stat_type)
|
|
{
|
|
bool res= false;
|
|
if (stat_type == HA_ENGINE_STATUS)
|
|
{
|
|
std::string str;
|
|
|
|
/* Per DB stats */
|
|
if (rdb->GetProperty("rocksdb.dbstats", &str)) {
|
|
res |= print_stats(thd, "DBSTATS", "rocksdb", str, stat_print);
|
|
}
|
|
|
|
/* Per column family stats */
|
|
for (auto cf_name : cf_manager.get_cf_names())
|
|
{
|
|
rocksdb::ColumnFamilyHandle* cfh;
|
|
bool is_automatic;
|
|
|
|
/*
|
|
Only the cf name is important. Whether it was generated automatically
|
|
does not matter, so is_automatic is ignored.
|
|
*/
|
|
cfh= cf_manager.get_cf(cf_name.c_str(), "", nullptr, &is_automatic);
|
|
if (cfh == nullptr)
|
|
continue;
|
|
|
|
if (!rdb->GetProperty(cfh, "rocksdb.cfstats", &str))
|
|
continue;
|
|
|
|
res |= print_stats(thd, "CF_COMPACTION", cf_name, str, stat_print);
|
|
}
|
|
|
|
/* Memory Statistics */
|
|
std::vector<rocksdb::DB*> dbs;
|
|
std::unordered_set<const rocksdb::Cache*> cache_set;
|
|
size_t internal_cache_count = 0;
|
|
size_t kDefaultInternalCacheSize = 8 * 1024 * 1024;
|
|
char buf[100];
|
|
|
|
dbs.push_back(rdb);
|
|
cache_set.insert(rocksdb_tbl_options.block_cache.get());
|
|
for (const auto& cf_handle : cf_manager.get_all_cf())
|
|
{
|
|
rocksdb::ColumnFamilyDescriptor cf_desc;
|
|
cf_handle->GetDescriptor(&cf_desc);
|
|
auto* table_factory = cf_desc.options.table_factory.get();
|
|
if (table_factory != nullptr)
|
|
{
|
|
std::string tf_name = table_factory->Name();
|
|
if (tf_name.find("BlockBasedTable") != std::string::npos)
|
|
{
|
|
const rocksdb::BlockBasedTableOptions* bbt_opt =
|
|
reinterpret_cast<rocksdb::BlockBasedTableOptions*>(
|
|
table_factory->GetOptions());
|
|
if (bbt_opt != nullptr)
|
|
{
|
|
if (bbt_opt->block_cache.get() != nullptr)
|
|
{
|
|
cache_set.insert(bbt_opt->block_cache.get());
|
|
}
|
|
else
|
|
{
|
|
internal_cache_count++;
|
|
}
|
|
cache_set.insert(bbt_opt->block_cache_compressed.get());
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
std::map<rocksdb::MemoryUtil::UsageType, uint64_t> temp_usage_by_type;
|
|
str.clear();
|
|
rocksdb::MemoryUtil::GetApproximateMemoryUsageByType(
|
|
dbs, cache_set, &temp_usage_by_type);
|
|
snprintf(buf, sizeof(buf), "\nMemTable Total: %lu",
|
|
temp_usage_by_type[rocksdb::MemoryUtil::kMemTableTotal]);
|
|
str.append(buf);
|
|
snprintf(buf, sizeof(buf), "\nMemTable Unflushed: %lu",
|
|
temp_usage_by_type[rocksdb::MemoryUtil::kMemTableUnFlushed]);
|
|
str.append(buf);
|
|
snprintf(buf, sizeof(buf), "\nTable Readers Total: %lu",
|
|
temp_usage_by_type[rocksdb::MemoryUtil::kTableReadersTotal]);
|
|
str.append(buf);
|
|
snprintf(buf, sizeof(buf), "\nCache Total: %lu",
|
|
temp_usage_by_type[rocksdb::MemoryUtil::kCacheTotal]);
|
|
str.append(buf);
|
|
snprintf(buf, sizeof(buf), "\nDefault Cache Capacity: %lu",
|
|
internal_cache_count * kDefaultInternalCacheSize);
|
|
str.append(buf);
|
|
res |= print_stats(thd, "Memory_Stats", "rocksdb", str, stat_print);
|
|
}
|
|
else if (stat_type == HA_ENGINE_TRX)
|
|
{
|
|
/* Handle the SHOW ENGINE ROCKSDB TRANSACTION STATUS command */
|
|
res |= rocksdb_show_snapshot_status(hton, thd, stat_print);
|
|
}
|
|
|
|
return res;
|
|
}
|
|
|
|
static inline void rocksdb_register_tx(handlerton *hton, THD *thd,
|
|
Rdb_transaction *tx)
|
|
{
|
|
trans_register_ha(thd, FALSE, rocksdb_hton);
|
|
if (my_core::thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN))
|
|
{
|
|
tx->start_stmt();
|
|
trans_register_ha(thd, TRUE, rocksdb_hton);
|
|
}
|
|
}
|
|
|
|
/*
|
|
Supporting START TRANSACTION WITH CONSISTENT [ROCKSDB] SNAPSHOT
|
|
|
|
Features:
|
|
1. Supporting START TRANSACTION WITH CONSISTENT SNAPSHOT
|
|
2. Getting current binlog position in addition to #1.
|
|
|
|
The second feature is done by START TRANSACTION WITH
|
|
CONSISTENT ROCKSDB SNAPSHOT. This is Facebook's extension, and
|
|
it works like existing START TRANSACTION WITH CONSISTENT INNODB SNAPSHOT.
|
|
|
|
- When not setting engine, START TRANSACTION WITH CONSISTENT SNAPSHOT
|
|
takes both InnoDB and RocksDB snapshots, and both InnoDB and RocksDB
|
|
participate in transaction. When executing COMMIT, both InnoDB and
|
|
RocksDB modifications are committed. Remember that XA is not supported yet,
|
|
so mixing engines is not recommended anyway.
|
|
|
|
- When setting engine, START TRANSACTION WITH CONSISTENT.. takes
|
|
snapshot for the specified engine only. But it starts both
|
|
InnoDB and RocksDB transactions.
|
|
*/
|
|
static int rocksdb_start_tx_and_assign_read_view(
|
|
handlerton* hton, /*!< in: RocksDB handlerton */
|
|
THD* thd, /*!< in: MySQL thread handle of the
|
|
user for whom the transaction should
|
|
be committed */
|
|
char* binlog_file, /* out: binlog file for last commit */
|
|
ulonglong* binlog_pos, /* out: binlog pos for last commit */
|
|
char** gtid_executed, /* out: Gtids logged until last commit */
|
|
int* gtid_executed_length) /*out: Length of gtid_executed string */
|
|
{
|
|
Rdb_perf_context_guard guard(thd);
|
|
|
|
ulong const tx_isolation = my_core::thd_tx_isolation(thd);
|
|
|
|
if (tx_isolation != ISO_REPEATABLE_READ)
|
|
{
|
|
my_printf_error(ER_UNKNOWN_ERROR,
|
|
"Only REPEATABLE READ isolation level is supported "
|
|
"for START TRANSACTION WITH CONSISTENT SNAPSHOT "
|
|
"in RocksDB Storage Engine.", MYF(0));
|
|
return 1;
|
|
}
|
|
|
|
if (binlog_file)
|
|
{
|
|
if (binlog_pos && mysql_bin_log.is_open())
|
|
mysql_bin_log_lock_commits();
|
|
else
|
|
return 1;
|
|
}
|
|
|
|
Rdb_transaction* tx= get_or_create_tx(thd);
|
|
DBUG_ASSERT(!tx->has_snapshot());
|
|
tx->set_tx_read_only(true);
|
|
rocksdb_register_tx(hton, thd, tx);
|
|
tx->acquire_snapshot(true);
|
|
|
|
if (binlog_file)
|
|
mysql_bin_log_unlock_commits(binlog_file, binlog_pos, gtid_executed,
|
|
gtid_executed_length);
|
|
|
|
return 0;
|
|
}
|
|
|
|
/* Dummy SAVEPOINT support. This is needed for long running transactions
|
|
* like mysqldump (https://bugs.mysql.com/bug.php?id=71017).
|
|
* Current SAVEPOINT does not correctly handle ROLLBACK and does not return
|
|
* errors. This needs to be addressed in future versions (Issue#96).
|
|
*/
|
|
static int rocksdb_savepoint(handlerton *hton, THD *thd, void *savepoint)
|
|
{
|
|
return 0;
|
|
}
|
|
|
|
static int rocksdb_rollback_to_savepoint(handlerton *hton, THD *thd,
|
|
void *savepoint)
|
|
{
|
|
Rdb_transaction*& tx= get_tx_from_thd(thd);
|
|
return tx->rollback_to_savepoint(savepoint);
|
|
}
|
|
|
|
static bool rocksdb_rollback_to_savepoint_can_release_mdl(handlerton *hton,
|
|
THD *thd)
|
|
{
|
|
return true;
|
|
}
|
|
|
|
#ifdef MARIAROCKS_NOT_YET
|
|
/*
|
|
This is called for INFORMATION_SCHEMA
|
|
*/
|
|
static void rocksdb_update_table_stats(
|
|
/* per-table stats callback */
|
|
void (*cb)(const char* db, const char* tbl, bool is_partition,
|
|
my_io_perf_t* r, my_io_perf_t* w, my_io_perf_t* r_blob,
|
|
my_io_perf_t* r_primary, my_io_perf_t* r_secondary,
|
|
page_stats_t *page_stats, comp_stats_t *comp_stats,
|
|
int n_lock_wait, int n_lock_wait_timeout,
|
|
const char* engine))
|
|
{
|
|
my_io_perf_t io_perf_read;
|
|
my_io_perf_t io_perf;
|
|
page_stats_t page_stats;
|
|
comp_stats_t comp_stats;
|
|
std::vector<std::string> tablenames;
|
|
|
|
/*
|
|
Most of these are for innodb, so setting them to 0.
|
|
TODO: possibly separate out primary vs. secondary index reads
|
|
*/
|
|
memset(&io_perf, 0, sizeof(io_perf));
|
|
memset(&page_stats, 0, sizeof(page_stats));
|
|
memset(&comp_stats, 0, sizeof(comp_stats));
|
|
|
|
tablenames= rdb_open_tables.get_table_names();
|
|
|
|
for (const auto& it : tablenames)
|
|
{
|
|
Rdb_table_handler *table_handler;
|
|
std::string str, dbname, tablename, partname;
|
|
char dbname_sys[NAME_LEN + 1];
|
|
char tablename_sys[NAME_LEN + 1];
|
|
bool is_partition;
|
|
|
|
if (rdb_normalize_tablename(it, &str)) {
|
|
/* Function needs to return void because of the interface and we've
|
|
* detected an error which shouldn't happen. There's no way to let
|
|
* caller know that something failed.
|
|
*/
|
|
SHIP_ASSERT(false);
|
|
return;
|
|
}
|
|
|
|
if (rdb_split_normalized_tablename(str, &dbname, &tablename, &partname))
|
|
{
|
|
continue;
|
|
}
|
|
|
|
is_partition= (partname.size() != 0);
|
|
|
|
table_handler= rdb_open_tables.get_table_handler(it.c_str());
|
|
if (table_handler == nullptr)
|
|
{
|
|
continue;
|
|
}
|
|
|
|
io_perf_read.bytes= table_handler->m_io_perf_read.bytes.load();
|
|
io_perf_read.requests= table_handler->m_io_perf_read.requests.load();
|
|
|
|
/*
|
|
Convert from rocksdb timer to mysql timer. RocksDB values are
|
|
in nanoseconds, but table statistics expect the value to be
|
|
in my_timer format.
|
|
*/
|
|
io_perf_read.svc_time= my_core::microseconds_to_my_timer(
|
|
table_handler->m_io_perf_read.svc_time.load() / 1000);
|
|
io_perf_read.svc_time_max= my_core::microseconds_to_my_timer(
|
|
table_handler->m_io_perf_read.svc_time_max.load() / 1000);
|
|
io_perf_read.wait_time= my_core::microseconds_to_my_timer(
|
|
table_handler->m_io_perf_read.wait_time.load() / 1000);
|
|
io_perf_read.wait_time_max= my_core::microseconds_to_my_timer(
|
|
table_handler->m_io_perf_read.wait_time_max.load() / 1000);
|
|
io_perf_read.slow_ios= table_handler->m_io_perf_read.slow_ios.load();
|
|
rdb_open_tables.release_table_handler(table_handler);
|
|
|
|
/*
|
|
Table stats expects our database and table name to be in system encoding,
|
|
not filename format. Convert before calling callback.
|
|
*/
|
|
my_core::filename_to_tablename(dbname.c_str(), dbname_sys,
|
|
sizeof(dbname_sys));
|
|
my_core::filename_to_tablename(tablename.c_str(), tablename_sys,
|
|
sizeof(tablename_sys));
|
|
(*cb)(dbname_sys, tablename_sys, is_partition, &io_perf_read, &io_perf,
|
|
&io_perf, &io_perf, &io_perf, &page_stats, &comp_stats, 0, 0,
|
|
rocksdb_hton_name);
|
|
}
|
|
}
|
|
#endif
|
|
|
|
static rocksdb::Status check_rocksdb_options_compatibility(
|
|
const char *dbpath,
|
|
const rocksdb::Options& main_opts,
|
|
const std::vector<rocksdb::ColumnFamilyDescriptor>& cf_descr)
|
|
{
|
|
DBUG_ASSERT(rocksdb_datadir != nullptr);
|
|
|
|
rocksdb::DBOptions loaded_db_opt;
|
|
std::vector<rocksdb::ColumnFamilyDescriptor> loaded_cf_descs;
|
|
rocksdb::Status status = LoadLatestOptions(dbpath,
|
|
rocksdb::Env::Default(), &loaded_db_opt,
|
|
&loaded_cf_descs);
|
|
|
|
// If we're starting from scratch and there are no options saved yet then this
|
|
// is a valid case. Therefore we can't compare the current set of options to
|
|
// anything.
|
|
if (status.IsNotFound()) {
|
|
return rocksdb::Status::OK();
|
|
}
|
|
|
|
if (!status.ok()) {
|
|
return status;
|
|
}
|
|
|
|
if (loaded_cf_descs.size() != cf_descr.size()) {
|
|
return rocksdb::Status::NotSupported("Mismatched size of column family " \
|
|
"descriptors.");
|
|
}
|
|
|
|
// Please see RocksDB documentation for more context about why we need to set
|
|
// user-defined functions and pointer-typed options manually.
|
|
for (size_t i = 0; i < loaded_cf_descs.size(); i++) {
|
|
loaded_cf_descs[i].options.compaction_filter =
|
|
cf_descr[i].options.compaction_filter;
|
|
loaded_cf_descs[i].options.compaction_filter_factory =
|
|
cf_descr[i].options.compaction_filter_factory;
|
|
loaded_cf_descs[i].options.comparator = cf_descr[i].options.comparator;
|
|
loaded_cf_descs[i].options.memtable_factory =
|
|
cf_descr[i].options.memtable_factory;
|
|
loaded_cf_descs[i].options.merge_operator =
|
|
cf_descr[i].options.merge_operator;
|
|
loaded_cf_descs[i].options.prefix_extractor =
|
|
cf_descr[i].options.prefix_extractor;
|
|
loaded_cf_descs[i].options.table_factory =
|
|
cf_descr[i].options.table_factory;
|
|
}
|
|
|
|
// This is the essence of the function - determine if it's safe to open the
|
|
// database or not.
|
|
status = CheckOptionsCompatibility(dbpath, rocksdb::Env::Default(),
|
|
main_opts, loaded_cf_descs);
|
|
|
|
return status;
|
|
}
|
|
|
|
/*
|
|
Storage Engine initialization function, invoked when plugin is loaded.
|
|
*/
|
|
|
|
static int rocksdb_init_func(void *p)
|
|
{
|
|
DBUG_ENTER("rocksdb_init_func");
|
|
|
|
// Validate the assumption about the size of ROCKSDB_SIZEOF_HIDDEN_PK_COLUMN.
|
|
static_assert(sizeof(longlong) == 8, "Assuming that longlong is 8 bytes.");
|
|
|
|
#ifdef HAVE_PSI_INTERFACE
|
|
init_rocksdb_psi_keys();
|
|
#endif
|
|
|
|
rocksdb_hton= (handlerton *)p;
|
|
mysql_mutex_init(rdb_psi_open_tbls_mutex_key, &rdb_open_tables.m_mutex,
|
|
MY_MUTEX_INIT_FAST);
|
|
#ifdef HAVE_PSI_INTERFACE
|
|
rdb_bg_thread.init(rdb_signal_bg_psi_mutex_key,
|
|
rdb_signal_bg_psi_cond_key);
|
|
rdb_drop_idx_thread.init(rdb_signal_drop_idx_psi_mutex_key,
|
|
rdb_signal_drop_idx_psi_cond_key);
|
|
#else
|
|
rdb_bg_thread.init();
|
|
rdb_drop_idx_thread.init();
|
|
#endif
|
|
mysql_mutex_init(rdb_collation_data_mutex_key, &rdb_collation_data_mutex,
|
|
MY_MUTEX_INIT_FAST);
|
|
mysql_mutex_init(rdb_mem_cmp_space_mutex_key, &rdb_mem_cmp_space_mutex,
|
|
MY_MUTEX_INIT_FAST);
|
|
|
|
#if defined(HAVE_PSI_INTERFACE)
|
|
rdb_collation_exceptions = new Regex_list_handler(
|
|
key_rwlock_collation_exception_list);
|
|
#else
|
|
rdb_collation_exceptions = new Regex_list_handler();
|
|
#endif
|
|
|
|
mysql_mutex_init(rdb_sysvars_psi_mutex_key, &rdb_sysvars_mutex,
|
|
MY_MUTEX_INIT_FAST);
|
|
rdb_open_tables.init_hash();
|
|
Rdb_transaction::init_mutex();
|
|
|
|
rocksdb_hton->state= SHOW_OPTION_YES;
|
|
rocksdb_hton->create= rocksdb_create_handler;
|
|
rocksdb_hton->close_connection= rocksdb_close_connection;
|
|
rocksdb_hton->prepare= rocksdb_prepare;
|
|
rocksdb_hton->commit_by_xid= rocksdb_commit_by_xid;
|
|
rocksdb_hton->rollback_by_xid= rocksdb_rollback_by_xid;
|
|
rocksdb_hton->recover= rocksdb_recover;
|
|
rocksdb_hton->commit= rocksdb_commit;
|
|
rocksdb_hton->rollback= rocksdb_rollback;
|
|
rocksdb_hton->db_type= DB_TYPE_ROCKSDB;
|
|
rocksdb_hton->show_status= rocksdb_show_status;
|
|
rocksdb_hton->start_consistent_snapshot=
|
|
rocksdb_start_tx_and_assign_read_view;
|
|
rocksdb_hton->savepoint_set= rocksdb_savepoint;
|
|
rocksdb_hton->savepoint_rollback= rocksdb_rollback_to_savepoint;
|
|
rocksdb_hton->savepoint_rollback_can_release_mdl=
|
|
rocksdb_rollback_to_savepoint_can_release_mdl;
|
|
#ifdef MARIAROCKS_NOT_YET
|
|
rocksdb_hton->update_table_stats = rocksdb_update_table_stats;
|
|
#endif // MARIAROCKS_NOT_YET
|
|
rocksdb_hton->flush_logs= rocksdb_flush_wal;
|
|
|
|
rocksdb_hton->flags= HTON_TEMPORARY_NOT_SUPPORTED |
|
|
HTON_SUPPORTS_EXTENDED_KEYS |
|
|
HTON_CAN_RECREATE;
|
|
|
|
DBUG_ASSERT(!mysqld_embedded);
|
|
|
|
rocksdb_stats= rocksdb::CreateDBStatistics();
|
|
rocksdb_db_options.statistics= rocksdb_stats;
|
|
|
|
if (rocksdb_rate_limiter_bytes_per_sec != 0) {
|
|
rocksdb_rate_limiter.reset(rocksdb::NewGenericRateLimiter(
|
|
rocksdb_rate_limiter_bytes_per_sec));
|
|
rocksdb_db_options.rate_limiter= rocksdb_rate_limiter;
|
|
}
|
|
|
|
std::shared_ptr<Rdb_logger> myrocks_logger= std::make_shared<Rdb_logger>();
|
|
rocksdb::Status s= rocksdb::CreateLoggerFromOptions(
|
|
rocksdb_datadir, rocksdb_db_options, &rocksdb_db_options.info_log);
|
|
if (s.ok()) {
|
|
myrocks_logger->SetRocksDBLogger(rocksdb_db_options.info_log);
|
|
}
|
|
|
|
rocksdb_db_options.info_log= myrocks_logger;
|
|
myrocks_logger->SetInfoLogLevel(
|
|
static_cast<rocksdb::InfoLogLevel>(rocksdb_info_log_level));
|
|
rocksdb_db_options.wal_dir= rocksdb_wal_dir;
|
|
|
|
rocksdb_db_options.wal_recovery_mode=
|
|
static_cast<rocksdb::WALRecoveryMode>(rocksdb_wal_recovery_mode);
|
|
|
|
rocksdb_db_options.access_hint_on_compaction_start=
|
|
static_cast<rocksdb::Options::AccessHint>
|
|
(rocksdb_access_hint_on_compaction_start);
|
|
|
|
if (rocksdb_db_options.allow_mmap_reads &&
|
|
!rocksdb_db_options.allow_os_buffer)
|
|
{
|
|
// allow_mmap_reads implies allow_os_buffer and RocksDB will not open if
|
|
// mmap_reads is on and os_buffer is off. (NO_LINT_DEBUG)
|
|
sql_print_error("RocksDB: Can't disable allow_os_buffer "
|
|
"if allow_mmap_reads is enabled\n");
|
|
rdb_open_tables.free_hash();
|
|
DBUG_RETURN(1);
|
|
}
|
|
|
|
std::vector<std::string> cf_names;
|
|
rocksdb::Status status;
|
|
status= rocksdb::DB::ListColumnFamilies(rocksdb_db_options, rocksdb_datadir,
|
|
&cf_names);
|
|
if (!status.ok())
|
|
{
|
|
/*
|
|
When we start on an empty datadir, ListColumnFamilies returns IOError,
|
|
and RocksDB doesn't provide any way to check what kind of error it was.
|
|
Checking system errno happens to work right now.
|
|
*/
|
|
if (status.IsIOError() && errno == ENOENT)
|
|
{
|
|
sql_print_information("RocksDB: Got ENOENT when listing column families");
|
|
sql_print_information("RocksDB: assuming that we're creating a new database");
|
|
}
|
|
else
|
|
{
|
|
std::string err_text= status.ToString();
|
|
sql_print_error("RocksDB: Error listing column families: %s", err_text.c_str());
|
|
rdb_open_tables.free_hash();
|
|
DBUG_RETURN(1);
|
|
}
|
|
}
|
|
else
|
|
sql_print_information("RocksDB: %ld column families found", cf_names.size());
|
|
|
|
std::vector<rocksdb::ColumnFamilyDescriptor> cf_descr;
|
|
std::vector<rocksdb::ColumnFamilyHandle*> cf_handles;
|
|
|
|
rocksdb_tbl_options.index_type=
|
|
(rocksdb::BlockBasedTableOptions::IndexType)rocksdb_index_type;
|
|
|
|
if (!rocksdb_tbl_options.no_block_cache) {
|
|
rocksdb_tbl_options.block_cache=
|
|
rocksdb::NewLRUCache(rocksdb_block_cache_size);
|
|
}
|
|
// Using newer BlockBasedTable format version for better compression
|
|
// and better memory allocation.
|
|
// See: https://github.com/facebook/rocksdb/commit/9ab5adfc59a621d12357580c94451d9f7320c2dd
|
|
rocksdb_tbl_options.format_version= 2;
|
|
|
|
if (rocksdb_collect_sst_properties) {
|
|
properties_collector_factory = std::make_shared
|
|
<Rdb_tbl_prop_coll_factory>(
|
|
&ddl_manager
|
|
);
|
|
|
|
rocksdb_set_compaction_options(nullptr, nullptr, nullptr, nullptr);
|
|
|
|
mysql_mutex_lock(&rdb_sysvars_mutex);
|
|
|
|
DBUG_ASSERT(rocksdb_table_stats_sampling_pct
|
|
<= RDB_TBL_STATS_SAMPLE_PCT_MAX);
|
|
properties_collector_factory->SetTableStatsSamplingPct(
|
|
rocksdb_table_stats_sampling_pct);
|
|
|
|
mysql_mutex_unlock(&rdb_sysvars_mutex);
|
|
}
|
|
|
|
if (!rocksdb_cf_options_map.init(ROCKSDB_WRITE_BUFFER_SIZE_DEFAULT,
|
|
rocksdb_tbl_options,
|
|
properties_collector_factory,
|
|
rocksdb_default_cf_options,
|
|
rocksdb_override_cf_options))
|
|
{
|
|
// NO_LINT_DEBUG
|
|
sql_print_error("RocksDB: Failed to initialize CF options map.");
|
|
rdb_open_tables.free_hash();
|
|
DBUG_RETURN(1);
|
|
}
|
|
|
|
/*
|
|
If there are no column families, we're creating the new database.
|
|
Create one column family named "default".
|
|
*/
|
|
if (cf_names.size() == 0)
|
|
cf_names.push_back(DEFAULT_CF_NAME);
|
|
|
|
std::vector<int> compaction_enabled_cf_indices;
|
|
sql_print_information("RocksDB: Column Families at start:");
|
|
for (size_t i = 0; i < cf_names.size(); ++i)
|
|
{
|
|
rocksdb::ColumnFamilyOptions opts;
|
|
rocksdb_cf_options_map.get_cf_options(cf_names[i], &opts);
|
|
|
|
sql_print_information(" cf=%s", cf_names[i].c_str());
|
|
sql_print_information(" write_buffer_size=%ld", opts.write_buffer_size);
|
|
sql_print_information(" target_file_size_base=%" PRIu64,
|
|
opts.target_file_size_base);
|
|
|
|
/*
|
|
Temporarily disable compactions to prevent a race condition where
|
|
compaction starts before compaction filter is ready.
|
|
*/
|
|
if (!opts.disable_auto_compactions)
|
|
{
|
|
compaction_enabled_cf_indices.push_back(i);
|
|
opts.disable_auto_compactions = true;
|
|
}
|
|
cf_descr.push_back(rocksdb::ColumnFamilyDescriptor(cf_names[i], opts));
|
|
}
|
|
|
|
rocksdb::Options main_opts(rocksdb_db_options,
|
|
rocksdb_cf_options_map.get_defaults());
|
|
|
|
#ifdef MARIAROCKS_NOT_YET
|
|
/*
|
|
Flashcache configuration:
|
|
When running on Flashcache, mysqld opens Flashcache device before
|
|
initializing storage engines, and setting file descriptor at
|
|
cachedev_fd global variable.
|
|
RocksDB has Flashcache-aware configuration. When this is enabled,
|
|
RocksDB adds background threads into Flashcache blacklists, which
|
|
makes sense for Flashcache use cases.
|
|
*/
|
|
if (cachedev_enabled)
|
|
{
|
|
flashcache_aware_env=
|
|
rocksdb::NewFlashcacheAwareEnv(rocksdb::Env::Default(),
|
|
cachedev_fd);
|
|
if (flashcache_aware_env.get() == nullptr)
|
|
{
|
|
// NO_LINT_DEBUG
|
|
sql_print_error("RocksDB: Failed to open flashcache device at fd %d",
|
|
cachedev_fd);
|
|
rdb_open_tables.free_hash();
|
|
DBUG_RETURN(1);
|
|
}
|
|
sql_print_information("RocksDB: Disabling flashcache on background "
|
|
"writer threads, fd %d", cachedev_fd);
|
|
main_opts.env= flashcache_aware_env.get();
|
|
}
|
|
#endif
|
|
|
|
main_opts.env->SetBackgroundThreads(main_opts.max_background_flushes,
|
|
rocksdb::Env::Priority::HIGH);
|
|
main_opts.env->SetBackgroundThreads(main_opts.max_background_compactions,
|
|
rocksdb::Env::Priority::LOW);
|
|
rocksdb::TransactionDBOptions tx_db_options;
|
|
tx_db_options.transaction_lock_timeout= 2; // 2 seconds
|
|
tx_db_options.custom_mutex_factory= std::make_shared<Rdb_mutex_factory>();
|
|
|
|
status= check_rocksdb_options_compatibility(rocksdb_datadir, main_opts,
|
|
cf_descr);
|
|
|
|
// We won't start if we'll determine that there's a chance of data corruption
|
|
// because of incompatible options.
|
|
if (!status.ok()) {
|
|
// NO_LINT_DEBUG
|
|
sql_print_error("RocksDB: compatibility check against existing database " \
|
|
"options failed. %s", status.ToString().c_str());
|
|
rdb_open_tables.free_hash();
|
|
DBUG_RETURN(1);
|
|
}
|
|
|
|
status= rocksdb::TransactionDB::Open(main_opts, tx_db_options,
|
|
rocksdb_datadir, cf_descr,
|
|
&cf_handles, &rdb);
|
|
|
|
if (!status.ok())
|
|
{
|
|
std::string err_text= status.ToString();
|
|
sql_print_error("RocksDB: Error opening instance: %s", err_text.c_str());
|
|
rdb_open_tables.free_hash();
|
|
DBUG_RETURN(1);
|
|
}
|
|
cf_manager.init(&rocksdb_cf_options_map, &cf_handles);
|
|
|
|
if (dict_manager.init(rdb->GetBaseDB(), &cf_manager))
|
|
{
|
|
// NO_LINT_DEBUG
|
|
sql_print_error("RocksDB: Failed to initialize data dictionary.");
|
|
rdb_open_tables.free_hash();
|
|
DBUG_RETURN(1);
|
|
}
|
|
|
|
if (binlog_manager.init(&dict_manager))
|
|
{
|
|
// NO_LINT_DEBUG
|
|
sql_print_error("RocksDB: Failed to initialize binlog manager.");
|
|
rdb_open_tables.free_hash();
|
|
DBUG_RETURN(1);
|
|
}
|
|
|
|
if (ddl_manager.init(&dict_manager, &cf_manager, rocksdb_validate_tables))
|
|
{
|
|
// NO_LINT_DEBUG
|
|
sql_print_error("RocksDB: Failed to initialize DDL manager.");
|
|
rdb_open_tables.free_hash();
|
|
DBUG_RETURN(1);
|
|
}
|
|
|
|
Rdb_sst_info::init(rdb);
|
|
|
|
/*
|
|
Enable auto compaction, things needed for compaction filter are finished
|
|
initializing
|
|
*/
|
|
std::vector<rocksdb::ColumnFamilyHandle*> compaction_enabled_cf_handles;
|
|
compaction_enabled_cf_handles.reserve(compaction_enabled_cf_indices.size());
|
|
for (auto index : compaction_enabled_cf_indices)
|
|
{
|
|
compaction_enabled_cf_handles.push_back(cf_handles[index]);
|
|
}
|
|
|
|
status= rdb->EnableAutoCompaction(compaction_enabled_cf_handles);
|
|
|
|
if (!status.ok())
|
|
{
|
|
std::string err_text= status.ToString();
|
|
// NO_LINT_DEBUG
|
|
sql_print_error("RocksDB: Error enabling compaction: %s", err_text.c_str());
|
|
rdb_open_tables.free_hash();
|
|
DBUG_RETURN(1);
|
|
}
|
|
|
|
auto err= rdb_bg_thread.create_thread(
|
|
#ifdef HAVE_PSI_INTERFACE
|
|
rdb_background_psi_thread_key
|
|
#endif
|
|
);
|
|
if (err != 0) {
|
|
sql_print_error("RocksDB: Couldn't start the background thread: (errno=%d)",
|
|
err);
|
|
rdb_open_tables.free_hash();
|
|
DBUG_RETURN(1);
|
|
}
|
|
|
|
err= rdb_drop_idx_thread.create_thread(
|
|
#ifdef HAVE_PSI_INTERFACE
|
|
rdb_drop_idx_psi_thread_key
|
|
#endif
|
|
);
|
|
if (err != 0) {
|
|
sql_print_error("RocksDB: Couldn't start the drop index thread: (errno=%d)",
|
|
err);
|
|
rdb_open_tables.free_hash();
|
|
DBUG_RETURN(1);
|
|
}
|
|
|
|
rdb_set_collation_exception_list(rocksdb_strict_collation_exceptions);
|
|
|
|
if (rocksdb_pause_background_work) {
|
|
rdb->PauseBackgroundWork();
|
|
}
|
|
|
|
sql_print_information("RocksDB instance opened");
|
|
DBUG_RETURN(0);
|
|
}
|
|
|
|
/*
|
|
Storage Engine deinitialization function, invoked when plugin is unloaded.
|
|
*/
|
|
|
|
static int rocksdb_done_func(void *p)
|
|
{
|
|
int error= 0;
|
|
DBUG_ENTER("rocksdb_done_func");
|
|
|
|
// signal the drop index thread to stop
|
|
rdb_drop_idx_thread.signal(true);
|
|
|
|
// Flush all memtables for not lose data, even if WAL is disabled.
|
|
rocksdb_flush_all_memtables();
|
|
|
|
// Stop all rocksdb background work
|
|
CancelAllBackgroundWork(rdb->GetBaseDB(), true);
|
|
|
|
// Signal the background thread to stop and to persist all stats collected
|
|
// from background flushes and compactions. This will add more keys to a new
|
|
// memtable, but since the memtables were just flushed, it should not trigger
|
|
// a flush that can stall due to background threads being stopped. As long
|
|
// as these keys are stored in a WAL file, they can be retrieved on restart.
|
|
rdb_bg_thread.signal(true);
|
|
|
|
// Wait for the background thread to finish.
|
|
auto err= rdb_bg_thread.join();
|
|
if (err != 0) {
|
|
// We'll log the message and continue because we're shutting down and
|
|
// continuation is the optimal strategy.
|
|
// NO_LINT_DEBUG
|
|
sql_print_error("RocksDB: Couldn't stop the background thread: (errno=%d)",
|
|
err);
|
|
}
|
|
|
|
// Wait for the drop index thread to finish.
|
|
err= rdb_drop_idx_thread.join();
|
|
if (err != 0) {
|
|
// NO_LINT_DEBUG
|
|
sql_print_error("RocksDB: Couldn't stop the index thread: (errno=%d)",
|
|
err);
|
|
}
|
|
|
|
if (rdb_open_tables.m_hash.records)
|
|
{
|
|
// Looks like we are getting unloaded and yet we have some open tables
|
|
// left behind.
|
|
error= 1;
|
|
}
|
|
|
|
rdb_open_tables.free_hash();
|
|
mysql_mutex_destroy(&rdb_open_tables.m_mutex);
|
|
mysql_mutex_destroy(&rdb_sysvars_mutex);
|
|
|
|
delete rdb_collation_exceptions;
|
|
mysql_mutex_destroy(&rdb_collation_data_mutex);
|
|
mysql_mutex_destroy(&rdb_mem_cmp_space_mutex);
|
|
|
|
Rdb_transaction::term_mutex();
|
|
|
|
for (auto& it : rdb_collation_data)
|
|
{
|
|
delete it;
|
|
it= nullptr;
|
|
}
|
|
|
|
ddl_manager.cleanup();
|
|
binlog_manager.cleanup();
|
|
dict_manager.cleanup();
|
|
cf_manager.cleanup();
|
|
|
|
delete rdb;
|
|
rdb= nullptr;
|
|
|
|
// Disown the cache data since we're shutting down.
|
|
// This results in memory leaks but it improved the shutdown time.
|
|
// Don't disown when running under valgrind
|
|
#ifndef HAVE_purify
|
|
if (rocksdb_tbl_options.block_cache)
|
|
{
|
|
rocksdb_tbl_options.block_cache->DisownData();
|
|
}
|
|
#endif /* HAVE_purify */
|
|
|
|
DBUG_RETURN(error);
|
|
}
|
|
|
|
|
|
/**
|
|
@brief
|
|
Example of simple lock controls. The "table_handler" it creates is a
|
|
structure we will pass to each ha_rocksdb handler. Do you have to have
|
|
one of these? Well, you have pieces that are used for locking, and
|
|
they are needed to function.
|
|
*/
|
|
|
|
Rdb_table_handler* Rdb_open_tables_map::get_table_handler(
|
|
const char *table_name)
|
|
{
|
|
Rdb_table_handler *table_handler;
|
|
uint length;
|
|
char *tmp_name;
|
|
|
|
DBUG_ASSERT(table_name != nullptr);
|
|
length=(uint) strlen(table_name);
|
|
|
|
// First, look up the table in the hash map.
|
|
mysql_mutex_lock(&m_mutex);
|
|
if (!(table_handler= reinterpret_cast<Rdb_table_handler*>(
|
|
my_hash_search(&m_hash, reinterpret_cast<const uchar*>(table_name),
|
|
length))))
|
|
{
|
|
// Since we did not find it in the hash map, attempt to create and add it
|
|
// to the hash map.
|
|
if (!(table_handler= reinterpret_cast<Rdb_table_handler*>(
|
|
my_multi_malloc(MYF(MY_WME | MY_ZEROFILL),
|
|
&table_handler, sizeof(*table_handler),
|
|
&tmp_name, length+1,
|
|
NullS))))
|
|
{
|
|
// Allocating a new Rdb_table_handler and a new table name failed.
|
|
mysql_mutex_unlock(&m_mutex);
|
|
return nullptr;
|
|
}
|
|
|
|
table_handler->m_ref_count= 0;
|
|
table_handler->m_table_name_length= length;
|
|
table_handler->m_table_name= tmp_name;
|
|
strmov(table_handler->m_table_name, table_name);
|
|
|
|
if (my_hash_insert(&m_hash, reinterpret_cast<uchar*>(table_handler)))
|
|
{
|
|
// Inserting into the hash map failed.
|
|
mysql_mutex_unlock(&m_mutex);
|
|
my_free(table_handler);
|
|
return nullptr;
|
|
}
|
|
|
|
thr_lock_init(&table_handler->m_thr_lock);
|
|
my_io_perf_atomic_init(&table_handler->m_io_perf_read);
|
|
}
|
|
DBUG_ASSERT(table_handler->m_ref_count >= 0);
|
|
table_handler->m_ref_count++;
|
|
|
|
mysql_mutex_unlock(&m_mutex);
|
|
|
|
return table_handler;
|
|
}
|
|
|
|
|
|
std::vector<std::string> rdb_get_open_table_names(void)
|
|
{
|
|
return rdb_open_tables.get_table_names();
|
|
}
|
|
|
|
|
|
std::vector<std::string> Rdb_open_tables_map::get_table_names(void) const
|
|
{
|
|
ulong i;
|
|
const Rdb_table_handler *table_handler;
|
|
std::vector<std::string> names;
|
|
|
|
mysql_mutex_lock(&m_mutex);
|
|
for (i = 0;
|
|
(table_handler= reinterpret_cast<const Rdb_table_handler*>(
|
|
my_hash_const_element(&m_hash, i)));
|
|
i++)
|
|
{
|
|
DBUG_ASSERT(table_handler != nullptr);
|
|
names.push_back(table_handler->m_table_name);
|
|
}
|
|
DBUG_ASSERT(i == m_hash.records);
|
|
mysql_mutex_unlock(&m_mutex);
|
|
|
|
return names;
|
|
}
|
|
|
|
|
|
void ha_rocksdb::load_auto_incr_value()
|
|
{
|
|
int save_active_index= active_index;
|
|
active_index= table->s->next_number_index;
|
|
uint8 save_table_status= table->status;
|
|
|
|
/*
|
|
load_auto_incr_value() may be called by statements that
|
|
do not execute implicit commits (i.e. SHOW CREATE TABLE).
|
|
index_last() creates a snapshot. When a snapshot is created
|
|
here, it has to be released as well. (GitHub issue#189)
|
|
*/
|
|
Rdb_transaction *tx= get_or_create_tx(table->in_use);
|
|
bool is_new_snapshot= !tx->has_snapshot();
|
|
|
|
// Do a lookup. We only need index column, so it should be index-only.
|
|
// (another reason to make it index-only is that table->read_set is
|
|
// not set appropriately and non-index-only lookup will not read the value)
|
|
bool save_keyread_only= m_keyread_only;
|
|
m_keyread_only= true;
|
|
|
|
if (!index_last(table->record[0]))
|
|
update_auto_incr_val();
|
|
|
|
m_keyread_only= save_keyread_only;
|
|
if (is_new_snapshot)
|
|
{
|
|
tx->release_snapshot();
|
|
}
|
|
|
|
table->status= save_table_status;
|
|
active_index= save_active_index;
|
|
|
|
/*
|
|
Do what ha_rocksdb::index_end() does.
|
|
(Why don't we use index_init/index_end? class handler defines index_init
|
|
as private, for some reason).
|
|
*/
|
|
release_scan_iterator();
|
|
}
|
|
|
|
/* Get PK value from table->record[0]. */
|
|
/*
|
|
TODO(alexyang): No existing support for auto_increment on non-pk columns, see
|
|
end of ha_rocksdb::create. Also see opened issue here:
|
|
https://github.com/facebook/mysql-5.6/issues/153
|
|
*/
|
|
void ha_rocksdb::update_auto_incr_val()
|
|
{
|
|
Field *field;
|
|
longlong new_val;
|
|
field= table->key_info[table->s->next_number_index].key_part[0].field;
|
|
|
|
my_bitmap_map *old_map= dbug_tmp_use_all_columns(table, table->read_set);
|
|
new_val= field->val_int() + 1;
|
|
dbug_tmp_restore_column_map(table->read_set, old_map);
|
|
|
|
longlong auto_incr_val= m_tbl_def->m_auto_incr_val;
|
|
while (auto_incr_val < new_val &&
|
|
!m_tbl_def->m_auto_incr_val.compare_exchange_weak(
|
|
auto_incr_val, new_val))
|
|
{
|
|
// Do nothing - just loop until auto_incr_val is >= new_val or
|
|
// we successfully set it
|
|
}
|
|
}
|
|
|
|
int ha_rocksdb::load_hidden_pk_value()
|
|
{
|
|
int save_active_index= active_index;
|
|
active_index= m_tbl_def->m_key_count - 1;
|
|
uint8 save_table_status= table->status;
|
|
|
|
Rdb_transaction *tx= get_or_create_tx(table->in_use);
|
|
bool is_new_snapshot= !tx->has_snapshot();
|
|
|
|
// Do a lookup.
|
|
if (!index_last(table->record[0]))
|
|
{
|
|
/*
|
|
Decode PK field from the key
|
|
*/
|
|
longlong hidden_pk_id= 0;
|
|
if (read_hidden_pk_id_from_rowkey(&hidden_pk_id))
|
|
{
|
|
if (is_new_snapshot)
|
|
{
|
|
tx->release_snapshot();
|
|
}
|
|
return HA_ERR_INTERNAL_ERROR;
|
|
}
|
|
|
|
hidden_pk_id++;
|
|
longlong old = m_tbl_def->m_hidden_pk_val;
|
|
while (old < hidden_pk_id &&
|
|
!m_tbl_def->m_hidden_pk_val.compare_exchange_weak(old, hidden_pk_id))
|
|
{}
|
|
}
|
|
|
|
if (is_new_snapshot)
|
|
{
|
|
tx->release_snapshot();
|
|
}
|
|
|
|
table->status= save_table_status;
|
|
active_index= save_active_index;
|
|
|
|
release_scan_iterator();
|
|
|
|
return 0;
|
|
}
|
|
|
|
/* Get PK value from m_tbl_def->m_hidden_pk_info. */
|
|
longlong ha_rocksdb::update_hidden_pk_val()
|
|
{
|
|
DBUG_ASSERT(has_hidden_pk(table));
|
|
longlong new_val;
|
|
new_val= m_tbl_def->m_hidden_pk_val++;
|
|
return new_val;
|
|
}
|
|
|
|
/* Get the id of the hidden pk id from m_last_rowkey */
|
|
int ha_rocksdb::read_hidden_pk_id_from_rowkey(longlong* hidden_pk_id)
|
|
{
|
|
DBUG_ASSERT(hidden_pk_id != nullptr);
|
|
DBUG_ASSERT(table != nullptr);
|
|
DBUG_ASSERT(has_hidden_pk(table));
|
|
|
|
rocksdb::Slice rowkey_slice(m_last_rowkey.ptr(), m_last_rowkey.length());
|
|
|
|
// Get hidden primary key from old key slice
|
|
Rdb_string_reader reader(&rowkey_slice);
|
|
if ((!reader.read(Rdb_key_def::INDEX_NUMBER_SIZE)))
|
|
return 1;
|
|
|
|
const int length= Field_longlong::PACK_LENGTH;
|
|
const uchar *from= reinterpret_cast<const uchar*>(reader.read(length));
|
|
if (from == nullptr)
|
|
{
|
|
return 1; /* Mem-comparable image doesn't have enough bytes */
|
|
}
|
|
|
|
*hidden_pk_id= rdb_netbuf_read_uint64(&from);
|
|
return 0;
|
|
}
|
|
|
|
|
|
/**
|
|
@brief
|
|
Free lock controls. We call this whenever we close a table. If the table had
|
|
the last reference to the table_handler, then we free the memory associated
|
|
with it.
|
|
*/
|
|
|
|
void Rdb_open_tables_map::release_table_handler(
|
|
Rdb_table_handler *table_handler)
|
|
{
|
|
mysql_mutex_lock(&m_mutex);
|
|
|
|
DBUG_ASSERT(table_handler->m_ref_count > 0);
|
|
if (!--table_handler->m_ref_count)
|
|
{
|
|
// Last rereference was released. Tear down the hash entry.
|
|
auto ret __attribute__((__unused__)) =
|
|
my_hash_delete(&m_hash, reinterpret_cast<uchar*>(table_handler));
|
|
DBUG_ASSERT(!ret); // the hash entry must actually be found and deleted
|
|
my_core::thr_lock_delete(&table_handler->m_thr_lock);
|
|
my_free(table_handler);
|
|
}
|
|
|
|
mysql_mutex_unlock(&m_mutex);
|
|
}
|
|
|
|
|
|
static handler* rocksdb_create_handler(my_core::handlerton *hton,
|
|
my_core::TABLE_SHARE *table_arg,
|
|
my_core::MEM_ROOT *mem_root)
|
|
{
|
|
return new (mem_root) ha_rocksdb(hton, table_arg);
|
|
}
|
|
|
|
|
|
ha_rocksdb::ha_rocksdb(my_core::handlerton *hton,
|
|
my_core::TABLE_SHARE *table_arg)
|
|
: handler(hton, table_arg), m_table_handler(nullptr), m_scan_it(nullptr),
|
|
m_scan_it_skips_bloom(false), m_tbl_def(nullptr),
|
|
m_pk_descr(nullptr), m_key_descr_arr(nullptr),
|
|
m_pk_can_be_decoded(false),
|
|
m_maybe_unpack_info(false),
|
|
m_pk_tuple(nullptr), m_pk_packed_tuple(nullptr),
|
|
m_sk_packed_tuple(nullptr),
|
|
m_sk_match_prefix(nullptr), m_sk_match_prefix_buf(nullptr),
|
|
m_sk_packed_tuple_old(nullptr),
|
|
m_pack_buffer(nullptr), m_lock_rows(RDB_LOCK_NONE),
|
|
m_keyread_only(FALSE), m_encoder_arr(nullptr),
|
|
m_row_checksums_checked(0),
|
|
m_in_rpl_delete_rows(false), m_in_rpl_update_rows(false),
|
|
m_force_skip_unique_check(false)
|
|
{}
|
|
|
|
|
|
static const char *ha_rocksdb_exts[] = {
|
|
NullS
|
|
};
|
|
|
|
|
|
const char **ha_rocksdb::bas_ext() const
|
|
{
|
|
return ha_rocksdb_exts;
|
|
}
|
|
|
|
bool ha_rocksdb::same_table(const ha_rocksdb& other) const
|
|
{
|
|
return m_tbl_def->base_tablename() == other.m_tbl_def->base_tablename();
|
|
}
|
|
|
|
#ifdef MARIAROCKS_NOT_YET
|
|
bool ha_rocksdb::init_with_fields()
|
|
{
|
|
uint pk= table_share->primary_key;
|
|
if (pk != MAX_KEY)
|
|
{
|
|
uint key_parts= table_share->key_info[pk].user_defined_key_parts;
|
|
check_keyread_allowed(pk /*PK*/, key_parts-1, true);
|
|
}
|
|
else
|
|
m_pk_can_be_decoded= false;
|
|
|
|
cached_table_flags= table_flags();
|
|
return false; /* Ok */
|
|
}
|
|
#endif
|
|
|
|
/**
|
|
Convert record from table->record[0] form into a form that can be written
|
|
into rocksdb.
|
|
|
|
@param pk_packed_slice Packed PK tuple. We need it in order to compute
|
|
and store its CRC.
|
|
@param packed_rec OUT Data slice with record data.
|
|
*/
|
|
|
|
void ha_rocksdb::convert_record_to_storage_format(
|
|
const rocksdb::Slice& pk_packed_slice,
|
|
Rdb_string_writer *pk_unpack_info,
|
|
rocksdb::Slice *packed_rec)
|
|
{
|
|
DBUG_ASSERT_IMP(m_maybe_unpack_info, pk_unpack_info);
|
|
m_storage_record.length(0);
|
|
|
|
/* All NULL bits are initially 0 */
|
|
m_storage_record.fill(m_null_bytes_in_rec, 0);
|
|
|
|
// If a primary key may have non-empty unpack_info for certain values,
|
|
// (m_maybe_unpack_info=TRUE), we write the unpack_info block. The block
|
|
// itself was prepared in Rdb_key_def::pack_record.
|
|
if (m_maybe_unpack_info &&
|
|
m_pk_descr->m_kv_format_version >=
|
|
Rdb_key_def::PRIMARY_FORMAT_VERSION_UPDATE1)
|
|
{
|
|
m_storage_record.append(reinterpret_cast<char*>(pk_unpack_info->ptr()),
|
|
pk_unpack_info->get_current_pos());
|
|
}
|
|
|
|
for (uint i=0; i < table->s->fields; i++)
|
|
{
|
|
/* Don't pack decodable PK key parts */
|
|
if (m_encoder_arr[i].m_storage_type != Rdb_field_encoder::STORE_ALL)
|
|
{
|
|
continue;
|
|
}
|
|
|
|
Field *field= table->field[i];
|
|
if (m_encoder_arr[i].maybe_null())
|
|
{
|
|
char *data= (char*)m_storage_record.ptr();
|
|
if (field->is_null())
|
|
{
|
|
data[m_encoder_arr[i].m_null_offset]|= m_encoder_arr[i].m_null_mask;
|
|
/* Don't write anything for NULL values */
|
|
continue;
|
|
}
|
|
}
|
|
|
|
if (m_encoder_arr[i].m_field_type == MYSQL_TYPE_BLOB)
|
|
{
|
|
my_core::Field_blob *blob= (my_core::Field_blob*)field;
|
|
/* Get the number of bytes needed to store length*/
|
|
uint length_bytes= blob->pack_length() - portable_sizeof_char_ptr;
|
|
|
|
/* Store the length of the value */
|
|
m_storage_record.append(reinterpret_cast<char*>(blob->ptr), length_bytes);
|
|
|
|
/* Store the blob value itself */
|
|
char *data_ptr;
|
|
memcpy(&data_ptr, blob->ptr + length_bytes, sizeof(uchar**));
|
|
m_storage_record.append(data_ptr, blob->get_length());
|
|
}
|
|
else if (m_encoder_arr[i].m_field_type == MYSQL_TYPE_VARCHAR)
|
|
{
|
|
Field_varstring* field_var= (Field_varstring*)field;
|
|
uint data_len;
|
|
/* field_var->length_bytes is 1 or 2 */
|
|
if (field_var->length_bytes == 1)
|
|
{
|
|
data_len= field_var->ptr[0];
|
|
}
|
|
else
|
|
{
|
|
DBUG_ASSERT(field_var->length_bytes==2);
|
|
data_len= uint2korr(field_var->ptr);
|
|
}
|
|
m_storage_record.append(reinterpret_cast<char*>(field_var->ptr),
|
|
field_var->length_bytes + data_len);
|
|
}
|
|
else
|
|
{
|
|
/* Copy the field data */
|
|
uint len= field->pack_length_in_rec();
|
|
m_storage_record.append(reinterpret_cast<char*>(field->ptr), len);
|
|
}
|
|
}
|
|
|
|
if (should_store_checksums())
|
|
{
|
|
uint32_t key_crc32= my_core::crc32(0,
|
|
rdb_slice_to_uchar_ptr(&pk_packed_slice),
|
|
pk_packed_slice.size());
|
|
uint32_t val_crc32= my_core::crc32(
|
|
0, rdb_mysql_str_to_uchar_str(&m_storage_record),
|
|
m_storage_record.length());
|
|
uchar key_crc_buf[RDB_CHECKSUM_SIZE];
|
|
uchar val_crc_buf[RDB_CHECKSUM_SIZE];
|
|
rdb_netbuf_store_uint32(key_crc_buf, key_crc32);
|
|
rdb_netbuf_store_uint32(val_crc_buf, val_crc32);
|
|
m_storage_record.append((const char*)&RDB_CHECKSUM_DATA_TAG, 1);
|
|
m_storage_record.append((const char*)key_crc_buf, RDB_CHECKSUM_SIZE);
|
|
m_storage_record.append((const char*)val_crc_buf, RDB_CHECKSUM_SIZE);
|
|
}
|
|
|
|
*packed_rec= rocksdb::Slice(m_storage_record.ptr(),
|
|
m_storage_record.length());
|
|
}
|
|
|
|
|
|
/*
|
|
@brief
|
|
Setup which fields will be unpacked when reading rows
|
|
|
|
@detail
|
|
Two special cases when we still unpack all fields:
|
|
- When this table is being updated (m_lock_rows==RDB_LOCK_WRITE).
|
|
- When @@rocksdb_verify_checksums is ON (In this mode, we need to read all
|
|
fields to find whether there is a row checksum at the end. We could skip
|
|
the fields instead of decoding them, but currently we do decoding.)
|
|
|
|
@seealso
|
|
ha_rocksdb::setup_field_converters()
|
|
ha_rocksdb::convert_record_from_storage_format()
|
|
*/
|
|
void ha_rocksdb::setup_read_decoders()
|
|
{
|
|
m_decoders_vect.clear();
|
|
|
|
int last_useful= 0;
|
|
int skip_size= 0;
|
|
|
|
for (uint i= 0; i < table->s->fields; i++)
|
|
{
|
|
// We only need the decoder if the whole record is stored.
|
|
if (m_encoder_arr[i].m_storage_type != Rdb_field_encoder::STORE_ALL)
|
|
{
|
|
continue;
|
|
}
|
|
|
|
if (m_lock_rows == RDB_LOCK_WRITE || m_verify_checksums ||
|
|
bitmap_is_set(table->read_set, table->field[i]->field_index))
|
|
{
|
|
// We will need to decode this field
|
|
m_decoders_vect.push_back({&m_encoder_arr[i], true, skip_size});
|
|
last_useful= m_decoders_vect.size();
|
|
skip_size= 0;
|
|
}
|
|
else
|
|
{
|
|
if (m_encoder_arr[i].uses_variable_len_encoding() ||
|
|
m_encoder_arr[i].maybe_null())
|
|
{
|
|
// For variable-length field, we need to read the data and skip it
|
|
m_decoders_vect.push_back({&m_encoder_arr[i], false, skip_size});
|
|
skip_size= 0;
|
|
}
|
|
else
|
|
{
|
|
// Fixed-width field can be skipped without looking at it.
|
|
// Add appropriate skip_size to the next field.
|
|
skip_size += m_encoder_arr[i].m_pack_length_in_rec;
|
|
}
|
|
}
|
|
}
|
|
|
|
// It could be that the last few elements are varchars that just do
|
|
// skipping. Remove them.
|
|
m_decoders_vect.erase(m_decoders_vect.begin() + last_useful,
|
|
m_decoders_vect.end());
|
|
}
|
|
|
|
|
|
#ifndef NDEBUG
|
|
void dbug_append_garbage_at_end(std::string &on_disk_rec)
|
|
{
|
|
on_disk_rec.append("abc");
|
|
}
|
|
|
|
void dbug_truncate_record(std::string &on_disk_rec)
|
|
{
|
|
on_disk_rec.resize(0);
|
|
}
|
|
|
|
void dbug_modify_rec_varchar12(std::string &on_disk_rec)
|
|
{
|
|
std::string res;
|
|
// The record is NULL-byte followed by VARCHAR(10).
|
|
// Put the NULL-byte
|
|
res.append("\0", 1);
|
|
// Then, add a valid VARCHAR(12) value.
|
|
res.append("\xC", 1);
|
|
res.append("123456789ab", 12);
|
|
|
|
on_disk_rec.assign(res);
|
|
}
|
|
|
|
|
|
void dbug_modify_key_varchar8(String &on_disk_rec)
|
|
{
|
|
std::string res;
|
|
// The key starts with index number
|
|
res.append(on_disk_rec.ptr(), Rdb_key_def::INDEX_NUMBER_SIZE);
|
|
|
|
// Then, a mem-comparable form of a varchar(8) value.
|
|
res.append("ABCDE\0\0\0\xFC", 9);
|
|
on_disk_rec.length(0);
|
|
on_disk_rec.append(res.data(), res.size());
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
int ha_rocksdb::convert_record_from_storage_format(const rocksdb::Slice *key,
|
|
uchar * buf)
|
|
{
|
|
DBUG_EXECUTE_IF("myrocks_simulate_bad_row_read1",
|
|
dbug_append_garbage_at_end(m_retrieved_record););
|
|
DBUG_EXECUTE_IF("myrocks_simulate_bad_row_read2",
|
|
dbug_truncate_record(m_retrieved_record););
|
|
DBUG_EXECUTE_IF("myrocks_simulate_bad_row_read3",
|
|
dbug_modify_rec_varchar12(m_retrieved_record););
|
|
|
|
rocksdb::Slice retrieved_rec_slice(&m_retrieved_record.front(),
|
|
m_retrieved_record.size());
|
|
return convert_record_from_storage_format(key, &retrieved_rec_slice, buf);
|
|
}
|
|
|
|
|
|
/*
|
|
@brief
|
|
Unpack the record in this->m_retrieved_record and this->m_last_rowkey from
|
|
storage format into buf (which can be table->record[0] or table->record[1]).
|
|
|
|
@param key Table record's key in mem-comparable form.
|
|
@param buf Store record in table->record[0] format here
|
|
|
|
@detail
|
|
If the table has blobs, the unpacked data in buf may keep pointers to the
|
|
data in this->m_retrieved_record.
|
|
|
|
The key is only needed to check its checksum value (the checksum is in
|
|
m_retrieved_record).
|
|
|
|
@seealso
|
|
ha_rocksdb::setup_read_decoders() Sets up data structures which tell which
|
|
columns to decode.
|
|
|
|
@return
|
|
0 OK
|
|
other Error inpacking the data
|
|
*/
|
|
|
|
int ha_rocksdb::convert_record_from_storage_format(const rocksdb::Slice *key,
|
|
const rocksdb::Slice *value,
|
|
uchar * buf)
|
|
{
|
|
DBUG_ASSERT(key != nullptr);
|
|
DBUG_ASSERT(buf != nullptr);
|
|
|
|
Rdb_string_reader reader(value);
|
|
my_ptrdiff_t ptr_diff= buf - table->record[0];
|
|
|
|
/*
|
|
Decode PK fields from the key
|
|
*/
|
|
DBUG_EXECUTE_IF("myrocks_simulate_bad_pk_read1",
|
|
dbug_modify_key_varchar8(m_last_rowkey););
|
|
|
|
const rocksdb::Slice rowkey_slice(m_last_rowkey.ptr(),
|
|
m_last_rowkey.length());
|
|
const char *unpack_info= nullptr;
|
|
uint16 unpack_info_len= 0;
|
|
rocksdb::Slice unpack_slice;
|
|
|
|
/* Other fields are decoded from the value */
|
|
const char *null_bytes= nullptr;
|
|
if (m_null_bytes_in_rec && !(null_bytes= reader.read(m_null_bytes_in_rec)))
|
|
{
|
|
return HA_ERR_INTERNAL_ERROR;
|
|
}
|
|
|
|
if (m_maybe_unpack_info && m_pk_descr->m_kv_format_version >=
|
|
Rdb_key_def::PRIMARY_FORMAT_VERSION_UPDATE1)
|
|
{
|
|
unpack_info= reader.read(RDB_UNPACK_HEADER_SIZE);
|
|
|
|
if (!unpack_info || unpack_info[0] != RDB_UNPACK_DATA_TAG)
|
|
{
|
|
return HA_ERR_INTERNAL_ERROR;
|
|
}
|
|
|
|
unpack_info_len= rdb_netbuf_to_uint16(
|
|
reinterpret_cast<const uchar *>(unpack_info + 1));
|
|
unpack_slice= rocksdb::Slice(unpack_info, unpack_info_len);
|
|
|
|
reader.read(unpack_info_len - RDB_UNPACK_HEADER_SIZE);
|
|
}
|
|
|
|
if (m_pk_descr->unpack_record(table, buf, &rowkey_slice,
|
|
unpack_info ? &unpack_slice : nullptr,
|
|
false /* verify_checksum */))
|
|
{
|
|
return HA_ERR_INTERNAL_ERROR;
|
|
}
|
|
|
|
for (auto it= m_decoders_vect.begin(); it != m_decoders_vect.end(); it++)
|
|
{
|
|
const Rdb_field_encoder* const field_dec= it->m_field_enc;
|
|
const bool decode= it->m_decode;
|
|
bool isNull = field_dec->maybe_null() &&
|
|
((null_bytes[field_dec->m_null_offset] & field_dec->m_null_mask) != 0);
|
|
|
|
Field *field= table->field[field_dec->m_field_index];
|
|
|
|
/* Skip the bytes we need to skip */
|
|
if (it->m_skip && !reader.read(it->m_skip))
|
|
return HA_ERR_INTERNAL_ERROR;
|
|
|
|
if (isNull)
|
|
{
|
|
if (decode)
|
|
{
|
|
/* This sets the NULL-bit of this record */
|
|
field->set_null(ptr_diff);
|
|
/*
|
|
Besides that, set the field value to default value. CHECKSUM TABLE
|
|
depends on this.
|
|
*/
|
|
uint field_offset= field->ptr - table->record[0];
|
|
memcpy(buf + field_offset,
|
|
table->s->default_values + field_offset,
|
|
field->pack_length());
|
|
}
|
|
continue;
|
|
}
|
|
else
|
|
{
|
|
if (decode)
|
|
field->set_notnull(ptr_diff);
|
|
}
|
|
|
|
if (field_dec->m_field_type == MYSQL_TYPE_BLOB)
|
|
{
|
|
my_core::Field_blob *blob= (my_core::Field_blob*)field;
|
|
/* Get the number of bytes needed to store length*/
|
|
uint length_bytes= blob->pack_length() - portable_sizeof_char_ptr;
|
|
|
|
blob->move_field_offset(ptr_diff);
|
|
|
|
const char *data_len_str;
|
|
if (!(data_len_str= reader.read(length_bytes)))
|
|
{
|
|
blob->move_field_offset(-ptr_diff);
|
|
return HA_ERR_INTERNAL_ERROR;
|
|
}
|
|
|
|
memcpy(blob->ptr, data_len_str, length_bytes);
|
|
|
|
uint32 data_len= blob->get_length((uchar*)data_len_str, length_bytes);
|
|
const char *blob_ptr;
|
|
if (!(blob_ptr= reader.read(data_len)))
|
|
{
|
|
blob->move_field_offset(-ptr_diff);
|
|
return HA_ERR_INTERNAL_ERROR;
|
|
}
|
|
|
|
if (decode)
|
|
{
|
|
// set 8-byte pointer to 0, like innodb does (relevant for 32-bit
|
|
// platforms)
|
|
memset(blob->ptr + length_bytes, 0, 8);
|
|
memcpy(blob->ptr + length_bytes, &blob_ptr, sizeof(uchar**));
|
|
blob->move_field_offset(-ptr_diff);
|
|
}
|
|
}
|
|
else if (field_dec->m_field_type == MYSQL_TYPE_VARCHAR)
|
|
{
|
|
Field_varstring* field_var= (Field_varstring*)field;
|
|
const char *data_len_str;
|
|
if (!(data_len_str= reader.read(field_var->length_bytes)))
|
|
return HA_ERR_INTERNAL_ERROR;
|
|
|
|
uint data_len;
|
|
/* field_var->length_bytes is 1 or 2 */
|
|
if (field_var->length_bytes == 1)
|
|
{
|
|
data_len= (uchar)data_len_str[0];
|
|
}
|
|
else
|
|
{
|
|
DBUG_ASSERT(field_var->length_bytes == 2);
|
|
data_len= uint2korr(data_len_str);
|
|
}
|
|
if (data_len > field->field_length)
|
|
{
|
|
/* The data on disk is longer than table DDL allows? */
|
|
return HA_ERR_INTERNAL_ERROR;
|
|
}
|
|
if (!reader.read(data_len))
|
|
return HA_ERR_INTERNAL_ERROR;
|
|
|
|
if (decode)
|
|
{
|
|
memcpy(field_var->ptr + ptr_diff, data_len_str,
|
|
field_var->length_bytes + data_len);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
const char *data_bytes;
|
|
uint len= field_dec->m_pack_length_in_rec;
|
|
if (len > 0)
|
|
{
|
|
if ((data_bytes= reader.read(len)) == nullptr)
|
|
{
|
|
return HA_ERR_INTERNAL_ERROR;
|
|
}
|
|
if (decode)
|
|
memcpy(field->ptr + ptr_diff, data_bytes, len);
|
|
}
|
|
}
|
|
}
|
|
|
|
if (m_verify_checksums)
|
|
{
|
|
if (reader.remaining_bytes() == RDB_CHECKSUM_CHUNK_SIZE &&
|
|
reader.read(1)[0] == RDB_CHECKSUM_DATA_TAG)
|
|
{
|
|
uint32_t stored_key_chksum=
|
|
rdb_netbuf_to_uint32((const uchar*)reader.read(RDB_CHECKSUM_SIZE));
|
|
uint32_t stored_val_chksum=
|
|
rdb_netbuf_to_uint32((const uchar*)reader.read(RDB_CHECKSUM_SIZE));
|
|
|
|
uint32_t computed_key_chksum=
|
|
my_core::crc32(0, rdb_slice_to_uchar_ptr(key), key->size());
|
|
uint32_t computed_val_chksum=
|
|
my_core::crc32(0, rdb_slice_to_uchar_ptr(value),
|
|
value->size() - RDB_CHECKSUM_CHUNK_SIZE);
|
|
|
|
DBUG_EXECUTE_IF("myrocks_simulate_bad_pk_checksum1",
|
|
stored_key_chksum++;);
|
|
|
|
if (stored_key_chksum != computed_key_chksum)
|
|
{
|
|
m_pk_descr->report_checksum_mismatch(true, key->data(), key->size());
|
|
return HA_ERR_INTERNAL_ERROR;
|
|
}
|
|
|
|
DBUG_EXECUTE_IF("myrocks_simulate_bad_pk_checksum2",
|
|
stored_val_chksum++;);
|
|
if (stored_val_chksum != computed_val_chksum)
|
|
{
|
|
m_pk_descr->report_checksum_mismatch(false, value->data(),
|
|
value->size());
|
|
return HA_ERR_INTERNAL_ERROR;
|
|
}
|
|
|
|
m_row_checksums_checked++;
|
|
}
|
|
if (reader.remaining_bytes())
|
|
return HA_ERR_INTERNAL_ERROR;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
|
|
void ha_rocksdb::get_storage_type(Rdb_field_encoder *encoder, uint kp)
|
|
{
|
|
Field *field= table->key_info[table->s->primary_key].key_part[kp].field;
|
|
|
|
if (field->real_type() == MYSQL_TYPE_NEWDECIMAL)
|
|
{
|
|
// Index-only is supported for DECIMAL columns.
|
|
// A DECIMAL value can be restored from its mem-comparable form.
|
|
// This works for both the old data format and the new data format.
|
|
if (m_pk_descr->m_kv_format_version >=
|
|
Rdb_key_def::PRIMARY_FORMAT_VERSION_UPDATE1)
|
|
{
|
|
// New format, don't store Decimal value in the row.
|
|
encoder->m_storage_type= Rdb_field_encoder::STORE_NONE;
|
|
}
|
|
else
|
|
{
|
|
// Old format. Pretend there's no way to unpack the decimal from
|
|
// its mem-comparable form.
|
|
encoder->m_storage_type= Rdb_field_encoder::STORE_ALL;
|
|
}
|
|
return;
|
|
}
|
|
|
|
if (m_pk_descr->m_kv_format_version >=
|
|
Rdb_key_def::PRIMARY_FORMAT_VERSION_UPDATE1)
|
|
{
|
|
// STORE_SOME uses unpack_info, so our key must be of version
|
|
// higher than PRIMARY_FORMAT_VERSION_UPDATE1 to use this
|
|
// feature.
|
|
if (m_pk_descr->has_unpack_info(kp))
|
|
{
|
|
DBUG_ASSERT(m_pk_descr->can_unpack(kp));
|
|
encoder->m_storage_type= Rdb_field_encoder::STORE_SOME;
|
|
m_maybe_unpack_info= true;
|
|
}
|
|
else if (m_pk_descr->can_unpack(kp))
|
|
{
|
|
encoder->m_storage_type= Rdb_field_encoder::STORE_NONE;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
// For old versions, we can only store none if there is no
|
|
// unpack_info (and it's unpackable).
|
|
if (m_pk_descr->can_unpack(kp) && !m_pk_descr->has_unpack_info(kp))
|
|
{
|
|
encoder->m_storage_type= Rdb_field_encoder::STORE_NONE;
|
|
}
|
|
}
|
|
}
|
|
|
|
/*
|
|
Setup data needed to convert table->record[] to and from record storage format.
|
|
|
|
@seealso
|
|
ha_rocksdb::convert_record_to_storage_format,
|
|
ha_rocksdb::convert_record_from_storage_format
|
|
*/
|
|
|
|
void ha_rocksdb::setup_field_converters()
|
|
{
|
|
uint i;
|
|
uint null_bytes= 0;
|
|
uchar cur_null_mask= 0x1;
|
|
|
|
DBUG_ASSERT(m_encoder_arr == nullptr);
|
|
m_encoder_arr= static_cast<Rdb_field_encoder*>(
|
|
my_malloc(table->s->fields * sizeof(Rdb_field_encoder), MYF(0)));
|
|
if (m_encoder_arr == nullptr)
|
|
{
|
|
return;
|
|
}
|
|
|
|
for (i= 0; i < table->s->fields; i++)
|
|
{
|
|
Field *field= table->field[i];
|
|
m_encoder_arr[i].m_storage_type= Rdb_field_encoder::STORE_ALL;
|
|
|
|
/*
|
|
Check if this field is
|
|
- a part of primary key, and
|
|
- it can be decoded back from its key image.
|
|
If both hold, we don't need to store this field in the value part of
|
|
RocksDB's key-value pair.
|
|
|
|
If hidden pk exists, we skip this check since the field will never be
|
|
part of the hidden pk.
|
|
*/
|
|
if (!has_hidden_pk(table) &&
|
|
field->part_of_key.is_set(table->s->primary_key))
|
|
{
|
|
KEY *pk_info= &table->key_info[table->s->primary_key];
|
|
for (uint kp= 0; kp < pk_info->user_defined_key_parts; kp++)
|
|
{
|
|
/* key_part->fieldnr is counted from 1 */
|
|
if (field->field_index + 1 == pk_info->key_part[kp].fieldnr)
|
|
{
|
|
get_storage_type(&m_encoder_arr[i], kp);
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
m_encoder_arr[i].m_field_type= field->real_type();
|
|
m_encoder_arr[i].m_field_index= i;
|
|
m_encoder_arr[i].m_pack_length_in_rec= field->pack_length_in_rec();
|
|
|
|
if (field->real_maybe_null())
|
|
{
|
|
m_encoder_arr[i].m_null_mask= cur_null_mask;
|
|
m_encoder_arr[i].m_null_offset= null_bytes;
|
|
if (cur_null_mask == 0x80)
|
|
{
|
|
cur_null_mask= 0x1;
|
|
null_bytes++;
|
|
}
|
|
else
|
|
cur_null_mask= cur_null_mask << 1;
|
|
}
|
|
else
|
|
{
|
|
m_encoder_arr[i].m_null_mask= 0;
|
|
}
|
|
}
|
|
|
|
/* Count the last, unfinished NULL-bits byte */
|
|
if (cur_null_mask != 0x1)
|
|
null_bytes++;
|
|
|
|
m_null_bytes_in_rec= null_bytes;
|
|
}
|
|
|
|
int ha_rocksdb::alloc_key_buffers(const TABLE* table_arg,
|
|
const Rdb_tbl_def* tbl_def_arg)
|
|
{
|
|
DBUG_ENTER("ha_rocksdb::alloc_key_buffers");
|
|
DBUG_ASSERT(m_pk_tuple == nullptr);
|
|
|
|
std::shared_ptr<Rdb_key_def>* kd_arr= tbl_def_arg->m_key_descr_arr;
|
|
|
|
uint key_len= 0;
|
|
m_pk_descr= kd_arr[pk_index(table_arg, tbl_def_arg)];
|
|
if (has_hidden_pk(table_arg))
|
|
{
|
|
m_pk_key_parts= 1;
|
|
}
|
|
else
|
|
{
|
|
m_pk_key_parts=
|
|
table->key_info[table->s->primary_key].user_defined_key_parts;
|
|
key_len= table->key_info[table->s->primary_key].key_length;
|
|
}
|
|
|
|
// move this into get_table_handler() ??
|
|
m_pk_descr->setup(table_arg, tbl_def_arg);
|
|
|
|
m_pk_tuple= reinterpret_cast<uchar*>(my_malloc(key_len, MYF(0)));
|
|
if (m_pk_tuple == nullptr)
|
|
{
|
|
DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
|
|
}
|
|
|
|
uint pack_key_len= m_pk_descr->max_storage_fmt_length();
|
|
m_pk_packed_tuple= reinterpret_cast<uchar*>(my_malloc(pack_key_len, MYF(0)));
|
|
if (m_pk_packed_tuple == nullptr)
|
|
{
|
|
DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
|
|
}
|
|
|
|
/* Sometimes, we may use m_sk_packed_tuple for storing packed PK */
|
|
uint max_packed_sk_len= pack_key_len;
|
|
for (uint i= 0; i < table_arg->s->keys; i++)
|
|
{
|
|
if (i == table_arg->s->primary_key) /* Primary key was processed above */
|
|
continue;
|
|
|
|
// TODO: move this into get_table_handler() ??
|
|
kd_arr[i]->setup(table_arg, tbl_def_arg);
|
|
|
|
uint packed_len= kd_arr[i]->max_storage_fmt_length();
|
|
if (packed_len > max_packed_sk_len)
|
|
{
|
|
max_packed_sk_len= packed_len;
|
|
}
|
|
}
|
|
|
|
if (!(m_sk_packed_tuple=
|
|
reinterpret_cast<uchar*>(my_malloc(max_packed_sk_len, MYF(0)))) ||
|
|
!(m_sk_match_prefix_buf=
|
|
reinterpret_cast<uchar*>(my_malloc(max_packed_sk_len, MYF(0)))) ||
|
|
!(m_sk_packed_tuple_old=
|
|
reinterpret_cast<uchar*>(my_malloc(max_packed_sk_len, MYF(0)))) ||
|
|
!(m_end_key_packed_tuple=
|
|
reinterpret_cast<uchar*>(my_malloc(max_packed_sk_len, MYF(0)))) ||
|
|
!((m_pack_buffer=
|
|
reinterpret_cast<uchar*>(my_malloc(max_packed_sk_len, MYF(0))))))
|
|
{
|
|
DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
|
|
}
|
|
|
|
DBUG_RETURN(0);
|
|
}
|
|
|
|
void ha_rocksdb::free_key_buffers()
|
|
{
|
|
my_free(m_pk_tuple);
|
|
m_pk_tuple= nullptr;
|
|
|
|
my_free(m_pk_packed_tuple);
|
|
m_pk_packed_tuple= nullptr;
|
|
|
|
my_free(m_sk_packed_tuple);
|
|
m_sk_packed_tuple= nullptr;
|
|
|
|
my_free(m_sk_match_prefix_buf);
|
|
m_sk_match_prefix_buf= nullptr;
|
|
|
|
my_free(m_sk_packed_tuple_old);
|
|
m_sk_packed_tuple_old= nullptr;
|
|
|
|
my_free(m_end_key_packed_tuple);
|
|
m_end_key_packed_tuple= nullptr;
|
|
|
|
my_free(m_pack_buffer);
|
|
m_pack_buffer= nullptr;
|
|
}
|
|
|
|
#ifdef MARIAROCKS_NOT_YET
|
|
void ha_rocksdb::set_use_read_free_rpl(const char* whitelist)
|
|
{
|
|
DBUG_ASSERT(whitelist != nullptr);
|
|
|
|
#if defined(HAVE_PSI_INTERFACE)
|
|
Regex_list_handler regex_handler(key_rwlock_read_free_rpl_tables);
|
|
#else
|
|
Regex_list_handler regex_handler;
|
|
#endif
|
|
|
|
if (!regex_handler.set_patterns(whitelist))
|
|
{
|
|
warn_about_bad_patterns(®ex_handler, "read_free_rpl_tables");
|
|
}
|
|
|
|
m_use_read_free_rpl = regex_handler.matches(m_tbl_def->base_tablename());
|
|
}
|
|
#endif
|
|
|
|
void ha_rocksdb::set_skip_unique_check_tables(const char* whitelist)
|
|
{
|
|
DBUG_ASSERT(whitelist != nullptr);
|
|
|
|
#if defined(HAVE_PSI_INTERFACE)
|
|
Regex_list_handler regex_handler(key_rwlock_skip_unique_check_tables);
|
|
#else
|
|
Regex_list_handler regex_handler;
|
|
#endif
|
|
|
|
if (!regex_handler.set_patterns(whitelist))
|
|
{
|
|
warn_about_bad_patterns(®ex_handler, "skip_unique_check_tables");
|
|
}
|
|
|
|
m_skip_unique_check= regex_handler.matches(m_tbl_def->base_tablename());
|
|
}
|
|
|
|
int ha_rocksdb::open(const char *name, int mode, uint test_if_locked)
|
|
{
|
|
DBUG_ENTER("ha_rocksdb::open");
|
|
|
|
if (m_table_handler != nullptr)
|
|
{
|
|
rdb_open_tables.release_table_handler(m_table_handler);
|
|
}
|
|
m_table_handler= rdb_open_tables.get_table_handler(name);
|
|
if (m_table_handler == nullptr)
|
|
{
|
|
DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
|
|
}
|
|
|
|
my_core::thr_lock_data_init(&m_table_handler->m_thr_lock, &m_db_lock,
|
|
nullptr);
|
|
m_io_perf.init(&m_table_handler->m_table_perf_context,
|
|
&m_table_handler->m_io_perf_read, &stats);
|
|
|
|
/*
|
|
note: m_pk_descr may be non-NULL here, when using partitions. It seems,
|
|
it's ok to discard it
|
|
*/
|
|
std::string fullname;
|
|
if (rdb_normalize_tablename(name, &fullname))
|
|
{
|
|
return HA_ERR_INTERNAL_ERROR;
|
|
}
|
|
|
|
m_tbl_def= ddl_manager.find(fullname);
|
|
if (m_tbl_def == nullptr)
|
|
{
|
|
my_error(ER_INTERNAL_ERROR, MYF(0),
|
|
"Attempt to open a table that is not present in RocksDB-SE data "
|
|
"dictionary");
|
|
DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
|
|
}
|
|
m_lock_rows= RDB_LOCK_NONE;
|
|
|
|
m_key_descr_arr= m_tbl_def->m_key_descr_arr;
|
|
|
|
/*
|
|
Full table scan actually uses primary key
|
|
(UPDATE needs to know this, otherwise it will go into infinite loop on
|
|
queries like "UPDATE tbl SET pk=pk+100")
|
|
*/
|
|
key_used_on_scan= table->s->primary_key;
|
|
|
|
int err;
|
|
if ((err = alloc_key_buffers(table, m_tbl_def)))
|
|
{
|
|
DBUG_RETURN(err);
|
|
}
|
|
|
|
setup_field_converters();
|
|
|
|
info(HA_STATUS_NO_LOCK | HA_STATUS_VARIABLE | HA_STATUS_CONST);
|
|
|
|
/*
|
|
The following load_XXX code calls row decode functions, and they do
|
|
that without having done ::external_lock() or index_init()/rnd_init().
|
|
(Note: this also means we're doing a read when there was no
|
|
setup_field_converters() call)
|
|
|
|
Initialize the necessary variables for them:
|
|
*/
|
|
m_verify_checksums= false;
|
|
|
|
/* TODO: move the following to where TABLE_SHARE is opened: */
|
|
if (table->found_next_number_field)
|
|
load_auto_incr_value();
|
|
|
|
if (has_hidden_pk(table) && load_hidden_pk_value())
|
|
DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
|
|
|
|
/* Index block size in MyRocks: used by MySQL in query optimization */
|
|
stats.block_size = rocksdb_tbl_options.block_size;
|
|
|
|
#ifdef MARIAROCKS_NOT_YET // MDEV-10976
|
|
/* Determine at open whether we can use Read Free Replication or not */
|
|
set_use_read_free_rpl(THDVAR(ha_thd(), read_free_rpl_tables));
|
|
#endif
|
|
|
|
/* Determine at open whether we should skip unique checks for this table */
|
|
set_skip_unique_check_tables(THDVAR(ha_thd(), skip_unique_check_tables));
|
|
|
|
DBUG_RETURN(0);
|
|
}
|
|
|
|
|
|
int ha_rocksdb::close(void)
|
|
{
|
|
DBUG_ENTER("ha_rocksdb::close");
|
|
|
|
m_pk_descr= nullptr;
|
|
m_key_descr_arr= nullptr;
|
|
|
|
free_key_buffers();
|
|
|
|
my_free(m_encoder_arr);
|
|
m_encoder_arr= nullptr;
|
|
|
|
rdb_open_tables.release_table_handler(m_table_handler);
|
|
m_table_handler= nullptr;
|
|
|
|
DBUG_RETURN(0);
|
|
}
|
|
|
|
bool ha_rocksdb::get_error_message(const int error, String *buf)
|
|
{
|
|
static_assert(HA_ERR_ROCKSDB_LAST > HA_ERR_FIRST,
|
|
"HA_ERR_ROCKSDB_LAST > HA_ERR_FIRST");
|
|
static_assert(HA_ERR_ROCKSDB_LAST > HA_ERR_LAST,
|
|
"HA_ERR_ROCKSDB_LAST > HA_ERR_LAST");
|
|
|
|
DBUG_ASSERT(error > 0);
|
|
DBUG_ASSERT(error <= HA_ERR_ROCKSDB_LAST);
|
|
DBUG_ASSERT(buf != nullptr);
|
|
|
|
Rdb_transaction* tx = get_tx_from_thd(ha_thd());
|
|
bool temp_error = false;
|
|
|
|
switch (error) {
|
|
case HA_ERR_ROCKSDB_PK_REQUIRED:
|
|
buf->append("Table must have a PRIMARY KEY.");
|
|
break;
|
|
case HA_ERR_ROCKSDB_UNIQUE_NOT_SUPPORTED:
|
|
buf->append("Unique indexes are not supported.");
|
|
break;
|
|
case HA_ERR_ROCKSDB_TOO_MANY_LOCKS:
|
|
buf->append("Number of locks held reached @@rocksdb_max_row_locks.");
|
|
break;
|
|
case HA_ERR_LOCK_WAIT_TIMEOUT:
|
|
DBUG_ASSERT(tx != nullptr);
|
|
buf->append(tx->m_detailed_error);
|
|
temp_error = true;
|
|
break;
|
|
default:
|
|
// We can be called with the values which are < HA_ERR_FIRST because most
|
|
// MySQL internal functions will just return 1 in case of an error.
|
|
break;
|
|
}
|
|
|
|
return temp_error;
|
|
}
|
|
|
|
/* MyRocks supports only the following collations for indexed columns */
|
|
static const std::set<const my_core::CHARSET_INFO *> RDB_INDEX_COLLATIONS=
|
|
{&my_charset_bin, &my_charset_utf8_bin, &my_charset_latin1_bin};
|
|
|
|
|
|
static bool rdb_is_index_collation_supported(const my_core::Field *field)
|
|
{
|
|
my_core::enum_field_types type= field->real_type();
|
|
/* Handle [VAR](CHAR|BINARY) or TEXT|BLOB */
|
|
if (type == MYSQL_TYPE_VARCHAR || type == MYSQL_TYPE_STRING ||
|
|
type == MYSQL_TYPE_BLOB)
|
|
{
|
|
return RDB_INDEX_COLLATIONS.find(field->charset()) !=
|
|
RDB_INDEX_COLLATIONS.end();
|
|
}
|
|
return true;
|
|
}
|
|
|
|
/*
|
|
Create structures needed for storing data in rocksdb. This is called when the
|
|
table is created. The structures will be shared by all TABLE* objects.
|
|
|
|
@param
|
|
table_arg Table with definition
|
|
db_table "dbname.tablename"
|
|
len strlen of the above
|
|
tbl_def_arg tbl_def whose key_descr is being created/populated
|
|
old_tbl_def_arg tbl_def from which keys are being copied over from
|
|
(for use during inplace alter)
|
|
|
|
@return
|
|
0 - Ok
|
|
other - error, either given table ddl is not supported by rocksdb or OOM.
|
|
*/
|
|
int ha_rocksdb::create_key_defs(const TABLE *table_arg,
|
|
Rdb_tbl_def *tbl_def_arg,
|
|
const TABLE *old_table_arg /* = nullptr */,
|
|
const Rdb_tbl_def *old_tbl_def_arg
|
|
/* = nullptr */)
|
|
{
|
|
DBUG_ASSERT(table_arg != nullptr);
|
|
DBUG_ASSERT(table_arg->s != nullptr);
|
|
|
|
DBUG_ENTER("ha_rocksdb::create_key_defs");
|
|
|
|
uint i;
|
|
|
|
/*
|
|
These need to be one greater than MAX_INDEXES since the user can create
|
|
MAX_INDEXES secondary keys and no primary key which would cause us
|
|
to generate a hidden one.
|
|
*/
|
|
std::array<key_def_cf_info, MAX_INDEXES + 1> cfs;
|
|
|
|
/*
|
|
NOTE: All new column families must be created before new index numbers are
|
|
allocated to each key definition. See below for more details.
|
|
http://github.com/MySQLOnRocksDB/mysql-5.6/issues/86#issuecomment-138515501
|
|
*/
|
|
if (create_cfs(table_arg, tbl_def_arg, &cfs))
|
|
{
|
|
DBUG_RETURN(1);
|
|
};
|
|
|
|
if (!old_tbl_def_arg)
|
|
{
|
|
/*
|
|
old_tbl_def doesn't exist. this means we are in the process of creating
|
|
a new table.
|
|
|
|
Get the index numbers (this will update the next_index_number)
|
|
and create Rdb_key_def structures.
|
|
*/
|
|
for (i= 0; i < tbl_def_arg->m_key_count; i++)
|
|
{
|
|
if (create_key_def(table_arg, i, tbl_def_arg,
|
|
&m_key_descr_arr[i], cfs[i]))
|
|
{
|
|
DBUG_RETURN(1);
|
|
}
|
|
}
|
|
}
|
|
else
|
|
{
|
|
/*
|
|
old_tbl_def exists. This means we are creating a new tbl_def as part of
|
|
in-place alter table. Copy over existing keys from the old_tbl_def and
|
|
generate the necessary new key definitions if any.
|
|
*/
|
|
if (create_inplace_key_defs(table_arg, tbl_def_arg, old_table_arg,
|
|
old_tbl_def_arg, cfs))
|
|
{
|
|
DBUG_RETURN(1);
|
|
}
|
|
}
|
|
|
|
DBUG_RETURN(0);
|
|
}
|
|
|
|
/*
|
|
Checks index parameters and creates column families needed for storing data
|
|
in rocksdb if necessary.
|
|
|
|
@param in
|
|
table_arg Table with definition
|
|
db_table Table name
|
|
tbl_def_arg Table def structure being populated
|
|
|
|
@param out
|
|
cfs CF info for each key definition in 'key_info' order
|
|
|
|
@return
|
|
0 - Ok
|
|
other - error
|
|
*/
|
|
int ha_rocksdb::create_cfs(const TABLE *table_arg, Rdb_tbl_def *tbl_def_arg,
|
|
std::array<struct key_def_cf_info, MAX_INDEXES + 1>* cfs)
|
|
{
|
|
DBUG_ASSERT(table_arg != nullptr);
|
|
DBUG_ASSERT(table_arg->s != nullptr);
|
|
|
|
DBUG_ENTER("ha_rocksdb::create_cfs");
|
|
|
|
char tablename_sys[NAME_LEN + 1];
|
|
|
|
my_core::filename_to_tablename(tbl_def_arg->base_tablename().c_str(),
|
|
tablename_sys, sizeof(tablename_sys));
|
|
|
|
/*
|
|
The first loop checks the index parameters and creates
|
|
column families if necessary.
|
|
*/
|
|
for (uint i= 0; i < tbl_def_arg->m_key_count; i++)
|
|
{
|
|
rocksdb::ColumnFamilyHandle* cf_handle;
|
|
|
|
if (rocksdb_strict_collation_check &&
|
|
!is_hidden_pk(i, table_arg, tbl_def_arg) &&
|
|
tbl_def_arg->base_tablename().find(tmp_file_prefix) != 0)
|
|
{
|
|
for (uint part= 0; part < table_arg->key_info[i].ext_key_parts; part++)
|
|
{
|
|
if (!rdb_is_index_collation_supported(
|
|
table_arg->key_info[i].key_part[part].field) &&
|
|
!rdb_collation_exceptions->matches(tablename_sys))
|
|
{
|
|
std::string collation_err;
|
|
for (auto coll : RDB_INDEX_COLLATIONS)
|
|
{
|
|
if (collation_err != "")
|
|
{
|
|
collation_err += ", ";
|
|
}
|
|
collation_err += coll->name;
|
|
}
|
|
my_printf_error(ER_UNKNOWN_ERROR,
|
|
"Unsupported collation on string indexed "
|
|
"column %s.%s Use binary collation (%s).", MYF(0),
|
|
tbl_def_arg->full_tablename().c_str(),
|
|
table_arg->key_info[i].key_part[part].field->field_name,
|
|
collation_err.c_str());
|
|
DBUG_RETURN(1);
|
|
}
|
|
}
|
|
}
|
|
|
|
/*
|
|
index comment has Column Family name. If there was no comment, we get
|
|
NULL, and it means use the default column family.
|
|
*/
|
|
const char *comment = get_key_comment(i, table_arg, tbl_def_arg);
|
|
const char *key_name = get_key_name(i, table_arg, tbl_def_arg);
|
|
|
|
if (looks_like_per_index_cf_typo(comment))
|
|
{
|
|
my_error(ER_NOT_SUPPORTED_YET, MYF(0),
|
|
"column family name looks like a typo of $per_index_cf");
|
|
DBUG_RETURN(1);
|
|
}
|
|
/* Prevent create from using the system column family */
|
|
if (comment && strcmp(DEFAULT_SYSTEM_CF_NAME, comment) == 0)
|
|
{
|
|
my_error(ER_WRONG_ARGUMENTS, MYF(0),
|
|
"column family not valid for storing index data");
|
|
DBUG_RETURN(1);
|
|
}
|
|
bool is_auto_cf_flag;
|
|
cf_handle = cf_manager.get_or_create_cf(rdb, comment,
|
|
tbl_def_arg->full_tablename(),
|
|
key_name, &is_auto_cf_flag);
|
|
if (!cf_handle)
|
|
DBUG_RETURN(1);
|
|
|
|
auto& cf = (*cfs)[i];
|
|
cf.cf_handle = cf_handle;
|
|
cf.is_reverse_cf = Rdb_cf_manager::is_cf_name_reverse(comment);
|
|
cf.is_auto_cf = is_auto_cf_flag;
|
|
}
|
|
|
|
DBUG_RETURN(0);
|
|
}
|
|
|
|
/*
|
|
Create key definition needed for storing data in rocksdb during ADD index
|
|
inplace operations.
|
|
|
|
@param in
|
|
table_arg Table with definition
|
|
tbl_def_arg New table def structure being populated
|
|
old_tbl_def_arg Old(current) table def structure
|
|
cfs Struct array which contains column family information
|
|
|
|
@return
|
|
0 - Ok
|
|
other - error, either given table ddl is not supported by rocksdb or OOM.
|
|
*/
|
|
int ha_rocksdb::create_inplace_key_defs(const TABLE *table_arg,
|
|
Rdb_tbl_def *tbl_def_arg,
|
|
const TABLE *old_table_arg,
|
|
const Rdb_tbl_def *old_tbl_def_arg,
|
|
const std::array<key_def_cf_info, MAX_INDEXES + 1>& cfs)
|
|
{
|
|
DBUG_ASSERT(table_arg != nullptr);
|
|
DBUG_ASSERT(tbl_def_arg != nullptr);
|
|
DBUG_ASSERT(old_tbl_def_arg != nullptr);
|
|
|
|
DBUG_ENTER("create_key_def");
|
|
|
|
std::shared_ptr<Rdb_key_def>* old_key_descr=
|
|
old_tbl_def_arg->m_key_descr_arr;
|
|
std::shared_ptr<Rdb_key_def>* new_key_descr=
|
|
tbl_def_arg->m_key_descr_arr;
|
|
std::unordered_map<std::string, uint> old_key_pos =
|
|
get_old_key_positions(table_arg, tbl_def_arg, old_table_arg,
|
|
old_tbl_def_arg);
|
|
|
|
uint i;
|
|
for (i= 0; i < tbl_def_arg->m_key_count; i++)
|
|
{
|
|
auto it = old_key_pos.find(get_key_name(i, table_arg, tbl_def_arg));
|
|
if (it != old_key_pos.end())
|
|
{
|
|
/*
|
|
Found matching index in old table definition, so copy it over to the
|
|
new one created.
|
|
*/
|
|
const std::shared_ptr<Rdb_key_def>& okd=
|
|
old_key_descr[it->second];
|
|
|
|
uint16 index_dict_version= 0;
|
|
uchar index_type= 0;
|
|
uint16 kv_version= 0;
|
|
GL_INDEX_ID gl_index_id= okd->get_gl_index_id();
|
|
if (!dict_manager.get_index_info(gl_index_id, &index_dict_version,
|
|
&index_type, &kv_version))
|
|
{
|
|
// NO_LINT_DEBUG
|
|
sql_print_error("RocksDB: Could not get index information "
|
|
"for Index Number (%u,%u), table %s",
|
|
gl_index_id.cf_id, gl_index_id.index_id,
|
|
old_tbl_def_arg->full_tablename().c_str());
|
|
DBUG_RETURN(1);
|
|
}
|
|
|
|
/*
|
|
We can't use the copy constructor because we need to update the
|
|
keynr within the pack_info for each field and the keyno of the keydef
|
|
itself.
|
|
*/
|
|
new_key_descr[i]= std::make_shared<Rdb_key_def>(
|
|
okd->get_index_number(),
|
|
i,
|
|
okd->get_cf(),
|
|
index_dict_version,
|
|
index_type,
|
|
kv_version,
|
|
okd->m_is_reverse_cf,
|
|
okd->m_is_auto_cf,
|
|
okd->m_name.c_str(),
|
|
dict_manager.get_stats(gl_index_id));
|
|
}
|
|
else if (create_key_def(table_arg, i, tbl_def_arg,
|
|
&new_key_descr[i], cfs[i]))
|
|
{
|
|
DBUG_RETURN(1);
|
|
}
|
|
|
|
DBUG_ASSERT(new_key_descr[i] != nullptr);
|
|
new_key_descr[i]->setup(table_arg, tbl_def_arg);
|
|
}
|
|
|
|
DBUG_RETURN(0);
|
|
}
|
|
|
|
std::unordered_map<std::string, uint> ha_rocksdb::get_old_key_positions(
|
|
const TABLE* table_arg,
|
|
const Rdb_tbl_def* tbl_def_arg,
|
|
const TABLE* old_table_arg,
|
|
const Rdb_tbl_def* old_tbl_def_arg)
|
|
{
|
|
DBUG_ASSERT(table_arg != nullptr);
|
|
DBUG_ASSERT(old_table_arg != nullptr);
|
|
DBUG_ASSERT(tbl_def_arg != nullptr);
|
|
DBUG_ASSERT(old_tbl_def_arg != nullptr);
|
|
|
|
DBUG_ENTER("get_old_key_positions");
|
|
|
|
std::shared_ptr<Rdb_key_def>* old_key_descr=
|
|
old_tbl_def_arg->m_key_descr_arr;
|
|
std::unordered_map<std::string, uint> old_key_pos;
|
|
std::unordered_map<std::string, uint> new_key_pos;
|
|
uint i;
|
|
|
|
for (i= 0; i < tbl_def_arg->m_key_count; i++)
|
|
{
|
|
new_key_pos[get_key_name(i, table_arg, tbl_def_arg)] = i;
|
|
}
|
|
|
|
for (i= 0; i < old_tbl_def_arg->m_key_count; i++)
|
|
{
|
|
if (is_hidden_pk(i, old_table_arg, old_tbl_def_arg))
|
|
{
|
|
old_key_pos[old_key_descr[i]->m_name] = i;
|
|
continue;
|
|
}
|
|
|
|
/*
|
|
In case of matching key name, need to check key parts of keys as well,
|
|
in case a simultaneous drop + add is performed, where the key name is the
|
|
same but the key parts are different.
|
|
|
|
Example:
|
|
CREATE TABLE t1 (a INT, b INT, KEY ka(a)) ENGINE=RocksDB;
|
|
ALTER TABLE t1 DROP INDEX ka, ADD INDEX ka(b), ALGORITHM=INPLACE;
|
|
*/
|
|
const KEY* old_key = &old_table_arg->key_info[i];
|
|
auto it = new_key_pos.find(old_key->name);
|
|
if (it == new_key_pos.end())
|
|
{
|
|
continue;
|
|
}
|
|
|
|
KEY* new_key = &table_arg->key_info[it->second];
|
|
|
|
if (!compare_key_parts(old_key, new_key))
|
|
{
|
|
old_key_pos[old_key->name] = i;
|
|
}
|
|
}
|
|
|
|
DBUG_RETURN(old_key_pos);
|
|
}
|
|
|
|
/* Check two keys to ensure that key parts within keys match */
|
|
int ha_rocksdb::compare_key_parts(const KEY* old_key, const KEY* new_key)
|
|
{
|
|
DBUG_ASSERT(old_key != nullptr);
|
|
DBUG_ASSERT(new_key != nullptr);
|
|
|
|
DBUG_ENTER("compare_key_parts");
|
|
|
|
/* Skip if key parts do not match, as it is a different key */
|
|
if (new_key->user_defined_key_parts != old_key->user_defined_key_parts)
|
|
{
|
|
DBUG_RETURN(1);
|
|
}
|
|
|
|
/* Check to see that key parts themselves match */
|
|
for (uint i= 0; i < old_key->user_defined_key_parts; i++)
|
|
{
|
|
if (strcmp(old_key->key_part[i].field->field_name,
|
|
new_key->key_part[i].field->field_name) != 0)
|
|
{
|
|
DBUG_RETURN(1);
|
|
}
|
|
}
|
|
|
|
DBUG_RETURN(0);
|
|
}
|
|
|
|
/*
|
|
Create key definition needed for storing data in rocksdb.
|
|
This can be called either during CREATE table or doing ADD index operations.
|
|
|
|
@param in
|
|
table_arg Table with definition
|
|
i Position of index being created inside table_arg->key_info
|
|
tbl_def_arg Table def structure being populated
|
|
cf_info Struct which contains column family information
|
|
|
|
@param out
|
|
new_key_def Newly created index definition.
|
|
|
|
@return
|
|
0 - Ok
|
|
other - error, either given table ddl is not supported by rocksdb or OOM.
|
|
*/
|
|
int ha_rocksdb::create_key_def(const TABLE *table_arg, uint i,
|
|
const Rdb_tbl_def* tbl_def_arg,
|
|
std::shared_ptr<Rdb_key_def>* new_key_def,
|
|
const struct key_def_cf_info& cf_info)
|
|
{
|
|
DBUG_ENTER("create_key_def");
|
|
DBUG_ASSERT(new_key_def != nullptr);
|
|
DBUG_ASSERT(*new_key_def == nullptr);
|
|
|
|
uint index_id= ddl_manager.get_and_update_next_number(&dict_manager);
|
|
uint16_t index_dict_version= Rdb_key_def::INDEX_INFO_VERSION_LATEST;
|
|
uchar index_type;
|
|
uint16_t kv_version;
|
|
|
|
if (is_hidden_pk(i, table_arg, tbl_def_arg))
|
|
{
|
|
index_type= Rdb_key_def::INDEX_TYPE_HIDDEN_PRIMARY;
|
|
kv_version= Rdb_key_def::PRIMARY_FORMAT_VERSION_LATEST;
|
|
}
|
|
else if (i == table_arg->s->primary_key)
|
|
{
|
|
index_type= Rdb_key_def::INDEX_TYPE_PRIMARY;
|
|
uint16 pk_latest_version= Rdb_key_def::PRIMARY_FORMAT_VERSION_LATEST;
|
|
DBUG_EXECUTE_IF("MYROCKS_FORMAT_VERSION_INITIAL",
|
|
{pk_latest_version=
|
|
Rdb_key_def::PRIMARY_FORMAT_VERSION_INITIAL;
|
|
});
|
|
kv_version= pk_latest_version;
|
|
}
|
|
else
|
|
{
|
|
index_type= Rdb_key_def::INDEX_TYPE_SECONDARY;
|
|
uint16 sk_latest_version= Rdb_key_def::SECONDARY_FORMAT_VERSION_LATEST;
|
|
DBUG_EXECUTE_IF("MYROCKS_FORMAT_VERSION_INITIAL",
|
|
{sk_latest_version=
|
|
Rdb_key_def::SECONDARY_FORMAT_VERSION_INITIAL;
|
|
});
|
|
kv_version= sk_latest_version;
|
|
}
|
|
|
|
const char *key_name = get_key_name(i, table_arg, m_tbl_def);
|
|
*new_key_def= std::make_shared<Rdb_key_def>(
|
|
index_id, i, cf_info.cf_handle, index_dict_version, index_type,
|
|
kv_version, cf_info.is_reverse_cf, cf_info.is_auto_cf, key_name);
|
|
|
|
DBUG_RETURN(0);
|
|
}
|
|
|
|
int rdb_normalize_tablename(const std::string& tablename,
|
|
std::string* strbuf)
|
|
{
|
|
DBUG_ASSERT(strbuf != nullptr);
|
|
|
|
if (tablename.size() < 2 || tablename[0] != '.' || tablename[1] != '/')
|
|
{
|
|
DBUG_ASSERT(0); // We were not passed table name?
|
|
return HA_ERR_INTERNAL_ERROR;
|
|
}
|
|
|
|
size_t pos = tablename.find_first_of('/', 2);
|
|
if (pos == std::string::npos)
|
|
{
|
|
DBUG_ASSERT(0); // We were not passed table name?
|
|
return HA_ERR_INTERNAL_ERROR;
|
|
}
|
|
|
|
*strbuf = tablename.substr(2, pos - 2) + "." + tablename.substr(pos + 1);
|
|
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
Check to see if the user's original statement includes foreign key
|
|
references
|
|
*/
|
|
bool ha_rocksdb::contains_foreign_key(THD* thd)
|
|
{
|
|
bool success;
|
|
const char* str = thd_query_string(thd)->str;
|
|
|
|
DBUG_ASSERT(str != nullptr);
|
|
|
|
while (*str != '\0')
|
|
{
|
|
// Scan from our current pos looking for 'FOREIGN'
|
|
str = rdb_find_in_string(str, "FOREIGN", &success);
|
|
if (!success)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
// Skip past the found "FOREIGN'
|
|
str = rdb_check_next_token(&my_charset_bin, str, "FOREIGN", &success);
|
|
DBUG_ASSERT(success);
|
|
|
|
if (!my_isspace(&my_charset_bin, *str))
|
|
{
|
|
return false;
|
|
}
|
|
|
|
// See if the next token is 'KEY'
|
|
str = rdb_check_next_token(&my_charset_bin, str, "KEY", &success);
|
|
if (!success)
|
|
{
|
|
continue;
|
|
}
|
|
|
|
// See if the next token is '('
|
|
str = rdb_check_next_token(&my_charset_bin, str, "(", &success);
|
|
if (!success)
|
|
{
|
|
// There is an optional index id after 'FOREIGN KEY', skip it
|
|
str = rdb_skip_id(&my_charset_bin, str);
|
|
|
|
// Now check for '(' again
|
|
str = rdb_check_next_token(&my_charset_bin, str, "(", &success);
|
|
}
|
|
|
|
// If we have found 'FOREIGN KEY [<word>] (' we can be confident we have
|
|
// a foreign key clause.
|
|
return success;
|
|
}
|
|
|
|
// We never found a valid foreign key clause
|
|
return false;
|
|
}
|
|
|
|
/**
|
|
@brief
|
|
splits the normalized table name of <dbname>.<tablename>#P#<part_no> into
|
|
the <dbname>, <tablename> and <part_no> components.
|
|
|
|
@param dbbuf returns database name/table_schema
|
|
@param tablebuf returns tablename
|
|
@param partitionbuf returns partition suffix if there is one
|
|
@return 0 on success, non-zero on failure to split
|
|
*/
|
|
int rdb_split_normalized_tablename(const std::string& fullname,
|
|
std::string* db,
|
|
std::string* table,
|
|
std::string* partition)
|
|
{
|
|
DBUG_ASSERT(!fullname.empty());
|
|
|
|
#define RDB_PARTITION_STR "#P#"
|
|
|
|
/* Normalize returns dbname.tablename. */
|
|
size_t dotpos = fullname.find('.');
|
|
|
|
/* Invalid table name? */
|
|
if (dotpos == std::string::npos)
|
|
{
|
|
return HA_ERR_INTERNAL_ERROR;
|
|
}
|
|
|
|
// Table must have a database name associated with it.
|
|
DBUG_ASSERT(dotpos > 0);
|
|
|
|
if (db != nullptr)
|
|
{
|
|
*db = fullname.substr(0, dotpos);
|
|
}
|
|
|
|
dotpos++;
|
|
|
|
size_t partpos = fullname.find(RDB_PARTITION_STR, dotpos,
|
|
strlen(RDB_PARTITION_STR));
|
|
|
|
if (partpos != std::string::npos)
|
|
{
|
|
DBUG_ASSERT(partpos >= dotpos);
|
|
|
|
if (table != nullptr)
|
|
{
|
|
*table = fullname.substr(dotpos, partpos - dotpos);
|
|
}
|
|
|
|
if (partition != nullptr)
|
|
{
|
|
*partition = fullname.substr(partpos + strlen(RDB_PARTITION_STR));
|
|
}
|
|
}
|
|
else if (table != nullptr)
|
|
{
|
|
*table = fullname.substr(dotpos);
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
|
|
/**
|
|
@brief
|
|
create() is called to create a database. The variable name will have the name
|
|
of the table.
|
|
|
|
@details
|
|
When create() is called you do not need to worry about
|
|
opening the table. Also, the .frm file will have already been
|
|
created so adjusting create_info is not necessary. You can overwrite
|
|
the .frm file at this point if you wish to change the table
|
|
definition, but there are no methods currently provided for doing
|
|
so.
|
|
|
|
Called from handle.cc by ha_create_table().
|
|
|
|
@see
|
|
ha_create_table() in handle.cc
|
|
*/
|
|
|
|
int ha_rocksdb::create(const char *name, TABLE *table_arg,
|
|
HA_CREATE_INFO *create_info)
|
|
{
|
|
int res;
|
|
std::string str;
|
|
DBUG_ENTER("ha_rocksdb::create");
|
|
|
|
THD *thd = my_core::thd_get_current_thd();
|
|
|
|
if (get_table_if_exists(name)) {
|
|
if (thd->lex->sql_command == SQLCOM_TRUNCATE) {
|
|
if (delete_table(name)) {
|
|
DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
|
|
}
|
|
} else {
|
|
if (rdb_normalize_tablename(name, &str)) {
|
|
DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
|
|
}
|
|
my_printf_error(ER_UNKNOWN_ERROR,
|
|
"Table '%s' does not exist, but metadata information exists inside "
|
|
"MyRocks. This is a sign of data inconsistency. Please check if "
|
|
"'%s.frm' exists, and try to restore it if it does not exist.",
|
|
MYF(0), str.c_str(), name);
|
|
DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
|
|
}
|
|
}
|
|
|
|
/*
|
|
Construct dbname.tablename ourselves, because parititioning
|
|
passes strings like "./test/t14#P#p0" for individual partitions,
|
|
while table_arg->s->table_name has none of that.
|
|
*/
|
|
if (rdb_normalize_tablename(name, &str))
|
|
{
|
|
DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
|
|
}
|
|
|
|
if (contains_foreign_key(thd))
|
|
{
|
|
my_printf_error(ER_NOT_SUPPORTED_YET, "MyRocks does not currently support "
|
|
"foreign key constraints", MYF(0));
|
|
DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
|
|
}
|
|
|
|
/*
|
|
TODO(alexyang): Temporarily disable unique indexes support when there is no
|
|
Primary Key
|
|
*/
|
|
for (uint keyno= 0; keyno < table_arg->s->keys; keyno++)
|
|
{
|
|
if ((table_arg->key_info[keyno].flags & HA_NOSAME) &&
|
|
has_hidden_pk(table_arg))
|
|
{
|
|
my_printf_error(ER_UNKNOWN_ERROR,
|
|
"Unique index support is disabled when the table has no primary key. ",
|
|
MYF(0));
|
|
DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
|
|
}
|
|
}
|
|
|
|
bool write_err= false;
|
|
std::unique_ptr<rocksdb::WriteBatch> wb= dict_manager.begin();
|
|
rocksdb::WriteBatch *batch= wb.get();
|
|
|
|
/* Create table/key descriptions and put them into the data dictionary */
|
|
m_tbl_def= new Rdb_tbl_def(str);
|
|
|
|
uint n_keys= table_arg->s->keys;
|
|
|
|
/*
|
|
If no primary key found, create a hidden PK and place it inside table
|
|
definition
|
|
*/
|
|
if (has_hidden_pk(table_arg))
|
|
{
|
|
n_keys += 1;
|
|
}
|
|
|
|
m_key_descr_arr= new std::shared_ptr<Rdb_key_def>[n_keys];
|
|
m_tbl_def->m_key_count= n_keys;
|
|
m_tbl_def->m_key_descr_arr= m_key_descr_arr;
|
|
|
|
if ((res= create_key_defs(table_arg, m_tbl_def)))
|
|
{
|
|
goto error;
|
|
}
|
|
|
|
m_pk_descr= m_key_descr_arr[pk_index(table_arg, m_tbl_def)];
|
|
|
|
dict_manager.lock();
|
|
write_err= ddl_manager.put_and_write(m_tbl_def, batch)
|
|
|| dict_manager.commit(batch);
|
|
dict_manager.unlock();
|
|
|
|
if (write_err)
|
|
{
|
|
goto error;
|
|
}
|
|
|
|
if (create_info->auto_increment_value)
|
|
m_tbl_def->m_auto_incr_val= create_info->auto_increment_value;
|
|
|
|
/*
|
|
We only support auto_increment at start of the PRIMARY KEY.
|
|
*/
|
|
//Field *field;
|
|
//if ((field= table_arg->next_number_field))
|
|
/* TODO mdcallag: disable this for now to let UNIQUE indexes kind of work
|
|
if ((field= table_arg->found_next_number_field))
|
|
{
|
|
int pk= table_arg->s->primary_key;
|
|
Field *pk_field= table_arg->key_info[pk].key_part[0].field;
|
|
if (field->field_index != pk_field->field_index)
|
|
DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
|
|
}
|
|
*/
|
|
DBUG_RETURN(0);
|
|
|
|
error:
|
|
/* Delete what we have allocated so far */
|
|
delete m_tbl_def;
|
|
m_tbl_def= nullptr;
|
|
m_key_descr_arr= nullptr;
|
|
|
|
DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
|
|
}
|
|
|
|
|
|
/**
|
|
@note
|
|
This function is used only when the table has not yet been opened, and
|
|
keyread_allowed bitmap doesn't have the correct values yet.
|
|
|
|
See comment in ha_rocksdb::index_flags() for details.
|
|
*/
|
|
|
|
bool ha_rocksdb::check_keyread_allowed(uint inx, uint part, bool all_parts) const
|
|
{
|
|
bool res= true;
|
|
KEY *key_info= &table_share->key_info[inx];
|
|
|
|
Rdb_field_packing dummy1;
|
|
res= dummy1.setup(nullptr, key_info->key_part[part].field, inx, part,
|
|
key_info->key_part[part].length);
|
|
|
|
if (res && all_parts)
|
|
{
|
|
for (uint i= 0; i < part; i++)
|
|
{
|
|
Field *field;
|
|
if ((field= key_info->key_part[i].field))
|
|
{
|
|
Rdb_field_packing dummy;
|
|
if (!dummy.setup(nullptr, field, inx, i, key_info->key_part[i].length))
|
|
{
|
|
/* Cannot do index-only reads for this column */
|
|
res= false;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
uint pk= table_share->primary_key;
|
|
if (inx == pk && all_parts &&
|
|
part + 1 == table_share->key_info[pk].user_defined_key_parts)
|
|
{
|
|
m_pk_can_be_decoded= res;
|
|
}
|
|
|
|
return res;
|
|
}
|
|
|
|
int ha_rocksdb::read_key_exact(const std::shared_ptr<const Rdb_key_def>& kd,
|
|
rocksdb::Iterator *iter,
|
|
bool full_key_match,
|
|
const rocksdb::Slice& key_slice) const
|
|
{
|
|
DBUG_ASSERT(kd != nullptr);
|
|
DBUG_ASSERT(iter != nullptr);
|
|
|
|
/*
|
|
We are looking for the first record such that
|
|
index_tuple= lookup_tuple.
|
|
lookup_tuple may be a prefix of the index.
|
|
*/
|
|
if (kd->m_is_reverse_cf)
|
|
{
|
|
if (!full_key_match)
|
|
{
|
|
if (!iter->Valid())
|
|
iter->SeekToLast();
|
|
else
|
|
iter->Prev();
|
|
}
|
|
}
|
|
|
|
if (!iter->Valid() || !kd->value_matches_prefix(iter->key(), key_slice))
|
|
{
|
|
/*
|
|
Got a record that is not equal to the lookup value, or even a record
|
|
from another table.index.
|
|
*/
|
|
return HA_ERR_KEY_NOT_FOUND;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
int ha_rocksdb::read_before_key(const std::shared_ptr<const Rdb_key_def>& kd,
|
|
bool full_key_match,
|
|
const rocksdb::Slice& key_slice)
|
|
{
|
|
DBUG_ASSERT(kd != nullptr);
|
|
|
|
/*
|
|
We are looking for record with the biggest t.key such that
|
|
t.key < lookup_tuple.
|
|
*/
|
|
if (kd->m_is_reverse_cf)
|
|
{
|
|
if (m_scan_it->Valid() && full_key_match &&
|
|
kd->value_matches_prefix(m_scan_it->key(), key_slice))
|
|
{
|
|
/* We are using full key and we've hit an exact match */
|
|
m_scan_it->Next();
|
|
}
|
|
}
|
|
else
|
|
{
|
|
if (m_scan_it->Valid())
|
|
m_scan_it->Prev();
|
|
else
|
|
m_scan_it->SeekToLast();
|
|
}
|
|
|
|
return m_scan_it->Valid() ? 0 : HA_ERR_KEY_NOT_FOUND;
|
|
}
|
|
|
|
int ha_rocksdb::read_after_key(const std::shared_ptr<const Rdb_key_def>& kd,
|
|
bool full_key_match,
|
|
const rocksdb::Slice& key_slice)
|
|
{
|
|
DBUG_ASSERT(kd != nullptr);
|
|
|
|
/*
|
|
We are looking for the first record such that
|
|
|
|
index_tuple $GT lookup_tuple
|
|
|
|
with HA_READ_AFTER_KEY, $GT = '>',
|
|
with HA_READ_KEY_OR_NEXT, $GT = '>='
|
|
*/
|
|
if (kd->m_is_reverse_cf)
|
|
{
|
|
if (!m_scan_it->Valid())
|
|
{
|
|
m_scan_it->SeekToLast();
|
|
}
|
|
else
|
|
{
|
|
/*
|
|
We should step back
|
|
- when not using full extended key
|
|
- when using full extended key and when we've got an exact match
|
|
*/
|
|
if (!full_key_match ||
|
|
!kd->value_matches_prefix(m_scan_it->key(), key_slice))
|
|
{
|
|
m_scan_it->Prev();
|
|
}
|
|
}
|
|
}
|
|
|
|
return m_scan_it->Valid() ? 0 : HA_ERR_KEY_NOT_FOUND;
|
|
}
|
|
|
|
int ha_rocksdb::position_to_correct_key(
|
|
const std::shared_ptr<const Rdb_key_def>& kd,
|
|
enum ha_rkey_function find_flag,
|
|
bool full_key_match,
|
|
const uchar* key,
|
|
key_part_map keypart_map,
|
|
const rocksdb::Slice& key_slice,
|
|
bool* move_forward)
|
|
{
|
|
int rc= 0;
|
|
|
|
*move_forward= true;
|
|
|
|
switch (find_flag) {
|
|
case HA_READ_KEY_EXACT:
|
|
rc= read_key_exact(kd, m_scan_it, full_key_match, key_slice);
|
|
break;
|
|
case HA_READ_BEFORE_KEY:
|
|
*move_forward= false;
|
|
rc= read_before_key(kd, full_key_match, key_slice);
|
|
if (rc == 0 && !kd->covers_key(m_scan_it->key()))
|
|
{
|
|
/* The record we've got is not from this index */
|
|
rc= HA_ERR_KEY_NOT_FOUND;
|
|
}
|
|
break;
|
|
case HA_READ_AFTER_KEY:
|
|
case HA_READ_KEY_OR_NEXT:
|
|
rc= read_after_key(kd, full_key_match, key_slice);
|
|
if (rc == 0 && !kd->covers_key(m_scan_it->key()))
|
|
{
|
|
/* The record we've got is not from this index */
|
|
rc= HA_ERR_KEY_NOT_FOUND;
|
|
}
|
|
break;
|
|
case HA_READ_KEY_OR_PREV:
|
|
case HA_READ_PREFIX:
|
|
/* This flag is not used by the SQL layer, so we don't support it yet. */
|
|
rc= HA_ERR_UNSUPPORTED;
|
|
break;
|
|
case HA_READ_PREFIX_LAST:
|
|
case HA_READ_PREFIX_LAST_OR_PREV:
|
|
*move_forward= false;
|
|
/*
|
|
Find the last record with the specified index prefix lookup.
|
|
- HA_READ_PREFIX_LAST requires that the record has the
|
|
prefix=lookup (if there are no such records,
|
|
HA_ERR_KEY_NOT_FOUND should be returned).
|
|
- HA_READ_PREFIX_LAST_OR_PREV has no such requirement. If there are no
|
|
records with prefix=lookup, we should return the last record
|
|
before that.
|
|
*/
|
|
rc= read_before_key(kd, full_key_match, key_slice);
|
|
if (rc == 0)
|
|
{
|
|
const rocksdb::Slice& rkey= m_scan_it->key();
|
|
if (!kd->covers_key(rkey))
|
|
{
|
|
/* The record we've got is not from this index */
|
|
rc= HA_ERR_KEY_NOT_FOUND;
|
|
}
|
|
else if (find_flag == HA_READ_PREFIX_LAST)
|
|
{
|
|
uint size = kd->pack_index_tuple(table, m_pack_buffer,
|
|
m_sk_packed_tuple, key,
|
|
keypart_map);
|
|
rocksdb::Slice lookup_tuple(
|
|
reinterpret_cast<char*>(m_sk_packed_tuple), size);
|
|
|
|
// We need to compare the key we've got with the original search prefix.
|
|
if (!kd->value_matches_prefix(rkey, lookup_tuple))
|
|
{
|
|
rc= HA_ERR_KEY_NOT_FOUND;
|
|
}
|
|
}
|
|
}
|
|
break;
|
|
default:
|
|
DBUG_ASSERT(0);
|
|
break;
|
|
}
|
|
|
|
return rc;
|
|
}
|
|
|
|
int ha_rocksdb::calc_eq_cond_len(const std::shared_ptr<const Rdb_key_def>& kd,
|
|
enum ha_rkey_function find_flag,
|
|
const rocksdb::Slice& slice,
|
|
int bytes_changed_by_succ,
|
|
const key_range *end_key,
|
|
uint* end_key_packed_size)
|
|
{
|
|
if (find_flag == HA_READ_KEY_EXACT)
|
|
return slice.size();
|
|
|
|
if (find_flag == HA_READ_PREFIX_LAST)
|
|
{
|
|
/*
|
|
We have made the kd->successor(m_sk_packed_tuple) call above.
|
|
|
|
The slice is at least Rdb_key_def::INDEX_NUMBER_SIZE bytes long.
|
|
*/
|
|
return slice.size() - bytes_changed_by_succ;
|
|
}
|
|
|
|
if (end_key)
|
|
{
|
|
*end_key_packed_size= kd->pack_index_tuple(table, m_pack_buffer,
|
|
m_end_key_packed_tuple,
|
|
end_key->key,
|
|
end_key->keypart_map);
|
|
|
|
/*
|
|
Calculating length of the equal conditions here. 4 byte index id is
|
|
included.
|
|
Example1: id1 BIGINT, id2 INT, id3 BIGINT, PRIMARY KEY (id1, id2, id3)
|
|
WHERE id1=1 AND id2=1 AND id3>=2 => eq_cond_len= 4+8+4= 16
|
|
WHERE id1=1 AND id2>=1 AND id3>=2 => eq_cond_len= 4+8= 12
|
|
Example2: id1 VARCHAR(30), id2 INT, PRIMARY KEY (id1, id2)
|
|
WHERE id1 = 'AAA' and id2 < 3; => eq_cond_len=13 (varchar used 9 bytes)
|
|
*/
|
|
rocksdb::Slice end_slice(reinterpret_cast<char*>(m_end_key_packed_tuple),
|
|
*end_key_packed_size);
|
|
return slice.difference_offset(end_slice);
|
|
}
|
|
|
|
/*
|
|
On range scan without any end key condition, there is no
|
|
eq cond, and eq cond length is the same as index_id size (4 bytes).
|
|
Example1: id1 BIGINT, id2 INT, id3 BIGINT, PRIMARY KEY (id1, id2, id3)
|
|
WHERE id1>=1 AND id2 >= 2 and id2 <= 5 => eq_cond_len= 4
|
|
*/
|
|
return Rdb_key_def::INDEX_NUMBER_SIZE;
|
|
}
|
|
|
|
int ha_rocksdb::read_row_from_primary_key(uchar *buf)
|
|
{
|
|
DBUG_ASSERT(buf != nullptr);
|
|
|
|
int rc;
|
|
const rocksdb::Slice& rkey= m_scan_it->key();
|
|
uint pk_size= rkey.size();
|
|
const char *pk_data= rkey.data();
|
|
|
|
memcpy(m_pk_packed_tuple, pk_data, pk_size);
|
|
m_last_rowkey.copy(pk_data, pk_size, &my_charset_bin);
|
|
|
|
if (m_lock_rows != RDB_LOCK_NONE)
|
|
{
|
|
/* We need to put a lock and re-read */
|
|
rc= get_row_by_rowid(buf, m_pk_packed_tuple, pk_size);
|
|
}
|
|
else
|
|
{
|
|
/* Unpack from the row we've read */
|
|
const rocksdb::Slice& value = m_scan_it->value();
|
|
rc= convert_record_from_storage_format(&rkey, &value, buf);
|
|
}
|
|
|
|
return rc;
|
|
}
|
|
|
|
int ha_rocksdb::read_row_from_secondary_key(uchar *buf,
|
|
const std::shared_ptr<const Rdb_key_def>& kd, bool move_forward)
|
|
{
|
|
DBUG_ASSERT(buf != nullptr);
|
|
DBUG_ASSERT(kd != nullptr);
|
|
|
|
int rc= 0;
|
|
uint pk_size;
|
|
|
|
if (m_keyread_only && m_lock_rows == RDB_LOCK_NONE && !has_hidden_pk(table))
|
|
{
|
|
/* Get the key columns and primary key value */
|
|
const rocksdb::Slice& rkey= m_scan_it->key();
|
|
pk_size= kd->get_primary_key_tuple(table, m_pk_descr, &rkey,
|
|
m_pk_packed_tuple);
|
|
const rocksdb::Slice& value= m_scan_it->value();
|
|
if (pk_size == RDB_INVALID_KEY_LEN)
|
|
{
|
|
rc= HA_ERR_INTERNAL_ERROR;
|
|
}
|
|
else
|
|
{
|
|
rc= try_keyonly_read_from_sk(buf, kd, rkey, value, pk_size);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
if (kd->m_is_reverse_cf)
|
|
move_forward= !move_forward;
|
|
|
|
rc= find_icp_matching_index_rec(move_forward, buf);
|
|
if (!rc)
|
|
{
|
|
const rocksdb::Slice& rkey= m_scan_it->key();
|
|
pk_size= kd->get_primary_key_tuple(table, m_pk_descr, &rkey,
|
|
m_pk_packed_tuple);
|
|
if (pk_size == RDB_INVALID_KEY_LEN)
|
|
{
|
|
rc= HA_ERR_INTERNAL_ERROR;
|
|
}
|
|
else
|
|
{
|
|
rc= get_row_by_rowid(buf, m_pk_packed_tuple, pk_size);
|
|
}
|
|
}
|
|
}
|
|
|
|
if (!rc)
|
|
{
|
|
m_last_rowkey.copy((const char*)m_pk_packed_tuple, pk_size,
|
|
&my_charset_bin);
|
|
}
|
|
|
|
return rc;
|
|
}
|
|
|
|
/**
|
|
@note
|
|
Some data types require unpack_info in order to do index-only reads.
|
|
|
|
However, support for this may have been added after the database was built.
|
|
This means that we need to support the case where unpack_info is not
|
|
available. In this case, we should just do a primary key lookup.
|
|
*/
|
|
int ha_rocksdb::try_keyonly_read_from_sk(uchar* buf,
|
|
const std::shared_ptr<const Rdb_key_def>& kd,
|
|
const rocksdb::Slice& key,
|
|
const rocksdb::Slice& value,
|
|
const uint rowid_size)
|
|
{
|
|
int res= kd->unpack_record(table, buf, &key, &value, m_verify_checksums);
|
|
uint size= rowid_size;
|
|
if (res == UNPACK_INFO_MISSING)
|
|
{
|
|
// If we can't unpack, then do a PK lookup.
|
|
if (rowid_size == RDB_INVALID_KEY_LEN)
|
|
{
|
|
size= kd->get_primary_key_tuple(
|
|
table, m_pk_descr, &key, m_pk_packed_tuple);
|
|
}
|
|
return get_row_by_rowid(buf, m_pk_packed_tuple, size);
|
|
}
|
|
else if (res == UNPACK_FAILURE)
|
|
{
|
|
return HA_ERR_INTERNAL_ERROR;
|
|
}
|
|
DBUG_ASSERT(res == UNPACK_SUCCESS);
|
|
return res;
|
|
}
|
|
|
|
|
|
/**
|
|
@note
|
|
The problem with this function is that SQL layer calls it, when
|
|
- the table has not been yet opened (no ::open() call done)
|
|
- this->table_share already exists, but it is in the process of being
|
|
filled, so some of fields are still NULL.
|
|
- In particular, table_share->key_info[inx].key_part[] is filled only up
|
|
to part #part. Subsequent key parts are not yet filled.
|
|
|
|
To complicate things further, SQL layer will call index_flags() with
|
|
all_parts=TRUE. Essentially, we're asked to provide flags for reading
|
|
keyparts whose datatype is not yet known.
|
|
|
|
We walk around this problem by using check_keyread_allowed(), which uses
|
|
table_share object and is careful not to step on unitialized data.
|
|
|
|
When we get a call with all_parts=TRUE, we try to analyze all parts but
|
|
ignore those that have key_part->field==nullptr (these are not initialized
|
|
yet).
|
|
*/
|
|
|
|
ulong ha_rocksdb::index_flags(uint inx, uint part, bool all_parts) const
|
|
{
|
|
ulong base_flags= HA_READ_NEXT | // doesn't seem to be used
|
|
HA_READ_ORDER |
|
|
HA_READ_RANGE |
|
|
HA_READ_PREV;
|
|
|
|
if (check_keyread_allowed(inx, part, all_parts))
|
|
base_flags |= HA_KEYREAD_ONLY;
|
|
|
|
if (inx == table_share->primary_key)
|
|
{
|
|
/*
|
|
Index-only reads on primary key are the same as table scan for us. Still,
|
|
we need to explicitly "allow" them, otherwise SQL layer will miss some
|
|
plans.
|
|
*/
|
|
base_flags |= HA_KEYREAD_ONLY;
|
|
}
|
|
else
|
|
{
|
|
/*
|
|
We can Index Condition Pushdown any key except the primary. With primary
|
|
key, we get (pk, record) pair immediately, there is no place to put the
|
|
ICP check.
|
|
*/
|
|
base_flags |= HA_DO_INDEX_COND_PUSHDOWN;
|
|
}
|
|
|
|
return base_flags;
|
|
}
|
|
|
|
|
|
/**
|
|
@brief
|
|
Read next index tuple through the secondary index.
|
|
|
|
@details
|
|
m_scan_it points at the index key-value pair that we should read the (pk,row)
|
|
pair for.
|
|
*/
|
|
int ha_rocksdb::secondary_index_read(const int keyno, uchar *buf)
|
|
{
|
|
DBUG_ASSERT(buf != nullptr);
|
|
DBUG_ASSERT(table != nullptr);
|
|
|
|
stats.rows_requested++;
|
|
|
|
/* Use STATUS_NOT_FOUND when record not found or some error occurred */
|
|
table->status= STATUS_NOT_FOUND;
|
|
|
|
if (m_scan_it->Valid())
|
|
{
|
|
rocksdb::Slice key= m_scan_it->key();
|
|
|
|
/* Check if we've ran out of records of this index */
|
|
if (m_key_descr_arr[keyno]->covers_key(key))
|
|
{
|
|
int rc;
|
|
|
|
//TODO: We could here check if we have ran out of range we're scanning
|
|
uint size= m_key_descr_arr[keyno]->get_primary_key_tuple(
|
|
table, m_pk_descr, &key, m_pk_packed_tuple);
|
|
if (size == RDB_INVALID_KEY_LEN)
|
|
{
|
|
return HA_ERR_INTERNAL_ERROR;
|
|
}
|
|
|
|
m_last_rowkey.copy((const char*)m_pk_packed_tuple, size, &my_charset_bin);
|
|
|
|
if (m_keyread_only && m_lock_rows == RDB_LOCK_NONE &&
|
|
!has_hidden_pk(table))
|
|
{
|
|
rc= try_keyonly_read_from_sk(buf, m_key_descr_arr[keyno],
|
|
key, m_scan_it->value(), size);
|
|
}
|
|
else
|
|
{
|
|
rc= get_row_by_rowid(buf, m_pk_packed_tuple, size);
|
|
}
|
|
|
|
if (!rc)
|
|
{
|
|
table->status= 0;
|
|
stats.rows_read++;
|
|
stats.rows_index_next++;
|
|
update_row_stats(ROWS_READ);
|
|
}
|
|
return rc;
|
|
}
|
|
}
|
|
return HA_ERR_END_OF_FILE;
|
|
}
|
|
|
|
|
|
/*
|
|
ha_rocksdb::read_range_first overrides handler::read_range_first.
|
|
The only difference from handler::read_range_first is that
|
|
ha_rocksdb::read_range_first passes end_key to
|
|
ha_rocksdb::index_read_map_impl function.
|
|
*/
|
|
int ha_rocksdb::read_range_first(const key_range *start_key,
|
|
const key_range *end_key,
|
|
bool eq_range_arg, bool sorted)
|
|
{
|
|
int result;
|
|
DBUG_ENTER("ha_rocksdb::read_range_first");
|
|
|
|
eq_range= eq_range_arg;
|
|
set_end_range(end_key, RANGE_SCAN_ASC);
|
|
|
|
range_key_part= table->key_info[active_index].key_part;
|
|
|
|
if (!start_key) // Read first record
|
|
result= ha_index_first(table->record[0]);
|
|
else
|
|
{
|
|
if (is_using_prohibited_gap_locks(table,
|
|
is_using_full_unique_key(
|
|
active_index,
|
|
start_key->keypart_map,
|
|
start_key->flag)))
|
|
{
|
|
DBUG_RETURN(HA_ERR_LOCK_DEADLOCK);
|
|
}
|
|
|
|
MYSQL_TABLE_IO_WAIT(m_psi, PSI_TABLE_FETCH_ROW, active_index, 0,
|
|
{ result= index_read_map_impl(table->record[0],
|
|
start_key->key,
|
|
start_key->keypart_map,
|
|
start_key->flag,
|
|
end_key); })
|
|
}
|
|
if (result)
|
|
DBUG_RETURN((result == HA_ERR_KEY_NOT_FOUND)
|
|
? HA_ERR_END_OF_FILE
|
|
: result);
|
|
|
|
if (compare_key(end_range) <= 0)
|
|
{
|
|
DBUG_RETURN(0);
|
|
}
|
|
else
|
|
{
|
|
/*
|
|
The last read row does not fall in the range. So request
|
|
storage engine to release row lock if possible.
|
|
*/
|
|
unlock_row();
|
|
DBUG_RETURN(HA_ERR_END_OF_FILE);
|
|
}
|
|
}
|
|
|
|
|
|
int ha_rocksdb::index_read_map(uchar *buf, const uchar *key,
|
|
key_part_map keypart_map,
|
|
enum ha_rkey_function find_flag)
|
|
{
|
|
return index_read_map_impl(buf, key, keypart_map, find_flag, nullptr);
|
|
}
|
|
|
|
/*
|
|
See storage/rocksdb/rocksdb-range-access.txt for description of how MySQL
|
|
index navigation commands are converted into RocksDB lookup commands.
|
|
|
|
This function takes end_key as an argument, and it is set on range scan.
|
|
MyRocks needs to decide whether prefix bloom filter can be used or not.
|
|
To decide to use prefix bloom filter or not, calculating equal condition length
|
|
is needed. On equal lookups (find_flag == HA_READ_KEY_EXACT), equal
|
|
condition length is the same as rocksdb::Slice.size() of the start key.
|
|
On range scan, equal condition length is MIN(start_key, end_key) of the
|
|
rocksdb::Slice expression.
|
|
*/
|
|
int ha_rocksdb::index_read_map_impl(uchar *buf, const uchar *key,
|
|
key_part_map keypart_map,
|
|
enum ha_rkey_function find_flag,
|
|
const key_range *end_key)
|
|
{
|
|
int rc= 0;
|
|
DBUG_ENTER("ha_rocksdb::index_read_map");
|
|
|
|
const std::shared_ptr<const Rdb_key_def>& kd= m_key_descr_arr[active_index];
|
|
uint actual_key_parts= kd->get_key_parts();
|
|
bool using_full_key= is_using_full_key(keypart_map, actual_key_parts);
|
|
|
|
if (!end_key)
|
|
end_key= end_range;
|
|
|
|
/* By default, we don't need the retrieved records to match the prefix */
|
|
m_sk_match_prefix= nullptr;
|
|
stats.rows_requested++;
|
|
|
|
if (active_index == table->s->primary_key &&
|
|
find_flag == HA_READ_KEY_EXACT && using_full_key)
|
|
{
|
|
/*
|
|
Equality lookup over primary key, using full tuple.
|
|
This is a special case, use DB::Get.
|
|
*/
|
|
uint size= kd->pack_index_tuple(table, m_pack_buffer, m_pk_packed_tuple,
|
|
key, keypart_map);
|
|
rc= get_row_by_rowid(buf, m_pk_packed_tuple, size);
|
|
if (!rc)
|
|
{
|
|
stats.rows_read++;
|
|
stats.rows_index_first++;
|
|
update_row_stats(ROWS_READ);
|
|
}
|
|
DBUG_RETURN(rc);
|
|
}
|
|
|
|
/*
|
|
Unique secondary index performs lookups without the extended key fields
|
|
*/
|
|
uint packed_size;
|
|
if (active_index != table->s->primary_key &&
|
|
table->key_info[active_index].flags & HA_NOSAME &&
|
|
find_flag == HA_READ_KEY_EXACT && using_full_key)
|
|
{
|
|
key_part_map tmp_map= (key_part_map(1) <<
|
|
table->key_info[active_index].user_defined_key_parts) - 1;
|
|
packed_size= kd->pack_index_tuple(table, m_pack_buffer, m_sk_packed_tuple,
|
|
key, tmp_map);
|
|
if (table->key_info[active_index].user_defined_key_parts !=
|
|
kd->get_key_parts())
|
|
using_full_key= false;
|
|
}
|
|
else
|
|
{
|
|
packed_size= kd->pack_index_tuple(table, m_pack_buffer, m_sk_packed_tuple,
|
|
key, keypart_map);
|
|
}
|
|
|
|
if ((pushed_idx_cond && pushed_idx_cond_keyno == active_index) &&
|
|
(find_flag == HA_READ_KEY_EXACT || find_flag == HA_READ_PREFIX_LAST))
|
|
{
|
|
/*
|
|
We are doing a point index lookup, and ICP is enabled. It is possible
|
|
that this call will be followed by ha_rocksdb->index_next_same() call.
|
|
|
|
Do what InnoDB does: save the lookup tuple now. We will need it in
|
|
index_next_same/find_icp_matching_index_rec in order to stop scanning
|
|
as soon as index record doesn't match the lookup tuple.
|
|
|
|
When not using ICP, handler::index_next_same() will make sure that rows
|
|
that don't match the lookup prefix are not returned.
|
|
row matches the lookup prefix.
|
|
*/
|
|
m_sk_match_prefix= m_sk_match_prefix_buf;
|
|
m_sk_match_length= packed_size;
|
|
memcpy(m_sk_match_prefix, m_sk_packed_tuple, packed_size);
|
|
}
|
|
|
|
int bytes_changed_by_succ= 0;
|
|
if (find_flag == HA_READ_PREFIX_LAST_OR_PREV ||
|
|
find_flag == HA_READ_PREFIX_LAST ||
|
|
find_flag == HA_READ_AFTER_KEY)
|
|
{
|
|
/* See below */
|
|
bytes_changed_by_succ= kd->successor(m_sk_packed_tuple, packed_size);
|
|
}
|
|
|
|
rocksdb::Slice slice(
|
|
reinterpret_cast<const char*>(m_sk_packed_tuple), packed_size);
|
|
|
|
uint end_key_packed_size= 0;
|
|
uint eq_cond_len= calc_eq_cond_len(kd, find_flag, slice,
|
|
bytes_changed_by_succ, end_key,
|
|
&end_key_packed_size);
|
|
|
|
bool use_all_keys= false;
|
|
if (find_flag == HA_READ_KEY_EXACT &&
|
|
my_count_bits(keypart_map) == kd->get_key_parts())
|
|
use_all_keys= true;
|
|
|
|
Rdb_transaction *tx= get_or_create_tx(table->in_use);
|
|
bool is_new_snapshot= !tx->has_snapshot();
|
|
// Loop as long as we get a deadlock error AND we end up creating the
|
|
// snapshot here (i.e. it did not exist prior to this)
|
|
for (;;)
|
|
{
|
|
/*
|
|
This will open the iterator and position it at a record that's equal or
|
|
greater than the lookup tuple.
|
|
*/
|
|
setup_scan_iterator(kd, &slice, use_all_keys, is_ascending(kd, find_flag),
|
|
eq_cond_len);
|
|
|
|
/*
|
|
Once we are positioned on from above, move to the position we really
|
|
want: See storage/rocksdb/rocksdb-range-access.txt
|
|
*/
|
|
bool move_forward;
|
|
rc = position_to_correct_key(kd, find_flag, using_full_key, key,
|
|
keypart_map, slice, &move_forward);
|
|
|
|
if (rc)
|
|
{
|
|
/* This status is returned on any error */
|
|
table->status= STATUS_NOT_FOUND;
|
|
DBUG_RETURN(rc);
|
|
}
|
|
|
|
m_skip_scan_it_next_call= false;
|
|
|
|
/*
|
|
Now get the data for the row into 'buf'. If we were using a primary key
|
|
then we have all the rows we need. For a secondary key we now need to
|
|
lookup the primary key.
|
|
*/
|
|
if (active_index == table->s->primary_key)
|
|
rc= read_row_from_primary_key(buf);
|
|
else
|
|
rc= read_row_from_secondary_key(buf, kd, move_forward);
|
|
|
|
if (rc != HA_ERR_LOCK_DEADLOCK || !is_new_snapshot)
|
|
break; /* Exit the loop */
|
|
|
|
// release the snapshot and iterator so they will be regenerated
|
|
tx->release_snapshot();
|
|
release_scan_iterator();
|
|
}
|
|
|
|
if (rc)
|
|
{
|
|
/* the only possible error condition is record-not-found */
|
|
table->status= STATUS_NOT_FOUND;
|
|
}
|
|
else
|
|
{
|
|
table->status= 0;
|
|
stats.rows_read++;
|
|
stats.rows_index_first++;
|
|
update_row_stats(ROWS_READ);
|
|
}
|
|
DBUG_RETURN(rc);
|
|
}
|
|
|
|
|
|
/*
|
|
@brief
|
|
Scan the secondary index until we find an index record that satisfies ICP
|
|
|
|
@param move_forward TRUE <=> move m_scan_it forward
|
|
FALSE <=> move m_scan_it backward
|
|
@param buf Record buffer (must be the same buffer that
|
|
pushed index condition points to, in practice
|
|
it is table->record[0])
|
|
|
|
@detail
|
|
Move the current iterator m_scan_it until we get an index tuple that
|
|
satisfies the pushed Index Condition.
|
|
(if there is no pushed index condition, return right away)
|
|
|
|
@return
|
|
0 - Index tuple satisfies ICP, can do index read.
|
|
other - error code
|
|
*/
|
|
|
|
int ha_rocksdb::find_icp_matching_index_rec(bool move_forward, uchar *buf)
|
|
{
|
|
DBUG_ASSERT(buf != nullptr);
|
|
|
|
if (pushed_idx_cond && pushed_idx_cond_keyno == active_index)
|
|
{
|
|
const std::shared_ptr<const Rdb_key_def>& kd= m_key_descr_arr[active_index];
|
|
DBUG_ASSERT(kd != nullptr);
|
|
|
|
while (1)
|
|
{
|
|
if (!m_scan_it->Valid())
|
|
{
|
|
table->status= STATUS_NOT_FOUND;
|
|
return HA_ERR_END_OF_FILE;
|
|
}
|
|
rocksdb::Slice rkey= m_scan_it->key();
|
|
|
|
if (!kd->covers_key(rkey))
|
|
{
|
|
table->status= STATUS_NOT_FOUND;
|
|
return HA_ERR_END_OF_FILE;
|
|
}
|
|
|
|
if (m_sk_match_prefix)
|
|
{
|
|
rocksdb::Slice prefix((const char*)m_sk_match_prefix,
|
|
m_sk_match_length);
|
|
if (!kd->value_matches_prefix(rkey, prefix))
|
|
{
|
|
table->status= STATUS_NOT_FOUND;
|
|
return HA_ERR_END_OF_FILE;
|
|
}
|
|
}
|
|
|
|
rocksdb::Slice value= m_scan_it->value();
|
|
if (try_keyonly_read_from_sk(buf, kd, rkey, value, RDB_INVALID_KEY_LEN))
|
|
{
|
|
return HA_ERR_INTERNAL_ERROR;
|
|
}
|
|
|
|
enum icp_result icp_status= check_index_cond();
|
|
if (icp_status == ICP_NO_MATCH)
|
|
{
|
|
if (move_forward)
|
|
m_scan_it->Next();
|
|
else
|
|
m_scan_it->Prev();
|
|
continue; /* Get the next (or prev) index tuple */
|
|
}
|
|
else if (icp_status == ICP_OUT_OF_RANGE)
|
|
{
|
|
/* We have walked out of range we are scanning */
|
|
table->status= STATUS_NOT_FOUND;
|
|
return HA_ERR_END_OF_FILE;
|
|
}
|
|
else /* icp_status == ICP_MATCH */
|
|
{
|
|
/* Index Condition is satisfied. We have rc==0, proceed to fetch the row. */
|
|
break;
|
|
}
|
|
/*
|
|
TODO: should we have this here, or RockDB handles this internally?
|
|
if (my_core::thd_killed(current_thd))
|
|
{
|
|
rc= HA_ERR_INTERNAL_ERROR; // doesn't matter
|
|
break;
|
|
}
|
|
*/
|
|
}
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
|
|
int ha_rocksdb::index_read_last_map(uchar * buf, const uchar * key,
|
|
key_part_map keypart_map)
|
|
{
|
|
return index_read_map(buf, key, keypart_map, HA_READ_PREFIX_LAST);
|
|
}
|
|
|
|
|
|
int ha_rocksdb::check(THD* thd, HA_CHECK_OPT* check_opt)
|
|
{
|
|
uint pk= pk_index(table, m_tbl_def);
|
|
String rowkey_copy;
|
|
String sec_key_copy;
|
|
const char* table_name = table->s->table_name.str;
|
|
|
|
// Only when debugging: don't use snapshot when reading
|
|
// Rdb_transaction *tx= get_or_create_tx(table->in_use);
|
|
// tx->snapshot= nullptr;
|
|
|
|
bool save_verify_checksums= m_verify_checksums;
|
|
m_verify_checksums= true;
|
|
|
|
/* For each secondary index, check that we can get a PK value from it */
|
|
// NO_LINT_DEBUG
|
|
sql_print_information("CHECKTABLE %s: Checking table %s", table_name,
|
|
table_name);
|
|
ha_rows UNINIT_VAR(row_checksums_at_start); // set/used iff first_index==true
|
|
ha_rows row_checksums= ha_rows(-1);
|
|
bool first_index= true;
|
|
|
|
for (uint keyno= 0; keyno < table->s->keys; keyno++)
|
|
{
|
|
if (keyno != pk)
|
|
{
|
|
extra(HA_EXTRA_KEYREAD);
|
|
ha_index_init(keyno, true);
|
|
ha_rows rows= 0;
|
|
ha_rows checksums= 0;
|
|
if (first_index)
|
|
row_checksums_at_start= m_row_checksums_checked;
|
|
int res;
|
|
// NO_LINT_DEBUG
|
|
sql_print_information("CHECKTABLE %s: Checking index %s", table_name,
|
|
table->key_info[keyno].name);
|
|
while (1)
|
|
{
|
|
if (!rows)
|
|
res= index_first(table->record[0]);
|
|
else
|
|
res= index_next(table->record[0]);
|
|
|
|
if (res == HA_ERR_END_OF_FILE)
|
|
break;
|
|
if (res)
|
|
{
|
|
// error
|
|
// NO_LINT_DEBUG
|
|
sql_print_error("CHECKTABLE %s: .. row %lld: index scan error %d",
|
|
table_name, rows, res);
|
|
goto error;
|
|
}
|
|
rocksdb::Slice key= m_scan_it->key();
|
|
sec_key_copy.copy(key.data(), key.size(), &my_charset_bin);
|
|
rowkey_copy.copy(m_last_rowkey.ptr(), m_last_rowkey.length(),
|
|
&my_charset_bin);
|
|
|
|
if (m_key_descr_arr[keyno]->
|
|
unpack_info_has_checksum(m_scan_it->value()))
|
|
{
|
|
checksums++;
|
|
}
|
|
|
|
if ((res= get_row_by_rowid(table->record[0], rowkey_copy.ptr(),
|
|
rowkey_copy.length())))
|
|
{
|
|
// NO_LINT_DEBUG
|
|
sql_print_error("CHECKTABLE %s: .. row %lld: "
|
|
"failed to fetch row by rowid",
|
|
table_name, rows);
|
|
goto error;
|
|
}
|
|
|
|
longlong hidden_pk_id= 0;
|
|
if (has_hidden_pk(table) &&
|
|
read_hidden_pk_id_from_rowkey(&hidden_pk_id))
|
|
goto error;
|
|
|
|
/* Check if we get the same PK value */
|
|
uint packed_size= m_pk_descr->pack_record(
|
|
table, m_pack_buffer, table->record[0], m_pk_packed_tuple, nullptr,
|
|
false, hidden_pk_id);
|
|
if (packed_size != rowkey_copy.length() ||
|
|
memcmp(m_pk_packed_tuple, rowkey_copy.ptr(), packed_size))
|
|
{
|
|
// NO_LINT_DEBUG
|
|
sql_print_error("CHECKTABLE %s: .. row %lld: PK value mismatch",
|
|
table_name, rows);
|
|
goto print_and_error;
|
|
}
|
|
|
|
/* Check if we get the same secondary key value */
|
|
packed_size= m_key_descr_arr[keyno]->pack_record(
|
|
table, m_pack_buffer, table->record[0], m_sk_packed_tuple,
|
|
&m_sk_tails, false, hidden_pk_id);
|
|
if (packed_size != sec_key_copy.length() ||
|
|
memcmp(m_sk_packed_tuple, sec_key_copy.ptr(), packed_size))
|
|
{
|
|
// NO_LINT_DEBUG
|
|
sql_print_error("CHECKTABLE %s: .. row %lld: "
|
|
"secondary index value mismatch",
|
|
table_name, rows);
|
|
goto print_and_error;
|
|
}
|
|
rows++;
|
|
continue;
|
|
|
|
print_and_error:
|
|
{
|
|
std::string buf;
|
|
buf = rdb_hexdump(rowkey_copy.ptr(), rowkey_copy.length(), 1000);
|
|
// NO_LINT_DEBUG
|
|
sql_print_error("CHECKTABLE %s: rowkey: %s", table_name,
|
|
buf.c_str());
|
|
|
|
buf= rdb_hexdump(m_retrieved_record.data(), m_retrieved_record.size(),
|
|
1000);
|
|
// NO_LINT_DEBUG
|
|
sql_print_error("CHECKTABLE %s: record: %s", table_name,
|
|
buf.c_str());
|
|
|
|
buf = rdb_hexdump(sec_key_copy.ptr(), sec_key_copy.length(), 1000);
|
|
// NO_LINT_DEBUG
|
|
sql_print_error("CHECKTABLE %s: index: %s", table_name,
|
|
buf.c_str());
|
|
|
|
goto error;
|
|
}
|
|
}
|
|
// NO_LINT_DEBUG
|
|
sql_print_information("CHECKTABLE %s: ... %lld index entries checked "
|
|
"(%lld had checksums)", table_name, rows,
|
|
checksums);
|
|
|
|
if (first_index)
|
|
{
|
|
row_checksums= m_row_checksums_checked - row_checksums_at_start;
|
|
first_index= false;
|
|
}
|
|
ha_index_end();
|
|
}
|
|
}
|
|
if (row_checksums != ha_rows(-1))
|
|
{
|
|
// NO_LINT_DEBUG
|
|
sql_print_information("CHECKTABLE %s: %lld table records had checksums",
|
|
table_name, row_checksums);
|
|
}
|
|
extra(HA_EXTRA_NO_KEYREAD);
|
|
|
|
m_verify_checksums= save_verify_checksums;
|
|
/*
|
|
TODO: we should check also for PK records that are missing in the secondary
|
|
indexes.
|
|
For that, need to walk through the PK and check that every PK record has a
|
|
proper counterpart in each secondary index.
|
|
*/
|
|
return HA_ADMIN_OK;
|
|
error:
|
|
m_verify_checksums= save_verify_checksums;
|
|
ha_index_or_rnd_end();
|
|
extra(HA_EXTRA_NO_KEYREAD);
|
|
return HA_ADMIN_CORRUPT;
|
|
}
|
|
|
|
|
|
static void dbug_dump_str(FILE *out, const char *str, int len)
|
|
{
|
|
fprintf(out, "\"");
|
|
for (int i= 0; i < len; i++)
|
|
{
|
|
if (str[i] > 32)
|
|
fprintf(out, "%c", str[i]);
|
|
else
|
|
fprintf(out, "\\%d", str[i]);
|
|
}
|
|
fprintf(out, "\"");
|
|
}
|
|
|
|
|
|
/*
|
|
Debugging help: dump the whole database into a human-readable file.
|
|
Usage:
|
|
dbug_dump_database(rdb);
|
|
*/
|
|
|
|
void dbug_dump_database(rocksdb::DB *db)
|
|
{
|
|
FILE *out= fopen("/tmp/rocksdb.dump", "wt");
|
|
if (!out)
|
|
return;
|
|
|
|
rocksdb::Iterator* it = db->NewIterator(rocksdb::ReadOptions());
|
|
for (it->SeekToFirst(); it->Valid(); it->Next()) {
|
|
rocksdb::Slice key= it->key();
|
|
rocksdb::Slice val= it->value();
|
|
dbug_dump_str(out, key.data(), key.size());
|
|
fprintf(out, " -> ");
|
|
dbug_dump_str(out, val.data(), val.size());
|
|
fprintf(out, "\n");
|
|
}
|
|
|
|
delete it;
|
|
fclose(out);
|
|
}
|
|
|
|
|
|
/*
|
|
Given a rowid (i.e. packed PK) as a parameter, get the record.
|
|
|
|
@return
|
|
0 - Ok
|
|
other - HA_ERR error code.
|
|
*/
|
|
|
|
int ha_rocksdb::get_row_by_rowid(uchar *buf, const char *rowid,
|
|
const uint rowid_size)
|
|
{
|
|
DBUG_ASSERT(buf != nullptr);
|
|
DBUG_ASSERT(rowid != nullptr);
|
|
DBUG_ASSERT(table != nullptr);
|
|
|
|
int rc;
|
|
DBUG_ENTER("ha_rocksdb::get_row_by_rowid");
|
|
|
|
rocksdb::Slice key_slice(rowid, rowid_size);
|
|
|
|
Rdb_transaction *tx= get_or_create_tx(table->in_use);
|
|
DBUG_ASSERT(tx != nullptr);
|
|
|
|
DEBUG_SYNC(ha_thd(), "rocksdb.get_row_by_rowid");
|
|
DBUG_EXECUTE_IF("dbug.rocksdb.get_row_by_rowid",
|
|
{
|
|
THD *thd= ha_thd();
|
|
const char act[]=
|
|
"now signal Reached "
|
|
"wait_for signal.rocksdb.get_row_by_rowid_let_running";
|
|
DBUG_ASSERT(opt_debug_sync_timeout > 0);
|
|
DBUG_ASSERT(!debug_sync_set_action(thd,
|
|
STRING_WITH_LEN(act)));
|
|
};);
|
|
|
|
bool found;
|
|
rocksdb::Status s;
|
|
|
|
if (m_lock_rows == RDB_LOCK_NONE)
|
|
{
|
|
tx->acquire_snapshot(true);
|
|
s= tx->get(m_pk_descr->get_cf(), key_slice, &m_retrieved_record);
|
|
}
|
|
else
|
|
s= tx->get_for_update(m_pk_descr->get_cf(), key_slice, &m_retrieved_record);
|
|
|
|
if (!s.IsNotFound() && !s.ok())
|
|
{
|
|
DBUG_RETURN(tx->set_status_error(table->in_use, s, m_pk_descr, m_tbl_def));
|
|
}
|
|
found= !s.IsNotFound();
|
|
|
|
table->status= STATUS_NOT_FOUND;
|
|
if (found)
|
|
{
|
|
m_last_rowkey.copy((const char*)rowid, rowid_size, &my_charset_bin);
|
|
rc= convert_record_from_storage_format(&key_slice, buf);
|
|
if (!rc)
|
|
table->status= 0;
|
|
}
|
|
else
|
|
{
|
|
/*
|
|
Note: we don't need to unlock the row. It is intentional that we keep
|
|
locks on rows that don't exist.
|
|
*/
|
|
rc= HA_ERR_KEY_NOT_FOUND;
|
|
}
|
|
|
|
DBUG_RETURN(rc);
|
|
}
|
|
|
|
|
|
int ha_rocksdb::index_next(uchar *buf)
|
|
{
|
|
bool moves_forward= true;
|
|
if (m_key_descr_arr[active_index]->m_is_reverse_cf)
|
|
{
|
|
moves_forward= false;
|
|
}
|
|
|
|
int rc= index_next_with_direction(buf, moves_forward);
|
|
if (rc == HA_ERR_KEY_NOT_FOUND)
|
|
rc = HA_ERR_END_OF_FILE;
|
|
return rc;
|
|
}
|
|
|
|
|
|
int ha_rocksdb::index_prev(uchar *buf)
|
|
{
|
|
bool moves_forward= false;
|
|
if (m_key_descr_arr[active_index]->m_is_reverse_cf)
|
|
{
|
|
moves_forward= true;
|
|
}
|
|
|
|
int rc= index_next_with_direction(buf, moves_forward);
|
|
if (rc == HA_ERR_KEY_NOT_FOUND)
|
|
rc = HA_ERR_END_OF_FILE;
|
|
|
|
return rc;
|
|
}
|
|
|
|
|
|
int ha_rocksdb::index_next_with_direction(uchar *buf, bool move_forward)
|
|
{
|
|
int rc;
|
|
DBUG_ENTER("ha_rocksdb::index_next");
|
|
|
|
if (active_index == pk_index(table, m_tbl_def))
|
|
{
|
|
rc= rnd_next_with_direction(buf, move_forward);
|
|
}
|
|
else
|
|
{
|
|
if (m_skip_scan_it_next_call)
|
|
{
|
|
m_skip_scan_it_next_call= false;
|
|
}
|
|
else
|
|
{
|
|
if (move_forward)
|
|
m_scan_it->Next(); /* this call cannot fail */
|
|
else
|
|
m_scan_it->Prev();
|
|
}
|
|
rc= find_icp_matching_index_rec(move_forward, buf);
|
|
if (!rc)
|
|
rc= secondary_index_read(active_index, buf);
|
|
}
|
|
|
|
DBUG_RETURN(rc);
|
|
}
|
|
|
|
|
|
int ha_rocksdb::index_first(uchar *buf)
|
|
{
|
|
m_sk_match_prefix= nullptr;
|
|
int rc= m_key_descr_arr[active_index]->m_is_reverse_cf ?
|
|
index_last_intern(buf) : index_first_intern(buf);
|
|
if (rc == HA_ERR_KEY_NOT_FOUND)
|
|
rc = HA_ERR_END_OF_FILE;
|
|
return rc;
|
|
}
|
|
|
|
|
|
int ha_rocksdb::index_last(uchar *buf)
|
|
{
|
|
m_sk_match_prefix= nullptr;
|
|
int rc= m_key_descr_arr[active_index]->m_is_reverse_cf ?
|
|
index_first_intern(buf) : index_last_intern(buf);
|
|
if (rc == HA_ERR_KEY_NOT_FOUND)
|
|
rc = HA_ERR_END_OF_FILE;
|
|
return rc;
|
|
}
|
|
|
|
|
|
/*
|
|
Start scanning from the "first" value.
|
|
|
|
The 'first' here means "the first from start of the key space".
|
|
For reverse-ordered key spaces, the first value will be the biggest, 'zzzz'.
|
|
|
|
An picture of a forward-ordered keyspace (remember, the keys have form
|
|
'indexnr-keyval'. Suppose the index we are at has number n)
|
|
|
|
(n-1) - ...
|
|
( n ) <--- 1. (n) doesn't exist in the db but it would be here.
|
|
( n ) - aaa <--- 2. Seek("n") will put us here on the first index
|
|
( n ) - bbb record.
|
|
( n ) - cc
|
|
|
|
So, need to do: Seek(n);
|
|
|
|
A backward-ordered keyspace:
|
|
|
|
(n+1) - bbb
|
|
(n+1) - aaa
|
|
(n+1) <--- (n+1) doesn't exist in the db but would be here.
|
|
( n ) - ccc <--- 1. We need to be here.
|
|
( n ) - bbb
|
|
( n ) - aaa
|
|
( n )
|
|
|
|
So, need to: Seek(n+1);
|
|
|
|
*/
|
|
|
|
int ha_rocksdb::index_first_intern(uchar *buf)
|
|
{
|
|
DBUG_ASSERT(buf != nullptr);
|
|
|
|
uchar *key;
|
|
uint key_size;
|
|
int rc;
|
|
DBUG_ENTER("ha_rocksdb::index_first");
|
|
|
|
if (is_pk(active_index, table, m_tbl_def))
|
|
{
|
|
key= m_pk_packed_tuple;
|
|
}
|
|
else
|
|
{
|
|
key= m_sk_packed_tuple;
|
|
}
|
|
|
|
DBUG_ASSERT(key != nullptr);
|
|
|
|
const std::shared_ptr<const Rdb_key_def>& kd= m_key_descr_arr[active_index];
|
|
if (kd->m_is_reverse_cf)
|
|
{
|
|
kd->get_supremum_key(key, &key_size);
|
|
}
|
|
else
|
|
{
|
|
kd->get_infimum_key(key, &key_size);
|
|
}
|
|
|
|
rocksdb::Slice index_key((const char*)key, key_size);
|
|
|
|
Rdb_transaction *tx= get_or_create_tx(table->in_use);
|
|
DBUG_ASSERT(tx != nullptr);
|
|
|
|
bool is_new_snapshot= !tx->has_snapshot();
|
|
// Loop as long as we get a deadlock error AND we end up creating the
|
|
// snapshot here (i.e. it did not exist prior to this)
|
|
for (;;)
|
|
{
|
|
setup_scan_iterator(kd, &index_key, false, !kd->m_is_reverse_cf,
|
|
Rdb_key_def::INDEX_NUMBER_SIZE);
|
|
m_skip_scan_it_next_call= true;
|
|
|
|
rc= index_next_with_direction(buf, true);
|
|
if (rc != HA_ERR_LOCK_DEADLOCK || !is_new_snapshot)
|
|
break; // exit the loop
|
|
|
|
// release the snapshot and iterator so they will be regenerated
|
|
tx->release_snapshot();
|
|
release_scan_iterator();
|
|
}
|
|
|
|
if (!rc)
|
|
{
|
|
/*
|
|
index_next is always incremented on success, so decrement if it is
|
|
index_first instead
|
|
*/
|
|
stats.rows_index_first++;
|
|
stats.rows_index_next--;
|
|
}
|
|
DBUG_RETURN(rc);
|
|
}
|
|
|
|
|
|
/**
|
|
@details
|
|
Start scanning from the "last" value
|
|
|
|
The 'last' here means "the last from start of the key space".
|
|
For reverse-ordered key spaces, we will actually read the smallest value.
|
|
|
|
An picture of a forward-ordered keyspace (remember, the keys have form
|
|
'indexnr-keyval'. Suppose the we are at a key that has number n)
|
|
|
|
(n-1)-something
|
|
( n )-aaa
|
|
( n )-bbb
|
|
( n )-ccc <----------- Need to seek to here.
|
|
(n+1) <---- Doesn't exist, but would be here.
|
|
(n+1)-smth, or no value at all
|
|
|
|
RocksDB's Iterator::Seek($val) seeks to "at $val or first value that's
|
|
greater". We can't see to "(n)-ccc" directly, because we don't know what
|
|
is the value of 'ccc' (the biggest record with prefix (n)). Instead, we seek
|
|
to "(n+1)", which is the least possible value that's greater than any value
|
|
in index #n. Then we step one record back.
|
|
|
|
So, need to: it->Seek(n+1) || it->SeekToLast(); it->Prev();
|
|
|
|
A backward-ordered keyspace:
|
|
|
|
(n+1)-something
|
|
( n ) - ccc
|
|
( n ) - bbb
|
|
( n ) - aaa <---------------- (*) Need to seek here.
|
|
( n ) <--- Doesn't exist, but would be here.
|
|
(n-1)-smth, or no value at all
|
|
|
|
So, need to: it->Seek(n) || it->SeekToLast(); it->Prev();
|
|
*/
|
|
|
|
int ha_rocksdb::index_last_intern(uchar *buf)
|
|
{
|
|
DBUG_ASSERT(buf != nullptr);
|
|
|
|
uchar *key;
|
|
uint key_size;
|
|
int rc;
|
|
DBUG_ENTER("ha_rocksdb::index_last");
|
|
|
|
if (is_pk(active_index, table, m_tbl_def))
|
|
{
|
|
key= m_pk_packed_tuple;
|
|
}
|
|
else
|
|
{
|
|
key= m_sk_packed_tuple;
|
|
}
|
|
|
|
DBUG_ASSERT(key != nullptr);
|
|
|
|
const std::shared_ptr<const Rdb_key_def>& kd= m_key_descr_arr[active_index];
|
|
if (kd->m_is_reverse_cf)
|
|
{
|
|
kd->get_infimum_key(key, &key_size);
|
|
}
|
|
else
|
|
{
|
|
kd->get_supremum_key(key, &key_size);
|
|
}
|
|
|
|
rocksdb::Slice index_key((const char*)key, key_size);
|
|
|
|
Rdb_transaction *tx= get_or_create_tx(table->in_use);
|
|
DBUG_ASSERT(tx != nullptr);
|
|
|
|
bool is_new_snapshot= !tx->has_snapshot();
|
|
// Loop as long as we get a deadlock error AND we end up creating the
|
|
// snapshot here (i.e. it did not exist prior to this)
|
|
for (;;)
|
|
{
|
|
setup_scan_iterator(kd, &index_key, false, kd->m_is_reverse_cf,
|
|
Rdb_key_def::INDEX_NUMBER_SIZE);
|
|
|
|
if (!m_scan_it->Valid())
|
|
{
|
|
/*
|
|
RocksDB keyspace has no records that follow our index. It could be, our
|
|
index is the last in the keyspace. Navigate to last record in the
|
|
keyspace.
|
|
*/
|
|
m_scan_it->SeekToLast();
|
|
}
|
|
else
|
|
{
|
|
/*
|
|
Iterator is pointing at some record right after the end of the index.
|
|
Move one record back.
|
|
*/
|
|
m_scan_it->Prev();
|
|
}
|
|
m_skip_scan_it_next_call= false;
|
|
|
|
if (is_pk(active_index, table, m_tbl_def))
|
|
{
|
|
m_skip_scan_it_next_call= true;
|
|
rc= rnd_next_with_direction(buf, false);
|
|
}
|
|
else
|
|
{
|
|
rc= find_icp_matching_index_rec(false /*move_forward*/, buf);
|
|
if (!rc)
|
|
rc= secondary_index_read(active_index, buf);
|
|
}
|
|
|
|
if (rc != HA_ERR_LOCK_DEADLOCK || !is_new_snapshot)
|
|
break; /* exit the loop */
|
|
|
|
// release the snapshot and iterator so they will be regenerated
|
|
tx->release_snapshot();
|
|
release_scan_iterator();
|
|
}
|
|
|
|
if (!rc)
|
|
{
|
|
/*
|
|
index_next is always incremented on success, so decrement if it is
|
|
index_first instead
|
|
*/
|
|
stats.rows_index_first++;
|
|
stats.rows_index_next--;
|
|
}
|
|
DBUG_RETURN(rc);
|
|
}
|
|
|
|
|
|
void ha_rocksdb::unlock_row()
|
|
{
|
|
if (m_lock_rows != RDB_LOCK_NONE)
|
|
{
|
|
Rdb_transaction *tx= get_or_create_tx(table->in_use);
|
|
tx->release_lock(m_pk_descr->get_cf(), std::string(m_last_rowkey.ptr(),
|
|
m_last_rowkey.length()));
|
|
}
|
|
}
|
|
|
|
/*
|
|
Returning true if SingleDelete can be used.
|
|
- Secondary Indexes can always use SingleDelete.
|
|
- If the index is PRIMARY KEY, and if all of the columns of the table
|
|
are covered by the PRIMARY KEY, SingleDelete can be used.
|
|
*/
|
|
bool ha_rocksdb::can_use_single_delete(uint index)
|
|
{
|
|
return (index != pk_index(table, m_tbl_def) ||
|
|
(!has_hidden_pk(table) &&
|
|
table->key_info[index].ext_key_parts == table->s->fields));
|
|
}
|
|
|
|
bool ha_rocksdb::skip_unique_check()
|
|
{
|
|
/*
|
|
We want to skip unique checks if:
|
|
1) bulk_load is on
|
|
2) this table is in the whitelist of tables to skip and the replication
|
|
lag has reached a large enough value (see unique_check_lag_threshold
|
|
and unique_check_lage_reset_threshold)
|
|
3) the user set rocksdb_skip_unique_check
|
|
*/
|
|
return THDVAR(table->in_use, bulk_load) ||
|
|
(m_force_skip_unique_check && m_skip_unique_check) ||
|
|
THDVAR(table->in_use, skip_unique_check);
|
|
}
|
|
|
|
#ifdef MARIAROCKS_NOT_YET // MDEV-10975
|
|
void ha_rocksdb::set_force_skip_unique_check(bool skip)
|
|
{
|
|
m_force_skip_unique_check= skip;
|
|
}
|
|
#endif
|
|
|
|
bool ha_rocksdb::commit_in_the_middle()
|
|
{
|
|
return THDVAR(table->in_use, bulk_load) ||
|
|
THDVAR(table->in_use, commit_in_the_middle);
|
|
}
|
|
|
|
/*
|
|
Executing bulk commit if it should.
|
|
@retval true if bulk commit failed
|
|
@retval false if bulk commit was skipped or succeeded
|
|
*/
|
|
bool ha_rocksdb::do_bulk_commit(Rdb_transaction *tx)
|
|
{
|
|
DBUG_ASSERT(tx != nullptr);
|
|
return commit_in_the_middle() &&
|
|
tx->get_write_count() >= THDVAR(table->in_use, bulk_load_size) &&
|
|
tx->flush_batch();
|
|
}
|
|
|
|
/*
|
|
If table was created without primary key, SQL layer represents the primary
|
|
key number as MAX_INDEXES. Hence, this function returns true if the table
|
|
does not contain a primary key. (In which case we generate a hidden
|
|
'auto-incremented' pk.)
|
|
*/
|
|
bool ha_rocksdb::has_hidden_pk(const TABLE* table)
|
|
{
|
|
DBUG_ASSERT(table != nullptr);
|
|
return Rdb_key_def::table_has_hidden_pk(table);
|
|
}
|
|
|
|
/*
|
|
Returns true if given index number is a hidden_pk.
|
|
- This is used when a table is created with no primary key.
|
|
*/
|
|
bool ha_rocksdb::is_hidden_pk(const uint index, const TABLE* table_arg,
|
|
const Rdb_tbl_def* tbl_def_arg)
|
|
{
|
|
DBUG_ASSERT(table_arg != nullptr);
|
|
DBUG_ASSERT(table_arg->s != nullptr);
|
|
DBUG_ASSERT(tbl_def_arg != nullptr);
|
|
|
|
return (table_arg->s->primary_key == MAX_INDEXES &&
|
|
index == tbl_def_arg->m_key_count - 1);
|
|
}
|
|
|
|
/* Returns index of primary key */
|
|
uint ha_rocksdb::pk_index(const TABLE* table_arg,
|
|
const Rdb_tbl_def* tbl_def_arg)
|
|
{
|
|
DBUG_ASSERT(table_arg != nullptr);
|
|
DBUG_ASSERT(table_arg->s != nullptr);
|
|
DBUG_ASSERT(tbl_def_arg != nullptr);
|
|
|
|
return table_arg->s->primary_key == MAX_INDEXES ?
|
|
tbl_def_arg->m_key_count - 1 :
|
|
table_arg->s->primary_key;
|
|
}
|
|
|
|
/* Returns true if given index number is a primary key */
|
|
bool ha_rocksdb::is_pk(const uint index, const TABLE* table_arg,
|
|
const Rdb_tbl_def *tbl_def_arg)
|
|
{
|
|
DBUG_ASSERT(table_arg != nullptr);
|
|
DBUG_ASSERT(table_arg->s != nullptr);
|
|
DBUG_ASSERT(tbl_def_arg != nullptr);
|
|
|
|
return index == table_arg->s->primary_key ||
|
|
is_hidden_pk(index, table_arg, tbl_def_arg);
|
|
}
|
|
|
|
const char* ha_rocksdb::get_key_name(const uint index, const TABLE* table_arg,
|
|
const Rdb_tbl_def* tbl_def_arg)
|
|
{
|
|
DBUG_ASSERT(table_arg != nullptr);
|
|
|
|
if (is_hidden_pk(index, table_arg, tbl_def_arg))
|
|
{
|
|
return HIDDEN_PK_NAME;
|
|
}
|
|
|
|
return table_arg->key_info[index].name;
|
|
}
|
|
|
|
const char* ha_rocksdb::get_key_comment(const uint index,
|
|
const TABLE* table_arg,
|
|
const Rdb_tbl_def* tbl_def_arg)
|
|
{
|
|
DBUG_ASSERT(table_arg != nullptr);
|
|
|
|
if (is_hidden_pk(index, table_arg, tbl_def_arg))
|
|
{
|
|
return nullptr;
|
|
}
|
|
|
|
return table_arg->key_info[index].comment.str;
|
|
}
|
|
|
|
int ha_rocksdb::write_row(uchar *buf)
|
|
{
|
|
DBUG_ENTER("ha_rocksdb::write_row");
|
|
DBUG_ASSERT(buf == table->record[0]);
|
|
DBUG_ASSERT(m_lock_rows == RDB_LOCK_WRITE);
|
|
/*
|
|
Note: "buf == table->record[0]" is copied from innodb. I am not aware of
|
|
any use cases where this condition is not true.
|
|
*/
|
|
if (table->next_number_field && buf == table->record[0])
|
|
{
|
|
int err;
|
|
if ((err= update_auto_increment()))
|
|
{
|
|
DBUG_RETURN(err);
|
|
}
|
|
}
|
|
|
|
int rv= update_write_row(nullptr, buf, skip_unique_check());
|
|
|
|
if (rv == 0)
|
|
{
|
|
stats.rows_inserted++;
|
|
update_row_stats(ROWS_INSERTED);
|
|
}
|
|
DBUG_RETURN(rv);
|
|
}
|
|
|
|
|
|
/**
|
|
Constructing m_last_rowkey (MyRocks key expression) from
|
|
before_update|delete image (MySQL row expression).
|
|
m_last_rowkey is normally set during lookup phase, such as
|
|
rnd_next_with_direction() and rnd_pos(). With Read Free Replication,
|
|
these read functions are skipped and update_rows(), delete_rows() are
|
|
called without setting m_last_rowkey. This function sets m_last_rowkey
|
|
for Read Free Replication.
|
|
*/
|
|
void ha_rocksdb::set_last_rowkey(const uchar *old_data)
|
|
{
|
|
if (old_data && use_read_free_rpl())
|
|
{
|
|
int old_pk_size= m_pk_descr->pack_record(
|
|
table, m_pack_buffer, old_data, m_pk_packed_tuple, nullptr, false);
|
|
m_last_rowkey.copy((const char*)m_pk_packed_tuple, old_pk_size,
|
|
&my_charset_bin);
|
|
}
|
|
}
|
|
|
|
int ha_rocksdb::get_pk_for_update(struct update_row_info* row_info)
|
|
{
|
|
int size;
|
|
|
|
/*
|
|
Get new row key for any insert, and any update where the pk is not hidden.
|
|
Row key for updates with hidden pk is handled below.
|
|
*/
|
|
if (!has_hidden_pk(table))
|
|
{
|
|
row_info->hidden_pk_id= 0;
|
|
|
|
row_info->new_pk_unpack_info= &m_pk_unpack_info;
|
|
|
|
size= m_pk_descr->pack_record(table, m_pack_buffer, row_info->new_data,
|
|
m_pk_packed_tuple,
|
|
row_info->new_pk_unpack_info,
|
|
false);
|
|
}
|
|
else if (row_info->old_data == nullptr)
|
|
{
|
|
row_info->hidden_pk_id= update_hidden_pk_val();
|
|
size= m_pk_descr->pack_hidden_pk(row_info->hidden_pk_id, m_pk_packed_tuple);
|
|
}
|
|
else
|
|
{
|
|
/*
|
|
If hidden primary key, rowkey for new record will always be the same as
|
|
before
|
|
*/
|
|
size= row_info->old_pk_slice.size();
|
|
memcpy(m_pk_packed_tuple, row_info->old_pk_slice.data(), size);
|
|
if (read_hidden_pk_id_from_rowkey(&row_info->hidden_pk_id))
|
|
{
|
|
return HA_ERR_INTERNAL_ERROR;
|
|
}
|
|
}
|
|
|
|
row_info->new_pk_slice= rocksdb::Slice((const char*)m_pk_packed_tuple, size);
|
|
|
|
return 0;
|
|
}
|
|
|
|
int ha_rocksdb::check_and_lock_unique_pk(uint key_id,
|
|
const struct update_row_info& row_info,
|
|
bool* found, bool* pk_changed)
|
|
{
|
|
DBUG_ASSERT(found != nullptr);
|
|
DBUG_ASSERT(pk_changed != nullptr);
|
|
|
|
*pk_changed= false;
|
|
|
|
/*
|
|
For UPDATEs, if the key has changed, we need to obtain a lock. INSERTs
|
|
always require locking.
|
|
*/
|
|
if (row_info.old_pk_slice.size() > 0)
|
|
{
|
|
/*
|
|
If the keys are the same, then no lock is needed
|
|
*/
|
|
if (!Rdb_pk_comparator::bytewise_compare(row_info.new_pk_slice,
|
|
row_info.old_pk_slice))
|
|
{
|
|
*found= false;
|
|
return 0;
|
|
}
|
|
|
|
*pk_changed= true;
|
|
}
|
|
|
|
/*
|
|
Perform a read to determine if a duplicate entry exists. For primary
|
|
keys, a point lookup will be sufficient.
|
|
|
|
note: we intentionally don't set options.snapshot here. We want to read
|
|
the latest committed data.
|
|
*/
|
|
|
|
/*
|
|
To prevent race conditions like below, it is necessary to
|
|
take a lock for a target row. get_for_update() holds a gap lock if
|
|
target key does not exist, so below conditions should never
|
|
happen.
|
|
|
|
1) T1 Get(empty) -> T2 Get(empty) -> T1 Put(insert) -> T1 commit
|
|
-> T2 Put(overwrite) -> T2 commit
|
|
2) T1 Get(empty) -> T1 Put(insert, not committed yet) -> T2 Get(empty)
|
|
-> T2 Put(insert, blocked) -> T1 commit -> T2 commit(overwrite)
|
|
*/
|
|
rocksdb::Status s= row_info.tx->get_for_update(m_pk_descr->get_cf(),
|
|
row_info.new_pk_slice,
|
|
&m_retrieved_record);
|
|
if (!s.ok() && !s.IsNotFound())
|
|
{
|
|
return row_info.tx->set_status_error(table->in_use, s,
|
|
m_key_descr_arr[key_id], m_tbl_def);
|
|
}
|
|
|
|
*found= !s.IsNotFound();
|
|
return 0;
|
|
}
|
|
|
|
int ha_rocksdb::check_and_lock_sk(uint key_id,
|
|
const struct update_row_info& row_info,
|
|
bool* found) const
|
|
{
|
|
DBUG_ASSERT(found != nullptr);
|
|
*found= false;
|
|
|
|
/*
|
|
Can skip checking this key if none of the key fields have changed.
|
|
*/
|
|
if (row_info.old_data != nullptr && !m_update_scope.is_set(key_id))
|
|
{
|
|
return 0;
|
|
}
|
|
|
|
KEY* key_info= nullptr;
|
|
uint n_null_fields= 0;
|
|
uint user_defined_key_parts= 1;
|
|
|
|
key_info= &table->key_info[key_id];
|
|
user_defined_key_parts= key_info->user_defined_key_parts;
|
|
/*
|
|
If there are no uniqueness requirements, there's no need to obtain a
|
|
lock for this key.
|
|
*/
|
|
if (!(key_info->flags & HA_NOSAME))
|
|
{
|
|
return 0;
|
|
}
|
|
|
|
const std::shared_ptr<const Rdb_key_def>& kd= m_key_descr_arr[key_id];
|
|
|
|
/*
|
|
Calculate the new key for obtaining the lock
|
|
|
|
For unique secondary indexes, the key used for locking does not
|
|
include the extended fields.
|
|
*/
|
|
int size= kd->pack_record(table, m_pack_buffer, row_info.new_data,
|
|
m_sk_packed_tuple, nullptr, false, 0,
|
|
user_defined_key_parts, &n_null_fields);
|
|
if (n_null_fields > 0)
|
|
{
|
|
/*
|
|
If any fields are marked as NULL this will never match another row as
|
|
to NULL never matches anything else including another NULL.
|
|
*/
|
|
return 0;
|
|
}
|
|
|
|
rocksdb::Slice new_slice= rocksdb::Slice((const char*)m_sk_packed_tuple,
|
|
size);
|
|
|
|
/*
|
|
For UPDATEs, if the key has changed, we need to obtain a lock. INSERTs
|
|
always require locking.
|
|
*/
|
|
if (row_info.old_data != nullptr)
|
|
{
|
|
size= kd->pack_record(table, m_pack_buffer, row_info.old_data,
|
|
m_sk_packed_tuple_old, nullptr, false,
|
|
row_info.hidden_pk_id,
|
|
user_defined_key_parts);
|
|
rocksdb::Slice old_slice= rocksdb::Slice(
|
|
(const char*)m_sk_packed_tuple_old, size);
|
|
|
|
/*
|
|
For updates, if the keys are the same, then no lock is needed
|
|
|
|
Also check to see if the key has any fields set to NULL. If it does, then
|
|
this key is unique since NULL is not equal to each other, so no lock is
|
|
needed.
|
|
*/
|
|
if (!Rdb_pk_comparator::bytewise_compare(new_slice, old_slice))
|
|
{
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
/*
|
|
Perform a read to determine if a duplicate entry exists - since this is
|
|
a secondary indexes a range scan is needed.
|
|
|
|
note: we intentionally don't set options.snapshot here. We want to read
|
|
the latest committed data.
|
|
*/
|
|
|
|
bool all_parts_used= (user_defined_key_parts == kd->get_key_parts());
|
|
|
|
/*
|
|
This iterator seems expensive since we need to allocate and free
|
|
memory for each unique index.
|
|
|
|
If this needs to be optimized, for keys without NULL fields, the
|
|
extended primary key fields can be migrated to the value portion of the
|
|
key. This enables using Get() instead of Seek() as in the primary key
|
|
case.
|
|
|
|
The bloom filter may need to be disabled for this lookup.
|
|
*/
|
|
bool total_order_seek=
|
|
!can_use_bloom_filter(ha_thd(), kd, new_slice, all_parts_used,
|
|
is_ascending(m_key_descr_arr[key_id],
|
|
HA_READ_KEY_EXACT));
|
|
bool fill_cache= !THDVAR(ha_thd(), skip_fill_cache);
|
|
|
|
/*
|
|
psergey-todo: we just need to take lock, lookups not needed:
|
|
*/
|
|
std::string dummy_value;
|
|
rocksdb::Status s= row_info.tx->get_for_update(kd->get_cf(), new_slice,
|
|
&dummy_value);
|
|
if (!s.ok() && !s.IsNotFound())
|
|
{
|
|
return row_info.tx->set_status_error(table->in_use, s, kd, m_tbl_def);
|
|
}
|
|
|
|
rocksdb::Iterator *iter= row_info.tx->get_iterator(kd->get_cf(),
|
|
total_order_seek,
|
|
fill_cache,
|
|
true /* read current data */,
|
|
false /* acquire snapshot */);
|
|
/*
|
|
Need to scan the transaction to see if there is a duplicate key.
|
|
Also need to scan RocksDB and verify the key has not been deleted
|
|
in the transaction.
|
|
*/
|
|
iter->Seek(new_slice);
|
|
*found= !read_key_exact(kd, iter, all_parts_used, new_slice);
|
|
delete iter;
|
|
|
|
return 0;
|
|
}
|
|
|
|
int ha_rocksdb::check_uniqueness_and_lock(
|
|
const struct update_row_info& row_info,
|
|
bool* pk_changed)
|
|
{
|
|
/*
|
|
Go through each index and determine if the index has uniqueness
|
|
requirements. If it does, then try to obtain a row lock on the new values.
|
|
Once all locks have been obtained, then perform the changes needed to
|
|
update/insert the row.
|
|
*/
|
|
for (uint key_id= 0; key_id < m_tbl_def->m_key_count; key_id++)
|
|
{
|
|
bool found;
|
|
int rc;
|
|
|
|
if (is_pk(key_id, table, m_tbl_def))
|
|
{
|
|
rc= check_and_lock_unique_pk(key_id, row_info, &found, pk_changed);
|
|
}
|
|
else
|
|
{
|
|
rc= check_and_lock_sk(key_id, row_info, &found);
|
|
}
|
|
|
|
if (rc != 0)
|
|
{
|
|
return rc;
|
|
}
|
|
|
|
if (found)
|
|
{
|
|
/* There is a row with this key already, so error out. */
|
|
errkey= key_id;
|
|
m_dupp_errkey= errkey;
|
|
return HA_ERR_FOUND_DUPP_KEY;
|
|
}
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
int ha_rocksdb::bulk_load_key(Rdb_transaction* tx,
|
|
const std::shared_ptr<const Rdb_key_def>& kd,
|
|
const rocksdb::Slice& key,
|
|
const rocksdb::Slice& value)
|
|
{
|
|
rocksdb::ColumnFamilyHandle* cf= kd->get_cf();
|
|
DBUG_ASSERT(cf != nullptr);
|
|
|
|
if (m_sst_info == nullptr)
|
|
{
|
|
m_sst_info=
|
|
std::make_shared<Rdb_sst_info>(rdb, m_table_handler->m_table_name,
|
|
kd->get_name(), cf, rocksdb_db_options);
|
|
tx->start_bulk_load(this);
|
|
}
|
|
|
|
DBUG_ASSERT(m_sst_info != nullptr);
|
|
|
|
int rc= m_sst_info->put(key, value);
|
|
if (rc != 0)
|
|
{
|
|
my_printf_error(ER_UNKNOWN_ERROR,
|
|
"Failed to add a key to sst file writer(%s)", MYF(0),
|
|
m_sst_info->error_message().c_str());
|
|
rc= HA_ERR_INTERNAL_ERROR;
|
|
}
|
|
|
|
return rc;
|
|
}
|
|
|
|
int ha_rocksdb::finalize_bulk_load()
|
|
{
|
|
int rc= 0;
|
|
|
|
if (m_sst_info != nullptr)
|
|
{
|
|
rc= m_sst_info->commit();
|
|
if (rc != 0)
|
|
{
|
|
my_printf_error(ER_UNKNOWN_ERROR,
|
|
"Failed to commit bulk loaded sst file to the "
|
|
"data store (%s)",
|
|
MYF(0), m_sst_info->error_message().c_str());
|
|
rc= HA_ERR_INTERNAL_ERROR;
|
|
}
|
|
|
|
m_sst_info= nullptr;
|
|
|
|
Rdb_transaction *tx= get_or_create_tx(ha_thd());
|
|
if (tx != nullptr)
|
|
{
|
|
tx->end_bulk_load(this);
|
|
}
|
|
}
|
|
|
|
return rc;
|
|
}
|
|
|
|
int ha_rocksdb::update_pk(const std::shared_ptr<const Rdb_key_def>& kd,
|
|
const struct update_row_info& row_info,
|
|
bool pk_changed)
|
|
{
|
|
uint key_id= kd->get_keyno();
|
|
bool hidden_pk= is_hidden_pk(key_id, table, m_tbl_def);
|
|
if (!hidden_pk && pk_changed)
|
|
{
|
|
/*
|
|
The old key needs to be deleted.
|
|
*/
|
|
rocksdb::Status s= delete_or_singledelete(key_id, row_info.tx, kd->get_cf(),
|
|
row_info.old_pk_slice);
|
|
if (!s.ok())
|
|
{
|
|
return row_info.tx->set_status_error(table->in_use, s, kd, m_tbl_def);
|
|
}
|
|
}
|
|
|
|
if (table->next_number_field)
|
|
{
|
|
update_auto_incr_val();
|
|
}
|
|
|
|
rocksdb::Slice value_slice;
|
|
convert_record_to_storage_format(row_info.new_pk_slice,
|
|
row_info.new_pk_unpack_info,
|
|
&value_slice);
|
|
|
|
int rc= 0;
|
|
auto cf= m_pk_descr->get_cf();
|
|
if (rocksdb_enable_bulk_load_api && THDVAR(table->in_use, bulk_load) &&
|
|
!hidden_pk)
|
|
{
|
|
/*
|
|
Write the primary key directly to an SST file using an SstFileWriter
|
|
*/
|
|
rc= bulk_load_key(row_info.tx, kd, row_info.new_pk_slice,
|
|
value_slice);
|
|
}
|
|
else if (row_info.skip_unique_check)
|
|
{
|
|
/*
|
|
It is responsibility of the user to make sure that the data being
|
|
inserted doesn't violate any unique keys.
|
|
*/
|
|
row_info.tx->get_blind_write_batch()->Put(cf, row_info.new_pk_slice,
|
|
value_slice);
|
|
}
|
|
else if (row_info.tx->m_ddl_transaction)
|
|
{
|
|
/*
|
|
DDL statement must check for unique key conflicts. For example:
|
|
ALTER TABLE tbl DROP PRIMARY KEY, ADD PRIMARY KEY(non_unique_column)
|
|
*/
|
|
row_info.tx->get_indexed_write_batch()->Put(cf, row_info.new_pk_slice,
|
|
value_slice);
|
|
}
|
|
else
|
|
{
|
|
auto s= row_info.tx->put(cf, row_info.new_pk_slice, value_slice);
|
|
if (!s.ok())
|
|
{
|
|
if (s.IsBusy())
|
|
{
|
|
errkey= table->s->primary_key;
|
|
m_dupp_errkey= errkey;
|
|
rc = HA_ERR_FOUND_DUPP_KEY;
|
|
}
|
|
else
|
|
{
|
|
rc = row_info.tx->set_status_error(table->in_use, s, m_pk_descr,
|
|
m_tbl_def);
|
|
}
|
|
}
|
|
}
|
|
|
|
return rc;
|
|
}
|
|
|
|
int ha_rocksdb::update_sk(const TABLE* table_arg,
|
|
const std::shared_ptr<const Rdb_key_def>& kd,
|
|
const struct update_row_info& row_info)
|
|
{
|
|
int new_packed_size;
|
|
int old_packed_size;
|
|
|
|
rocksdb::Slice new_key_slice;
|
|
rocksdb::Slice new_value_slice;
|
|
rocksdb::Slice old_key_slice;
|
|
|
|
uint key_id = kd->get_keyno();
|
|
/*
|
|
Can skip updating this key if none of the key fields have changed.
|
|
*/
|
|
if (row_info.old_data != nullptr && !m_update_scope.is_set(key_id))
|
|
{
|
|
return 0;
|
|
}
|
|
|
|
bool store_checksums= should_store_checksums();
|
|
|
|
new_packed_size= kd->pack_record(table_arg, m_pack_buffer, row_info.new_data,
|
|
m_sk_packed_tuple, &m_sk_tails,
|
|
store_checksums,
|
|
row_info.hidden_pk_id);
|
|
|
|
if (row_info.old_data != nullptr)
|
|
{
|
|
// The old value
|
|
old_packed_size= kd->pack_record(table_arg, m_pack_buffer,
|
|
row_info.old_data,
|
|
m_sk_packed_tuple_old, &m_sk_tails_old,
|
|
store_checksums,
|
|
row_info.hidden_pk_id);
|
|
|
|
/*
|
|
Check if we are going to write the same value. This can happen when
|
|
one does
|
|
UPDATE tbl SET col='foo'
|
|
and we are looking at the row that already has col='foo'.
|
|
|
|
We also need to compare the unpack info. Suppose, the collation is
|
|
case-insensitive, and unpack info contains information about whether
|
|
the letters were uppercase and lowercase. Then, both 'foo' and 'FOO'
|
|
will have the same key value, but different data in unpack_info.
|
|
|
|
(note: anyone changing bytewise_compare should take this code into
|
|
account)
|
|
*/
|
|
if (old_packed_size == new_packed_size &&
|
|
m_sk_tails_old.get_current_pos() == m_sk_tails.get_current_pos() &&
|
|
memcmp(m_sk_packed_tuple_old, m_sk_packed_tuple,
|
|
old_packed_size) == 0 &&
|
|
memcmp(m_sk_tails_old.ptr(), m_sk_tails.ptr(),
|
|
m_sk_tails.get_current_pos()) == 0)
|
|
{
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
Deleting entries from secondary index should skip locking, but
|
|
be visible to the transaction.
|
|
(also note that DDL statements do not delete rows, so this is not a DDL
|
|
statement)
|
|
*/
|
|
old_key_slice= rocksdb::Slice(
|
|
reinterpret_cast<const char*>(m_sk_packed_tuple_old),
|
|
old_packed_size);
|
|
|
|
row_info.tx->get_indexed_write_batch()->SingleDelete(kd->get_cf(),
|
|
old_key_slice);
|
|
}
|
|
|
|
/*
|
|
We're writing a new entry for secondary key. We can skip locking; we
|
|
should write to
|
|
- WriteBatchWithIndex normally (so that transaction sees the new row)
|
|
- non-indexed WriteBatch, when we don't need to see the new row:
|
|
= when doing a DDL operation and writing to a non-unique index, or
|
|
= when doing a bulk load
|
|
*/
|
|
rocksdb::WriteBatchBase *write_batch;
|
|
if ((row_info.tx->m_ddl_transaction &&
|
|
!(table_arg->key_info[key_id].flags & HA_NOSAME)) ||
|
|
row_info.skip_unique_check)
|
|
{
|
|
write_batch= row_info.tx->get_blind_write_batch();
|
|
}
|
|
else
|
|
{
|
|
write_batch= row_info.tx->get_indexed_write_batch();
|
|
}
|
|
|
|
new_key_slice= rocksdb::Slice(
|
|
reinterpret_cast<const char*>(m_sk_packed_tuple), new_packed_size);
|
|
new_value_slice= rocksdb::Slice(
|
|
reinterpret_cast<const char*>(m_sk_tails.ptr()),
|
|
m_sk_tails.get_current_pos());
|
|
|
|
write_batch->Put(kd->get_cf(), new_key_slice, new_value_slice);
|
|
|
|
return 0;
|
|
}
|
|
|
|
int ha_rocksdb::update_indexes(const struct update_row_info& row_info,
|
|
bool pk_changed)
|
|
{
|
|
int rc;
|
|
|
|
for (uint key_id= 0; key_id < m_tbl_def->m_key_count; key_id++)
|
|
{
|
|
const std::shared_ptr<const Rdb_key_def>& kd= m_key_descr_arr[key_id];
|
|
if (is_pk(key_id, table, m_tbl_def))
|
|
{
|
|
rc= update_pk(kd, row_info, pk_changed);
|
|
}
|
|
else
|
|
{
|
|
rc= update_sk(table, kd, row_info);
|
|
}
|
|
|
|
if (rc != 0)
|
|
{
|
|
return rc;
|
|
}
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
int ha_rocksdb::update_write_row(const uchar *old_data,
|
|
const uchar *new_data,
|
|
const bool skip_unique_check)
|
|
{
|
|
bool pk_changed= false;
|
|
struct update_row_info row_info;
|
|
|
|
row_info.old_data= old_data;
|
|
row_info.new_data= new_data;
|
|
row_info.skip_unique_check= skip_unique_check;
|
|
row_info.new_pk_unpack_info= nullptr;
|
|
|
|
DBUG_ENTER("ha_rocksdb::write_row_common");
|
|
|
|
set_last_rowkey(old_data);
|
|
|
|
row_info.tx= get_or_create_tx(table->in_use);
|
|
|
|
if (old_data != nullptr)
|
|
{
|
|
row_info.old_pk_slice= rocksdb::Slice(m_last_rowkey.ptr(),
|
|
m_last_rowkey.length());
|
|
|
|
/* Determine which indexes need updating. */
|
|
calc_updated_indexes();
|
|
}
|
|
|
|
/*
|
|
Get the new row key into row_info.new_pk_slice
|
|
*/
|
|
int rc= get_pk_for_update(&row_info);
|
|
if (rc != 0)
|
|
{
|
|
DBUG_RETURN(rc);
|
|
}
|
|
|
|
if (!skip_unique_check)
|
|
{
|
|
/*
|
|
Check to see if we are going to have failures because of unique
|
|
keys. Also lock the appropriate key values.
|
|
*/
|
|
rc= check_uniqueness_and_lock(row_info, &pk_changed);
|
|
if (rc != 0)
|
|
{
|
|
DBUG_RETURN(rc);
|
|
}
|
|
}
|
|
|
|
DEBUG_SYNC(ha_thd(), "rocksdb.update_write_row_after_unique_check");
|
|
|
|
/*
|
|
At this point, all locks have been obtained, and all checks for duplicate
|
|
keys have been performed. No further errors can be allowed to occur from
|
|
here because updates to the transaction will be made and those updates
|
|
cannot be easily removed without rolling back the entire transaction.
|
|
*/
|
|
rc= update_indexes(row_info, pk_changed);
|
|
if (rc != 0)
|
|
{
|
|
DBUG_RETURN(rc);
|
|
}
|
|
|
|
if (do_bulk_commit(row_info.tx))
|
|
{
|
|
DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
|
|
}
|
|
|
|
DBUG_RETURN(0);
|
|
}
|
|
|
|
|
|
/*
|
|
Open a cursor and position it at the passed record
|
|
*/
|
|
|
|
void ha_rocksdb::setup_scan_iterator(
|
|
const std::shared_ptr<const Rdb_key_def>& kd,
|
|
rocksdb::Slice *slice,
|
|
const bool use_all_keys,
|
|
const bool is_ascending,
|
|
const uint eq_cond_len)
|
|
{
|
|
DBUG_ASSERT(kd != nullptr);
|
|
DBUG_ASSERT(slice != nullptr);
|
|
DBUG_ASSERT(slice->size() >= eq_cond_len);
|
|
|
|
Rdb_transaction *tx= get_or_create_tx(table->in_use);
|
|
|
|
bool skip_bloom= true;
|
|
|
|
rocksdb::Slice eq_cond(slice->data(), eq_cond_len);
|
|
if (can_use_bloom_filter(ha_thd(), kd, eq_cond, use_all_keys, is_ascending))
|
|
{
|
|
skip_bloom= false;
|
|
}
|
|
|
|
/*
|
|
In some cases, setup_scan_iterator() is called multiple times from
|
|
the same query but bloom filter can not always be used.
|
|
Suppose the following query example. id2 is VARCHAR(30) and PRIMARY KEY (id1, id2).
|
|
select count(*) from t2 WHERE id1=100 and id2 IN ('00000000000000000000', '100');
|
|
In this case, setup_scan_iterator() is called twice, the first time is for
|
|
(id1, id2)=(100, '00000000000000000000') and the second time is for (100, '100').
|
|
If prefix bloom filter length is 24 bytes, prefix bloom filter can be used for the
|
|
first condition but not for the second condition.
|
|
If bloom filter condition is changed, currently it is necessary to destroy and
|
|
re-create Iterator.
|
|
*/
|
|
if (m_scan_it_skips_bloom != skip_bloom)
|
|
{
|
|
release_scan_iterator();
|
|
}
|
|
|
|
/*
|
|
SQL layer can call rnd_init() multiple times in a row.
|
|
In that case, re-use the iterator, but re-position it at the table start.
|
|
*/
|
|
if (!m_scan_it)
|
|
{
|
|
bool fill_cache= !THDVAR(ha_thd(), skip_fill_cache);
|
|
m_scan_it= tx->get_iterator(kd->get_cf(), skip_bloom, fill_cache);
|
|
m_scan_it_skips_bloom= skip_bloom;
|
|
}
|
|
/*
|
|
Seek() will "Position at the first key in the source that at or past target".
|
|
The operation cannot fail.
|
|
*/
|
|
m_scan_it->Seek(*slice);
|
|
}
|
|
|
|
|
|
void ha_rocksdb::setup_iterator_for_rnd_scan()
|
|
{
|
|
uint key_size;
|
|
|
|
if (m_pk_descr->m_is_reverse_cf)
|
|
m_pk_descr->get_supremum_key(m_pk_packed_tuple, &key_size);
|
|
else
|
|
m_pk_descr->get_infimum_key(m_pk_packed_tuple, &key_size);
|
|
|
|
rocksdb::Slice table_key((const char*)m_pk_packed_tuple, key_size);
|
|
|
|
setup_scan_iterator(m_pk_descr, &table_key);
|
|
m_skip_scan_it_next_call= true;
|
|
}
|
|
|
|
|
|
int ha_rocksdb::rnd_init(bool scan)
|
|
{
|
|
DBUG_ENTER("ha_rocksdb::rnd_init");
|
|
|
|
Rdb_transaction *tx= get_or_create_tx(table->in_use);
|
|
setup_read_decoders();
|
|
|
|
if (scan)
|
|
{
|
|
m_rnd_scan_is_new_snapshot= !tx->has_snapshot();
|
|
setup_iterator_for_rnd_scan();
|
|
}
|
|
else
|
|
{
|
|
/* We don't need any preparations for rnd_pos() calls. */
|
|
}
|
|
|
|
// If m_lock_rows is on then we will be doing a get_for_update when accessing
|
|
// the index, so don't acquire the snapshot right away. Otherwise acquire
|
|
// the snapshot immediately.
|
|
tx->acquire_snapshot(m_lock_rows == RDB_LOCK_NONE);
|
|
|
|
DBUG_RETURN(0);
|
|
}
|
|
|
|
|
|
int ha_rocksdb::rnd_next(uchar *buf)
|
|
{
|
|
int rc;
|
|
for (;;)
|
|
{
|
|
rc= rnd_next_with_direction(buf, true);
|
|
if (rc != HA_ERR_LOCK_DEADLOCK || !m_rnd_scan_is_new_snapshot)
|
|
break; /* exit the loop */
|
|
|
|
// release the snapshot and iterator and then regenerate them
|
|
Rdb_transaction *tx= get_or_create_tx(table->in_use);
|
|
tx->release_snapshot();
|
|
release_scan_iterator();
|
|
setup_iterator_for_rnd_scan();
|
|
}
|
|
|
|
m_rnd_scan_is_new_snapshot= false;
|
|
|
|
if (rc == HA_ERR_KEY_NOT_FOUND)
|
|
rc = HA_ERR_END_OF_FILE;
|
|
return rc;
|
|
}
|
|
|
|
|
|
/*
|
|
See also secondary_index_read().
|
|
*/
|
|
int ha_rocksdb::rnd_next_with_direction(uchar *buf, bool move_forward)
|
|
{
|
|
int rc;
|
|
DBUG_ENTER("ha_rocksdb::rnd_next");
|
|
|
|
table->status= STATUS_NOT_FOUND;
|
|
stats.rows_requested++;
|
|
|
|
if (!m_scan_it || !m_scan_it->Valid())
|
|
{
|
|
/*
|
|
We can get here when SQL layer has called
|
|
|
|
h->index_init(PRIMARY);
|
|
h->index_read_map(full index tuple, HA_READ_KEY_EXACT);
|
|
|
|
In this case, we should return EOF.
|
|
*/
|
|
DBUG_RETURN(HA_ERR_END_OF_FILE);
|
|
}
|
|
|
|
if (m_skip_scan_it_next_call)
|
|
{
|
|
m_skip_scan_it_next_call= false;
|
|
}
|
|
else
|
|
{
|
|
if (move_forward)
|
|
m_scan_it->Next(); /* this call cannot fail */
|
|
else
|
|
m_scan_it->Prev(); /* this call cannot fail */
|
|
}
|
|
|
|
if (m_scan_it->Valid())
|
|
{
|
|
/* check if we're out of this table */
|
|
rocksdb::Slice key= m_scan_it->key();
|
|
if (!m_pk_descr->covers_key(key))
|
|
{
|
|
rc= HA_ERR_END_OF_FILE;
|
|
}
|
|
else
|
|
{
|
|
if (m_lock_rows != RDB_LOCK_NONE)
|
|
{
|
|
/*
|
|
Lock the row we've just read.
|
|
|
|
Now we call get_for_update which will 1) Take a lock and 2) Will fail
|
|
if the row was deleted since the snapshot was taken.
|
|
*/
|
|
Rdb_transaction *tx= get_or_create_tx(table->in_use);
|
|
DEBUG_SYNC(ha_thd(), "rocksdb_concurrent_delete");
|
|
rocksdb::Status s= tx->get_for_update(m_pk_descr->get_cf(), key,
|
|
&m_retrieved_record);
|
|
if (!s.ok())
|
|
{
|
|
DBUG_RETURN(tx->set_status_error(table->in_use, s, m_pk_descr,
|
|
m_tbl_def));
|
|
}
|
|
|
|
// If we called get_for_update() use the value from that call not from
|
|
// the iterator as it may be stale since we don't have a snapshot
|
|
// when m_lock_rows is not RDB_LOCK_NONE.
|
|
m_last_rowkey.copy(key.data(), key.size(), &my_charset_bin);
|
|
rc= convert_record_from_storage_format(&key, buf);
|
|
}
|
|
else
|
|
{
|
|
// Use the value from the iterator
|
|
rocksdb::Slice value= m_scan_it->value();
|
|
m_last_rowkey.copy(key.data(), key.size(), &my_charset_bin);
|
|
rc= convert_record_from_storage_format(&key, &value, buf);
|
|
}
|
|
|
|
table->status= 0;
|
|
}
|
|
}
|
|
else
|
|
rc= HA_ERR_END_OF_FILE;
|
|
|
|
if (!rc) {
|
|
stats.rows_read++;
|
|
stats.rows_index_next++;
|
|
update_row_stats(ROWS_READ);
|
|
}
|
|
|
|
DBUG_RETURN(rc);
|
|
}
|
|
|
|
|
|
int ha_rocksdb::rnd_end()
|
|
{
|
|
DBUG_ENTER("ha_rocksdb::rnd_end");
|
|
|
|
release_scan_iterator();
|
|
|
|
DBUG_RETURN(0);
|
|
}
|
|
|
|
int ha_rocksdb::index_init(uint idx, bool sorted)
|
|
{
|
|
Rdb_transaction *tx= get_or_create_tx(table->in_use);
|
|
DBUG_ASSERT(tx != nullptr);
|
|
|
|
setup_read_decoders();
|
|
|
|
// If m_lock_rows is not RDB_LOCK_NONE then we will be doing a get_for_update
|
|
// when accessing the index, so don't acquire the snapshot right away.
|
|
// Otherwise acquire the snapshot immediately.
|
|
tx->acquire_snapshot(m_lock_rows == RDB_LOCK_NONE);
|
|
|
|
active_index= idx;
|
|
|
|
return 0;
|
|
}
|
|
|
|
int ha_rocksdb::index_end()
|
|
{
|
|
release_scan_iterator();
|
|
|
|
active_index= MAX_KEY;
|
|
in_range_check_pushed_down= FALSE;
|
|
return 0;
|
|
}
|
|
|
|
|
|
int ha_rocksdb::delete_all_rows(ha_rows* nrows)
|
|
{
|
|
DBUG_ENTER("ha_rocksdb::delete_all_rows");
|
|
/*
|
|
We will have to delete records one-by-one anyway. Let the SQL layer do
|
|
it.
|
|
*/
|
|
DBUG_RETURN(HA_ERR_WRONG_COMMAND);
|
|
}
|
|
|
|
int ha_rocksdb::truncate()
|
|
{
|
|
m_tbl_def->m_auto_incr_val= 1;
|
|
m_tbl_def->m_hidden_pk_val= 1;
|
|
remove_rows(m_tbl_def);
|
|
return 0;
|
|
}
|
|
|
|
|
|
/*
|
|
Delete the row we've last read. The row is also passed as parameter.
|
|
|
|
@detail
|
|
The caller guarantees table buf points to the row that was just read.
|
|
The row is either table->record[0] or table->record[1].
|
|
(Check out InnoDB: row_update_for_mysql() has "UT_NOT_USED(mysql_rec)"
|
|
*/
|
|
int ha_rocksdb::delete_row(const uchar *buf)
|
|
{
|
|
DBUG_ENTER("ha_rocksdb::delete_row");
|
|
|
|
set_last_rowkey(buf);
|
|
|
|
rocksdb::Slice key_slice(m_last_rowkey.ptr(), m_last_rowkey.length());
|
|
Rdb_transaction *tx= get_or_create_tx(table->in_use);
|
|
|
|
uint index= pk_index(table, m_tbl_def);
|
|
rocksdb::Status s= delete_or_singledelete(index, tx,
|
|
m_pk_descr->get_cf(),
|
|
key_slice);
|
|
if (!s.ok())
|
|
{
|
|
DBUG_RETURN(tx->set_status_error(table->in_use, s, m_pk_descr, m_tbl_def));
|
|
}
|
|
|
|
longlong hidden_pk_id= 0;
|
|
if (m_tbl_def->m_key_count > 1 &&
|
|
has_hidden_pk(table) && read_hidden_pk_id_from_rowkey(&hidden_pk_id))
|
|
DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
|
|
|
|
// Delete the record for every secondary index
|
|
for (uint i= 0; i < m_tbl_def->m_key_count; i++)
|
|
{
|
|
if (!is_pk(i, table, m_tbl_def))
|
|
{
|
|
int packed_size;
|
|
const std::shared_ptr<const Rdb_key_def>& kd= m_key_descr_arr[i];
|
|
packed_size= kd->pack_record(table, m_pack_buffer, buf, m_sk_packed_tuple,
|
|
nullptr, false, hidden_pk_id);
|
|
rocksdb::Slice secondary_key_slice(
|
|
reinterpret_cast<const char*>(m_sk_packed_tuple), packed_size);
|
|
/* Deleting on secondary key doesn't need any locks: */
|
|
tx->get_indexed_write_batch()->SingleDelete(kd->get_cf(),
|
|
secondary_key_slice);
|
|
}
|
|
}
|
|
|
|
if (do_bulk_commit(tx))
|
|
{
|
|
DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
|
|
}
|
|
stats.rows_deleted++;
|
|
update_row_stats(ROWS_DELETED);
|
|
|
|
DBUG_RETURN(0);
|
|
}
|
|
|
|
|
|
rocksdb::Status
|
|
ha_rocksdb::delete_or_singledelete(uint index,
|
|
Rdb_transaction *tx,
|
|
rocksdb::ColumnFamilyHandle* column_family,
|
|
const rocksdb::Slice& key)
|
|
{
|
|
if (can_use_single_delete(index))
|
|
return tx->single_delete(column_family, key);
|
|
return tx->delete_key(column_family, key);
|
|
}
|
|
|
|
|
|
void ha_rocksdb::update_stats(void)
|
|
{
|
|
DBUG_ENTER("ha_rocksdb::update_stats");
|
|
|
|
stats.records= 0;
|
|
stats.index_file_length= 0ul;
|
|
stats.data_file_length= 0ul;
|
|
stats.mean_rec_length= 0;
|
|
for (uint i= 0; i < m_tbl_def->m_key_count; i++)
|
|
{
|
|
if (is_pk(i, table, m_tbl_def))
|
|
{
|
|
stats.data_file_length= m_pk_descr->m_stats.m_actual_disk_size;
|
|
stats.records = m_pk_descr->m_stats.m_rows;
|
|
}
|
|
else
|
|
{
|
|
stats.index_file_length+= m_key_descr_arr[i]->m_stats.m_actual_disk_size;
|
|
}
|
|
}
|
|
DBUG_VOID_RETURN;
|
|
}
|
|
|
|
int ha_rocksdb::info(uint flag)
|
|
{
|
|
DBUG_ENTER("ha_rocksdb::info");
|
|
|
|
if (!table)
|
|
return 1;
|
|
|
|
if (flag & HA_STATUS_VARIABLE)
|
|
{
|
|
/*
|
|
Test only to simulate corrupted stats
|
|
*/
|
|
DBUG_EXECUTE_IF("myrocks_simulate_negative_stats",
|
|
m_pk_descr->m_stats.m_actual_disk_size =
|
|
-m_pk_descr->m_stats.m_actual_disk_size;);
|
|
|
|
update_stats();
|
|
|
|
/*
|
|
If any stats are negative due to bad cached stats, re-run analyze table
|
|
and re-retrieve the stats.
|
|
*/
|
|
if (static_cast<longlong>(stats.data_file_length) < 0 ||
|
|
static_cast<longlong>(stats.index_file_length) < 0 ||
|
|
static_cast<longlong>(stats.records) < 0)
|
|
{
|
|
if (analyze(nullptr, nullptr)) {
|
|
DBUG_RETURN(1);
|
|
}
|
|
|
|
update_stats();
|
|
}
|
|
|
|
if (stats.records == 0)
|
|
{
|
|
// most likely, the table is in memtable
|
|
// try to deduce from GetApproximateSizes
|
|
uchar buf[Rdb_key_def::INDEX_NUMBER_SIZE*2];
|
|
auto r= get_range(pk_index(table, m_tbl_def), buf);
|
|
uint64_t sz= 0;
|
|
rdb->GetApproximateSizes(
|
|
m_pk_descr->get_cf(),
|
|
&r, 1,
|
|
&sz, true);
|
|
stats.records = sz/ROCKSDB_ASSUMED_KEY_VALUE_DISK_SIZE;
|
|
stats.data_file_length = sz;
|
|
|
|
if (rocksdb_debug_optimizer_n_rows > 0)
|
|
stats.records = rocksdb_debug_optimizer_n_rows;
|
|
}
|
|
|
|
if (stats.records != 0)
|
|
stats.mean_rec_length = stats.data_file_length / stats.records;
|
|
}
|
|
if (flag & HA_STATUS_CONST)
|
|
{
|
|
ref_length= m_pk_descr->max_storage_fmt_length();
|
|
|
|
// TODO: Needs to reimplement after having real index statistics
|
|
for (uint i= 0; i < m_tbl_def->m_key_count; i++)
|
|
{
|
|
if (is_hidden_pk(i, table, m_tbl_def))
|
|
{
|
|
continue;
|
|
}
|
|
KEY* k= &table->key_info[i];
|
|
for (uint j = 0; j < k->ext_key_parts; j++)
|
|
{
|
|
const Rdb_index_stats& k_stats= m_key_descr_arr[i]->m_stats;
|
|
uint x = k_stats.m_distinct_keys_per_prefix.size() > j &&
|
|
k_stats.m_distinct_keys_per_prefix[j] > 0 ?
|
|
k_stats.m_rows / k_stats.m_distinct_keys_per_prefix[j] :
|
|
0;
|
|
if (x > stats.records)
|
|
x = stats.records;
|
|
if ((x == 0 && rocksdb_debug_optimizer_no_zero_cardinality) ||
|
|
rocksdb_debug_optimizer_n_rows > 0)
|
|
{
|
|
// Fake cardinality implementation. For example, (idx1, idx2, idx3) index
|
|
// will have rec_per_key for (idx1)=4, (idx1,2)=2, and (idx1,2,3)=1.
|
|
// rec_per_key for the whole index is 1, and multiplied by 2^n if
|
|
// n suffix columns of the index are not used.
|
|
x = 1 << (k->ext_key_parts-j-1);
|
|
}
|
|
k->rec_per_key[j]= x;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (flag & HA_STATUS_ERRKEY)
|
|
{
|
|
/*
|
|
Currently we support only primary keys so we know which key had a
|
|
uniqueness violation.
|
|
*/
|
|
errkey= m_dupp_errkey;
|
|
dup_ref= m_pk_tuple; // TODO(?): this should store packed PK.
|
|
}
|
|
|
|
if (flag & HA_STATUS_AUTO)
|
|
{
|
|
stats.auto_increment_value= m_tbl_def->m_auto_incr_val;
|
|
}
|
|
|
|
DBUG_RETURN(0);
|
|
}
|
|
|
|
void ha_rocksdb::position(const uchar *record)
|
|
{
|
|
DBUG_ENTER("ha_rocksdb::position");
|
|
|
|
longlong hidden_pk_id= 0;
|
|
if (has_hidden_pk(table) && read_hidden_pk_id_from_rowkey(&hidden_pk_id))
|
|
DBUG_ASSERT(false); // should never reach here
|
|
|
|
/*
|
|
Get packed primary key value from the record.
|
|
|
|
(Note: m_last_rowkey has the packed PK of last-read-row, which allows to
|
|
handle most cases, but there is an exception to this: when slave applies
|
|
RBR events, it fills the record and then calls position(); rnd_pos())
|
|
|
|
Also note that we *can* rely on PK column values being available. This is
|
|
because table_flags() includes HA_PRIMARY_KEY_REQUIRED_FOR_POSITION bit.
|
|
When that is true, table->prepare_for_position() adds PK columns into the
|
|
read set (this may potentially disable index-only access when PK column
|
|
cannot be restored from its mem-comparable form in the secondary indexes).
|
|
*/
|
|
uint packed_size= m_pk_descr->pack_record(table, m_pack_buffer, record,
|
|
ref, nullptr, false, hidden_pk_id);
|
|
|
|
/*
|
|
It could be that mem-comparable form of PK occupies less than ref_length
|
|
bytes. Fill the remainder with zeros.
|
|
*/
|
|
if (ref_length > packed_size)
|
|
memset(ref + packed_size, 0, ref_length - packed_size);
|
|
|
|
DBUG_VOID_RETURN;
|
|
}
|
|
|
|
|
|
int ha_rocksdb::rnd_pos(uchar *buf, uchar *pos)
|
|
{
|
|
int rc;
|
|
size_t len;
|
|
DBUG_ENTER("ha_rocksdb::rnd_pos");
|
|
|
|
stats.rows_requested++;
|
|
len= m_pk_descr->key_length(table, rocksdb::Slice((const char*)pos,
|
|
ref_length));
|
|
if (len == size_t(-1))
|
|
{
|
|
DBUG_RETURN(HA_ERR_INTERNAL_ERROR); /* Data corruption? */
|
|
}
|
|
|
|
rc= get_row_by_rowid(buf, pos, len);
|
|
|
|
if (!rc)
|
|
{
|
|
stats.rows_read++;
|
|
update_row_stats(ROWS_READ);
|
|
}
|
|
|
|
DBUG_RETURN(rc);
|
|
}
|
|
|
|
|
|
/*
|
|
@brief
|
|
Calculate (if needed) the bitmap of indexes that are modified by the
|
|
current query
|
|
|
|
@detail
|
|
The calculation done by checking index definitions against the table->write_set
|
|
column bitmap.
|
|
*/
|
|
|
|
void ha_rocksdb::calc_updated_indexes()
|
|
{
|
|
if (!m_update_scope_is_valid)
|
|
{
|
|
m_update_scope_is_valid= true;
|
|
m_update_scope.clear_all();
|
|
|
|
for (uint keynr= 0; keynr < table->s->keys; keynr++)
|
|
{
|
|
const std::shared_ptr<const Rdb_key_def>& kd= m_key_descr_arr[keynr];
|
|
/* Walk over all key parts, including the "extended key" suffix */
|
|
uint key_parts= kd->get_key_parts();
|
|
for (uint kp= 0; kp < key_parts; kp++)
|
|
{
|
|
if (has_hidden_pk(table) && kp + 1 == key_parts)
|
|
break;
|
|
|
|
Field *field= kd->get_table_field_for_part_no(table, kp);
|
|
if (bitmap_is_set(table->write_set, field->field_index))
|
|
{
|
|
m_update_scope.set_bit(keynr);
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
int ha_rocksdb::update_row(const uchar *old_data, uchar *new_data)
|
|
{
|
|
DBUG_ENTER("ha_rocksdb::update_row");
|
|
DBUG_ASSERT(m_lock_rows == RDB_LOCK_WRITE);
|
|
/*
|
|
old_data points to record we're updating. It is the same as the record
|
|
we've just read (for multi-table UPDATE, too, because SQL layer will make
|
|
an rnd_pos() call to re-read the record before calling update_row())
|
|
*/
|
|
DBUG_ASSERT(new_data == table->record[0]);
|
|
|
|
int rv;
|
|
rv= update_write_row(old_data, new_data, false);
|
|
|
|
if (rv == 0)
|
|
{
|
|
stats.rows_updated++;
|
|
update_row_stats(ROWS_UPDATED);
|
|
}
|
|
DBUG_RETURN(rv);
|
|
}
|
|
|
|
|
|
/* The following function was copied from ha_blackhole::store_lock: */
|
|
THR_LOCK_DATA **ha_rocksdb::store_lock(THD *thd,
|
|
THR_LOCK_DATA **to,
|
|
enum thr_lock_type lock_type)
|
|
{
|
|
DBUG_ENTER("ha_rocksdb::store_lock");
|
|
|
|
bool in_lock_tables = my_core::thd_in_lock_tables(thd);
|
|
|
|
/* First, make a decision about MyRocks's internal locking */
|
|
if (lock_type >= TL_WRITE_ALLOW_WRITE)
|
|
{
|
|
m_lock_rows = RDB_LOCK_WRITE;
|
|
}
|
|
else if (lock_type == TL_READ_WITH_SHARED_LOCKS)
|
|
{
|
|
m_lock_rows = RDB_LOCK_READ;
|
|
}
|
|
else
|
|
{
|
|
m_lock_rows = RDB_LOCK_NONE;
|
|
if (THDVAR(thd, lock_scanned_rows))
|
|
{
|
|
/*
|
|
The following logic was copied directly from
|
|
ha_innobase::store_lock_with_x_type() in
|
|
storage/innobase/handler/ha_innodb.cc and causes MyRocks to leave
|
|
locks in place on rows that are in a table that is not beint updated.
|
|
*/
|
|
uint sql_command = my_core::thd_sql_command(thd);
|
|
if ((lock_type == TL_READ && in_lock_tables) ||
|
|
(lock_type == TL_READ_HIGH_PRIORITY && in_lock_tables) ||
|
|
can_hold_read_locks_on_select(thd, lock_type))
|
|
{
|
|
ulong tx_isolation = my_core::thd_tx_isolation(thd);
|
|
if (sql_command != SQLCOM_CHECKSUM &&
|
|
((my_core::thd_test_options(thd, OPTION_BIN_LOG) &&
|
|
tx_isolation > ISO_READ_COMMITTED) ||
|
|
tx_isolation == ISO_SERIALIZABLE ||
|
|
(lock_type != TL_READ && lock_type != TL_READ_NO_INSERT) ||
|
|
(sql_command != SQLCOM_INSERT_SELECT &&
|
|
sql_command != SQLCOM_REPLACE_SELECT &&
|
|
sql_command != SQLCOM_UPDATE &&
|
|
sql_command != SQLCOM_DELETE &&
|
|
sql_command != SQLCOM_CREATE_TABLE)))
|
|
{
|
|
m_lock_rows = RDB_LOCK_READ;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/* Then, tell the SQL layer what kind of locking it should use: */
|
|
if (lock_type != TL_IGNORE && m_db_lock.type == TL_UNLOCK)
|
|
{
|
|
/*
|
|
Here is where we get into the guts of a row level lock.
|
|
If TL_UNLOCK is set
|
|
If we are not doing a LOCK TABLE or DISCARD/IMPORT
|
|
TABLESPACE, then allow multiple writers
|
|
*/
|
|
|
|
if ((lock_type >= TL_WRITE_CONCURRENT_INSERT && lock_type <= TL_WRITE) &&
|
|
!in_lock_tables && !my_core::thd_tablespace_op(thd))
|
|
{
|
|
lock_type = TL_WRITE_ALLOW_WRITE;
|
|
}
|
|
|
|
/*
|
|
In queries of type INSERT INTO t1 SELECT ... FROM t2 ...
|
|
MySQL would use the lock TL_READ_NO_INSERT on t2, and that
|
|
would conflict with TL_WRITE_ALLOW_WRITE, blocking all inserts
|
|
to t2. Convert the lock to a normal read lock to allow
|
|
concurrent inserts to t2.
|
|
*/
|
|
|
|
if (lock_type == TL_READ_NO_INSERT && !in_lock_tables)
|
|
{
|
|
lock_type = TL_READ;
|
|
}
|
|
|
|
m_db_lock.type= lock_type;
|
|
}
|
|
*to++= &m_db_lock;
|
|
DBUG_RETURN(to);
|
|
}
|
|
|
|
void ha_rocksdb::read_thd_vars(THD *thd)
|
|
{
|
|
m_store_checksums= THDVAR(thd, store_checksums);
|
|
m_verify_checksums= THDVAR(thd, verify_checksums);
|
|
m_checksums_pct= THDVAR(thd, checksums_pct);
|
|
}
|
|
|
|
int ha_rocksdb::external_lock(THD *thd, int lock_type)
|
|
{
|
|
int res= 0;
|
|
DBUG_ENTER("ha_rocksdb::external_lock");
|
|
|
|
int binlog_format= my_core::thd_binlog_format(thd);
|
|
bool unsafe_for_binlog= THDVAR(ha_thd(), unsafe_for_binlog);
|
|
if (lock_type == F_WRLCK && !thd->rli_slave &&
|
|
!unsafe_for_binlog &&
|
|
binlog_format != BINLOG_FORMAT_ROW &&
|
|
binlog_format != BINLOG_FORMAT_UNSPEC &&
|
|
my_core::thd_binlog_filter_ok(thd))
|
|
{
|
|
my_printf_error(ER_UNKNOWN_ERROR,
|
|
"Can't execute updates on master with binlog_format != ROW.",
|
|
MYF(0));
|
|
DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
|
|
}
|
|
|
|
if (lock_type == F_UNLCK)
|
|
{
|
|
Rdb_transaction* tx= get_or_create_tx(thd);
|
|
|
|
tx->io_perf_end_and_record(&m_io_perf);
|
|
if (!my_core::thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN))
|
|
{
|
|
/*
|
|
Do like InnoDB: when we get here, it's time to commit a
|
|
single-statement transaction.
|
|
|
|
If the statement involved multiple tables, this code will be executed
|
|
for each of them, but that's ok because non-first tx->commit() calls
|
|
will be no-ops.
|
|
*/
|
|
res= tx->commit_or_rollback();
|
|
}
|
|
}
|
|
else
|
|
{
|
|
if (my_core::thd_tx_isolation(thd) < ISO_READ_COMMITTED ||
|
|
my_core::thd_tx_isolation(thd) > ISO_REPEATABLE_READ)
|
|
{
|
|
my_printf_error(ER_UNKNOWN_ERROR,
|
|
"MyRocks supports only READ COMMITTED and "
|
|
"REPEATABLE READ isolation levels. "
|
|
"Please change from current isolation "
|
|
"level %s",
|
|
MYF(0),
|
|
tx_isolation_names[my_core::thd_tx_isolation(thd)]);
|
|
DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
|
|
}
|
|
/*
|
|
It's nice to do the following on start of every statement. The problem
|
|
is, handler->start_stmt() is not called for INSERTs.
|
|
So, we put this code here.
|
|
*/
|
|
Rdb_transaction* tx= get_or_create_tx(thd);
|
|
read_thd_vars(thd);
|
|
|
|
m_update_scope_is_valid= false;
|
|
|
|
if (skip_unique_check())
|
|
{
|
|
if ((thd->lex->sql_command == SQLCOM_INSERT ||
|
|
thd->lex->sql_command == SQLCOM_LOAD ||
|
|
thd->lex->sql_command == SQLCOM_REPLACE) &&
|
|
(thd->lex->duplicates == DUP_REPLACE ||
|
|
thd->lex->duplicates == DUP_UPDATE))
|
|
{
|
|
my_printf_error(ER_UNKNOWN_ERROR,
|
|
"When unique checking is disabled in MyRocks, INSERT,"
|
|
"UPDATE,LOAD statements with clauses that update or "
|
|
"replace the key (i.e. "
|
|
"INSERT ON DUPLICATE KEY UPDATE, REPLACE) are not "
|
|
"allowed. Query: %s",
|
|
MYF(0), thd->query());
|
|
DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
|
|
}
|
|
}
|
|
|
|
if (lock_type == F_WRLCK)
|
|
{
|
|
if (tx->is_tx_read_only())
|
|
{
|
|
my_printf_error(ER_UNKNOWN_ERROR,
|
|
"Can't execute updates when you started a transaction "
|
|
"with START TRANSACTION WITH CONSISTENT [ROCKSDB] "
|
|
"SNAPSHOT.",
|
|
MYF(0));
|
|
DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
|
|
}
|
|
|
|
/*
|
|
SQL layer signals us to take a write lock. It does so when starting DML
|
|
statement. We should put locks on the rows we're reading.
|
|
|
|
Note: sometimes, external_lock() can be called without a prior
|
|
::store_lock call. That's why we need to set lock_* members here, too.
|
|
*/
|
|
m_lock_rows= RDB_LOCK_WRITE;
|
|
|
|
if (thd->lex->sql_command == SQLCOM_CREATE_INDEX ||
|
|
thd->lex->sql_command == SQLCOM_DROP_INDEX ||
|
|
thd->lex->sql_command == SQLCOM_ALTER_TABLE)
|
|
{
|
|
tx->m_ddl_transaction= true;
|
|
}
|
|
}
|
|
rocksdb_register_tx(rocksdb_hton, thd, tx);
|
|
tx->io_perf_start(&m_io_perf);
|
|
}
|
|
|
|
DBUG_RETURN(res);
|
|
}
|
|
|
|
|
|
/**
|
|
@note
|
|
A quote from ha_innobase::start_stmt():
|
|
<quote>
|
|
MySQL calls this function at the start of each SQL statement inside LOCK
|
|
TABLES. Inside LOCK TABLES the ::external_lock method does not work to
|
|
mark SQL statement borders.
|
|
</quote>
|
|
*/
|
|
|
|
int ha_rocksdb::start_stmt(THD *thd, thr_lock_type lock_type)
|
|
{
|
|
Rdb_transaction* tx= get_or_create_tx(thd);
|
|
read_thd_vars(thd);
|
|
rocksdb_register_tx(ht, thd, tx);
|
|
tx->io_perf_start(&m_io_perf);
|
|
return 0;
|
|
}
|
|
|
|
rocksdb::Range get_range(
|
|
uint32_t i, uchar buf[Rdb_key_def::INDEX_NUMBER_SIZE*2], int offset1,
|
|
int offset2)
|
|
{
|
|
uchar* buf_begin= buf;
|
|
uchar* buf_end= buf + Rdb_key_def::INDEX_NUMBER_SIZE;
|
|
rdb_netbuf_store_index(buf_begin, i + offset1);
|
|
rdb_netbuf_store_index(buf_end, i + offset2);
|
|
|
|
return rocksdb::Range(
|
|
rocksdb::Slice((const char*) buf_begin, Rdb_key_def::INDEX_NUMBER_SIZE),
|
|
rocksdb::Slice((const char*) buf_end, Rdb_key_def::INDEX_NUMBER_SIZE));
|
|
}
|
|
|
|
static rocksdb::Range get_range(
|
|
const std::shared_ptr<const Rdb_key_def>& kd,
|
|
uchar buf[Rdb_key_def::INDEX_NUMBER_SIZE*2],
|
|
int offset1, int offset2)
|
|
{
|
|
return get_range(kd->get_index_number(), buf, offset1, offset2);
|
|
}
|
|
|
|
rocksdb::Range get_range(const std::shared_ptr<const Rdb_key_def>& kd,
|
|
uchar buf[Rdb_key_def::INDEX_NUMBER_SIZE*2])
|
|
{
|
|
if (kd->m_is_reverse_cf)
|
|
{
|
|
return myrocks::get_range(kd, buf, 1, 0);
|
|
}
|
|
else
|
|
{
|
|
return myrocks::get_range(kd, buf, 0, 1);
|
|
}
|
|
}
|
|
|
|
rocksdb::Range ha_rocksdb::get_range(
|
|
int i, uchar buf[Rdb_key_def::INDEX_NUMBER_SIZE*2]) const
|
|
{
|
|
return myrocks::get_range(m_key_descr_arr[i], buf);
|
|
}
|
|
|
|
|
|
/*
|
|
Drop index thread's main logic
|
|
*/
|
|
|
|
void Rdb_drop_index_thread::run()
|
|
{
|
|
mysql_mutex_lock(&m_signal_mutex);
|
|
|
|
for (;;) {
|
|
// The stop flag might be set by shutdown command
|
|
// after drop_index_thread releases signal_mutex
|
|
// (i.e. while executing expensive Seek()). To prevent drop_index_thread
|
|
// from entering long cond_timedwait, checking if stop flag
|
|
// is true or not is needed, with drop_index_interrupt_mutex held.
|
|
if (m_stop) {
|
|
break;
|
|
}
|
|
|
|
timespec ts;
|
|
clock_gettime(CLOCK_REALTIME, &ts);
|
|
ts.tv_sec += dict_manager.is_drop_index_empty()
|
|
? 24*60*60 // no filtering
|
|
: 60; // filtering
|
|
|
|
auto ret __attribute__((__unused__)) = mysql_cond_timedwait(
|
|
&m_signal_cond, &m_signal_mutex, &ts);
|
|
if (m_stop) {
|
|
break;
|
|
}
|
|
// make sure, no program error is returned
|
|
DBUG_ASSERT(ret == 0 || ret == ETIMEDOUT);
|
|
mysql_mutex_unlock(&m_signal_mutex);
|
|
|
|
std::vector<GL_INDEX_ID> indices;
|
|
dict_manager.get_ongoing_drop_indexes(&indices);
|
|
if (!indices.empty()) {
|
|
std::unordered_set<GL_INDEX_ID> finished;
|
|
rocksdb::ReadOptions read_opts;
|
|
read_opts.total_order_seek = true; // disable bloom filter
|
|
|
|
for (auto d : indices) {
|
|
uint32 cf_flags= 0;
|
|
if (!dict_manager.get_cf_flags(d.cf_id, &cf_flags))
|
|
{
|
|
sql_print_error("RocksDB: Failed to get column family flags "
|
|
"from cf id %u. MyRocks data dictionary may "
|
|
"get corrupted.", d.cf_id);
|
|
abort_with_stack_traces();
|
|
}
|
|
rocksdb::ColumnFamilyHandle* cfh= cf_manager.get_cf(d.cf_id);
|
|
DBUG_ASSERT(cfh);
|
|
bool is_reverse_cf= cf_flags & Rdb_key_def::REVERSE_CF_FLAG;
|
|
|
|
bool index_removed= false;
|
|
uchar key_buf[Rdb_key_def::INDEX_NUMBER_SIZE]= {0};
|
|
rdb_netbuf_store_uint32(key_buf, d.index_id);
|
|
rocksdb::Slice key = rocksdb::Slice((char*)key_buf, sizeof(key_buf));
|
|
uchar buf[Rdb_key_def::INDEX_NUMBER_SIZE*2];
|
|
rocksdb::Range range = get_range(d.index_id, buf, is_reverse_cf?1:0,
|
|
is_reverse_cf?0:1);
|
|
rocksdb::CompactRangeOptions compact_range_options;
|
|
compact_range_options.bottommost_level_compaction =
|
|
rocksdb::BottommostLevelCompaction::kForce;
|
|
compact_range_options.exclusive_manual_compaction = false;
|
|
rocksdb::Status status = DeleteFilesInRange(rdb->GetBaseDB(), cfh,
|
|
&range.start, &range.limit);
|
|
if (!status.ok())
|
|
{
|
|
if (status.IsShutdownInProgress())
|
|
{
|
|
break;
|
|
}
|
|
rdb_handle_io_error(status, RDB_IO_ERROR_BG_THREAD);
|
|
}
|
|
status = rdb->CompactRange(
|
|
compact_range_options, cfh, &range.start, &range.limit);
|
|
if (!status.ok())
|
|
{
|
|
if (status.IsShutdownInProgress())
|
|
{
|
|
break;
|
|
}
|
|
rdb_handle_io_error(status, RDB_IO_ERROR_BG_THREAD);
|
|
}
|
|
std::unique_ptr<rocksdb::Iterator> it(
|
|
rdb->NewIterator(read_opts, cfh));
|
|
it->Seek(key);
|
|
if (is_reverse_cf)
|
|
{
|
|
if (!it->Valid())
|
|
{
|
|
it->SeekToLast();
|
|
}
|
|
else
|
|
{
|
|
it->Prev();
|
|
}
|
|
}
|
|
if (!it->Valid())
|
|
{
|
|
index_removed= true;
|
|
}
|
|
else
|
|
{
|
|
if (memcmp(it->key().data(), key_buf, Rdb_key_def::INDEX_NUMBER_SIZE))
|
|
{
|
|
// Key does not have same prefix
|
|
index_removed= true;
|
|
}
|
|
}
|
|
if (index_removed)
|
|
{
|
|
finished.insert(d);
|
|
}
|
|
}
|
|
|
|
if (!finished.empty()) {
|
|
dict_manager.finish_drop_indexes(finished);
|
|
}
|
|
}
|
|
mysql_mutex_lock(&m_signal_mutex);
|
|
}
|
|
|
|
mysql_mutex_unlock(&m_signal_mutex);
|
|
}
|
|
|
|
|
|
Rdb_tbl_def* ha_rocksdb::get_table_if_exists(const char* tablename)
|
|
{
|
|
DBUG_ASSERT(tablename != nullptr);
|
|
|
|
std::string str;
|
|
if (rdb_normalize_tablename(tablename, &str))
|
|
{
|
|
// We were not passed table name?
|
|
DBUG_ASSERT(0);
|
|
return nullptr;
|
|
}
|
|
|
|
return ddl_manager.find(str);
|
|
}
|
|
|
|
/*
|
|
Note: the following function is called when the table is not open. That is,
|
|
this->table==nullptr, pk_key_descr==nullptr, etc.
|
|
|
|
tablename points to line in form "./dbname/tablename".
|
|
*/
|
|
|
|
int ha_rocksdb::delete_table(const char *tablename)
|
|
{
|
|
DBUG_ENTER("ha_rocksdb::delete_table");
|
|
std::unique_ptr<rocksdb::WriteBatch> wb= dict_manager.begin();
|
|
rocksdb::WriteBatch *batch= wb.get();
|
|
|
|
/* Find the table in the hash */
|
|
Rdb_tbl_def *tbl = get_table_if_exists(tablename);
|
|
if (!tbl)
|
|
DBUG_RETURN(HA_ERR_NO_SUCH_TABLE);
|
|
dict_manager.add_drop_table(tbl->m_key_descr_arr, tbl->m_key_count, batch);
|
|
|
|
/*
|
|
Remove the table entry in data dictionary (this will also remove it from
|
|
the persistent data dictionary).
|
|
*/
|
|
ddl_manager.remove(tbl, batch, true);
|
|
if (dict_manager.commit(batch))
|
|
DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
|
|
|
|
rdb_drop_idx_thread.signal();
|
|
|
|
DBUG_RETURN(0);
|
|
}
|
|
|
|
|
|
void ha_rocksdb::remove_rows(Rdb_tbl_def *tbl)
|
|
{
|
|
rocksdb::WriteOptions wo= rdb_get_rocksdb_write_options(handler::ha_thd());
|
|
|
|
rocksdb::ReadOptions opts;
|
|
opts.total_order_seek= true;
|
|
rocksdb::Iterator* it= rdb->NewIterator(opts);
|
|
|
|
char key_buf[MAX_KEY_LENGTH];
|
|
uint key_len;
|
|
/*
|
|
Remove all records in each index.
|
|
(This is is not crash-safe, but it doesn't matter, because bulk row
|
|
deletion will be handled on rocksdb side)
|
|
*/
|
|
for (uint i= 0; i < tbl->m_key_count ; i++)
|
|
{
|
|
const std::shared_ptr<const Rdb_key_def>& kd= tbl->m_key_descr_arr[i];
|
|
kd->get_infimum_key(reinterpret_cast<uchar*>(key_buf), &key_len);
|
|
|
|
rocksdb::Slice table_key(key_buf, key_len);
|
|
it->Seek(table_key);
|
|
while (it->Valid())
|
|
{
|
|
rocksdb::Slice key= it->key();
|
|
if (!kd->covers_key(key))
|
|
{
|
|
break;
|
|
}
|
|
|
|
if (can_use_single_delete(i))
|
|
rdb->SingleDelete(wo, key);
|
|
else
|
|
rdb->Delete(wo, key);
|
|
it->Next();
|
|
}
|
|
}
|
|
delete it;
|
|
}
|
|
|
|
|
|
int ha_rocksdb::rename_table(const char *from, const char *to)
|
|
{
|
|
DBUG_ASSERT(from != nullptr);
|
|
DBUG_ASSERT(to != nullptr);
|
|
|
|
bool write_err= false;
|
|
std::string from_str;
|
|
std::string to_str;
|
|
std::string from_db;
|
|
std::string to_db;
|
|
|
|
if (rdb_normalize_tablename(from, &from_str) ||
|
|
rdb_split_normalized_tablename(from_str, &from_db))
|
|
{
|
|
return HA_ERR_INTERNAL_ERROR;
|
|
}
|
|
|
|
if (rdb_normalize_tablename(to, &to_str) ||
|
|
rdb_split_normalized_tablename(to_str, &to_db))
|
|
{
|
|
return HA_ERR_INTERNAL_ERROR;
|
|
}
|
|
|
|
// If the user changed the database part of the name then validate that the
|
|
// 'to' database exists.
|
|
if (from_db != to_db && !rdb_database_exists(to_db))
|
|
{
|
|
return HA_ERR_INTERNAL_ERROR;
|
|
}
|
|
|
|
/* Check if any index has a per-index column family */
|
|
{
|
|
/*
|
|
We can't check this directly, because
|
|
1. the object that referred to the old table has been close()d.
|
|
2. this ha_rocksdb object has no connection to any table at all, it has
|
|
been just created with (TABLE*)nullptr.
|
|
|
|
So, we have to find the old table in the ddl_manager (it's there because
|
|
it has been opened by this statement), and check is_auto_cf there.
|
|
*/
|
|
Rdb_tbl_def *tdef;
|
|
if (!(tdef= ddl_manager.find(from_str)))
|
|
return HA_ERR_INTERNAL_ERROR;
|
|
|
|
for (uint i= 0; i < tdef->m_key_count; i++)
|
|
{
|
|
DBUG_ASSERT(tdef->m_key_descr_arr != nullptr);
|
|
|
|
if (tdef->m_key_descr_arr[i]->m_is_auto_cf)
|
|
{
|
|
my_error(ER_NOT_SUPPORTED_YET, MYF(0),
|
|
"ALTER TABLE on table with per-index CF");
|
|
return HA_ERR_INTERNAL_ERROR;
|
|
}
|
|
}
|
|
}
|
|
|
|
std::unique_ptr<rocksdb::WriteBatch> wb= dict_manager.begin();
|
|
rocksdb::WriteBatch *batch= wb.get();
|
|
dict_manager.lock();
|
|
write_err= ddl_manager.rename(from_str, to_str, batch)
|
|
|| dict_manager.commit(batch);
|
|
dict_manager.unlock();
|
|
|
|
return write_err ? HA_ERR_INTERNAL_ERROR : 0;
|
|
}
|
|
|
|
/**
|
|
check_if_incompatible_data() called if ALTER TABLE can't detect otherwise
|
|
if new and old definition are compatible
|
|
|
|
@details If there are no other explicit signs like changed number of
|
|
fields this function will be called by compare_tables()
|
|
(sql/sql_tables.cc) to decide should we rewrite whole table or only .frm
|
|
file.
|
|
|
|
*/
|
|
|
|
bool ha_rocksdb::check_if_incompatible_data(HA_CREATE_INFO *info,
|
|
uint table_changes)
|
|
{
|
|
DBUG_ENTER("ha_rocksdb::check_if_incompatible_data");
|
|
// this function is needed only for online alter-table
|
|
DBUG_RETURN(COMPATIBLE_DATA_NO);
|
|
}
|
|
|
|
|
|
int ha_rocksdb::extra(enum ha_extra_function operation)
|
|
{
|
|
DBUG_ENTER("ha_rocksdb::extra");
|
|
|
|
switch (operation) {
|
|
case HA_EXTRA_KEYREAD:
|
|
m_keyread_only= true;
|
|
break;
|
|
case HA_EXTRA_NO_KEYREAD:
|
|
m_keyread_only= false;
|
|
break;
|
|
case HA_EXTRA_FLUSH:
|
|
/*
|
|
If the table has blobs, then they are part of m_retrieved_record.
|
|
This call invalidates them.
|
|
*/
|
|
m_retrieved_record.clear();
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
|
|
DBUG_RETURN(0);
|
|
}
|
|
|
|
/*
|
|
Given a starting key and an ending key, estimate the number of rows that
|
|
will exist between the two keys.
|
|
*/
|
|
ha_rows ha_rocksdb::records_in_range(uint inx, key_range *min_key,
|
|
key_range *max_key)
|
|
{
|
|
DBUG_ENTER("ha_rocksdb::records_in_range");
|
|
ha_rows ret = THDVAR(ha_thd(), records_in_range);
|
|
if (ret) {
|
|
DBUG_RETURN(ret);
|
|
}
|
|
if (table->force_index) {
|
|
ha_rows force_rows = THDVAR(ha_thd(), force_index_records_in_range);
|
|
if (force_rows) {
|
|
DBUG_RETURN(force_rows);
|
|
}
|
|
}
|
|
|
|
const std::shared_ptr<const Rdb_key_def>& kd= m_key_descr_arr[inx];
|
|
DBUG_ASSERT(kd);
|
|
|
|
uint size1=0;
|
|
if (min_key) {
|
|
size1 = kd->pack_index_tuple(table, m_pack_buffer, m_sk_packed_tuple,
|
|
min_key->key, min_key->keypart_map);
|
|
if (min_key->flag == HA_READ_PREFIX_LAST_OR_PREV ||
|
|
min_key->flag == HA_READ_PREFIX_LAST ||
|
|
min_key->flag == HA_READ_AFTER_KEY)
|
|
{
|
|
kd->successor(m_sk_packed_tuple, size1);
|
|
}
|
|
} else {
|
|
kd->get_infimum_key(m_sk_packed_tuple, &size1);
|
|
}
|
|
|
|
uint size2=0;
|
|
if (max_key) {
|
|
size2 = kd->pack_index_tuple(table, m_pack_buffer, m_sk_packed_tuple_old,
|
|
max_key->key, max_key->keypart_map);
|
|
if (max_key->flag == HA_READ_PREFIX_LAST_OR_PREV ||
|
|
max_key->flag == HA_READ_PREFIX_LAST ||
|
|
max_key->flag == HA_READ_AFTER_KEY)
|
|
{
|
|
kd->successor(m_sk_packed_tuple_old, size2);
|
|
}
|
|
// pad the upper key with FFFFs to make sure it is more than the lower
|
|
if (size1 > size2) {
|
|
memset(m_sk_packed_tuple_old+size2, 0xff, size1-size2);
|
|
size2 = size1;
|
|
}
|
|
} else {
|
|
kd->get_supremum_key(m_sk_packed_tuple_old, &size2);
|
|
}
|
|
|
|
rocksdb::Slice slice1((const char*) m_sk_packed_tuple, size1);
|
|
rocksdb::Slice slice2((const char*) m_sk_packed_tuple_old, size2);
|
|
|
|
// slice1 >= slice2 means no row will match
|
|
if (slice1.compare(slice2) >= 0)
|
|
{
|
|
DBUG_RETURN(0);
|
|
}
|
|
|
|
rocksdb::Range r(
|
|
kd->m_is_reverse_cf ? slice2 : slice1,
|
|
kd->m_is_reverse_cf ? slice1 : slice2
|
|
);
|
|
|
|
uint64_t sz=0;
|
|
auto disk_size = kd->m_stats.m_actual_disk_size;
|
|
if (disk_size == 0)
|
|
disk_size = kd->m_stats.m_data_size;
|
|
auto rows = kd->m_stats.m_rows;
|
|
if (rows == 0 || disk_size == 0)
|
|
{
|
|
rows= 1;
|
|
disk_size= ROCKSDB_ASSUMED_KEY_VALUE_DISK_SIZE;
|
|
}
|
|
// Getting statistics, including from Memtables
|
|
rdb->GetApproximateSizes(
|
|
kd->get_cf(),
|
|
&r, 1,
|
|
&sz, true);
|
|
ret = rows*sz/disk_size;
|
|
|
|
/*
|
|
GetApproximateSizes() gives estimates so ret might exceed stats.records.
|
|
MySQL then decides to use full index scan rather than range scan, which
|
|
is not efficient for most cases.
|
|
To prevent this, changing estimated records slightly smaller than
|
|
stats.records.
|
|
*/
|
|
if (ret >= stats.records)
|
|
{
|
|
ret = stats.records * 0.99;
|
|
}
|
|
|
|
if (rocksdb_debug_optimizer_n_rows > 0)
|
|
{
|
|
ret= rocksdb_debug_optimizer_n_rows;
|
|
}
|
|
else if (ret == 0)
|
|
{
|
|
ret = 1;
|
|
}
|
|
|
|
DBUG_RETURN(ret);
|
|
}
|
|
|
|
|
|
void ha_rocksdb::update_create_info(HA_CREATE_INFO *create_info)
|
|
{
|
|
if (!(create_info->used_fields & HA_CREATE_USED_AUTO))
|
|
{
|
|
create_info->auto_increment_value= m_tbl_def->m_auto_incr_val;
|
|
}
|
|
}
|
|
|
|
/**
|
|
@brief
|
|
Doing manual compaction on OPTIMIZE TABLE in RocksDB.
|
|
Compaction itself is executed by background thread in RocksDB, but
|
|
CompactRange() waits until compaction completes so this function
|
|
may take a long time.
|
|
Since RocksDB dataset is allocated per index id, OPTIMIZE TABLE
|
|
triggers manual compaction for all indexes of the table.
|
|
@details
|
|
Compaction range is from the beginning of the index id to
|
|
the first row of the next index id. When using reverse order
|
|
column family, the first row of the next index id should be
|
|
the last row of the previous index id.
|
|
*/
|
|
int ha_rocksdb::optimize(THD *thd, HA_CHECK_OPT* check_opt)
|
|
{
|
|
int rc= 0;
|
|
for (uint i= 0; i < table->s->keys; i++)
|
|
{
|
|
uchar buf[Rdb_key_def::INDEX_NUMBER_SIZE*2];
|
|
auto range = get_range(i, buf);
|
|
if (!rdb->CompactRange(rocksdb::CompactRangeOptions(),
|
|
m_key_descr_arr[i]->get_cf(),
|
|
&range.start, &range.limit).ok())
|
|
{
|
|
rc= 1;
|
|
break;
|
|
}
|
|
}
|
|
return rc;
|
|
}
|
|
|
|
int ha_rocksdb::analyze(THD* thd, HA_CHECK_OPT* check_opt)
|
|
{
|
|
DBUG_ENTER("ha_rocksdb::analyze");
|
|
|
|
if (!table)
|
|
DBUG_RETURN(1);
|
|
|
|
// find per column family key ranges which need to be queried
|
|
std::unordered_map<rocksdb::ColumnFamilyHandle*, std::vector<rocksdb::Range>>
|
|
ranges;
|
|
std::unordered_set<GL_INDEX_ID> ids_to_check;
|
|
std::vector<uchar> buf(table->s->keys * 2 * Rdb_key_def::INDEX_NUMBER_SIZE);
|
|
for (uint i = 0; i < table->s->keys; i++)
|
|
{
|
|
auto bufp = &buf[i * 2 * Rdb_key_def::INDEX_NUMBER_SIZE];
|
|
const std::shared_ptr<const Rdb_key_def>& kd= m_key_descr_arr[i];
|
|
ranges[kd->get_cf()].push_back(get_range(i, bufp));
|
|
ids_to_check.insert(kd->get_gl_index_id());
|
|
}
|
|
|
|
// for analyze statements, force flush on memtable to get accurate cardinality
|
|
Rdb_cf_manager& cf_manager= rdb_get_cf_manager();
|
|
if (thd != nullptr && THDVAR(thd, flush_memtable_on_analyze) &&
|
|
!rocksdb_pause_background_work)
|
|
{
|
|
for (auto it : ids_to_check)
|
|
{
|
|
rdb->Flush(rocksdb::FlushOptions(), cf_manager.get_cf(it.cf_id));
|
|
}
|
|
}
|
|
|
|
// get RocksDB table properties for these ranges
|
|
rocksdb::TablePropertiesCollection props;
|
|
for (auto it : ranges)
|
|
{
|
|
auto old_size __attribute__((__unused__)) = props.size();
|
|
auto status = rdb->GetPropertiesOfTablesInRange(
|
|
it.first, &it.second[0], it.second.size(), &props);
|
|
DBUG_ASSERT(props.size() >= old_size);
|
|
if (!status.ok())
|
|
DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
|
|
}
|
|
|
|
int num_sst= 0;
|
|
// group stats per index id
|
|
std::unordered_map<GL_INDEX_ID, Rdb_index_stats> stats;
|
|
for (auto it : ids_to_check)
|
|
{
|
|
// Initialize the stats to 0. If there are no files that contain
|
|
// this gl_index_id, then 0 should be stored for the cached stats.
|
|
stats[it] = Rdb_index_stats(it);
|
|
}
|
|
for (auto it : props)
|
|
{
|
|
std::vector<Rdb_index_stats> sst_stats;
|
|
Rdb_tbl_prop_coll::read_stats_from_tbl_props(it.second, &sst_stats);
|
|
/*
|
|
sst_stats is a list of index statistics for indexes that have entries
|
|
in the current SST file.
|
|
*/
|
|
for (auto it1 : sst_stats)
|
|
{
|
|
/*
|
|
Only update statistics for indexes that belong to this SQL table.
|
|
|
|
The reason is: We are walking through all SST files that have
|
|
entries from this table (and so can compute good statistics). For
|
|
other SQL tables, it can be that we're only seeing a small fraction
|
|
of table's entries (and so we can't update statistics based on that).
|
|
*/
|
|
if (ids_to_check.find(it1.m_gl_index_id) == ids_to_check.end())
|
|
continue;
|
|
|
|
const std::shared_ptr<const Rdb_key_def>& kd=
|
|
ddl_manager.find(it1.m_gl_index_id);
|
|
stats[it1.m_gl_index_id].merge(it1, true, kd->max_storage_fmt_length());
|
|
}
|
|
num_sst++;
|
|
}
|
|
|
|
// set and persist new stats
|
|
ddl_manager.set_stats(stats);
|
|
ddl_manager.persist_stats(true);
|
|
|
|
DBUG_RETURN(0);
|
|
}
|
|
|
|
void ha_rocksdb::get_auto_increment(ulonglong offset, ulonglong increment,
|
|
ulonglong nb_desired_values,
|
|
ulonglong *first_value,
|
|
ulonglong *nb_reserved_values)
|
|
{
|
|
/*
|
|
MySQL has a somewhat complicated way of handling the auto-increment value.
|
|
The first time get_auto_increment is called for a statement,
|
|
nb_desired_values is the estimate for how many values will be needed. The
|
|
engine can then reserve some values, and those will be automatically used
|
|
by MySQL, until a hard-coded value shows up in the insert statement, after
|
|
which MySQL again calls this function to reset its starting value.
|
|
*
|
|
For simplicity we will just ignore nb_desired_values - we aren't going to
|
|
reserve any extra values for a multi-insert statement. Each row will
|
|
simply acquire the next value as needed and we will always tell MySQL that
|
|
we only reserved 1 value. Since we are using an atomic value for
|
|
m_auto_incr_val this should be safe - if we had to grab a mutex, doing
|
|
an actual reserve of some values might be a better solution.
|
|
*/
|
|
*first_value= m_tbl_def->m_auto_incr_val++;
|
|
*nb_reserved_values= 1;
|
|
}
|
|
|
|
|
|
#ifndef DBUG_OFF
|
|
|
|
/* Debugger help function */
|
|
static char dbug_item_print_buf[512];
|
|
|
|
const char *dbug_print_item(Item *item)
|
|
{
|
|
char *buf= dbug_item_print_buf;
|
|
String str(buf, sizeof(dbug_item_print_buf), &my_charset_bin);
|
|
str.length(0);
|
|
if (!item)
|
|
return "(Item*)nullptr";
|
|
item->print(&str ,QT_ORDINARY);
|
|
if (str.c_ptr() == buf)
|
|
return buf;
|
|
else
|
|
return "Couldn't fit into buffer";
|
|
}
|
|
|
|
#endif /*DBUG_OFF*/
|
|
|
|
|
|
/**
|
|
SQL layer calls this function to push an index condition.
|
|
|
|
@details
|
|
The condition is for index keyno (only one condition can be pushed at a
|
|
time).
|
|
The caller guarantees that condition refers only to index fields; besides
|
|
that, fields must have
|
|
|
|
$field->part_of_key.set_bit(keyno)
|
|
|
|
which means that
|
|
|
|
(handler->index_flags(keyno, $keypart, 0) & HA_KEYREAD_ONLY) == 1
|
|
|
|
which means that field value can be restored from the index tuple.
|
|
|
|
@return
|
|
Part of condition we couldn't check (always nullptr).
|
|
*/
|
|
|
|
class Item* ha_rocksdb::idx_cond_push(uint keyno, class Item* idx_cond)
|
|
{
|
|
DBUG_ASSERT(keyno != MAX_KEY);
|
|
DBUG_ASSERT(idx_cond != nullptr);
|
|
|
|
pushed_idx_cond = idx_cond;
|
|
pushed_idx_cond_keyno = keyno;
|
|
in_range_check_pushed_down = TRUE;
|
|
|
|
/* We will check the whole condition */
|
|
return nullptr;
|
|
}
|
|
|
|
|
|
/*
|
|
@brief
|
|
Check the index condition.
|
|
|
|
@detail
|
|
Check the index condition. (The caller has unpacked all needed index
|
|
columns into table->record[0])
|
|
|
|
@return
|
|
ICP_NO_MATCH - Condition not satisfied (caller should continue
|
|
scanning)
|
|
OUT_OF_RANGE - We've left the range we're scanning (caller should
|
|
stop scanning and return HA_ERR_END_OF_FILE)
|
|
|
|
ICP_MATCH - Condition is satisfied (caller should fetch the record
|
|
and return it)
|
|
*/
|
|
|
|
enum icp_result ha_rocksdb::check_index_cond()
|
|
{
|
|
DBUG_ASSERT(pushed_idx_cond);
|
|
DBUG_ASSERT(pushed_idx_cond_keyno != MAX_KEY);
|
|
|
|
if (end_range && compare_key_icp(end_range) > 0)
|
|
{
|
|
/* caller should return HA_ERR_END_OF_FILE already */
|
|
return ICP_OUT_OF_RANGE;
|
|
}
|
|
|
|
return pushed_idx_cond->val_int() ? ICP_MATCH : ICP_NO_MATCH;
|
|
}
|
|
|
|
|
|
/**
|
|
Checking if an index is used for ascending scan or not
|
|
|
|
@detail
|
|
Currently RocksDB does not support bloom filter for
|
|
prefix lookup + descending scan, but supports bloom filter for
|
|
prefix lookup + ascending scan. This function returns true if
|
|
the scan pattern is absolutely ascending.
|
|
@param kd
|
|
@param find_flag
|
|
*/
|
|
bool ha_rocksdb::is_ascending(const std::shared_ptr<const Rdb_key_def>& kd,
|
|
enum ha_rkey_function find_flag) const
|
|
{
|
|
DBUG_ASSERT(kd != nullptr);
|
|
|
|
bool is_ascending;
|
|
switch (find_flag) {
|
|
case HA_READ_KEY_EXACT:
|
|
{
|
|
is_ascending = !kd->m_is_reverse_cf;
|
|
break;
|
|
}
|
|
case HA_READ_PREFIX:
|
|
{
|
|
is_ascending= true;
|
|
break;
|
|
}
|
|
case HA_READ_KEY_OR_NEXT:
|
|
case HA_READ_AFTER_KEY:
|
|
{
|
|
is_ascending = !kd->m_is_reverse_cf;
|
|
break;
|
|
}
|
|
case HA_READ_KEY_OR_PREV:
|
|
case HA_READ_BEFORE_KEY:
|
|
case HA_READ_PREFIX_LAST:
|
|
case HA_READ_PREFIX_LAST_OR_PREV:
|
|
{
|
|
is_ascending = kd->m_is_reverse_cf;
|
|
break;
|
|
}
|
|
default:
|
|
is_ascending= false;
|
|
}
|
|
return is_ascending;
|
|
}
|
|
|
|
/*
|
|
Checks if inplace alter is supported for a given operation.
|
|
*/
|
|
|
|
my_core::enum_alter_inplace_result ha_rocksdb::check_if_supported_inplace_alter(
|
|
TABLE* /* altered_table */,
|
|
my_core::Alter_inplace_info *ha_alter_info)
|
|
{
|
|
DBUG_ENTER("check_if_supported_alter");
|
|
|
|
if (ha_alter_info->handler_flags &
|
|
~(my_core::Alter_inplace_info::DROP_INDEX |
|
|
my_core::Alter_inplace_info::DROP_UNIQUE_INDEX |
|
|
my_core::Alter_inplace_info::ADD_INDEX))
|
|
{
|
|
DBUG_RETURN(my_core::HA_ALTER_INPLACE_NOT_SUPPORTED);
|
|
}
|
|
|
|
DBUG_RETURN(my_core::HA_ALTER_INPLACE_SHARED_LOCK_AFTER_PREPARE);
|
|
}
|
|
|
|
/**
|
|
Allows the storage engine to update internal structures with concurrent
|
|
writes blocked. If check_if_supported_inplace_alter() returns
|
|
HA_ALTER_INPLACE_NO_LOCK_AFTER_PREPARE or
|
|
HA_ALTER_INPLACE_SHARED_AFTER_PREPARE, this function is called with
|
|
exclusive lock otherwise the same level of locking as for
|
|
inplace_alter_table() will be used.
|
|
|
|
@note Storage engines are responsible for reporting any errors by
|
|
calling my_error()/print_error()
|
|
|
|
@note If this function reports error, commit_inplace_alter_table()
|
|
will be called with commit= false.
|
|
|
|
@note For partitioning, failing to prepare one partition, means that
|
|
commit_inplace_alter_table() will be called to roll back changes for
|
|
all partitions. This means that commit_inplace_alter_table() might be
|
|
called without prepare_inplace_alter_table() having been called first
|
|
for a given partition.
|
|
|
|
@param altered_table TABLE object for new version of table.
|
|
@param ha_alter_info Structure describing changes to be done
|
|
by ALTER TABLE and holding data used
|
|
during in-place alter.
|
|
|
|
@retval true Error
|
|
@retval false Success
|
|
*/
|
|
bool ha_rocksdb::prepare_inplace_alter_table(
|
|
TABLE *altered_table,
|
|
my_core::Alter_inplace_info *ha_alter_info)
|
|
{
|
|
DBUG_ENTER("prepare_inplace_alter_table");
|
|
uint n_dropped_keys= ha_alter_info->index_drop_count;
|
|
uint n_added_keys= ha_alter_info->index_add_count;
|
|
uint old_n_keys= m_tbl_def->m_key_count;
|
|
uint new_n_keys= altered_table->s->keys;
|
|
if (has_hidden_pk(altered_table))
|
|
{
|
|
new_n_keys += 1;
|
|
}
|
|
|
|
std::shared_ptr<Rdb_key_def>* old_key_descr= m_tbl_def->m_key_descr_arr;
|
|
std::shared_ptr<Rdb_key_def>* new_key_descr=
|
|
new std::shared_ptr<Rdb_key_def>[new_n_keys];
|
|
|
|
Rdb_tbl_def* new_tdef= new Rdb_tbl_def(m_tbl_def->full_tablename());
|
|
new_tdef->m_key_descr_arr= new_key_descr;
|
|
new_tdef->m_key_count= new_n_keys;
|
|
new_tdef->m_auto_incr_val=
|
|
m_tbl_def->m_auto_incr_val.load(std::memory_order_relaxed);
|
|
new_tdef->m_hidden_pk_val=
|
|
m_tbl_def->m_hidden_pk_val.load(std::memory_order_relaxed);
|
|
|
|
if (ha_alter_info->handler_flags &
|
|
(my_core::Alter_inplace_info::DROP_INDEX |
|
|
my_core::Alter_inplace_info::DROP_UNIQUE_INDEX |
|
|
my_core::Alter_inplace_info::ADD_INDEX)
|
|
&& create_key_defs(altered_table, new_tdef, table, m_tbl_def))
|
|
{
|
|
/* Delete the new key descriptors */
|
|
delete[] new_key_descr;
|
|
|
|
/*
|
|
Explicitly mark as nullptr so we don't accidentally remove entries
|
|
from data dictionary on cleanup (or cause double delete[]).
|
|
*/
|
|
new_tdef->m_key_descr_arr= nullptr;
|
|
delete new_tdef;
|
|
DBUG_RETURN(1);
|
|
}
|
|
|
|
std::unordered_set<std::shared_ptr<Rdb_key_def>> added_indexes;
|
|
std::unordered_set<GL_INDEX_ID> dropped_index_ids;
|
|
|
|
uint i;
|
|
uint j;
|
|
const KEY* key;
|
|
|
|
/* Determine which(if any) key definition(s) need to be dropped */
|
|
for (i = 0; i < ha_alter_info->index_drop_count; i++)
|
|
{
|
|
const KEY* key = ha_alter_info->index_drop_buffer[i];
|
|
for (j = 0; j < old_n_keys; j++)
|
|
{
|
|
if (!old_key_descr[j]->m_name.compare(key->name))
|
|
{
|
|
dropped_index_ids.insert(old_key_descr[j]->get_gl_index_id());
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
DBUG_ASSERT(dropped_index_ids.size() == ha_alter_info->index_drop_count);
|
|
|
|
/* Determine which(if any) key definitions(s) need to be added */
|
|
for (i = 0; i < ha_alter_info->index_add_count; i++)
|
|
{
|
|
key = &ha_alter_info->key_info_buffer[
|
|
ha_alter_info->index_add_buffer[i]];
|
|
for (j = 0; j < new_n_keys; j++)
|
|
{
|
|
if (!new_key_descr[j]->m_name.compare(key->name))
|
|
{
|
|
added_indexes.insert(new_key_descr[j]);
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
DBUG_ASSERT(added_indexes.size() == ha_alter_info->index_add_count);
|
|
|
|
ha_alter_info->handler_ctx= new Rdb_inplace_alter_ctx(
|
|
new_tdef, old_key_descr, new_key_descr, old_n_keys, new_n_keys,
|
|
added_indexes, dropped_index_ids, n_added_keys, n_dropped_keys);
|
|
|
|
DBUG_RETURN(0);
|
|
}
|
|
|
|
/**
|
|
Alter the table structure in-place with operations specified using
|
|
HA_ALTER_FLAGS and Alter_inplace_info. The level of concurrency allowed
|
|
during this operation depends on the return value from
|
|
check_if_supported_inplace_alter().
|
|
|
|
@note Storage engines are responsible for reporting any errors by
|
|
calling my_error()/print_error()
|
|
|
|
@note If this function reports error, commit_inplace_alter_table()
|
|
will be called with commit= false.
|
|
|
|
@param altered_table TABLE object for new version of table.
|
|
@param ha_alter_info Structure describing changes to be done
|
|
by ALTER TABLE and holding data used
|
|
during in-place alter.
|
|
|
|
@retval true Error
|
|
@retval false Success
|
|
*/
|
|
bool ha_rocksdb::inplace_alter_table(TABLE* altered_table,
|
|
my_core::Alter_inplace_info *ha_alter_info)
|
|
{
|
|
/* Do nothing for now */
|
|
DBUG_ENTER("inplace_alter_table");
|
|
DBUG_ASSERT(ha_alter_info->handler_ctx);
|
|
|
|
Rdb_inplace_alter_ctx* ctx=
|
|
static_cast<Rdb_inplace_alter_ctx*> (ha_alter_info->handler_ctx);
|
|
|
|
if (ha_alter_info->handler_flags & my_core::Alter_inplace_info::ADD_INDEX)
|
|
{
|
|
/*
|
|
Buffers need to be set up again to account for new, possibly longer
|
|
secondary keys.
|
|
*/
|
|
free_key_buffers();
|
|
if (alloc_key_buffers(altered_table, ctx->m_new_tdef))
|
|
{
|
|
DBUG_RETURN(1);
|
|
}
|
|
|
|
/* Populate all new secondary keys by scanning primary key */
|
|
if (inplace_populate_sk(altered_table, ctx->m_added_indexes))
|
|
{
|
|
DBUG_RETURN(1);
|
|
}
|
|
}
|
|
DBUG_EXECUTE_IF("myrocks_simulate_index_create_rollback", DBUG_RETURN(1););
|
|
|
|
DBUG_RETURN(0);
|
|
}
|
|
|
|
/**
|
|
Scan the Primary Key index entries and populate the new secondary keys.
|
|
*/
|
|
int ha_rocksdb::inplace_populate_sk(const TABLE* new_table_arg,
|
|
const std::unordered_set<std::shared_ptr<Rdb_key_def>>& indexes)
|
|
{
|
|
DBUG_ENTER("ha_rocksdb::inplace_populate_sk");
|
|
std::unique_ptr<rocksdb::WriteBatch> wb= dict_manager.begin();
|
|
rocksdb::WriteBatch *batch= wb.get();
|
|
|
|
/* Update the data dictionary */
|
|
std::unordered_set<GL_INDEX_ID> create_index_ids;
|
|
for (auto& index : indexes)
|
|
{
|
|
create_index_ids.insert(index->get_gl_index_id());
|
|
}
|
|
dict_manager.add_create_index(create_index_ids, batch);
|
|
dict_manager.commit(batch);
|
|
|
|
const bool hidden_pk_exists = has_hidden_pk(table);
|
|
|
|
int res= 0;
|
|
Rdb_transaction *tx;
|
|
ulonglong rdb_merge_buf_size= THDVAR(ha_thd(), merge_buf_size);
|
|
ulonglong rdb_merge_combine_read_size= THDVAR(ha_thd(),
|
|
merge_combine_read_size);
|
|
|
|
for (auto& index : indexes)
|
|
{
|
|
const rocksdb::Comparator* index_comp= index->get_cf()->GetComparator();
|
|
Rdb_index_merge rdb_merge(rdb_merge_buf_size, rdb_merge_combine_read_size,
|
|
index_comp);
|
|
|
|
if ((res= rdb_merge.init()))
|
|
{
|
|
DBUG_RETURN(res);
|
|
}
|
|
|
|
/*
|
|
Note: We pass in the currently existing table + tbl_def object here,
|
|
as the pk index position may have changed in the case of hidden primary
|
|
keys.
|
|
*/
|
|
uint pk= pk_index(table, m_tbl_def);
|
|
ha_index_init(pk, true);
|
|
|
|
/* Scan each record in the primary key in order */
|
|
for (res = index_first(table->record[0]); res == 0;
|
|
res = index_next(table->record[0]))
|
|
{
|
|
longlong hidden_pk_id= 0;
|
|
if (hidden_pk_exists &&
|
|
read_hidden_pk_id_from_rowkey(&hidden_pk_id))
|
|
{
|
|
// NO_LINT_DEBUG
|
|
sql_print_error("Error retrieving hidden pk id.");
|
|
ha_index_end();
|
|
DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
|
|
}
|
|
|
|
/* Create new secondary index entry */
|
|
int new_packed_size= index->pack_record(new_table_arg, m_pack_buffer,
|
|
table->record[0],
|
|
m_sk_packed_tuple, &m_sk_tails,
|
|
should_store_checksums(),
|
|
hidden_pk_id);
|
|
|
|
rocksdb::Slice key= rocksdb::Slice(
|
|
reinterpret_cast<const char*>(m_sk_packed_tuple), new_packed_size);
|
|
rocksdb::Slice val= rocksdb::Slice(
|
|
reinterpret_cast<const char*>(m_sk_tails.ptr()),
|
|
m_sk_tails.get_current_pos());
|
|
|
|
/*
|
|
Add record to offset tree in preparation for writing out to
|
|
disk in sorted chunks.
|
|
*/
|
|
if ((res= rdb_merge.add(key, val)))
|
|
{
|
|
ha_index_end();
|
|
DBUG_RETURN(res);
|
|
}
|
|
}
|
|
|
|
if (res != HA_ERR_END_OF_FILE)
|
|
{
|
|
// NO_LINT_DEBUG
|
|
sql_print_error("Error retrieving index entry from primary key.");
|
|
ha_index_end();
|
|
DBUG_RETURN(res);
|
|
}
|
|
|
|
tx= get_or_create_tx(table->in_use);
|
|
ha_index_end();
|
|
|
|
/*
|
|
Perform an n-way merge of n sorted buffers on disk, then writes all
|
|
results to RocksDB via SSTFileWriter API.
|
|
*/
|
|
rocksdb::Slice merge_key;
|
|
rocksdb::Slice merge_val;
|
|
while ((res= rdb_merge.next(&merge_key, &merge_val)) == 0)
|
|
{
|
|
/*
|
|
Insert key and slice to SST via SSTFileWriter API.
|
|
*/
|
|
if ((res= bulk_load_key(tx, index, merge_key, merge_val)))
|
|
{
|
|
break;
|
|
}
|
|
}
|
|
|
|
/*
|
|
Here, res == -1 means that we are finished, while > 0 means an error
|
|
occurred.
|
|
*/
|
|
if (res > 0)
|
|
{
|
|
// NO_LINT_DEBUG
|
|
sql_print_error("Error while bulk loading keys in external merge sort.");
|
|
DBUG_RETURN(res);
|
|
}
|
|
|
|
if ((res= tx->finish_bulk_load()))
|
|
{
|
|
// NO_LINT_DEBUG
|
|
sql_print_error("Error finishing bulk load.");
|
|
DBUG_RETURN(res);
|
|
}
|
|
}
|
|
|
|
/*
|
|
Explicitly tell jemalloc to clean up any unused dirty pages at this point.
|
|
See https://reviews.facebook.net/D63723 for more details.
|
|
*/
|
|
purge_all_jemalloc_arenas();
|
|
|
|
DBUG_EXECUTE_IF("crash_during_online_index_creation", DBUG_SUICIDE(););
|
|
DBUG_RETURN(res);
|
|
}
|
|
|
|
/**
|
|
Commit or rollback the changes made during prepare_inplace_alter_table()
|
|
and inplace_alter_table() inside the storage engine.
|
|
Note that in case of rollback the allowed level of concurrency during
|
|
this operation will be the same as for inplace_alter_table() and thus
|
|
might be higher than during prepare_inplace_alter_table(). (For example,
|
|
concurrent writes were blocked during prepare, but might not be during
|
|
rollback).
|
|
|
|
@note Storage engines are responsible for reporting any errors by
|
|
calling my_error()/print_error()
|
|
|
|
@note If this function with commit= true reports error, it will be called
|
|
again with commit= false.
|
|
|
|
@note In case of partitioning, this function might be called for rollback
|
|
without prepare_inplace_alter_table() having been called first.
|
|
Also partitioned tables sets ha_alter_info->group_commit_ctx to a NULL
|
|
terminated array of the partitions handlers and if all of them are
|
|
committed as one, then group_commit_ctx should be set to NULL to indicate
|
|
to the partitioning handler that all partitions handlers are committed.
|
|
@see prepare_inplace_alter_table().
|
|
|
|
@param altered_table TABLE object for new version of table.
|
|
@param ha_alter_info Structure describing changes to be done
|
|
by ALTER TABLE and holding data used
|
|
during in-place alter.
|
|
@param commit True => Commit, False => Rollback.
|
|
|
|
@retval true Error
|
|
@retval false Success
|
|
*/
|
|
bool ha_rocksdb::commit_inplace_alter_table(
|
|
my_core::TABLE *altered_table,
|
|
my_core::Alter_inplace_info *ha_alter_info,
|
|
bool commit)
|
|
{
|
|
Rdb_inplace_alter_ctx* ctx0=
|
|
static_cast<Rdb_inplace_alter_ctx*> (ha_alter_info->handler_ctx);
|
|
|
|
DBUG_ENTER("commit_inplace_alter_table");
|
|
|
|
/*
|
|
IMPORTANT: When rollback is requested, mysql will abort with
|
|
an assertion failure. That means every failed commit during inplace alter
|
|
table will result in a fatal error on the server. Indexes ongoing creation
|
|
will be detected when the server restarts, and dropped.
|
|
|
|
For partitioned tables, a rollback call to this function (commit == false)
|
|
is done for each partition. A successful commit call only executes once
|
|
for all partitions.
|
|
*/
|
|
if (!commit)
|
|
{
|
|
/* If ctx has not been created yet, nothing to do here */
|
|
if (!ctx0)
|
|
{
|
|
DBUG_RETURN(0);
|
|
}
|
|
|
|
/*
|
|
Cannot call destructor for Rdb_tbl_def directly because we don't want to
|
|
erase the mappings inside the ddl_manager, as the old_key_descr is still
|
|
using them.
|
|
*/
|
|
if (ctx0->m_new_key_descr)
|
|
{
|
|
/* Delete the new key descriptors */
|
|
for (uint i = 0; i < ctx0->m_new_tdef->m_key_count; i++)
|
|
{
|
|
ctx0->m_new_key_descr[i]= nullptr;
|
|
}
|
|
|
|
delete[] ctx0->m_new_key_descr;
|
|
ctx0->m_new_key_descr = nullptr;
|
|
ctx0->m_new_tdef->m_key_descr_arr = nullptr;
|
|
|
|
delete ctx0->m_new_tdef;
|
|
}
|
|
|
|
DBUG_RETURN(0);
|
|
}
|
|
|
|
DBUG_ASSERT(ctx0);
|
|
|
|
/*
|
|
For partitioned tables, we need to commit all changes to all tables at
|
|
once, unlike in the other inplace alter API methods.
|
|
*/
|
|
inplace_alter_handler_ctx** ctx_array;
|
|
inplace_alter_handler_ctx* ctx_single[2];
|
|
|
|
if (ha_alter_info->group_commit_ctx)
|
|
{
|
|
DBUG_EXECUTE_IF("crash_during_index_creation_partition",
|
|
DBUG_SUICIDE(););
|
|
ctx_array = ha_alter_info->group_commit_ctx;
|
|
}
|
|
else
|
|
{
|
|
ctx_single[0] = ctx0;
|
|
ctx_single[1] = nullptr;
|
|
ctx_array = ctx_single;
|
|
}
|
|
|
|
DBUG_ASSERT(ctx0 == ctx_array[0]);
|
|
ha_alter_info->group_commit_ctx = nullptr;
|
|
|
|
if (ha_alter_info->handler_flags &
|
|
(my_core::Alter_inplace_info::DROP_INDEX |
|
|
my_core::Alter_inplace_info::DROP_UNIQUE_INDEX |
|
|
my_core::Alter_inplace_info::ADD_INDEX))
|
|
{
|
|
std::unique_ptr<rocksdb::WriteBatch> wb= dict_manager.begin();
|
|
rocksdb::WriteBatch *batch= wb.get();
|
|
std::unordered_set<GL_INDEX_ID> create_index_ids;
|
|
|
|
m_tbl_def= ctx0->m_new_tdef;
|
|
m_key_descr_arr= m_tbl_def->m_key_descr_arr;
|
|
m_pk_descr= m_key_descr_arr[pk_index(altered_table, m_tbl_def)];
|
|
|
|
dict_manager.lock();
|
|
for (inplace_alter_handler_ctx** pctx = ctx_array; *pctx; pctx++)
|
|
{
|
|
Rdb_inplace_alter_ctx* ctx= static_cast<Rdb_inplace_alter_ctx*> (*pctx);
|
|
|
|
/* Mark indexes to be dropped */
|
|
dict_manager.add_drop_index(ctx->m_dropped_index_ids, batch);
|
|
|
|
for (auto& index : ctx->m_added_indexes)
|
|
{
|
|
create_index_ids.insert(index->get_gl_index_id());
|
|
}
|
|
|
|
if (ddl_manager.put_and_write(ctx->m_new_tdef, batch))
|
|
{
|
|
/*
|
|
Failed to write new entry into data dictionary, this should never
|
|
happen.
|
|
*/
|
|
DBUG_ASSERT(0);
|
|
}
|
|
}
|
|
|
|
if (dict_manager.commit(batch))
|
|
{
|
|
/*
|
|
Should never reach here. We assume MyRocks will abort if commit fails.
|
|
*/
|
|
DBUG_ASSERT(0);
|
|
}
|
|
|
|
dict_manager.unlock();
|
|
|
|
/* Mark ongoing create indexes as finished/remove from data dictionary */
|
|
dict_manager.finish_indexes_operation(create_index_ids,
|
|
Rdb_key_def::DDL_CREATE_INDEX_ONGOING);
|
|
rdb_drop_idx_thread.signal();
|
|
}
|
|
|
|
DBUG_RETURN(0);
|
|
}
|
|
|
|
#define SHOW_FNAME(name) rocksdb_show_##name
|
|
|
|
#define DEF_SHOW_FUNC(name, key) \
|
|
static int SHOW_FNAME(name)(MYSQL_THD thd, SHOW_VAR *var, char *buff) \
|
|
{ \
|
|
rocksdb_status_counters.name = \
|
|
rocksdb_stats->getTickerCount(rocksdb::key); \
|
|
var->type = SHOW_LONGLONG; \
|
|
var->value = (char *)&rocksdb_status_counters.name; \
|
|
return 0; \
|
|
}
|
|
|
|
#define DEF_STATUS_VAR(name) \
|
|
{"rocksdb_" #name, (char*) &SHOW_FNAME(name), SHOW_FUNC}
|
|
|
|
#define DEF_STATUS_VAR_PTR(name, ptr, option) \
|
|
{"rocksdb_" name, (char*) ptr, option}
|
|
|
|
#define DEF_STATUS_VAR_FUNC(name, ptr, option) \
|
|
{name, reinterpret_cast<char*>(ptr), option}
|
|
|
|
struct rocksdb_status_counters_t {
|
|
uint64_t block_cache_miss;
|
|
uint64_t block_cache_hit;
|
|
uint64_t block_cache_add;
|
|
uint64_t block_cache_index_miss;
|
|
uint64_t block_cache_index_hit;
|
|
uint64_t block_cache_filter_miss;
|
|
uint64_t block_cache_filter_hit;
|
|
uint64_t block_cache_data_miss;
|
|
uint64_t block_cache_data_hit;
|
|
uint64_t bloom_filter_useful;
|
|
uint64_t memtable_hit;
|
|
uint64_t memtable_miss;
|
|
uint64_t compaction_key_drop_new;
|
|
uint64_t compaction_key_drop_obsolete;
|
|
uint64_t compaction_key_drop_user;
|
|
uint64_t number_keys_written;
|
|
uint64_t number_keys_read;
|
|
uint64_t number_keys_updated;
|
|
uint64_t bytes_written;
|
|
uint64_t bytes_read;
|
|
uint64_t no_file_closes;
|
|
uint64_t no_file_opens;
|
|
uint64_t no_file_errors;
|
|
uint64_t l0_slowdown_micros;
|
|
uint64_t memtable_compaction_micros;
|
|
uint64_t l0_num_files_stall_micros;
|
|
uint64_t rate_limit_delay_millis;
|
|
uint64_t num_iterators;
|
|
uint64_t number_multiget_get;
|
|
uint64_t number_multiget_keys_read;
|
|
uint64_t number_multiget_bytes_read;
|
|
uint64_t number_deletes_filtered;
|
|
uint64_t number_merge_failures;
|
|
uint64_t sequence_number;
|
|
uint64_t bloom_filter_prefix_checked;
|
|
uint64_t bloom_filter_prefix_useful;
|
|
uint64_t number_reseeks_iteration;
|
|
uint64_t getupdatessince_calls;
|
|
uint64_t block_cachecompressed_miss;
|
|
uint64_t block_cachecompressed_hit;
|
|
uint64_t wal_synced;
|
|
uint64_t wal_bytes;
|
|
uint64_t write_self;
|
|
uint64_t write_other;
|
|
uint64_t write_timedout;
|
|
uint64_t write_wal;
|
|
uint64_t flush_write_bytes;
|
|
uint64_t compact_read_bytes;
|
|
uint64_t compact_write_bytes;
|
|
uint64_t number_superversion_acquires;
|
|
uint64_t number_superversion_releases;
|
|
uint64_t number_superversion_cleanups;
|
|
uint64_t number_block_not_compressed;
|
|
};
|
|
|
|
static rocksdb_status_counters_t rocksdb_status_counters;
|
|
|
|
DEF_SHOW_FUNC(block_cache_miss, BLOCK_CACHE_MISS)
|
|
DEF_SHOW_FUNC(block_cache_hit, BLOCK_CACHE_HIT)
|
|
DEF_SHOW_FUNC(block_cache_add, BLOCK_CACHE_ADD)
|
|
DEF_SHOW_FUNC(block_cache_index_miss, BLOCK_CACHE_INDEX_MISS)
|
|
DEF_SHOW_FUNC(block_cache_index_hit, BLOCK_CACHE_INDEX_HIT)
|
|
DEF_SHOW_FUNC(block_cache_filter_miss, BLOCK_CACHE_FILTER_MISS)
|
|
DEF_SHOW_FUNC(block_cache_filter_hit, BLOCK_CACHE_FILTER_HIT)
|
|
DEF_SHOW_FUNC(block_cache_data_miss, BLOCK_CACHE_DATA_MISS)
|
|
DEF_SHOW_FUNC(block_cache_data_hit, BLOCK_CACHE_DATA_HIT)
|
|
DEF_SHOW_FUNC(bloom_filter_useful, BLOOM_FILTER_USEFUL)
|
|
DEF_SHOW_FUNC(memtable_hit, MEMTABLE_HIT)
|
|
DEF_SHOW_FUNC(memtable_miss, MEMTABLE_MISS)
|
|
DEF_SHOW_FUNC(compaction_key_drop_new, COMPACTION_KEY_DROP_NEWER_ENTRY)
|
|
DEF_SHOW_FUNC(compaction_key_drop_obsolete, COMPACTION_KEY_DROP_OBSOLETE)
|
|
DEF_SHOW_FUNC(compaction_key_drop_user, COMPACTION_KEY_DROP_USER)
|
|
DEF_SHOW_FUNC(number_keys_written, NUMBER_KEYS_WRITTEN)
|
|
DEF_SHOW_FUNC(number_keys_read, NUMBER_KEYS_READ)
|
|
DEF_SHOW_FUNC(number_keys_updated, NUMBER_KEYS_UPDATED)
|
|
DEF_SHOW_FUNC(bytes_written, BYTES_WRITTEN)
|
|
DEF_SHOW_FUNC(bytes_read, BYTES_READ)
|
|
DEF_SHOW_FUNC(no_file_closes, NO_FILE_CLOSES)
|
|
DEF_SHOW_FUNC(no_file_opens, NO_FILE_OPENS)
|
|
DEF_SHOW_FUNC(no_file_errors, NO_FILE_ERRORS)
|
|
DEF_SHOW_FUNC(l0_slowdown_micros, STALL_L0_SLOWDOWN_MICROS)
|
|
DEF_SHOW_FUNC(memtable_compaction_micros, STALL_MEMTABLE_COMPACTION_MICROS)
|
|
DEF_SHOW_FUNC(l0_num_files_stall_micros, STALL_L0_NUM_FILES_MICROS)
|
|
DEF_SHOW_FUNC(rate_limit_delay_millis, RATE_LIMIT_DELAY_MILLIS)
|
|
DEF_SHOW_FUNC(num_iterators, NO_ITERATORS)
|
|
DEF_SHOW_FUNC(number_multiget_get, NUMBER_MULTIGET_CALLS)
|
|
DEF_SHOW_FUNC(number_multiget_keys_read, NUMBER_MULTIGET_KEYS_READ)
|
|
DEF_SHOW_FUNC(number_multiget_bytes_read, NUMBER_MULTIGET_BYTES_READ)
|
|
DEF_SHOW_FUNC(number_deletes_filtered, NUMBER_FILTERED_DELETES)
|
|
DEF_SHOW_FUNC(number_merge_failures, NUMBER_MERGE_FAILURES)
|
|
DEF_SHOW_FUNC(sequence_number, SEQUENCE_NUMBER)
|
|
DEF_SHOW_FUNC(bloom_filter_prefix_checked, BLOOM_FILTER_PREFIX_CHECKED)
|
|
DEF_SHOW_FUNC(bloom_filter_prefix_useful, BLOOM_FILTER_PREFIX_USEFUL)
|
|
DEF_SHOW_FUNC(number_reseeks_iteration, NUMBER_OF_RESEEKS_IN_ITERATION)
|
|
DEF_SHOW_FUNC(getupdatessince_calls, GET_UPDATES_SINCE_CALLS)
|
|
DEF_SHOW_FUNC(block_cachecompressed_miss, BLOCK_CACHE_COMPRESSED_MISS)
|
|
DEF_SHOW_FUNC(block_cachecompressed_hit, BLOCK_CACHE_COMPRESSED_HIT)
|
|
DEF_SHOW_FUNC(wal_synced, WAL_FILE_SYNCED)
|
|
DEF_SHOW_FUNC(wal_bytes, WAL_FILE_BYTES)
|
|
DEF_SHOW_FUNC(write_self, WRITE_DONE_BY_SELF)
|
|
DEF_SHOW_FUNC(write_other, WRITE_DONE_BY_OTHER)
|
|
DEF_SHOW_FUNC(write_timedout, WRITE_TIMEDOUT)
|
|
DEF_SHOW_FUNC(write_wal, WRITE_WITH_WAL)
|
|
DEF_SHOW_FUNC(flush_write_bytes, FLUSH_WRITE_BYTES)
|
|
DEF_SHOW_FUNC(compact_read_bytes, COMPACT_READ_BYTES)
|
|
DEF_SHOW_FUNC(compact_write_bytes, COMPACT_WRITE_BYTES)
|
|
DEF_SHOW_FUNC(number_superversion_acquires, NUMBER_SUPERVERSION_ACQUIRES)
|
|
DEF_SHOW_FUNC(number_superversion_releases, NUMBER_SUPERVERSION_RELEASES)
|
|
DEF_SHOW_FUNC(number_superversion_cleanups, NUMBER_SUPERVERSION_CLEANUPS)
|
|
DEF_SHOW_FUNC(number_block_not_compressed, NUMBER_BLOCK_NOT_COMPRESSED)
|
|
|
|
static void myrocks_update_status() {
|
|
export_stats.rows_deleted = global_stats.rows[ROWS_DELETED];
|
|
export_stats.rows_inserted = global_stats.rows[ROWS_INSERTED];
|
|
export_stats.rows_read = global_stats.rows[ROWS_READ];
|
|
export_stats.rows_updated = global_stats.rows[ROWS_UPDATED];
|
|
|
|
export_stats.system_rows_deleted = global_stats.system_rows[ROWS_DELETED];
|
|
export_stats.system_rows_inserted = global_stats.system_rows[ROWS_INSERTED];
|
|
export_stats.system_rows_read = global_stats.system_rows[ROWS_READ];
|
|
export_stats.system_rows_updated = global_stats.system_rows[ROWS_UPDATED];
|
|
}
|
|
|
|
static SHOW_VAR myrocks_status_variables[]= {
|
|
DEF_STATUS_VAR_FUNC("rows_deleted", &export_stats.rows_deleted,
|
|
SHOW_LONGLONG),
|
|
DEF_STATUS_VAR_FUNC("rows_inserted", &export_stats.rows_inserted,
|
|
SHOW_LONGLONG),
|
|
DEF_STATUS_VAR_FUNC("rows_read", &export_stats.rows_read, SHOW_LONGLONG),
|
|
DEF_STATUS_VAR_FUNC("rows_updated", &export_stats.rows_updated,
|
|
SHOW_LONGLONG),
|
|
DEF_STATUS_VAR_FUNC("system_rows_deleted", &export_stats.system_rows_deleted,
|
|
SHOW_LONGLONG),
|
|
DEF_STATUS_VAR_FUNC("system_rows_inserted",
|
|
&export_stats.system_rows_inserted, SHOW_LONGLONG),
|
|
DEF_STATUS_VAR_FUNC("system_rows_read", &export_stats.system_rows_read,
|
|
SHOW_LONGLONG),
|
|
DEF_STATUS_VAR_FUNC("system_rows_updated", &export_stats.system_rows_updated,
|
|
SHOW_LONGLONG),
|
|
|
|
{NullS, NullS, SHOW_LONG}
|
|
};
|
|
|
|
static void show_myrocks_vars(THD* thd, SHOW_VAR* var, char* buff) {
|
|
myrocks_update_status();
|
|
var->type = SHOW_ARRAY;
|
|
var->value = reinterpret_cast<char*>(&myrocks_status_variables);
|
|
}
|
|
|
|
static SHOW_VAR rocksdb_status_vars[]= {
|
|
DEF_STATUS_VAR(block_cache_miss),
|
|
DEF_STATUS_VAR(block_cache_hit),
|
|
DEF_STATUS_VAR(block_cache_add),
|
|
DEF_STATUS_VAR(block_cache_index_miss),
|
|
DEF_STATUS_VAR(block_cache_index_hit),
|
|
DEF_STATUS_VAR(block_cache_filter_miss),
|
|
DEF_STATUS_VAR(block_cache_filter_hit),
|
|
DEF_STATUS_VAR(block_cache_data_miss),
|
|
DEF_STATUS_VAR(block_cache_data_hit),
|
|
DEF_STATUS_VAR(bloom_filter_useful),
|
|
DEF_STATUS_VAR(memtable_hit),
|
|
DEF_STATUS_VAR(memtable_miss),
|
|
DEF_STATUS_VAR(compaction_key_drop_new),
|
|
DEF_STATUS_VAR(compaction_key_drop_obsolete),
|
|
DEF_STATUS_VAR(compaction_key_drop_user),
|
|
DEF_STATUS_VAR(number_keys_written),
|
|
DEF_STATUS_VAR(number_keys_read),
|
|
DEF_STATUS_VAR(number_keys_updated),
|
|
DEF_STATUS_VAR(bytes_written),
|
|
DEF_STATUS_VAR(bytes_read),
|
|
DEF_STATUS_VAR(no_file_closes),
|
|
DEF_STATUS_VAR(no_file_opens),
|
|
DEF_STATUS_VAR(no_file_errors),
|
|
DEF_STATUS_VAR(l0_slowdown_micros),
|
|
DEF_STATUS_VAR(memtable_compaction_micros),
|
|
DEF_STATUS_VAR(l0_num_files_stall_micros),
|
|
DEF_STATUS_VAR(rate_limit_delay_millis),
|
|
DEF_STATUS_VAR(num_iterators),
|
|
DEF_STATUS_VAR(number_multiget_get),
|
|
DEF_STATUS_VAR(number_multiget_keys_read),
|
|
DEF_STATUS_VAR(number_multiget_bytes_read),
|
|
DEF_STATUS_VAR(number_deletes_filtered),
|
|
DEF_STATUS_VAR(number_merge_failures),
|
|
DEF_STATUS_VAR(sequence_number),
|
|
DEF_STATUS_VAR(bloom_filter_prefix_checked),
|
|
DEF_STATUS_VAR(bloom_filter_prefix_useful),
|
|
DEF_STATUS_VAR(number_reseeks_iteration),
|
|
DEF_STATUS_VAR(getupdatessince_calls),
|
|
DEF_STATUS_VAR(block_cachecompressed_miss),
|
|
DEF_STATUS_VAR(block_cachecompressed_hit),
|
|
DEF_STATUS_VAR(wal_synced),
|
|
DEF_STATUS_VAR(wal_bytes),
|
|
DEF_STATUS_VAR(write_self),
|
|
DEF_STATUS_VAR(write_other),
|
|
DEF_STATUS_VAR(write_timedout),
|
|
DEF_STATUS_VAR(write_wal),
|
|
DEF_STATUS_VAR(flush_write_bytes),
|
|
DEF_STATUS_VAR(compact_read_bytes),
|
|
DEF_STATUS_VAR(compact_write_bytes),
|
|
DEF_STATUS_VAR(number_superversion_acquires),
|
|
DEF_STATUS_VAR(number_superversion_releases),
|
|
DEF_STATUS_VAR(number_superversion_cleanups),
|
|
DEF_STATUS_VAR(number_block_not_compressed),
|
|
DEF_STATUS_VAR_PTR("snapshot_conflict_errors",
|
|
&rocksdb_snapshot_conflict_errors,
|
|
SHOW_LONGLONG),
|
|
DEF_STATUS_VAR_PTR("number_stat_computes", &rocksdb_number_stat_computes, SHOW_LONGLONG),
|
|
DEF_STATUS_VAR_PTR("number_sst_entry_put", &rocksdb_num_sst_entry_put,
|
|
SHOW_LONGLONG),
|
|
DEF_STATUS_VAR_PTR("number_sst_entry_delete", &rocksdb_num_sst_entry_delete,
|
|
SHOW_LONGLONG),
|
|
DEF_STATUS_VAR_PTR("number_sst_entry_singledelete",
|
|
&rocksdb_num_sst_entry_singledelete, SHOW_LONGLONG),
|
|
DEF_STATUS_VAR_PTR("number_sst_entry_merge", &rocksdb_num_sst_entry_merge,
|
|
SHOW_LONGLONG),
|
|
DEF_STATUS_VAR_PTR("number_sst_entry_other", &rocksdb_num_sst_entry_other,
|
|
SHOW_LONGLONG),
|
|
{"rocksdb", reinterpret_cast<char*>(&show_myrocks_vars), SHOW_FUNC},
|
|
{NullS, NullS, SHOW_LONG}
|
|
};
|
|
|
|
|
|
/*
|
|
Background thread's main logic
|
|
*/
|
|
|
|
void Rdb_background_thread::run()
|
|
{
|
|
timespec ts_next_sync;
|
|
clock_gettime(CLOCK_REALTIME, &ts_next_sync);
|
|
ts_next_sync.tv_sec++;
|
|
|
|
for (;;)
|
|
{
|
|
// wait for 1 second or until we received a condition to stop the thread
|
|
mysql_mutex_lock(&m_signal_mutex);
|
|
auto ret __attribute__((__unused__)) = mysql_cond_timedwait(
|
|
&m_signal_cond, &m_signal_mutex, &ts_next_sync);
|
|
// make sure that no program error is returned
|
|
DBUG_ASSERT(ret == 0 || ret == ETIMEDOUT);
|
|
bool local_stop= m_stop;
|
|
bool local_save_stats= m_save_stats;
|
|
reset();
|
|
mysql_mutex_unlock(&m_signal_mutex);
|
|
|
|
if (local_stop)
|
|
{
|
|
break;
|
|
}
|
|
|
|
if (local_save_stats)
|
|
{
|
|
ddl_manager.persist_stats();
|
|
}
|
|
|
|
// Flush the WAL if need be but don't do it more frequent
|
|
// than once per second
|
|
timespec ts;
|
|
clock_gettime(CLOCK_REALTIME, &ts);
|
|
if (ts.tv_sec - ts_next_sync.tv_sec >= 1)
|
|
{
|
|
if (rdb && rocksdb_background_sync)
|
|
{
|
|
DBUG_ASSERT(!rocksdb_db_options.allow_mmap_writes);
|
|
rocksdb::Status s= rdb->SyncWAL();
|
|
if (!s.ok())
|
|
rdb_handle_io_error(s, RDB_IO_ERROR_BG_THREAD);
|
|
}
|
|
ts_next_sync.tv_sec= ts.tv_sec + 1;
|
|
}
|
|
}
|
|
|
|
// save remaining stats which might've left unsaved
|
|
ddl_manager.persist_stats();
|
|
}
|
|
|
|
|
|
/**
|
|
Deciding if it is possible to use bloom filter or not.
|
|
|
|
@detail
|
|
Even if bloom filter exists, it is not always possible
|
|
to use bloom filter. If using bloom filter when you shouldn't,
|
|
false negative may happen -- fewer rows than expected may be returned.
|
|
It is users' responsibility to use bloom filter correctly.
|
|
|
|
If bloom filter does not exist, return value does not matter because
|
|
RocksDB does not use bloom filter internally.
|
|
|
|
@param kd
|
|
@param eq_cond Equal condition part of the key. This always includes
|
|
system index id (4 bytes).
|
|
@param use_all_keys True if all key parts are set with equal conditions.
|
|
This is aware of extended keys.
|
|
*/
|
|
bool can_use_bloom_filter(THD *thd,
|
|
const std::shared_ptr<const Rdb_key_def>& kd,
|
|
const rocksdb::Slice &eq_cond,
|
|
const bool use_all_keys,
|
|
bool is_ascending)
|
|
{
|
|
bool can_use= false;
|
|
|
|
if (THDVAR(thd, skip_bloom_filter_on_read))
|
|
{
|
|
return can_use;
|
|
}
|
|
|
|
rocksdb::Options opt = rdb->GetOptions(kd->get_cf());
|
|
if (opt.prefix_extractor)
|
|
{
|
|
/*
|
|
This is an optimized use case for CappedPrefixTransform.
|
|
If eq_cond length >= prefix extractor length and if
|
|
all keys are used for equal lookup, it is
|
|
always possible to use bloom filter.
|
|
|
|
Prefix bloom filter can't be used on descending scan with
|
|
prefix lookup (i.e. WHERE id1=1 ORDER BY id2 DESC), because of
|
|
RocksDB's limitation. On ascending (or not sorting) scan,
|
|
keys longer than the capped prefix length will be truncated down
|
|
to the capped length and the resulting key is added to the bloom filter.
|
|
|
|
Keys shorter than the capped prefix length will be added to
|
|
the bloom filter. When keys are looked up, key conditionals
|
|
longer than the capped length can be used; key conditionals
|
|
shorter require all parts of the key to be available
|
|
for the short key match.
|
|
*/
|
|
if (use_all_keys && opt.prefix_extractor->InRange(eq_cond))
|
|
can_use= true;
|
|
else if (!is_ascending)
|
|
can_use= false;
|
|
else if (opt.prefix_extractor->SameResultWhenAppended(eq_cond))
|
|
can_use= true;
|
|
else
|
|
can_use= false;
|
|
} else
|
|
{
|
|
/*
|
|
if prefix extractor is not defined, all key parts have to be
|
|
used by eq_cond.
|
|
*/
|
|
if (use_all_keys)
|
|
can_use= true;
|
|
else
|
|
can_use= false;
|
|
}
|
|
|
|
return can_use;
|
|
}
|
|
|
|
/* For modules that need access to the global data structures */
|
|
rocksdb::DB *rdb_get_rocksdb_db()
|
|
{
|
|
return rdb;
|
|
}
|
|
|
|
Rdb_cf_manager& rdb_get_cf_manager()
|
|
{
|
|
return cf_manager;
|
|
}
|
|
|
|
rocksdb::BlockBasedTableOptions& rdb_get_table_options()
|
|
{
|
|
return rocksdb_tbl_options;
|
|
}
|
|
|
|
|
|
int rdb_get_table_perf_counters(const char *tablename,
|
|
Rdb_perf_counters *counters)
|
|
{
|
|
DBUG_ASSERT(counters != nullptr);
|
|
DBUG_ASSERT(tablename != nullptr);
|
|
|
|
Rdb_table_handler *table_handler;
|
|
table_handler= rdb_open_tables.get_table_handler(tablename);
|
|
if (table_handler == nullptr)
|
|
{
|
|
return HA_ERR_INTERNAL_ERROR;
|
|
}
|
|
|
|
counters->load(table_handler->m_table_perf_context);
|
|
|
|
rdb_open_tables.release_table_handler(table_handler);
|
|
return 0;
|
|
}
|
|
|
|
|
|
void rdb_handle_io_error(rocksdb::Status status, RDB_IO_ERROR_TYPE err_type)
|
|
{
|
|
if (status.IsIOError())
|
|
{
|
|
switch (err_type) {
|
|
case RDB_IO_ERROR_TX_COMMIT:
|
|
case RDB_IO_ERROR_DICT_COMMIT:
|
|
{
|
|
sql_print_error("RocksDB: Failed to write to WAL - status %d, %s",
|
|
status.code(), status.ToString().c_str());
|
|
sql_print_error("RocksDB: Aborting on WAL write error.");
|
|
abort_with_stack_traces();
|
|
break;
|
|
}
|
|
case RDB_IO_ERROR_BG_THREAD:
|
|
{
|
|
sql_print_warning("RocksDB: BG Thread failed to write to RocksDB "
|
|
"- status %d, %s", status.code(),
|
|
status.ToString().c_str());
|
|
break;
|
|
}
|
|
default:
|
|
DBUG_ASSERT(0);
|
|
break;
|
|
}
|
|
}
|
|
else if (status.IsCorruption())
|
|
{
|
|
/* NO_LINT_DEBUG */
|
|
sql_print_error("RocksDB: Data Corruption detected! %d, %s",
|
|
status.code(), status.ToString().c_str());
|
|
/* NO_LINT_DEBUG */
|
|
sql_print_error("RocksDB: Aborting because of data corruption.");
|
|
abort_with_stack_traces();
|
|
}
|
|
else if (!status.ok())
|
|
{
|
|
switch (err_type) {
|
|
case RDB_IO_ERROR_DICT_COMMIT:
|
|
{
|
|
sql_print_error("RocksDB: Failed to write to WAL (dictionary) - "
|
|
"status %d, %s",
|
|
status.code(), status.ToString().c_str());
|
|
sql_print_error("RocksDB: Aborting on WAL write error.");
|
|
abort_with_stack_traces();
|
|
break;
|
|
}
|
|
default:
|
|
sql_print_warning("RocksDB: Failed to write to RocksDB "
|
|
"- status %d, %s", status.code(),
|
|
status.ToString().c_str());
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
Rdb_dict_manager *rdb_get_dict_manager(void)
|
|
{
|
|
return &dict_manager;
|
|
}
|
|
|
|
Rdb_ddl_manager *rdb_get_ddl_manager(void)
|
|
{
|
|
return &ddl_manager;
|
|
}
|
|
|
|
Rdb_binlog_manager *rdb_get_binlog_manager(void)
|
|
{
|
|
return &binlog_manager;
|
|
}
|
|
|
|
|
|
void
|
|
rocksdb_set_compaction_options(
|
|
my_core::THD* thd __attribute__((__unused__)),
|
|
my_core::st_mysql_sys_var* var __attribute__((__unused__)),
|
|
void* var_ptr,
|
|
const void* save)
|
|
{
|
|
if (var_ptr && save) {
|
|
*(uint64_t*)var_ptr = *(const uint64_t*) save;
|
|
}
|
|
Rdb_compact_params params = {
|
|
(uint64_t)rocksdb_compaction_sequential_deletes,
|
|
(uint64_t)rocksdb_compaction_sequential_deletes_window,
|
|
(uint64_t)rocksdb_compaction_sequential_deletes_file_size
|
|
};
|
|
if (properties_collector_factory) {
|
|
properties_collector_factory->SetCompactionParams(params);
|
|
}
|
|
}
|
|
|
|
void rocksdb_set_table_stats_sampling_pct(
|
|
my_core::THD* thd __attribute__((__unused__)),
|
|
my_core::st_mysql_sys_var* var __attribute__((__unused__)),
|
|
void* var_ptr __attribute__((__unused__)),
|
|
const void* save)
|
|
{
|
|
mysql_mutex_lock(&rdb_sysvars_mutex);
|
|
|
|
uint32_t new_val= *static_cast<const uint32_t*>(save);
|
|
|
|
if (new_val != rocksdb_table_stats_sampling_pct) {
|
|
rocksdb_table_stats_sampling_pct = new_val;
|
|
|
|
if (properties_collector_factory) {
|
|
properties_collector_factory->SetTableStatsSamplingPct(
|
|
rocksdb_table_stats_sampling_pct);
|
|
}
|
|
}
|
|
|
|
mysql_mutex_unlock(&rdb_sysvars_mutex);
|
|
}
|
|
|
|
/*
|
|
This function allows setting the rate limiter's bytes per second value
|
|
but only if the rate limiter is turned on which has to be done at startup.
|
|
If the rate is already 0 (turned off) or we are changing it to 0 (trying
|
|
to turn it off) this function will push a warning to the client and do
|
|
nothing.
|
|
This is similar to the code in innodb_doublewrite_update (found in
|
|
storage/innobase/handler/ha_innodb.cc).
|
|
*/
|
|
void
|
|
rocksdb_set_rate_limiter_bytes_per_sec(
|
|
my_core::THD* thd,
|
|
my_core::st_mysql_sys_var* var __attribute__((__unused__)),
|
|
void* var_ptr __attribute__((__unused__)),
|
|
const void* save)
|
|
{
|
|
uint64_t new_val= *static_cast<const uint64_t*>(save);
|
|
if (new_val == 0 || rocksdb_rate_limiter_bytes_per_sec == 0)
|
|
{
|
|
/*
|
|
If a rate_limiter was not enabled at startup we can't change it nor
|
|
can we disable it if one was created at startup
|
|
*/
|
|
push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
|
|
ER_WRONG_ARGUMENTS,
|
|
"RocksDB: rocksdb_rate_limiter_bytes_per_sec cannot "
|
|
"be dynamically changed to or from 0. Do a clean "
|
|
"shutdown if you want to change it from or to 0.");
|
|
}
|
|
else if (new_val != rocksdb_rate_limiter_bytes_per_sec)
|
|
{
|
|
/* Apply the new value to the rate limiter and store it locally */
|
|
DBUG_ASSERT(rocksdb_rate_limiter != nullptr);
|
|
rocksdb_rate_limiter_bytes_per_sec= new_val;
|
|
rocksdb_rate_limiter->SetBytesPerSecond(new_val);
|
|
}
|
|
}
|
|
|
|
void rdb_set_collation_exception_list(const char *exception_list)
|
|
{
|
|
DBUG_ASSERT(rdb_collation_exceptions != nullptr);
|
|
|
|
if (!rdb_collation_exceptions->set_patterns(exception_list))
|
|
{
|
|
my_core::warn_about_bad_patterns(rdb_collation_exceptions,
|
|
"strict_collation_exceptions");
|
|
}
|
|
}
|
|
|
|
void
|
|
rocksdb_set_collation_exception_list(THD* thd,
|
|
struct st_mysql_sys_var* var,
|
|
void* var_ptr,
|
|
const void* save)
|
|
{
|
|
const char* val = *static_cast<const char*const*>(save);
|
|
|
|
rdb_set_collation_exception_list(val);
|
|
|
|
*static_cast<const char**>(var_ptr) = val;
|
|
}
|
|
|
|
void
|
|
rocksdb_set_bulk_load(THD* thd,
|
|
struct st_mysql_sys_var* var __attribute__((__unused__)),
|
|
void* var_ptr,
|
|
const void* save)
|
|
{
|
|
Rdb_transaction*& tx= get_tx_from_thd(thd);
|
|
|
|
if (tx != nullptr)
|
|
{
|
|
int rc= tx->finish_bulk_load();
|
|
if (rc != 0)
|
|
{
|
|
// NO_LINT_DEBUG
|
|
sql_print_error("RocksDB: Error %d finalizing last SST file while "
|
|
"setting bulk loading variable", rc);
|
|
abort_with_stack_traces();
|
|
}
|
|
}
|
|
|
|
*static_cast<bool*>(var_ptr) = *static_cast<const bool*>(save);
|
|
}
|
|
|
|
void rdb_queue_save_stats_request()
|
|
{
|
|
rdb_bg_thread.request_save_stats();
|
|
}
|
|
|
|
#ifdef MARIAROCKS_NOT_YET // MDEV-10976
|
|
void ha_rocksdb::rpl_before_delete_rows()
|
|
{
|
|
m_in_rpl_delete_rows= true;
|
|
}
|
|
|
|
void ha_rocksdb::rpl_after_delete_rows()
|
|
{
|
|
m_in_rpl_delete_rows= false;
|
|
}
|
|
|
|
void ha_rocksdb::rpl_before_update_rows()
|
|
{
|
|
m_in_rpl_update_rows= true;
|
|
}
|
|
|
|
void ha_rocksdb::rpl_after_update_rows()
|
|
{
|
|
m_in_rpl_update_rows= false;
|
|
}
|
|
|
|
/**
|
|
@brief
|
|
Read Free Replication can be used or not. Returning False means
|
|
Read Free Replication can be used. Read Free Replication can be used
|
|
on UPDATE or DELETE row events, and table must have user defined
|
|
primary key.
|
|
*/
|
|
bool ha_rocksdb::use_read_free_rpl()
|
|
{
|
|
return ((m_in_rpl_delete_rows || m_in_rpl_update_rows) &&
|
|
!has_hidden_pk(table) && m_use_read_free_rpl);
|
|
}
|
|
#endif // MARIAROCKS_NOT_YET
|
|
|
|
double ha_rocksdb::read_time(uint index, uint ranges, ha_rows rows)
|
|
{
|
|
if (index != table->s->primary_key) {
|
|
/* Non covering index range scan */
|
|
return(handler::read_time(index, ranges, rows));
|
|
}
|
|
return (rows/20.0) + 1;
|
|
}
|
|
|
|
} // namespace myrocks
|
|
|
|
/*
|
|
Register the storage engine plugin outside of myrocks namespace
|
|
so that mysql_declare_plugin does not get confused when it does
|
|
its name generation.
|
|
*/
|
|
|
|
struct st_mysql_storage_engine rocksdb_storage_engine=
|
|
{ MYSQL_HANDLERTON_INTERFACE_VERSION };
|
|
|
|
mysql_declare_plugin(rocksdb_se)
|
|
{
|
|
MYSQL_STORAGE_ENGINE_PLUGIN, /* Plugin Type */
|
|
&rocksdb_storage_engine, /* Plugin Descriptor */
|
|
"ROCKSDB", /* Plugin Name */
|
|
"Monty Program Ab", /* Plugin Author */
|
|
"RocksDB storage engine", /* Plugin Description */
|
|
PLUGIN_LICENSE_GPL, /* Plugin Licence */
|
|
myrocks::rocksdb_init_func, /* Plugin Entry Point */
|
|
myrocks::rocksdb_done_func, /* Plugin Deinitializer */
|
|
0x0001, /* version number (0.1) */
|
|
myrocks::rocksdb_status_vars, /* status variables */
|
|
myrocks::rocksdb_system_variables, /* system variables */
|
|
nullptr, /* config options */
|
|
0, /* flags */
|
|
},
|
|
myrocks::rdb_i_s_cfstats,
|
|
myrocks::rdb_i_s_dbstats,
|
|
myrocks::rdb_i_s_perf_context,
|
|
myrocks::rdb_i_s_perf_context_global,
|
|
myrocks::rdb_i_s_cfoptions,
|
|
myrocks::rdb_i_s_global_info,
|
|
myrocks::rdb_i_s_ddl,
|
|
myrocks::rdb_i_s_index_file_map
|
|
mysql_declare_plugin_end;
|