mariadb/storage/innobase/dict/dict0mem.cc

1547 lines
41 KiB
C++
Raw Normal View History

/*****************************************************************************
2016-06-21 14:21:03 +02:00
Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
Copyright (c) 2012, Facebook Inc.
MDEV-12253: Buffer pool blocks are accessed after they have been freed Problem was that bpage was referenced after it was already freed from LRU. Fixed by adding a new variable encrypted that is passed down to buf_page_check_corrupt() and used in buf_page_get_gen() to stop processing page read. This patch should also address following test failures and bugs: MDEV-12419: IMPORT should not look up tablespace in PageConverter::validate(). This is now removed. MDEV-10099: encryption.innodb_onlinealter_encryption fails sporadically in buildbot MDEV-11420: encryption.innodb_encryption-page-compression failed in buildbot MDEV-11222: encryption.encrypt_and_grep failed in buildbot on P8 Removed dict_table_t::is_encrypted and dict_table_t::ibd_file_missing and replaced these with dict_table_t::file_unreadable. Table ibd file is missing if fil_get_space(space_id) returns NULL and encrypted if not. Removed dict_table_t::is_corrupted field. Ported FilSpace class from 10.2 and using that on buf_page_check_corrupt(), buf_page_decrypt_after_read(), buf_page_encrypt_before_write(), buf_dblwr_process(), buf_read_page(), dict_stats_save_defrag_stats(). Added test cases when enrypted page could be read while doing redo log crash recovery. Also added test case for row compressed blobs. btr_cur_open_at_index_side_func(), btr_cur_open_at_rnd_pos_func(): Avoid referencing block that is NULL. buf_page_get_zip(): Issue error if page read fails. buf_page_get_gen(): Use dberr_t for error detection and do not reference bpage after we hare freed it. buf_mark_space_corrupt(): remove bpage from LRU also when it is encrypted. buf_page_check_corrupt(): @return DB_SUCCESS if page has been read and is not corrupted, DB_PAGE_CORRUPTED if page based on checksum check is corrupted, DB_DECRYPTION_FAILED if page post encryption checksum matches but after decryption normal page checksum does not match. In read case only DB_SUCCESS is possible. buf_page_io_complete(): use dberr_t for error handling. buf_flush_write_block_low(), buf_read_ahead_random(), buf_read_page_async(), buf_read_ahead_linear(), buf_read_ibuf_merge_pages(), buf_read_recv_pages(), fil_aio_wait(): Issue error if page read fails. btr_pcur_move_to_next_page(): Do not reference page if it is NULL. Introduced dict_table_t::is_readable() and dict_index_t::is_readable() that will return true if tablespace exists and pages read from tablespace are not corrupted or page decryption failed. Removed buf_page_t::key_version. After page decryption the key version is not removed from page frame. For unencrypted pages, old key_version is removed at buf_page_encrypt_before_write() dict_stats_update_transient_for_index(), dict_stats_update_transient() Do not continue if table decryption failed or table is corrupted. dict0stats.cc: Introduced a dict_stats_report_error function to avoid code duplication. fil_parse_write_crypt_data(): Check that key read from redo log entry is found from encryption plugin and if it is not, refuse to start. PageConverter::validate(): Removed access to fil_space_t as tablespace is not available during import. Fixed error code on innodb.innodb test. Merged test cased innodb-bad-key-change5 and innodb-bad-key-shutdown to innodb-bad-key-change2. Removed innodb-bad-key-change5 test. Decreased unnecessary complexity on some long lasting tests. Removed fil_inc_pending_ops(), fil_decr_pending_ops(), fil_get_first_space(), fil_get_next_space(), fil_get_first_space_safe(), fil_get_next_space_safe() functions. fil_space_verify_crypt_checksum(): Fixed bug found using ASAN where FIL_PAGE_END_LSN_OLD_CHECKSUM field was incorrectly accessed from row compressed tables. Fixed out of page frame bug for row compressed tables in fil_space_verify_crypt_checksum() found using ASAN. Incorrect function was called for compressed table. Added new tests for discard, rename table and drop (we should allow them even when page decryption fails). Alter table rename is not allowed. Added test for restart with innodb-force-recovery=1 when page read on redo-recovery cant be decrypted. Added test for corrupted table where both page data and FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION is corrupted. Adjusted the test case innodb_bug14147491 so that it does not anymore expect crash. Instead table is just mostly not usable. fil0fil.h: fil_space_acquire_low is not visible function and fil_space_acquire and fil_space_acquire_silent are inline functions. FilSpace class uses fil_space_acquire_low directly. recv_apply_hashed_log_recs() does not return anything.
2017-04-26 15:19:16 +03:00
Copyright (c) 2013, 2017, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
You should have received a copy of the GNU General Public License along with
this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
*****************************************************************************/
/******************************************************************//**
@file dict/dict0mem.cc
Data dictionary memory object creation
Created 1/8/1996 Heikki Tuuri
***********************************************************************/
#include "ha_prototypes.h"
#include <mysql_com.h>
#include "dict0mem.h"
#include "rem0rec.h"
#include "data0type.h"
#include "mach0data.h"
#include "dict0dict.h"
#include "fts0priv.h"
2014-11-18 17:41:12 +01:00
#include "ut0crc32.h"
#include "lock0lock.h"
#include "sync0sync.h"
2014-11-18 17:41:12 +01:00
#include <iostream>
#define DICT_HEAP_SIZE 100 /*!< initial memory heap size when
creating a table or index object */
/** System databases */
static const char* innobase_system_databases[] = {
"mysql/",
"information_schema/",
"performance_schema/",
NullS
};
/** The start of the table basename suffix for partitioned tables */
const char table_name_t::part_suffix[4]
#ifdef _WIN32
= "#p#";
#else
= "#P#";
#endif
2014-11-18 17:41:12 +01:00
/** An interger randomly initialized at startup used to make a temporary
table name as unuique as possible. */
2014-11-18 17:41:12 +01:00
static ib_uint32_t dict_temp_file_num;
/** Display an identifier.
@param[in,out] s output stream
@param[in] id_name SQL identifier (other than table name)
@return the output stream */
std::ostream&
operator<<(
std::ostream& s,
const id_name_t& id_name)
{
const char q = '`';
const char* c = id_name;
s << q;
for (; *c != 0; c++) {
if (*c == q) {
s << *c;
}
s << *c;
}
s << q;
return(s);
}
/** Display a table name.
@param[in,out] s output stream
@param[in] table_name table name
@return the output stream */
std::ostream&
operator<<(
std::ostream& s,
const table_name_t& table_name)
{
return(s << ut_get_name(NULL, table_name.m_name));
}
/**********************************************************************//**
Creates a table memory object.
@return own: table object */
dict_table_t*
dict_mem_table_create(
/*==================*/
const char* name, /*!< in: table name */
ulint space, /*!< in: space where the clustered index of
the table is placed */
ulint n_cols, /*!< in: total number of columns including
virtual and non-virtual columns */
ulint n_v_cols,/*!< in: number of virtual columns */
ulint flags, /*!< in: table flags */
ulint flags2) /*!< in: table flags2 */
{
dict_table_t* table;
mem_heap_t* heap;
ut_ad(name);
ut_a(dict_tf2_is_valid(flags, flags2));
ut_a(!(flags2 & DICT_TF2_UNUSED_BIT_MASK));
heap = mem_heap_create(DICT_HEAP_SIZE);
table = static_cast<dict_table_t*>(
mem_heap_zalloc(heap, sizeof(*table)));
lock_table_lock_list_init(&table->locks);
UT_LIST_INIT(table->indexes, &dict_index_t::indexes);
table->heap = heap;
ut_d(table->magic_n = DICT_TABLE_MAGIC_N);
table->flags = (unsigned int) flags;
table->flags2 = (unsigned int) flags2;
table->name.m_name = mem_strdup(name);
table->is_system_db = dict_mem_table_is_system(table->name.m_name);
table->space = (unsigned int) space;
table->n_t_cols = unsigned(n_cols + DATA_N_SYS_COLS);
table->n_v_cols = (unsigned int) (n_v_cols);
table->n_cols = table->n_t_cols - table->n_v_cols;
table->cols = static_cast<dict_col_t*>(
mem_heap_alloc(heap, table->n_cols * sizeof(dict_col_t)));
table->v_cols = static_cast<dict_v_col_t*>(
mem_heap_alloc(heap, n_v_cols * sizeof(*table->v_cols)));
2014-06-09 18:16:00 +02:00
/* true means that the stats latch will be enabled -
dict_table_stats_lock() will not be noop. */
dict_table_stats_latch_create(table, true);
2014-05-05 18:20:28 +02:00
table->autoinc_lock = static_cast<ib_lock_t*>(
mem_heap_alloc(heap, lock_get_size()));
/* lazy creation of table autoinc latch */
2015-05-04 22:13:46 +02:00
dict_table_autoinc_create_lazy(table);
/* If the table has an FTS index or we are in the process
of building one, create the table->fts */
if (dict_table_has_fts_index(table)
|| DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID)
|| DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_ADD_DOC_ID)) {
table->fts = fts_create(table);
table->fts->cache = fts_cache_create(table);
} else {
table->fts = NULL;
}
2014-09-11 10:13:35 +02:00
new(&table->foreign_set) dict_foreign_set();
new(&table->referenced_set) dict_foreign_set();
return(table);
}
/****************************************************************//**
Free a table memory object. */
void
dict_mem_table_free(
/*================*/
dict_table_t* table) /*!< in: table */
{
ut_ad(table);
ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
ut_d(table->cached = FALSE);
if (dict_table_has_fts_index(table)
|| DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID)
|| DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_ADD_DOC_ID)) {
if (table->fts) {
fts_optimize_remove_table(table);
2014-02-26 19:23:04 +01:00
fts_free(table);
}
}
2014-05-05 18:20:28 +02:00
dict_table_autoinc_destroy(table);
dict_mem_table_free_foreign_vcol_set(table);
2014-06-09 18:16:00 +02:00
dict_table_stats_latch_destroy(table);
2014-05-05 18:20:28 +02:00
2014-09-11 10:13:35 +02:00
table->foreign_set.~dict_foreign_set();
table->referenced_set.~dict_foreign_set();
ut_free(table->name.m_name);
table->name.m_name = NULL;
/* Clean up virtual index info structures that are registered
with virtual columns */
for (ulint i = 0; i < table->n_v_def; i++) {
dict_v_col_t* vcol
= dict_table_get_nth_v_col(table, i);
UT_DELETE(vcol->v_indexes);
}
if (table->s_cols != NULL) {
UT_DELETE(table->s_cols);
}
mem_heap_free(table->heap);
}
/****************************************************************//**
Append 'name' to 'col_names'. @see dict_table_t::col_names
@return new column names array */
static
const char*
dict_add_col_name(
/*==============*/
const char* col_names, /*!< in: existing column names, or
NULL */
ulint cols, /*!< in: number of existing columns */
const char* name, /*!< in: new column name */
mem_heap_t* heap) /*!< in: heap */
{
ulint old_len;
ulint new_len;
ulint total_len;
char* res;
ut_ad(!cols == !col_names);
/* Find out length of existing array. */
if (col_names) {
const char* s = col_names;
ulint i;
for (i = 0; i < cols; i++) {
s += strlen(s) + 1;
}
old_len = s - col_names;
} else {
old_len = 0;
}
new_len = strlen(name) + 1;
total_len = old_len + new_len;
res = static_cast<char*>(mem_heap_alloc(heap, total_len));
if (old_len > 0) {
memcpy(res, col_names, old_len);
}
memcpy(res + old_len, name, new_len);
return(res);
}
/**********************************************************************//**
Adds a column definition to a table. */
void
dict_mem_table_add_col(
/*===================*/
dict_table_t* table, /*!< in: table */
mem_heap_t* heap, /*!< in: temporary memory heap, or NULL */
const char* name, /*!< in: column name, or NULL */
ulint mtype, /*!< in: main datatype */
ulint prtype, /*!< in: precise type */
ulint len) /*!< in: precision */
{
dict_col_t* col;
ulint i;
ut_ad(table);
ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
ut_ad(!heap == !name);
ut_ad(!(prtype & DATA_VIRTUAL));
i = table->n_def++;
table->n_t_def++;
if (name) {
if (table->n_def == table->n_cols) {
heap = table->heap;
}
if (i && !table->col_names) {
/* All preceding column names are empty. */
char* s = static_cast<char*>(
mem_heap_zalloc(heap, table->n_def));
table->col_names = s;
}
table->col_names = dict_add_col_name(table->col_names,
i, name, heap);
}
col = dict_table_get_nth_col(table, i);
dict_mem_fill_column_struct(col, i, mtype, prtype, len);
switch (prtype & DATA_VERSIONED) {
case DATA_VERS_START:
ut_ad(!table->vers_start);
table->vers_start = i;
break;
case DATA_VERS_END:
ut_ad(!table->vers_end);
table->vers_end = i;
}
}
/** Adds a virtual column definition to a table.
@param[in,out] table table
@param[in,out] heap temporary memory heap, or NULL. It is
used to store name when we have not finished
adding all columns. When all columns are
added, the whole name will copy to memory from
table->heap
@param[in] name column name
@param[in] mtype main datatype
@param[in] prtype precise type
@param[in] len length
@param[in] pos position in a table
@param[in] num_base number of base columns
@return the virtual column definition */
dict_v_col_t*
dict_mem_table_add_v_col(
dict_table_t* table,
mem_heap_t* heap,
const char* name,
ulint mtype,
ulint prtype,
ulint len,
ulint pos,
ulint num_base)
{
dict_v_col_t* v_col;
ulint i;
ut_ad(table);
ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
ut_ad(!heap == !name);
ut_ad(prtype & DATA_VIRTUAL);
i = table->n_v_def++;
table->n_t_def++;
if (name != NULL) {
if (table->n_v_def == table->n_v_cols) {
heap = table->heap;
}
if (i && !table->v_col_names) {
/* All preceding column names are empty. */
char* s = static_cast<char*>(
mem_heap_zalloc(heap, table->n_v_def));
table->v_col_names = s;
}
table->v_col_names = dict_add_col_name(table->v_col_names,
i, name, heap);
}
MDEV-11369 Instant ADD COLUMN for InnoDB For InnoDB tables, adding, dropping and reordering columns has required a rebuild of the table and all its indexes. Since MySQL 5.6 (and MariaDB 10.0) this has been supported online (LOCK=NONE), allowing concurrent modification of the tables. This work revises the InnoDB ROW_FORMAT=REDUNDANT, ROW_FORMAT=COMPACT and ROW_FORMAT=DYNAMIC so that columns can be appended instantaneously, with only minor changes performed to the table structure. The counter innodb_instant_alter_column in INFORMATION_SCHEMA.GLOBAL_STATUS is incremented whenever a table rebuild operation is converted into an instant ADD COLUMN operation. ROW_FORMAT=COMPRESSED tables will not support instant ADD COLUMN. Some usability limitations will be addressed in subsequent work: MDEV-13134 Introduce ALTER TABLE attributes ALGORITHM=NOCOPY and ALGORITHM=INSTANT MDEV-14016 Allow instant ADD COLUMN, ADD INDEX, LOCK=NONE The format of the clustered index (PRIMARY KEY) is changed as follows: (1) The FIL_PAGE_TYPE of the root page will be FIL_PAGE_TYPE_INSTANT, and a new field PAGE_INSTANT will contain the original number of fields in the clustered index ('core' fields). If instant ADD COLUMN has not been used or the table becomes empty, or the very first instant ADD COLUMN operation is rolled back, the fields PAGE_INSTANT and FIL_PAGE_TYPE will be reset to 0 and FIL_PAGE_INDEX. (2) A special 'default row' record is inserted into the leftmost leaf, between the page infimum and the first user record. This record is distinguished by the REC_INFO_MIN_REC_FLAG, and it is otherwise in the same format as records that contain values for the instantly added columns. This 'default row' always has the same number of fields as the clustered index according to the table definition. The values of 'core' fields are to be ignored. For other fields, the 'default row' will contain the default values as they were during the ALTER TABLE statement. (If the column default values are changed later, those values will only be stored in the .frm file. The 'default row' will contain the original evaluated values, which must be the same for every row.) The 'default row' must be completely hidden from higher-level access routines. Assertions have been added to ensure that no 'default row' is ever present in the adaptive hash index or in locked records. The 'default row' is never delete-marked. (3) In clustered index leaf page records, the number of fields must reside between the number of 'core' fields (dict_index_t::n_core_fields introduced in this work) and dict_index_t::n_fields. If the number of fields is less than dict_index_t::n_fields, the missing fields are replaced with the column value of the 'default row'. Note: The number of fields in the record may shrink if some of the last instantly added columns are updated to the value that is in the 'default row'. The function btr_cur_trim() implements this 'compression' on update and rollback; dtuple::trim() implements it on insert. (4) In ROW_FORMAT=COMPACT and ROW_FORMAT=DYNAMIC records, the new status value REC_STATUS_COLUMNS_ADDED will indicate the presence of a new record header that will encode n_fields-n_core_fields-1 in 1 or 2 bytes. (In ROW_FORMAT=REDUNDANT records, the record header always explicitly encodes the number of fields.) We introduce the undo log record type TRX_UNDO_INSERT_DEFAULT for covering the insert of the 'default row' record when instant ADD COLUMN is used for the first time. Subsequent instant ADD COLUMN can use TRX_UNDO_UPD_EXIST_REC. This is joint work with Vin Chen (陈福荣) from Tencent. The design that was discussed in April 2017 would not have allowed import or export of data files, because instead of the 'default row' it would have introduced a data dictionary table. The test rpl.rpl_alter_instant is exactly as contributed in pull request #408. The test innodb.instant_alter is based on a contributed test. The redo log record format changes for ROW_FORMAT=DYNAMIC and ROW_FORMAT=COMPACT are as contributed. (With this change present, crash recovery from MariaDB 10.3.1 will fail in spectacular ways!) Also the semantics of higher-level redo log records that modify the PAGE_INSTANT field is changed. The redo log format version identifier was already changed to LOG_HEADER_FORMAT_CURRENT=103 in MariaDB 10.3.1. Everything else has been rewritten by me. Thanks to Elena Stepanova, the code has been tested extensively. When rolling back an instant ADD COLUMN operation, we must empty the PAGE_FREE list after deleting or shortening the 'default row' record, by calling either btr_page_empty() or btr_page_reorganize(). We must know the size of each entry in the PAGE_FREE list. If rollback left a freed copy of the 'default row' in the PAGE_FREE list, we would be unable to determine its size (if it is in ROW_FORMAT=COMPACT or ROW_FORMAT=DYNAMIC) because it would contain more fields than the rolled-back definition of the clustered index. UNIV_SQL_DEFAULT: A new special constant that designates an instantly added column that is not present in the clustered index record. len_is_stored(): Check if a length is an actual length. There are two magic length values: UNIV_SQL_DEFAULT, UNIV_SQL_NULL. dict_col_t::def_val: The 'default row' value of the column. If the column is not added instantly, def_val.len will be UNIV_SQL_DEFAULT. dict_col_t: Add the accessors is_virtual(), is_nullable(), is_instant(), instant_value(). dict_col_t::remove_instant(): Remove the 'instant ADD' status of a column. dict_col_t::name(const dict_table_t& table): Replaces dict_table_get_col_name(). dict_index_t::n_core_fields: The original number of fields. For secondary indexes and if instant ADD COLUMN has not been used, this will be equal to dict_index_t::n_fields. dict_index_t::n_core_null_bytes: Number of bytes needed to represent the null flags; usually equal to UT_BITS_IN_BYTES(n_nullable). dict_index_t::NO_CORE_NULL_BYTES: Magic value signalling that n_core_null_bytes was not initialized yet from the clustered index root page. dict_index_t: Add the accessors is_instant(), is_clust(), get_n_nullable(), instant_field_value(). dict_index_t::instant_add_field(): Adjust clustered index metadata for instant ADD COLUMN. dict_index_t::remove_instant(): Remove the 'instant ADD' status of a clustered index when the table becomes empty, or the very first instant ADD COLUMN operation is rolled back. dict_table_t: Add the accessors is_instant(), is_temporary(), supports_instant(). dict_table_t::instant_add_column(): Adjust metadata for instant ADD COLUMN. dict_table_t::rollback_instant(): Adjust metadata on the rollback of instant ADD COLUMN. prepare_inplace_alter_table_dict(): First create the ctx->new_table, and only then decide if the table really needs to be rebuilt. We must split the creation of table or index metadata from the creation of the dictionary table records and the creation of the data. In this way, we can transform a table-rebuilding operation into an instant ADD COLUMN operation. Dictionary objects will only be added to cache when table rebuilding or index creation is needed. The ctx->instant_table will never be added to cache. dict_table_t::add_to_cache(): Modified and renamed from dict_table_add_to_cache(). Do not modify the table metadata. Let the callers invoke dict_table_add_system_columns() and if needed, set can_be_evicted. dict_create_sys_tables_tuple(), dict_create_table_step(): Omit the system columns (which will now exist in the dict_table_t object already at this point). dict_create_table_step(): Expect the callers to invoke dict_table_add_system_columns(). pars_create_table(): Before creating the table creation execution graph, invoke dict_table_add_system_columns(). row_create_table_for_mysql(): Expect all callers to invoke dict_table_add_system_columns(). create_index_dict(): Replaces row_merge_create_index_graph(). innodb_update_n_cols(): Renamed from innobase_update_n_virtual(). Call my_error() if an error occurs. btr_cur_instant_init(), btr_cur_instant_init_low(), btr_cur_instant_root_init(): Load additional metadata from the clustered index and set dict_index_t::n_core_null_bytes. This is invoked when table metadata is first loaded into the data dictionary. dict_boot(): Initialize n_core_null_bytes for the four hard-coded dictionary tables. dict_create_index_step(): Initialize n_core_null_bytes. This is executed as part of CREATE TABLE. dict_index_build_internal_clust(): Initialize n_core_null_bytes to NO_CORE_NULL_BYTES if table->supports_instant(). row_create_index_for_mysql(): Initialize n_core_null_bytes for CREATE TEMPORARY TABLE. commit_cache_norebuild(): Call the code to rename or enlarge columns in the cache only if instant ADD COLUMN is not being used. (Instant ADD COLUMN would copy all column metadata from instant_table to old_table, including the names and lengths.) PAGE_INSTANT: A new 13-bit field for storing dict_index_t::n_core_fields. This is repurposing the 16-bit field PAGE_DIRECTION, of which only the least significant 3 bits were used. The original byte containing PAGE_DIRECTION will be accessible via the new constant PAGE_DIRECTION_B. page_get_instant(), page_set_instant(): Accessors for the PAGE_INSTANT. page_ptr_get_direction(), page_get_direction(), page_ptr_set_direction(): Accessors for PAGE_DIRECTION. page_direction_reset(): Reset PAGE_DIRECTION, PAGE_N_DIRECTION. page_direction_increment(): Increment PAGE_N_DIRECTION and set PAGE_DIRECTION. rec_get_offsets(): Use the 'leaf' parameter for non-debug purposes, and assume that heap_no is always set. Initialize all dict_index_t::n_fields for ROW_FORMAT=REDUNDANT records, even if the record contains fewer fields. rec_offs_make_valid(): Add the parameter 'leaf'. rec_copy_prefix_to_dtuple(): Assert that the tuple is only built on the core fields. Instant ADD COLUMN only applies to the clustered index, and we should never build a search key that has more than the PRIMARY KEY and possibly DB_TRX_ID,DB_ROLL_PTR. All these columns are always present. dict_index_build_data_tuple(): Remove assertions that would be duplicated in rec_copy_prefix_to_dtuple(). rec_init_offsets(): Support ROW_FORMAT=REDUNDANT records whose number of fields is between n_core_fields and n_fields. cmp_rec_rec_with_match(): Implement the comparison between two MIN_REC_FLAG records. trx_t::in_rollback: Make the field available in non-debug builds. trx_start_for_ddl_low(): Remove dangerous error-tolerance. A dictionary transaction must be flagged as such before it has generated any undo log records. This is because trx_undo_assign_undo() will mark the transaction as a dictionary transaction in the undo log header right before the very first undo log record is being written. btr_index_rec_validate(): Account for instant ADD COLUMN row_undo_ins_remove_clust_rec(): On the rollback of an insert into SYS_COLUMNS, revert instant ADD COLUMN in the cache by removing the last column from the table and the clustered index. row_search_on_row_ref(), row_undo_mod_parse_undo_rec(), row_undo_mod(), trx_undo_update_rec_get_update(): Handle the 'default row' as a special case. dtuple_t::trim(index): Omit a redundant suffix of an index tuple right before insert or update. After instant ADD COLUMN, if the last fields of a clustered index tuple match the 'default row', there is no need to store them. While trimming the entry, we must hold a page latch, so that the table cannot be emptied and the 'default row' be deleted. btr_cur_optimistic_update(), btr_cur_pessimistic_update(), row_upd_clust_rec_by_insert(), row_ins_clust_index_entry_low(): Invoke dtuple_t::trim() if needed. row_ins_clust_index_entry(): Restore dtuple_t::n_fields after calling row_ins_clust_index_entry_low(). rec_get_converted_size(), rec_get_converted_size_comp(): Allow the number of fields to be between n_core_fields and n_fields. Do not support infimum,supremum. They are never supposed to be stored in dtuple_t, because page creation nowadays uses a lower-level method for initializing them. rec_convert_dtuple_to_rec_comp(): Assign the status bits based on the number of fields. btr_cur_trim(): In an update, trim the index entry as needed. For the 'default row', handle rollback specially. For user records, omit fields that match the 'default row'. btr_cur_optimistic_delete_func(), btr_cur_pessimistic_delete(): Skip locking and adaptive hash index for the 'default row'. row_log_table_apply_convert_mrec(): Replace 'default row' values if needed. In the temporary file that is applied by row_log_table_apply(), we must identify whether the records contain the extra header for instantly added columns. For now, we will allocate an additional byte for this for ROW_T_INSERT and ROW_T_UPDATE records when the source table has been subject to instant ADD COLUMN. The ROW_T_DELETE records are fine, as they will be converted and will only contain 'core' columns (PRIMARY KEY and some system columns) that are converted from dtuple_t. rec_get_converted_size_temp(), rec_init_offsets_temp(), rec_convert_dtuple_to_temp(): Add the parameter 'status'. REC_INFO_DEFAULT_ROW = REC_INFO_MIN_REC_FLAG | REC_STATUS_COLUMNS_ADDED: An info_bits constant for distinguishing the 'default row' record. rec_comp_status_t: An enum of the status bit values. rec_leaf_format: An enum that replaces the bool parameter of rec_init_offsets_comp_ordinary().
2017-10-06 07:00:05 +03:00
v_col = &table->v_cols[i];
dict_mem_fill_column_struct(&v_col->m_col, pos, mtype, prtype, len);
v_col->v_pos = i;
if (num_base != 0) {
v_col->base_col = static_cast<dict_col_t**>(mem_heap_zalloc(
table->heap, num_base * sizeof(
*v_col->base_col)));
} else {
v_col->base_col = NULL;
}
v_col->num_base = num_base;
/* Initialize the index list for virtual columns */
v_col->v_indexes = UT_NEW_NOKEY(dict_v_idx_list());
return(v_col);
}
/** Adds a stored column definition to a table.
@param[in] table table
@param[in] num_base number of base columns. */
void
dict_mem_table_add_s_col(
dict_table_t* table,
ulint num_base)
{
ulint i = table->n_def - 1;
dict_col_t* col = dict_table_get_nth_col(table, i);
dict_s_col_t s_col;
ut_ad(col != NULL);
if (table->s_cols == NULL) {
table->s_cols = UT_NEW_NOKEY(dict_s_col_list());
}
s_col.m_col = col;
s_col.s_pos = i + table->n_v_def;
if (num_base != 0) {
s_col.base_col = static_cast<dict_col_t**>(mem_heap_zalloc(
table->heap, num_base * sizeof(dict_col_t*)));
} else {
s_col.base_col = NULL;
}
s_col.num_base = num_base;
table->s_cols->push_back(s_col);
}
/**********************************************************************//**
Renames a column of a table in the data dictionary cache. */
2016-06-21 14:21:03 +02:00
static MY_ATTRIBUTE((nonnull))
void
dict_mem_table_col_rename_low(
/*==========================*/
dict_table_t* table, /*!< in/out: table */
unsigned i, /*!< in: column offset corresponding to s */
const char* to, /*!< in: new column name */
const char* s, /*!< in: pointer to table->col_names */
bool is_virtual)
/*!< in: if this is a virtual column */
{
char* t_col_names = const_cast<char*>(
is_virtual ? table->v_col_names : table->col_names);
ulint n_col = is_virtual ? table->n_v_def : table->n_def;
size_t from_len = strlen(s), to_len = strlen(to);
ut_ad(i < table->n_def || is_virtual);
ut_ad(i < table->n_v_def || !is_virtual);
ut_ad(from_len <= NAME_LEN);
ut_ad(to_len <= NAME_LEN);
char from[NAME_LEN];
strncpy(from, s, NAME_LEN);
if (from_len == to_len) {
/* The easy case: simply replace the column name in
table->col_names. */
strcpy(const_cast<char*>(s), to);
} else {
/* We need to adjust all affected index->field
pointers, as in dict_index_add_col(). First, copy
table->col_names. */
ulint prefix_len = s - t_col_names;
for (; i < n_col; i++) {
s += strlen(s) + 1;
}
ulint full_len = s - t_col_names;
char* col_names;
if (to_len > from_len) {
col_names = static_cast<char*>(
mem_heap_alloc(
table->heap,
full_len + to_len - from_len));
memcpy(col_names, t_col_names, prefix_len);
} else {
col_names = const_cast<char*>(t_col_names);
}
memcpy(col_names + prefix_len, to, to_len);
memmove(col_names + prefix_len + to_len,
t_col_names + (prefix_len + from_len),
full_len - (prefix_len + from_len));
/* Replace the field names in every index. */
for (dict_index_t* index = dict_table_get_first_index(table);
index != NULL;
index = dict_table_get_next_index(index)) {
ulint n_fields = dict_index_get_n_fields(index);
for (ulint i = 0; i < n_fields; i++) {
dict_field_t* field
= dict_index_get_nth_field(
index, i);
/* if is_virtual and that in field->col does
not match, continue */
if ((!is_virtual) !=
(!dict_col_is_virtual(field->col))) {
continue;
}
ulint name_ofs
= field->name - t_col_names;
if (name_ofs <= prefix_len) {
field->name = col_names + name_ofs;
} else {
ut_a(name_ofs < full_len);
field->name = col_names
+ name_ofs + to_len - from_len;
}
}
}
if (is_virtual) {
table->v_col_names = col_names;
} else {
table->col_names = col_names;
}
}
/* Virtual columns are not allowed for foreign key */
if (is_virtual) {
return;
}
2014-09-11 10:13:35 +02:00
dict_foreign_t* foreign;
/* Replace the field names in every foreign key constraint. */
2014-09-11 10:13:35 +02:00
for (dict_foreign_set::iterator it = table->foreign_set.begin();
it != table->foreign_set.end();
++it) {
foreign = *it;
if (foreign->foreign_index == NULL) {
/* We may go here when we set foreign_key_checks to 0,
and then try to rename a column and modify the
corresponding foreign key constraint. The index
would have been dropped, we have to find an equivalent
one */
for (unsigned f = 0; f < foreign->n_fields; f++) {
if (strcmp(foreign->foreign_col_names[f], from)
== 0) {
char** rc = const_cast<char**>(
foreign->foreign_col_names
+ f);
if (to_len <= strlen(*rc)) {
memcpy(*rc, to, to_len + 1);
} else {
*rc = static_cast<char*>(
mem_heap_dup(
foreign->heap,
to,
to_len + 1));
}
}
}
dict_index_t* new_index = dict_foreign_find_index(
foreign->foreign_table, NULL,
foreign->foreign_col_names,
foreign->n_fields, NULL, true, false,
NULL, NULL, NULL);
/* There must be an equivalent index in this case. */
ut_ad(new_index != NULL);
foreign->foreign_index = new_index;
} else {
for (unsigned f = 0; f < foreign->n_fields; f++) {
/* These can point straight to
table->col_names, because the foreign key
constraints will be freed at the same time
when the table object is freed. */
foreign->foreign_col_names[f]
= dict_index_get_nth_field(
foreign->foreign_index,
f)->name;
}
}
}
2014-09-11 10:13:35 +02:00
for (dict_foreign_set::iterator it = table->referenced_set.begin();
it != table->referenced_set.end();
++it) {
foreign = *it;
ut_ad(foreign->referenced_index != NULL);
for (unsigned f = 0; f < foreign->n_fields; f++) {
/* foreign->referenced_col_names[] need to be
copies, because the constraint may become
orphan when foreign_key_checks=0 and the
parent table is dropped. */
const char* col_name = dict_index_get_nth_field(
foreign->referenced_index, f)->name;
if (strcmp(foreign->referenced_col_names[f],
col_name)) {
char** rc = const_cast<char**>(
foreign->referenced_col_names + f);
size_t col_name_len_1 = strlen(col_name) + 1;
if (col_name_len_1 <= strlen(*rc) + 1) {
memcpy(*rc, col_name, col_name_len_1);
} else {
*rc = static_cast<char*>(
mem_heap_dup(
foreign->heap,
col_name,
col_name_len_1));
}
}
}
}
}
/**********************************************************************//**
Renames a column of a table in the data dictionary cache. */
void
dict_mem_table_col_rename(
/*======================*/
dict_table_t* table, /*!< in/out: table */
ulint nth_col,/*!< in: column index */
const char* from, /*!< in: old column name */
const char* to, /*!< in: new column name */
bool is_virtual)
/*!< in: if this is a virtual column */
{
const char* s = is_virtual ? table->v_col_names : table->col_names;
ut_ad((!is_virtual && nth_col < table->n_def)
|| (is_virtual && nth_col < table->n_v_def));
for (ulint i = 0; i < nth_col; i++) {
size_t len = strlen(s);
ut_ad(len > 0);
s += len + 1;
}
/* This could fail if the data dictionaries are out of sync.
Proceed with the renaming anyway. */
ut_ad(!strcmp(from, s));
dict_mem_table_col_rename_low(table, static_cast<unsigned>(nth_col),
to, s, is_virtual);
}
/**********************************************************************//**
This function populates a dict_col_t memory structure with
supplied information. */
void
dict_mem_fill_column_struct(
/*========================*/
dict_col_t* column, /*!< out: column struct to be
filled */
ulint col_pos, /*!< in: column position */
ulint mtype, /*!< in: main data type */
ulint prtype, /*!< in: precise type */
ulint col_len) /*!< in: column length */
{
ulint mbminlen;
ulint mbmaxlen;
column->ind = (unsigned int) col_pos;
column->ord_part = 0;
column->max_prefix = 0;
column->mtype = (unsigned int) mtype;
column->prtype = (unsigned int) prtype;
column->len = (unsigned int) col_len;
MDEV-11369 Instant ADD COLUMN for InnoDB For InnoDB tables, adding, dropping and reordering columns has required a rebuild of the table and all its indexes. Since MySQL 5.6 (and MariaDB 10.0) this has been supported online (LOCK=NONE), allowing concurrent modification of the tables. This work revises the InnoDB ROW_FORMAT=REDUNDANT, ROW_FORMAT=COMPACT and ROW_FORMAT=DYNAMIC so that columns can be appended instantaneously, with only minor changes performed to the table structure. The counter innodb_instant_alter_column in INFORMATION_SCHEMA.GLOBAL_STATUS is incremented whenever a table rebuild operation is converted into an instant ADD COLUMN operation. ROW_FORMAT=COMPRESSED tables will not support instant ADD COLUMN. Some usability limitations will be addressed in subsequent work: MDEV-13134 Introduce ALTER TABLE attributes ALGORITHM=NOCOPY and ALGORITHM=INSTANT MDEV-14016 Allow instant ADD COLUMN, ADD INDEX, LOCK=NONE The format of the clustered index (PRIMARY KEY) is changed as follows: (1) The FIL_PAGE_TYPE of the root page will be FIL_PAGE_TYPE_INSTANT, and a new field PAGE_INSTANT will contain the original number of fields in the clustered index ('core' fields). If instant ADD COLUMN has not been used or the table becomes empty, or the very first instant ADD COLUMN operation is rolled back, the fields PAGE_INSTANT and FIL_PAGE_TYPE will be reset to 0 and FIL_PAGE_INDEX. (2) A special 'default row' record is inserted into the leftmost leaf, between the page infimum and the first user record. This record is distinguished by the REC_INFO_MIN_REC_FLAG, and it is otherwise in the same format as records that contain values for the instantly added columns. This 'default row' always has the same number of fields as the clustered index according to the table definition. The values of 'core' fields are to be ignored. For other fields, the 'default row' will contain the default values as they were during the ALTER TABLE statement. (If the column default values are changed later, those values will only be stored in the .frm file. The 'default row' will contain the original evaluated values, which must be the same for every row.) The 'default row' must be completely hidden from higher-level access routines. Assertions have been added to ensure that no 'default row' is ever present in the adaptive hash index or in locked records. The 'default row' is never delete-marked. (3) In clustered index leaf page records, the number of fields must reside between the number of 'core' fields (dict_index_t::n_core_fields introduced in this work) and dict_index_t::n_fields. If the number of fields is less than dict_index_t::n_fields, the missing fields are replaced with the column value of the 'default row'. Note: The number of fields in the record may shrink if some of the last instantly added columns are updated to the value that is in the 'default row'. The function btr_cur_trim() implements this 'compression' on update and rollback; dtuple::trim() implements it on insert. (4) In ROW_FORMAT=COMPACT and ROW_FORMAT=DYNAMIC records, the new status value REC_STATUS_COLUMNS_ADDED will indicate the presence of a new record header that will encode n_fields-n_core_fields-1 in 1 or 2 bytes. (In ROW_FORMAT=REDUNDANT records, the record header always explicitly encodes the number of fields.) We introduce the undo log record type TRX_UNDO_INSERT_DEFAULT for covering the insert of the 'default row' record when instant ADD COLUMN is used for the first time. Subsequent instant ADD COLUMN can use TRX_UNDO_UPD_EXIST_REC. This is joint work with Vin Chen (陈福荣) from Tencent. The design that was discussed in April 2017 would not have allowed import or export of data files, because instead of the 'default row' it would have introduced a data dictionary table. The test rpl.rpl_alter_instant is exactly as contributed in pull request #408. The test innodb.instant_alter is based on a contributed test. The redo log record format changes for ROW_FORMAT=DYNAMIC and ROW_FORMAT=COMPACT are as contributed. (With this change present, crash recovery from MariaDB 10.3.1 will fail in spectacular ways!) Also the semantics of higher-level redo log records that modify the PAGE_INSTANT field is changed. The redo log format version identifier was already changed to LOG_HEADER_FORMAT_CURRENT=103 in MariaDB 10.3.1. Everything else has been rewritten by me. Thanks to Elena Stepanova, the code has been tested extensively. When rolling back an instant ADD COLUMN operation, we must empty the PAGE_FREE list after deleting or shortening the 'default row' record, by calling either btr_page_empty() or btr_page_reorganize(). We must know the size of each entry in the PAGE_FREE list. If rollback left a freed copy of the 'default row' in the PAGE_FREE list, we would be unable to determine its size (if it is in ROW_FORMAT=COMPACT or ROW_FORMAT=DYNAMIC) because it would contain more fields than the rolled-back definition of the clustered index. UNIV_SQL_DEFAULT: A new special constant that designates an instantly added column that is not present in the clustered index record. len_is_stored(): Check if a length is an actual length. There are two magic length values: UNIV_SQL_DEFAULT, UNIV_SQL_NULL. dict_col_t::def_val: The 'default row' value of the column. If the column is not added instantly, def_val.len will be UNIV_SQL_DEFAULT. dict_col_t: Add the accessors is_virtual(), is_nullable(), is_instant(), instant_value(). dict_col_t::remove_instant(): Remove the 'instant ADD' status of a column. dict_col_t::name(const dict_table_t& table): Replaces dict_table_get_col_name(). dict_index_t::n_core_fields: The original number of fields. For secondary indexes and if instant ADD COLUMN has not been used, this will be equal to dict_index_t::n_fields. dict_index_t::n_core_null_bytes: Number of bytes needed to represent the null flags; usually equal to UT_BITS_IN_BYTES(n_nullable). dict_index_t::NO_CORE_NULL_BYTES: Magic value signalling that n_core_null_bytes was not initialized yet from the clustered index root page. dict_index_t: Add the accessors is_instant(), is_clust(), get_n_nullable(), instant_field_value(). dict_index_t::instant_add_field(): Adjust clustered index metadata for instant ADD COLUMN. dict_index_t::remove_instant(): Remove the 'instant ADD' status of a clustered index when the table becomes empty, or the very first instant ADD COLUMN operation is rolled back. dict_table_t: Add the accessors is_instant(), is_temporary(), supports_instant(). dict_table_t::instant_add_column(): Adjust metadata for instant ADD COLUMN. dict_table_t::rollback_instant(): Adjust metadata on the rollback of instant ADD COLUMN. prepare_inplace_alter_table_dict(): First create the ctx->new_table, and only then decide if the table really needs to be rebuilt. We must split the creation of table or index metadata from the creation of the dictionary table records and the creation of the data. In this way, we can transform a table-rebuilding operation into an instant ADD COLUMN operation. Dictionary objects will only be added to cache when table rebuilding or index creation is needed. The ctx->instant_table will never be added to cache. dict_table_t::add_to_cache(): Modified and renamed from dict_table_add_to_cache(). Do not modify the table metadata. Let the callers invoke dict_table_add_system_columns() and if needed, set can_be_evicted. dict_create_sys_tables_tuple(), dict_create_table_step(): Omit the system columns (which will now exist in the dict_table_t object already at this point). dict_create_table_step(): Expect the callers to invoke dict_table_add_system_columns(). pars_create_table(): Before creating the table creation execution graph, invoke dict_table_add_system_columns(). row_create_table_for_mysql(): Expect all callers to invoke dict_table_add_system_columns(). create_index_dict(): Replaces row_merge_create_index_graph(). innodb_update_n_cols(): Renamed from innobase_update_n_virtual(). Call my_error() if an error occurs. btr_cur_instant_init(), btr_cur_instant_init_low(), btr_cur_instant_root_init(): Load additional metadata from the clustered index and set dict_index_t::n_core_null_bytes. This is invoked when table metadata is first loaded into the data dictionary. dict_boot(): Initialize n_core_null_bytes for the four hard-coded dictionary tables. dict_create_index_step(): Initialize n_core_null_bytes. This is executed as part of CREATE TABLE. dict_index_build_internal_clust(): Initialize n_core_null_bytes to NO_CORE_NULL_BYTES if table->supports_instant(). row_create_index_for_mysql(): Initialize n_core_null_bytes for CREATE TEMPORARY TABLE. commit_cache_norebuild(): Call the code to rename or enlarge columns in the cache only if instant ADD COLUMN is not being used. (Instant ADD COLUMN would copy all column metadata from instant_table to old_table, including the names and lengths.) PAGE_INSTANT: A new 13-bit field for storing dict_index_t::n_core_fields. This is repurposing the 16-bit field PAGE_DIRECTION, of which only the least significant 3 bits were used. The original byte containing PAGE_DIRECTION will be accessible via the new constant PAGE_DIRECTION_B. page_get_instant(), page_set_instant(): Accessors for the PAGE_INSTANT. page_ptr_get_direction(), page_get_direction(), page_ptr_set_direction(): Accessors for PAGE_DIRECTION. page_direction_reset(): Reset PAGE_DIRECTION, PAGE_N_DIRECTION. page_direction_increment(): Increment PAGE_N_DIRECTION and set PAGE_DIRECTION. rec_get_offsets(): Use the 'leaf' parameter for non-debug purposes, and assume that heap_no is always set. Initialize all dict_index_t::n_fields for ROW_FORMAT=REDUNDANT records, even if the record contains fewer fields. rec_offs_make_valid(): Add the parameter 'leaf'. rec_copy_prefix_to_dtuple(): Assert that the tuple is only built on the core fields. Instant ADD COLUMN only applies to the clustered index, and we should never build a search key that has more than the PRIMARY KEY and possibly DB_TRX_ID,DB_ROLL_PTR. All these columns are always present. dict_index_build_data_tuple(): Remove assertions that would be duplicated in rec_copy_prefix_to_dtuple(). rec_init_offsets(): Support ROW_FORMAT=REDUNDANT records whose number of fields is between n_core_fields and n_fields. cmp_rec_rec_with_match(): Implement the comparison between two MIN_REC_FLAG records. trx_t::in_rollback: Make the field available in non-debug builds. trx_start_for_ddl_low(): Remove dangerous error-tolerance. A dictionary transaction must be flagged as such before it has generated any undo log records. This is because trx_undo_assign_undo() will mark the transaction as a dictionary transaction in the undo log header right before the very first undo log record is being written. btr_index_rec_validate(): Account for instant ADD COLUMN row_undo_ins_remove_clust_rec(): On the rollback of an insert into SYS_COLUMNS, revert instant ADD COLUMN in the cache by removing the last column from the table and the clustered index. row_search_on_row_ref(), row_undo_mod_parse_undo_rec(), row_undo_mod(), trx_undo_update_rec_get_update(): Handle the 'default row' as a special case. dtuple_t::trim(index): Omit a redundant suffix of an index tuple right before insert or update. After instant ADD COLUMN, if the last fields of a clustered index tuple match the 'default row', there is no need to store them. While trimming the entry, we must hold a page latch, so that the table cannot be emptied and the 'default row' be deleted. btr_cur_optimistic_update(), btr_cur_pessimistic_update(), row_upd_clust_rec_by_insert(), row_ins_clust_index_entry_low(): Invoke dtuple_t::trim() if needed. row_ins_clust_index_entry(): Restore dtuple_t::n_fields after calling row_ins_clust_index_entry_low(). rec_get_converted_size(), rec_get_converted_size_comp(): Allow the number of fields to be between n_core_fields and n_fields. Do not support infimum,supremum. They are never supposed to be stored in dtuple_t, because page creation nowadays uses a lower-level method for initializing them. rec_convert_dtuple_to_rec_comp(): Assign the status bits based on the number of fields. btr_cur_trim(): In an update, trim the index entry as needed. For the 'default row', handle rollback specially. For user records, omit fields that match the 'default row'. btr_cur_optimistic_delete_func(), btr_cur_pessimistic_delete(): Skip locking and adaptive hash index for the 'default row'. row_log_table_apply_convert_mrec(): Replace 'default row' values if needed. In the temporary file that is applied by row_log_table_apply(), we must identify whether the records contain the extra header for instantly added columns. For now, we will allocate an additional byte for this for ROW_T_INSERT and ROW_T_UPDATE records when the source table has been subject to instant ADD COLUMN. The ROW_T_DELETE records are fine, as they will be converted and will only contain 'core' columns (PRIMARY KEY and some system columns) that are converted from dtuple_t. rec_get_converted_size_temp(), rec_init_offsets_temp(), rec_convert_dtuple_to_temp(): Add the parameter 'status'. REC_INFO_DEFAULT_ROW = REC_INFO_MIN_REC_FLAG | REC_STATUS_COLUMNS_ADDED: An info_bits constant for distinguishing the 'default row' record. rec_comp_status_t: An enum of the status bit values. rec_leaf_format: An enum that replaces the bool parameter of rec_init_offsets_comp_ordinary().
2017-10-06 07:00:05 +03:00
column->def_val.data = NULL;
column->def_val.len = UNIV_SQL_DEFAULT;
dtype_get_mblen(mtype, prtype, &mbminlen, &mbmaxlen);
dict_col_set_mbminmaxlen(column, mbminlen, mbmaxlen);
}
/**********************************************************************//**
Creates an index memory object.
@return own: index object */
dict_index_t*
dict_mem_index_create(
/*==================*/
const char* table_name, /*!< in: table name */
const char* index_name, /*!< in: index name */
ulint space, /*!< in: space where the index tree is
placed, ignored if the index is of
the clustered type */
ulint type, /*!< in: DICT_UNIQUE,
DICT_CLUSTERED, ... ORed */
ulint n_fields) /*!< in: number of fields */
{
dict_index_t* index;
mem_heap_t* heap;
ut_ad(table_name && index_name);
heap = mem_heap_create(DICT_HEAP_SIZE);
index = static_cast<dict_index_t*>(
mem_heap_zalloc(heap, sizeof(*index)));
dict_mem_fill_index_struct(index, heap, table_name, index_name,
space, type, n_fields);
2015-05-04 22:13:46 +02:00
dict_index_zip_pad_mutex_create_lazy(index);
if (type & DICT_SPATIAL) {
mutex_create(LATCH_ID_RTR_SSN_MUTEX, &index->rtr_ssn.mutex);
index->rtr_track = static_cast<rtr_info_track_t*>(
mem_heap_alloc(
heap,
sizeof(*index->rtr_track)));
mutex_create(LATCH_ID_RTR_ACTIVE_MUTEX,
&index->rtr_track->rtr_active_mutex);
index->rtr_track->rtr_active = UT_NEW_NOKEY(rtr_info_active());
}
return(index);
}
/**********************************************************************//**
Creates and initializes a foreign constraint memory object.
@return own: foreign constraint struct */
dict_foreign_t*
dict_mem_foreign_create(void)
/*=========================*/
{
dict_foreign_t* foreign;
mem_heap_t* heap;
DBUG_ENTER("dict_mem_foreign_create");
heap = mem_heap_create(100);
foreign = static_cast<dict_foreign_t*>(
mem_heap_zalloc(heap, sizeof(dict_foreign_t)));
foreign->heap = heap;
foreign->v_cols = NULL;
DBUG_PRINT("dict_mem_foreign_create", ("heap: %p", heap));
DBUG_RETURN(foreign);
}
/**********************************************************************//**
Sets the foreign_table_name_lookup pointer based on the value of
lower_case_table_names. If that is 0 or 1, foreign_table_name_lookup
will point to foreign_table_name. If 2, then another string is
allocated from foreign->heap and set to lower case. */
void
dict_mem_foreign_table_name_lookup_set(
/*===================================*/
dict_foreign_t* foreign, /*!< in/out: foreign struct */
ibool do_alloc) /*!< in: is an alloc needed */
{
if (innobase_get_lower_case_table_names() == 2) {
if (do_alloc) {
ulint len;
len = strlen(foreign->foreign_table_name) + 1;
foreign->foreign_table_name_lookup =
static_cast<char*>(
mem_heap_alloc(foreign->heap, len));
}
strcpy(foreign->foreign_table_name_lookup,
foreign->foreign_table_name);
innobase_casedn_str(foreign->foreign_table_name_lookup);
} else {
foreign->foreign_table_name_lookup
= foreign->foreign_table_name;
}
}
/**********************************************************************//**
Sets the referenced_table_name_lookup pointer based on the value of
lower_case_table_names. If that is 0 or 1, referenced_table_name_lookup
will point to referenced_table_name. If 2, then another string is
allocated from foreign->heap and set to lower case. */
void
dict_mem_referenced_table_name_lookup_set(
/*======================================*/
dict_foreign_t* foreign, /*!< in/out: foreign struct */
ibool do_alloc) /*!< in: is an alloc needed */
{
if (innobase_get_lower_case_table_names() == 2) {
if (do_alloc) {
ulint len;
len = strlen(foreign->referenced_table_name) + 1;
foreign->referenced_table_name_lookup =
static_cast<char*>(
mem_heap_alloc(foreign->heap, len));
}
strcpy(foreign->referenced_table_name_lookup,
foreign->referenced_table_name);
innobase_casedn_str(foreign->referenced_table_name_lookup);
} else {
foreign->referenced_table_name_lookup
= foreign->referenced_table_name;
}
}
/** Fill the virtual column set with virtual column information
present in the given virtual index.
@param[in] index virtual index
@param[out] v_cols virtual column set. */
static
void
dict_mem_fill_vcol_has_index(
const dict_index_t* index,
dict_vcol_set** v_cols)
{
for (ulint i = 0; i < index->table->n_v_cols; i++) {
dict_v_col_t* v_col = dict_table_get_nth_v_col(
index->table, i);
if (!v_col->m_col.ord_part) {
continue;
}
dict_v_idx_list::iterator it;
for (it = v_col->v_indexes->begin();
it != v_col->v_indexes->end(); ++it) {
dict_v_idx_t v_idx = *it;
if (v_idx.index != index) {
continue;
}
if (*v_cols == NULL) {
*v_cols = UT_NEW_NOKEY(dict_vcol_set());
}
(*v_cols)->insert(v_col);
}
}
}
/** Fill the virtual column set with the virtual column of the index
if the index contains given column name.
@param[in] col_name column name
@param[in] table innodb table object
@param[out] v_cols set of virtual column information. */
static
void
dict_mem_fill_vcol_from_v_indexes(
const char* col_name,
const dict_table_t* table,
dict_vcol_set** v_cols)
{
/* virtual column can't be Primary Key, so start with
secondary index */
for (dict_index_t* index = dict_table_get_next_index(
dict_table_get_first_index(table));
index;
index = dict_table_get_next_index(index)) {
/* Skip if the index have newly added
virtual column because field name is NULL.
Later virtual column set will be
refreshed during loading of table. */
if (!dict_index_has_virtual(index)
|| index->has_new_v_col) {
continue;
}
for (ulint i = 0; i < index->n_fields; i++) {
dict_field_t* field =
dict_index_get_nth_field(index, i);
if (strcmp(field->name, col_name) == 0) {
dict_mem_fill_vcol_has_index(
index, v_cols);
}
}
}
}
/** Fill the virtual column set with virtual columns which have base columns
as the given col_name
@param[in] col_name column name
@param[in] table table object
@param[out] v_cols set of virtual columns. */
static
void
dict_mem_fill_vcol_set_for_base_col(
const char* col_name,
const dict_table_t* table,
dict_vcol_set** v_cols)
{
for (ulint i = 0; i < table->n_v_cols; i++) {
dict_v_col_t* v_col = dict_table_get_nth_v_col(table, i);
if (!v_col->m_col.ord_part) {
continue;
}
for (ulint j = 0; j < v_col->num_base; j++) {
if (strcmp(col_name, dict_table_get_col_name(
table,
v_col->base_col[j]->ind)) == 0) {
if (*v_cols == NULL) {
*v_cols = UT_NEW_NOKEY(dict_vcol_set());
}
(*v_cols)->insert(v_col);
}
}
}
}
/** Fills the dependent virtual columns in a set.
Reason for being dependent are
1) FK can be present on base column of virtual columns
2) FK can be present on column which is a part of virtual index
@param[in,out] foreign foreign key information. */
void
dict_mem_foreign_fill_vcol_set(
dict_foreign_t* foreign)
{
ulint type = foreign->type;
if (type == 0) {
return;
}
for (ulint i = 0; i < foreign->n_fields; i++) {
/** FK can be present on base columns
of virtual columns. */
dict_mem_fill_vcol_set_for_base_col(
foreign->foreign_col_names[i],
foreign->foreign_table,
&foreign->v_cols);
/** FK can be present on the columns
which can be a part of virtual index. */
dict_mem_fill_vcol_from_v_indexes(
foreign->foreign_col_names[i],
foreign->foreign_table,
&foreign->v_cols);
}
}
/** Fill virtual columns set in each fk constraint present in the table.
@param[in,out] table innodb table object. */
void
dict_mem_table_fill_foreign_vcol_set(
dict_table_t* table)
{
dict_foreign_set fk_set = table->foreign_set;
dict_foreign_t* foreign;
dict_foreign_set::iterator it;
for (it = fk_set.begin(); it != fk_set.end(); ++it) {
foreign = *it;
dict_mem_foreign_fill_vcol_set(foreign);
}
}
/** Free the vcol_set from all foreign key constraint on the table.
@param[in,out] table innodb table object. */
void
dict_mem_table_free_foreign_vcol_set(
dict_table_t* table)
{
dict_foreign_set fk_set = table->foreign_set;
dict_foreign_t* foreign;
dict_foreign_set::iterator it;
for (it = fk_set.begin(); it != fk_set.end(); ++it) {
foreign = *it;
if (foreign->v_cols != NULL) {
UT_DELETE(foreign->v_cols);
foreign->v_cols = NULL;
}
}
}
/**********************************************************************//**
Adds a field definition to an index. NOTE: does not take a copy
of the column name if the field is a column. The memory occupied
by the column name may be released only after publishing the index. */
void
dict_mem_index_add_field(
/*=====================*/
dict_index_t* index, /*!< in: index */
const char* name, /*!< in: column name */
ulint prefix_len) /*!< in: 0 or the column prefix length
in a MySQL index like
INDEX (textcol(25)) */
{
dict_field_t* field;
ut_ad(index);
ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
index->n_def++;
field = dict_index_get_nth_field(index, index->n_def - 1);
field->name = name;
field->prefix_len = (unsigned int) prefix_len;
}
/**********************************************************************//**
Frees an index memory object. */
void
dict_mem_index_free(
/*================*/
dict_index_t* index) /*!< in: index */
{
ut_ad(index);
ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
2015-05-04 22:13:46 +02:00
dict_index_zip_pad_mutex_destroy(index);
if (dict_index_is_spatial(index)) {
rtr_info_active::iterator it;
rtr_info_t* rtr_info;
for (it = index->rtr_track->rtr_active->begin();
it != index->rtr_track->rtr_active->end(); ++it) {
rtr_info = *it;
rtr_info->index = NULL;
}
mutex_destroy(&index->rtr_ssn.mutex);
mutex_destroy(&index->rtr_track->rtr_active_mutex);
UT_DELETE(index->rtr_track->rtr_active);
}
mem_heap_free(index->heap);
}
2014-11-18 17:41:12 +01:00
/** Create a temporary tablename like "#sql-ibtid-inc where
tid = the Table ID
inc = a randomly initialized number that is incremented for each file
The table ID is a 64 bit integer, can use up to 20 digits, and is
initialized at bootstrap. The second number is 32 bits, can use up to 10
digits, and is initialized at startup to a randomly distributed number.
It is hoped that the combination of these two numbers will provide a
reasonably unique temporary file name.
@param[in] heap A memory heap
@param[in] dbtab Table name in the form database/table name
@param[in] id Table id
@return A unique temporary tablename suitable for InnoDB use */
char*
dict_mem_create_temporary_tablename(
2014-11-18 17:41:12 +01:00
mem_heap_t* heap,
const char* dbtab,
table_id_t id)
{
2014-11-18 17:41:12 +01:00
size_t size;
char* name;
const char* dbend = strchr(dbtab, '/');
ut_ad(dbend);
size_t dblen = dbend - dbtab + 1;
2014-11-18 17:41:12 +01:00
/* Increment a randomly initialized number for each temp file. */
my_atomic_add32((int32*) &dict_temp_file_num, 1);
size = dblen + (sizeof(TEMP_FILE_PREFIX) + 3 + 20 + 1 + 10);
2014-11-18 17:41:12 +01:00
name = static_cast<char*>(mem_heap_alloc(heap, size));
memcpy(name, dbtab, dblen);
snprintf(name + dblen, size - dblen,
2014-11-18 17:41:12 +01:00
TEMP_FILE_PREFIX_INNODB UINT64PF "-" UINT32PF,
id, dict_temp_file_num);
return(name);
}
2014-11-18 17:41:12 +01:00
/** Initialize dict memory variables */
void
dict_mem_init(void)
{
/* Initialize a randomly distributed temporary file number */
ib_uint32_t now = static_cast<ib_uint32_t>(ut_time());
2014-11-18 17:41:12 +01:00
const byte* buf = reinterpret_cast<const byte*>(&now);
2014-11-18 17:41:12 +01:00
dict_temp_file_num = ut_crc32(buf, sizeof(now));
DBUG_PRINT("dict_mem_init",
("Starting Temporary file number is " UINT32PF,
dict_temp_file_num));
}
/** Validate the search order in the foreign key set.
@param[in] fk_set the foreign key set to be validated
@return true if search order is fine in the set, false otherwise. */
bool
dict_foreign_set_validate(
const dict_foreign_set& fk_set)
{
dict_foreign_not_exists not_exists(fk_set);
dict_foreign_set::const_iterator it = std::find_if(
2014-11-18 17:41:12 +01:00
fk_set.begin(), fk_set.end(), not_exists);
if (it == fk_set.end()) {
return(true);
}
dict_foreign_t* foreign = *it;
std::cerr << "Foreign key lookup failed: " << *foreign;
std::cerr << fk_set;
ut_ad(0);
return(false);
}
/** Validate the search order in the foreign key sets of the table
(foreign_set and referenced_set).
@param[in] table table whose foreign key sets are to be validated
@return true if foreign key sets are fine, false otherwise. */
bool
dict_foreign_set_validate(
const dict_table_t& table)
{
return(dict_foreign_set_validate(table.foreign_set)
&& dict_foreign_set_validate(table.referenced_set));
}
std::ostream&
operator<< (std::ostream& out, const dict_foreign_t& foreign)
{
out << "[dict_foreign_t: id='" << foreign.id << "'";
if (foreign.foreign_table_name != NULL) {
out << ",for: '" << foreign.foreign_table_name << "'";
}
out << "]";
return(out);
}
std::ostream&
operator<< (std::ostream& out, const dict_foreign_set& fk_set)
{
out << "[dict_foreign_set:";
std::for_each(fk_set.begin(), fk_set.end(), dict_foreign_print(out));
out << "]" << std::endl;
return(out);
}
/****************************************************************//**
Determines if a table belongs to a system database
@return */
bool
dict_mem_table_is_system(
/*================*/
char *name) /*!< in: table name */
{
ut_ad(name);
/* table has the following format: database/table
and some system table are of the form SYS_* */
if (strchr(name, '/')) {
size_t table_len = strlen(name);
const char *system_db;
int i = 0;
while ((system_db = innobase_system_databases[i++])
&& (system_db != NullS)) {
size_t len = strlen(system_db);
if (table_len > len && !strncmp(name, system_db, len)) {
return true;
}
}
return false;
} else {
return true;
}
}
MDEV-11369 Instant ADD COLUMN for InnoDB For InnoDB tables, adding, dropping and reordering columns has required a rebuild of the table and all its indexes. Since MySQL 5.6 (and MariaDB 10.0) this has been supported online (LOCK=NONE), allowing concurrent modification of the tables. This work revises the InnoDB ROW_FORMAT=REDUNDANT, ROW_FORMAT=COMPACT and ROW_FORMAT=DYNAMIC so that columns can be appended instantaneously, with only minor changes performed to the table structure. The counter innodb_instant_alter_column in INFORMATION_SCHEMA.GLOBAL_STATUS is incremented whenever a table rebuild operation is converted into an instant ADD COLUMN operation. ROW_FORMAT=COMPRESSED tables will not support instant ADD COLUMN. Some usability limitations will be addressed in subsequent work: MDEV-13134 Introduce ALTER TABLE attributes ALGORITHM=NOCOPY and ALGORITHM=INSTANT MDEV-14016 Allow instant ADD COLUMN, ADD INDEX, LOCK=NONE The format of the clustered index (PRIMARY KEY) is changed as follows: (1) The FIL_PAGE_TYPE of the root page will be FIL_PAGE_TYPE_INSTANT, and a new field PAGE_INSTANT will contain the original number of fields in the clustered index ('core' fields). If instant ADD COLUMN has not been used or the table becomes empty, or the very first instant ADD COLUMN operation is rolled back, the fields PAGE_INSTANT and FIL_PAGE_TYPE will be reset to 0 and FIL_PAGE_INDEX. (2) A special 'default row' record is inserted into the leftmost leaf, between the page infimum and the first user record. This record is distinguished by the REC_INFO_MIN_REC_FLAG, and it is otherwise in the same format as records that contain values for the instantly added columns. This 'default row' always has the same number of fields as the clustered index according to the table definition. The values of 'core' fields are to be ignored. For other fields, the 'default row' will contain the default values as they were during the ALTER TABLE statement. (If the column default values are changed later, those values will only be stored in the .frm file. The 'default row' will contain the original evaluated values, which must be the same for every row.) The 'default row' must be completely hidden from higher-level access routines. Assertions have been added to ensure that no 'default row' is ever present in the adaptive hash index or in locked records. The 'default row' is never delete-marked. (3) In clustered index leaf page records, the number of fields must reside between the number of 'core' fields (dict_index_t::n_core_fields introduced in this work) and dict_index_t::n_fields. If the number of fields is less than dict_index_t::n_fields, the missing fields are replaced with the column value of the 'default row'. Note: The number of fields in the record may shrink if some of the last instantly added columns are updated to the value that is in the 'default row'. The function btr_cur_trim() implements this 'compression' on update and rollback; dtuple::trim() implements it on insert. (4) In ROW_FORMAT=COMPACT and ROW_FORMAT=DYNAMIC records, the new status value REC_STATUS_COLUMNS_ADDED will indicate the presence of a new record header that will encode n_fields-n_core_fields-1 in 1 or 2 bytes. (In ROW_FORMAT=REDUNDANT records, the record header always explicitly encodes the number of fields.) We introduce the undo log record type TRX_UNDO_INSERT_DEFAULT for covering the insert of the 'default row' record when instant ADD COLUMN is used for the first time. Subsequent instant ADD COLUMN can use TRX_UNDO_UPD_EXIST_REC. This is joint work with Vin Chen (陈福荣) from Tencent. The design that was discussed in April 2017 would not have allowed import or export of data files, because instead of the 'default row' it would have introduced a data dictionary table. The test rpl.rpl_alter_instant is exactly as contributed in pull request #408. The test innodb.instant_alter is based on a contributed test. The redo log record format changes for ROW_FORMAT=DYNAMIC and ROW_FORMAT=COMPACT are as contributed. (With this change present, crash recovery from MariaDB 10.3.1 will fail in spectacular ways!) Also the semantics of higher-level redo log records that modify the PAGE_INSTANT field is changed. The redo log format version identifier was already changed to LOG_HEADER_FORMAT_CURRENT=103 in MariaDB 10.3.1. Everything else has been rewritten by me. Thanks to Elena Stepanova, the code has been tested extensively. When rolling back an instant ADD COLUMN operation, we must empty the PAGE_FREE list after deleting or shortening the 'default row' record, by calling either btr_page_empty() or btr_page_reorganize(). We must know the size of each entry in the PAGE_FREE list. If rollback left a freed copy of the 'default row' in the PAGE_FREE list, we would be unable to determine its size (if it is in ROW_FORMAT=COMPACT or ROW_FORMAT=DYNAMIC) because it would contain more fields than the rolled-back definition of the clustered index. UNIV_SQL_DEFAULT: A new special constant that designates an instantly added column that is not present in the clustered index record. len_is_stored(): Check if a length is an actual length. There are two magic length values: UNIV_SQL_DEFAULT, UNIV_SQL_NULL. dict_col_t::def_val: The 'default row' value of the column. If the column is not added instantly, def_val.len will be UNIV_SQL_DEFAULT. dict_col_t: Add the accessors is_virtual(), is_nullable(), is_instant(), instant_value(). dict_col_t::remove_instant(): Remove the 'instant ADD' status of a column. dict_col_t::name(const dict_table_t& table): Replaces dict_table_get_col_name(). dict_index_t::n_core_fields: The original number of fields. For secondary indexes and if instant ADD COLUMN has not been used, this will be equal to dict_index_t::n_fields. dict_index_t::n_core_null_bytes: Number of bytes needed to represent the null flags; usually equal to UT_BITS_IN_BYTES(n_nullable). dict_index_t::NO_CORE_NULL_BYTES: Magic value signalling that n_core_null_bytes was not initialized yet from the clustered index root page. dict_index_t: Add the accessors is_instant(), is_clust(), get_n_nullable(), instant_field_value(). dict_index_t::instant_add_field(): Adjust clustered index metadata for instant ADD COLUMN. dict_index_t::remove_instant(): Remove the 'instant ADD' status of a clustered index when the table becomes empty, or the very first instant ADD COLUMN operation is rolled back. dict_table_t: Add the accessors is_instant(), is_temporary(), supports_instant(). dict_table_t::instant_add_column(): Adjust metadata for instant ADD COLUMN. dict_table_t::rollback_instant(): Adjust metadata on the rollback of instant ADD COLUMN. prepare_inplace_alter_table_dict(): First create the ctx->new_table, and only then decide if the table really needs to be rebuilt. We must split the creation of table or index metadata from the creation of the dictionary table records and the creation of the data. In this way, we can transform a table-rebuilding operation into an instant ADD COLUMN operation. Dictionary objects will only be added to cache when table rebuilding or index creation is needed. The ctx->instant_table will never be added to cache. dict_table_t::add_to_cache(): Modified and renamed from dict_table_add_to_cache(). Do not modify the table metadata. Let the callers invoke dict_table_add_system_columns() and if needed, set can_be_evicted. dict_create_sys_tables_tuple(), dict_create_table_step(): Omit the system columns (which will now exist in the dict_table_t object already at this point). dict_create_table_step(): Expect the callers to invoke dict_table_add_system_columns(). pars_create_table(): Before creating the table creation execution graph, invoke dict_table_add_system_columns(). row_create_table_for_mysql(): Expect all callers to invoke dict_table_add_system_columns(). create_index_dict(): Replaces row_merge_create_index_graph(). innodb_update_n_cols(): Renamed from innobase_update_n_virtual(). Call my_error() if an error occurs. btr_cur_instant_init(), btr_cur_instant_init_low(), btr_cur_instant_root_init(): Load additional metadata from the clustered index and set dict_index_t::n_core_null_bytes. This is invoked when table metadata is first loaded into the data dictionary. dict_boot(): Initialize n_core_null_bytes for the four hard-coded dictionary tables. dict_create_index_step(): Initialize n_core_null_bytes. This is executed as part of CREATE TABLE. dict_index_build_internal_clust(): Initialize n_core_null_bytes to NO_CORE_NULL_BYTES if table->supports_instant(). row_create_index_for_mysql(): Initialize n_core_null_bytes for CREATE TEMPORARY TABLE. commit_cache_norebuild(): Call the code to rename or enlarge columns in the cache only if instant ADD COLUMN is not being used. (Instant ADD COLUMN would copy all column metadata from instant_table to old_table, including the names and lengths.) PAGE_INSTANT: A new 13-bit field for storing dict_index_t::n_core_fields. This is repurposing the 16-bit field PAGE_DIRECTION, of which only the least significant 3 bits were used. The original byte containing PAGE_DIRECTION will be accessible via the new constant PAGE_DIRECTION_B. page_get_instant(), page_set_instant(): Accessors for the PAGE_INSTANT. page_ptr_get_direction(), page_get_direction(), page_ptr_set_direction(): Accessors for PAGE_DIRECTION. page_direction_reset(): Reset PAGE_DIRECTION, PAGE_N_DIRECTION. page_direction_increment(): Increment PAGE_N_DIRECTION and set PAGE_DIRECTION. rec_get_offsets(): Use the 'leaf' parameter for non-debug purposes, and assume that heap_no is always set. Initialize all dict_index_t::n_fields for ROW_FORMAT=REDUNDANT records, even if the record contains fewer fields. rec_offs_make_valid(): Add the parameter 'leaf'. rec_copy_prefix_to_dtuple(): Assert that the tuple is only built on the core fields. Instant ADD COLUMN only applies to the clustered index, and we should never build a search key that has more than the PRIMARY KEY and possibly DB_TRX_ID,DB_ROLL_PTR. All these columns are always present. dict_index_build_data_tuple(): Remove assertions that would be duplicated in rec_copy_prefix_to_dtuple(). rec_init_offsets(): Support ROW_FORMAT=REDUNDANT records whose number of fields is between n_core_fields and n_fields. cmp_rec_rec_with_match(): Implement the comparison between two MIN_REC_FLAG records. trx_t::in_rollback: Make the field available in non-debug builds. trx_start_for_ddl_low(): Remove dangerous error-tolerance. A dictionary transaction must be flagged as such before it has generated any undo log records. This is because trx_undo_assign_undo() will mark the transaction as a dictionary transaction in the undo log header right before the very first undo log record is being written. btr_index_rec_validate(): Account for instant ADD COLUMN row_undo_ins_remove_clust_rec(): On the rollback of an insert into SYS_COLUMNS, revert instant ADD COLUMN in the cache by removing the last column from the table and the clustered index. row_search_on_row_ref(), row_undo_mod_parse_undo_rec(), row_undo_mod(), trx_undo_update_rec_get_update(): Handle the 'default row' as a special case. dtuple_t::trim(index): Omit a redundant suffix of an index tuple right before insert or update. After instant ADD COLUMN, if the last fields of a clustered index tuple match the 'default row', there is no need to store them. While trimming the entry, we must hold a page latch, so that the table cannot be emptied and the 'default row' be deleted. btr_cur_optimistic_update(), btr_cur_pessimistic_update(), row_upd_clust_rec_by_insert(), row_ins_clust_index_entry_low(): Invoke dtuple_t::trim() if needed. row_ins_clust_index_entry(): Restore dtuple_t::n_fields after calling row_ins_clust_index_entry_low(). rec_get_converted_size(), rec_get_converted_size_comp(): Allow the number of fields to be between n_core_fields and n_fields. Do not support infimum,supremum. They are never supposed to be stored in dtuple_t, because page creation nowadays uses a lower-level method for initializing them. rec_convert_dtuple_to_rec_comp(): Assign the status bits based on the number of fields. btr_cur_trim(): In an update, trim the index entry as needed. For the 'default row', handle rollback specially. For user records, omit fields that match the 'default row'. btr_cur_optimistic_delete_func(), btr_cur_pessimistic_delete(): Skip locking and adaptive hash index for the 'default row'. row_log_table_apply_convert_mrec(): Replace 'default row' values if needed. In the temporary file that is applied by row_log_table_apply(), we must identify whether the records contain the extra header for instantly added columns. For now, we will allocate an additional byte for this for ROW_T_INSERT and ROW_T_UPDATE records when the source table has been subject to instant ADD COLUMN. The ROW_T_DELETE records are fine, as they will be converted and will only contain 'core' columns (PRIMARY KEY and some system columns) that are converted from dtuple_t. rec_get_converted_size_temp(), rec_init_offsets_temp(), rec_convert_dtuple_to_temp(): Add the parameter 'status'. REC_INFO_DEFAULT_ROW = REC_INFO_MIN_REC_FLAG | REC_STATUS_COLUMNS_ADDED: An info_bits constant for distinguishing the 'default row' record. rec_comp_status_t: An enum of the status bit values. rec_leaf_format: An enum that replaces the bool parameter of rec_init_offsets_comp_ordinary().
2017-10-06 07:00:05 +03:00
/** Adjust clustered index metadata for instant ADD COLUMN.
@param[in] clustered index definition after instant ADD COLUMN */
inline void dict_index_t::instant_add_field(const dict_index_t& instant)
{
DBUG_ASSERT(is_clust());
DBUG_ASSERT(instant.is_clust());
DBUG_ASSERT(!instant.is_instant());
DBUG_ASSERT(n_def == n_fields);
DBUG_ASSERT(instant.n_def == instant.n_fields);
DBUG_ASSERT(type == instant.type);
DBUG_ASSERT(trx_id_offset == instant.trx_id_offset);
DBUG_ASSERT(n_user_defined_cols == instant.n_user_defined_cols);
DBUG_ASSERT(n_uniq == instant.n_uniq);
DBUG_ASSERT(instant.n_fields > n_fields);
DBUG_ASSERT(instant.n_def > n_def);
DBUG_ASSERT(instant.n_nullable >= n_nullable);
DBUG_ASSERT(instant.n_core_fields >= n_core_fields);
DBUG_ASSERT(instant.n_core_null_bytes >= n_core_null_bytes);
n_fields = instant.n_fields;
n_def = instant.n_def;
n_nullable = instant.n_nullable;
fields = static_cast<dict_field_t*>(
mem_heap_dup(heap, instant.fields, n_fields * sizeof *fields));
ut_d(unsigned n_null = 0);
for (unsigned i = 0; i < n_fields; i++) {
DBUG_ASSERT(fields[i].same(instant.fields[i]));
const dict_col_t* icol = instant.fields[i].col;
DBUG_ASSERT(!icol->is_virtual());
dict_col_t* col = fields[i].col = &table->cols[
icol - instant.table->cols];
fields[i].name = col->name(*table);
ut_d(n_null += col->is_nullable());
}
ut_ad(n_null == n_nullable);
}
/** Adjust metadata for instant ADD COLUMN.
@param[in] table table definition after instant ADD COLUMN */
void dict_table_t::instant_add_column(const dict_table_t& table)
{
DBUG_ASSERT(!table.cached);
DBUG_ASSERT(table.n_def == table.n_cols);
DBUG_ASSERT(table.n_t_def == table.n_t_cols);
DBUG_ASSERT(n_def == n_cols);
DBUG_ASSERT(n_t_def == n_t_cols);
DBUG_ASSERT(table.n_cols > n_cols);
ut_ad(mutex_own(&dict_sys->mutex));
const char* end = table.col_names;
for (unsigned i = table.n_cols; i--; ) end += strlen(end) + 1;
col_names = static_cast<char*>(mem_heap_dup(heap, table.col_names,
end - table.col_names));
const dict_col_t* const old_cols = cols;
const dict_col_t* const old_cols_end = cols + n_cols;
cols = static_cast<dict_col_t*>(mem_heap_dup(heap, table.cols,
table.n_cols
* sizeof *cols));
/* Preserve the default values of previously instantly
added columns. */
for (unsigned i = n_cols - DATA_N_SYS_COLS; i--; ) {
cols[i].def_val = old_cols[i].def_val;
}
/* Copy the new default values to this->heap. */
for (unsigned i = n_cols; i < table.n_cols; i++) {
dict_col_t& c = cols[i - DATA_N_SYS_COLS];
DBUG_ASSERT(c.is_instant());
if (c.def_val.len == 0) {
c.def_val.data = field_ref_zero;
} else if (const void*& d = c.def_val.data) {
d = mem_heap_dup(heap, d, c.def_val.len);
} else {
DBUG_ASSERT(c.def_val.len == UNIV_SQL_NULL);
}
}
const unsigned old_n_cols = n_cols;
const unsigned n_add = table.n_cols - n_cols;
n_t_def += n_add;
n_t_cols += n_add;
n_cols = table.n_cols;
n_def = n_cols;
for (unsigned i = n_v_def; i--; ) {
const dict_v_col_t& v = v_cols[i];
for (ulint n = v.num_base; n--; ) {
dict_col_t*& base = v.base_col[n];
if (!base->is_virtual()) {
2017-10-28 12:45:49 +03:00
DBUG_ASSERT(base >= old_cols);
size_t n = size_t(base - old_cols);
DBUG_ASSERT(n + DATA_N_SYS_COLS < old_n_cols);
MDEV-11369 Instant ADD COLUMN for InnoDB For InnoDB tables, adding, dropping and reordering columns has required a rebuild of the table and all its indexes. Since MySQL 5.6 (and MariaDB 10.0) this has been supported online (LOCK=NONE), allowing concurrent modification of the tables. This work revises the InnoDB ROW_FORMAT=REDUNDANT, ROW_FORMAT=COMPACT and ROW_FORMAT=DYNAMIC so that columns can be appended instantaneously, with only minor changes performed to the table structure. The counter innodb_instant_alter_column in INFORMATION_SCHEMA.GLOBAL_STATUS is incremented whenever a table rebuild operation is converted into an instant ADD COLUMN operation. ROW_FORMAT=COMPRESSED tables will not support instant ADD COLUMN. Some usability limitations will be addressed in subsequent work: MDEV-13134 Introduce ALTER TABLE attributes ALGORITHM=NOCOPY and ALGORITHM=INSTANT MDEV-14016 Allow instant ADD COLUMN, ADD INDEX, LOCK=NONE The format of the clustered index (PRIMARY KEY) is changed as follows: (1) The FIL_PAGE_TYPE of the root page will be FIL_PAGE_TYPE_INSTANT, and a new field PAGE_INSTANT will contain the original number of fields in the clustered index ('core' fields). If instant ADD COLUMN has not been used or the table becomes empty, or the very first instant ADD COLUMN operation is rolled back, the fields PAGE_INSTANT and FIL_PAGE_TYPE will be reset to 0 and FIL_PAGE_INDEX. (2) A special 'default row' record is inserted into the leftmost leaf, between the page infimum and the first user record. This record is distinguished by the REC_INFO_MIN_REC_FLAG, and it is otherwise in the same format as records that contain values for the instantly added columns. This 'default row' always has the same number of fields as the clustered index according to the table definition. The values of 'core' fields are to be ignored. For other fields, the 'default row' will contain the default values as they were during the ALTER TABLE statement. (If the column default values are changed later, those values will only be stored in the .frm file. The 'default row' will contain the original evaluated values, which must be the same for every row.) The 'default row' must be completely hidden from higher-level access routines. Assertions have been added to ensure that no 'default row' is ever present in the adaptive hash index or in locked records. The 'default row' is never delete-marked. (3) In clustered index leaf page records, the number of fields must reside between the number of 'core' fields (dict_index_t::n_core_fields introduced in this work) and dict_index_t::n_fields. If the number of fields is less than dict_index_t::n_fields, the missing fields are replaced with the column value of the 'default row'. Note: The number of fields in the record may shrink if some of the last instantly added columns are updated to the value that is in the 'default row'. The function btr_cur_trim() implements this 'compression' on update and rollback; dtuple::trim() implements it on insert. (4) In ROW_FORMAT=COMPACT and ROW_FORMAT=DYNAMIC records, the new status value REC_STATUS_COLUMNS_ADDED will indicate the presence of a new record header that will encode n_fields-n_core_fields-1 in 1 or 2 bytes. (In ROW_FORMAT=REDUNDANT records, the record header always explicitly encodes the number of fields.) We introduce the undo log record type TRX_UNDO_INSERT_DEFAULT for covering the insert of the 'default row' record when instant ADD COLUMN is used for the first time. Subsequent instant ADD COLUMN can use TRX_UNDO_UPD_EXIST_REC. This is joint work with Vin Chen (陈福荣) from Tencent. The design that was discussed in April 2017 would not have allowed import or export of data files, because instead of the 'default row' it would have introduced a data dictionary table. The test rpl.rpl_alter_instant is exactly as contributed in pull request #408. The test innodb.instant_alter is based on a contributed test. The redo log record format changes for ROW_FORMAT=DYNAMIC and ROW_FORMAT=COMPACT are as contributed. (With this change present, crash recovery from MariaDB 10.3.1 will fail in spectacular ways!) Also the semantics of higher-level redo log records that modify the PAGE_INSTANT field is changed. The redo log format version identifier was already changed to LOG_HEADER_FORMAT_CURRENT=103 in MariaDB 10.3.1. Everything else has been rewritten by me. Thanks to Elena Stepanova, the code has been tested extensively. When rolling back an instant ADD COLUMN operation, we must empty the PAGE_FREE list after deleting or shortening the 'default row' record, by calling either btr_page_empty() or btr_page_reorganize(). We must know the size of each entry in the PAGE_FREE list. If rollback left a freed copy of the 'default row' in the PAGE_FREE list, we would be unable to determine its size (if it is in ROW_FORMAT=COMPACT or ROW_FORMAT=DYNAMIC) because it would contain more fields than the rolled-back definition of the clustered index. UNIV_SQL_DEFAULT: A new special constant that designates an instantly added column that is not present in the clustered index record. len_is_stored(): Check if a length is an actual length. There are two magic length values: UNIV_SQL_DEFAULT, UNIV_SQL_NULL. dict_col_t::def_val: The 'default row' value of the column. If the column is not added instantly, def_val.len will be UNIV_SQL_DEFAULT. dict_col_t: Add the accessors is_virtual(), is_nullable(), is_instant(), instant_value(). dict_col_t::remove_instant(): Remove the 'instant ADD' status of a column. dict_col_t::name(const dict_table_t& table): Replaces dict_table_get_col_name(). dict_index_t::n_core_fields: The original number of fields. For secondary indexes and if instant ADD COLUMN has not been used, this will be equal to dict_index_t::n_fields. dict_index_t::n_core_null_bytes: Number of bytes needed to represent the null flags; usually equal to UT_BITS_IN_BYTES(n_nullable). dict_index_t::NO_CORE_NULL_BYTES: Magic value signalling that n_core_null_bytes was not initialized yet from the clustered index root page. dict_index_t: Add the accessors is_instant(), is_clust(), get_n_nullable(), instant_field_value(). dict_index_t::instant_add_field(): Adjust clustered index metadata for instant ADD COLUMN. dict_index_t::remove_instant(): Remove the 'instant ADD' status of a clustered index when the table becomes empty, or the very first instant ADD COLUMN operation is rolled back. dict_table_t: Add the accessors is_instant(), is_temporary(), supports_instant(). dict_table_t::instant_add_column(): Adjust metadata for instant ADD COLUMN. dict_table_t::rollback_instant(): Adjust metadata on the rollback of instant ADD COLUMN. prepare_inplace_alter_table_dict(): First create the ctx->new_table, and only then decide if the table really needs to be rebuilt. We must split the creation of table or index metadata from the creation of the dictionary table records and the creation of the data. In this way, we can transform a table-rebuilding operation into an instant ADD COLUMN operation. Dictionary objects will only be added to cache when table rebuilding or index creation is needed. The ctx->instant_table will never be added to cache. dict_table_t::add_to_cache(): Modified and renamed from dict_table_add_to_cache(). Do not modify the table metadata. Let the callers invoke dict_table_add_system_columns() and if needed, set can_be_evicted. dict_create_sys_tables_tuple(), dict_create_table_step(): Omit the system columns (which will now exist in the dict_table_t object already at this point). dict_create_table_step(): Expect the callers to invoke dict_table_add_system_columns(). pars_create_table(): Before creating the table creation execution graph, invoke dict_table_add_system_columns(). row_create_table_for_mysql(): Expect all callers to invoke dict_table_add_system_columns(). create_index_dict(): Replaces row_merge_create_index_graph(). innodb_update_n_cols(): Renamed from innobase_update_n_virtual(). Call my_error() if an error occurs. btr_cur_instant_init(), btr_cur_instant_init_low(), btr_cur_instant_root_init(): Load additional metadata from the clustered index and set dict_index_t::n_core_null_bytes. This is invoked when table metadata is first loaded into the data dictionary. dict_boot(): Initialize n_core_null_bytes for the four hard-coded dictionary tables. dict_create_index_step(): Initialize n_core_null_bytes. This is executed as part of CREATE TABLE. dict_index_build_internal_clust(): Initialize n_core_null_bytes to NO_CORE_NULL_BYTES if table->supports_instant(). row_create_index_for_mysql(): Initialize n_core_null_bytes for CREATE TEMPORARY TABLE. commit_cache_norebuild(): Call the code to rename or enlarge columns in the cache only if instant ADD COLUMN is not being used. (Instant ADD COLUMN would copy all column metadata from instant_table to old_table, including the names and lengths.) PAGE_INSTANT: A new 13-bit field for storing dict_index_t::n_core_fields. This is repurposing the 16-bit field PAGE_DIRECTION, of which only the least significant 3 bits were used. The original byte containing PAGE_DIRECTION will be accessible via the new constant PAGE_DIRECTION_B. page_get_instant(), page_set_instant(): Accessors for the PAGE_INSTANT. page_ptr_get_direction(), page_get_direction(), page_ptr_set_direction(): Accessors for PAGE_DIRECTION. page_direction_reset(): Reset PAGE_DIRECTION, PAGE_N_DIRECTION. page_direction_increment(): Increment PAGE_N_DIRECTION and set PAGE_DIRECTION. rec_get_offsets(): Use the 'leaf' parameter for non-debug purposes, and assume that heap_no is always set. Initialize all dict_index_t::n_fields for ROW_FORMAT=REDUNDANT records, even if the record contains fewer fields. rec_offs_make_valid(): Add the parameter 'leaf'. rec_copy_prefix_to_dtuple(): Assert that the tuple is only built on the core fields. Instant ADD COLUMN only applies to the clustered index, and we should never build a search key that has more than the PRIMARY KEY and possibly DB_TRX_ID,DB_ROLL_PTR. All these columns are always present. dict_index_build_data_tuple(): Remove assertions that would be duplicated in rec_copy_prefix_to_dtuple(). rec_init_offsets(): Support ROW_FORMAT=REDUNDANT records whose number of fields is between n_core_fields and n_fields. cmp_rec_rec_with_match(): Implement the comparison between two MIN_REC_FLAG records. trx_t::in_rollback: Make the field available in non-debug builds. trx_start_for_ddl_low(): Remove dangerous error-tolerance. A dictionary transaction must be flagged as such before it has generated any undo log records. This is because trx_undo_assign_undo() will mark the transaction as a dictionary transaction in the undo log header right before the very first undo log record is being written. btr_index_rec_validate(): Account for instant ADD COLUMN row_undo_ins_remove_clust_rec(): On the rollback of an insert into SYS_COLUMNS, revert instant ADD COLUMN in the cache by removing the last column from the table and the clustered index. row_search_on_row_ref(), row_undo_mod_parse_undo_rec(), row_undo_mod(), trx_undo_update_rec_get_update(): Handle the 'default row' as a special case. dtuple_t::trim(index): Omit a redundant suffix of an index tuple right before insert or update. After instant ADD COLUMN, if the last fields of a clustered index tuple match the 'default row', there is no need to store them. While trimming the entry, we must hold a page latch, so that the table cannot be emptied and the 'default row' be deleted. btr_cur_optimistic_update(), btr_cur_pessimistic_update(), row_upd_clust_rec_by_insert(), row_ins_clust_index_entry_low(): Invoke dtuple_t::trim() if needed. row_ins_clust_index_entry(): Restore dtuple_t::n_fields after calling row_ins_clust_index_entry_low(). rec_get_converted_size(), rec_get_converted_size_comp(): Allow the number of fields to be between n_core_fields and n_fields. Do not support infimum,supremum. They are never supposed to be stored in dtuple_t, because page creation nowadays uses a lower-level method for initializing them. rec_convert_dtuple_to_rec_comp(): Assign the status bits based on the number of fields. btr_cur_trim(): In an update, trim the index entry as needed. For the 'default row', handle rollback specially. For user records, omit fields that match the 'default row'. btr_cur_optimistic_delete_func(), btr_cur_pessimistic_delete(): Skip locking and adaptive hash index for the 'default row'. row_log_table_apply_convert_mrec(): Replace 'default row' values if needed. In the temporary file that is applied by row_log_table_apply(), we must identify whether the records contain the extra header for instantly added columns. For now, we will allocate an additional byte for this for ROW_T_INSERT and ROW_T_UPDATE records when the source table has been subject to instant ADD COLUMN. The ROW_T_DELETE records are fine, as they will be converted and will only contain 'core' columns (PRIMARY KEY and some system columns) that are converted from dtuple_t. rec_get_converted_size_temp(), rec_init_offsets_temp(), rec_convert_dtuple_to_temp(): Add the parameter 'status'. REC_INFO_DEFAULT_ROW = REC_INFO_MIN_REC_FLAG | REC_STATUS_COLUMNS_ADDED: An info_bits constant for distinguishing the 'default row' record. rec_comp_status_t: An enum of the status bit values. rec_leaf_format: An enum that replaces the bool parameter of rec_init_offsets_comp_ordinary().
2017-10-06 07:00:05 +03:00
base = &cols[n];
}
}
}
dict_index_t* index = dict_table_get_first_index(this);
index->instant_add_field(*dict_table_get_first_index(&table));
while ((index = dict_table_get_next_index(index)) != NULL) {
for (unsigned i = 0; i < index->n_fields; i++) {
dict_field_t& field = index->fields[i];
if (field.col < old_cols
|| field.col >= old_cols_end) {
DBUG_ASSERT(field.col->is_virtual());
} else {
/* Secondary indexes may contain user
columns and DB_ROW_ID (if there is
GEN_CLUST_INDEX instead of PRIMARY KEY),
but not DB_TRX_ID,DB_ROLL_PTR. */
2017-10-28 12:45:49 +03:00
DBUG_ASSERT(field.col >= old_cols);
size_t n = size_t(field.col - old_cols);
DBUG_ASSERT(n + DATA_N_SYS_COLS <= old_n_cols);
MDEV-11369 Instant ADD COLUMN for InnoDB For InnoDB tables, adding, dropping and reordering columns has required a rebuild of the table and all its indexes. Since MySQL 5.6 (and MariaDB 10.0) this has been supported online (LOCK=NONE), allowing concurrent modification of the tables. This work revises the InnoDB ROW_FORMAT=REDUNDANT, ROW_FORMAT=COMPACT and ROW_FORMAT=DYNAMIC so that columns can be appended instantaneously, with only minor changes performed to the table structure. The counter innodb_instant_alter_column in INFORMATION_SCHEMA.GLOBAL_STATUS is incremented whenever a table rebuild operation is converted into an instant ADD COLUMN operation. ROW_FORMAT=COMPRESSED tables will not support instant ADD COLUMN. Some usability limitations will be addressed in subsequent work: MDEV-13134 Introduce ALTER TABLE attributes ALGORITHM=NOCOPY and ALGORITHM=INSTANT MDEV-14016 Allow instant ADD COLUMN, ADD INDEX, LOCK=NONE The format of the clustered index (PRIMARY KEY) is changed as follows: (1) The FIL_PAGE_TYPE of the root page will be FIL_PAGE_TYPE_INSTANT, and a new field PAGE_INSTANT will contain the original number of fields in the clustered index ('core' fields). If instant ADD COLUMN has not been used or the table becomes empty, or the very first instant ADD COLUMN operation is rolled back, the fields PAGE_INSTANT and FIL_PAGE_TYPE will be reset to 0 and FIL_PAGE_INDEX. (2) A special 'default row' record is inserted into the leftmost leaf, between the page infimum and the first user record. This record is distinguished by the REC_INFO_MIN_REC_FLAG, and it is otherwise in the same format as records that contain values for the instantly added columns. This 'default row' always has the same number of fields as the clustered index according to the table definition. The values of 'core' fields are to be ignored. For other fields, the 'default row' will contain the default values as they were during the ALTER TABLE statement. (If the column default values are changed later, those values will only be stored in the .frm file. The 'default row' will contain the original evaluated values, which must be the same for every row.) The 'default row' must be completely hidden from higher-level access routines. Assertions have been added to ensure that no 'default row' is ever present in the adaptive hash index or in locked records. The 'default row' is never delete-marked. (3) In clustered index leaf page records, the number of fields must reside between the number of 'core' fields (dict_index_t::n_core_fields introduced in this work) and dict_index_t::n_fields. If the number of fields is less than dict_index_t::n_fields, the missing fields are replaced with the column value of the 'default row'. Note: The number of fields in the record may shrink if some of the last instantly added columns are updated to the value that is in the 'default row'. The function btr_cur_trim() implements this 'compression' on update and rollback; dtuple::trim() implements it on insert. (4) In ROW_FORMAT=COMPACT and ROW_FORMAT=DYNAMIC records, the new status value REC_STATUS_COLUMNS_ADDED will indicate the presence of a new record header that will encode n_fields-n_core_fields-1 in 1 or 2 bytes. (In ROW_FORMAT=REDUNDANT records, the record header always explicitly encodes the number of fields.) We introduce the undo log record type TRX_UNDO_INSERT_DEFAULT for covering the insert of the 'default row' record when instant ADD COLUMN is used for the first time. Subsequent instant ADD COLUMN can use TRX_UNDO_UPD_EXIST_REC. This is joint work with Vin Chen (陈福荣) from Tencent. The design that was discussed in April 2017 would not have allowed import or export of data files, because instead of the 'default row' it would have introduced a data dictionary table. The test rpl.rpl_alter_instant is exactly as contributed in pull request #408. The test innodb.instant_alter is based on a contributed test. The redo log record format changes for ROW_FORMAT=DYNAMIC and ROW_FORMAT=COMPACT are as contributed. (With this change present, crash recovery from MariaDB 10.3.1 will fail in spectacular ways!) Also the semantics of higher-level redo log records that modify the PAGE_INSTANT field is changed. The redo log format version identifier was already changed to LOG_HEADER_FORMAT_CURRENT=103 in MariaDB 10.3.1. Everything else has been rewritten by me. Thanks to Elena Stepanova, the code has been tested extensively. When rolling back an instant ADD COLUMN operation, we must empty the PAGE_FREE list after deleting or shortening the 'default row' record, by calling either btr_page_empty() or btr_page_reorganize(). We must know the size of each entry in the PAGE_FREE list. If rollback left a freed copy of the 'default row' in the PAGE_FREE list, we would be unable to determine its size (if it is in ROW_FORMAT=COMPACT or ROW_FORMAT=DYNAMIC) because it would contain more fields than the rolled-back definition of the clustered index. UNIV_SQL_DEFAULT: A new special constant that designates an instantly added column that is not present in the clustered index record. len_is_stored(): Check if a length is an actual length. There are two magic length values: UNIV_SQL_DEFAULT, UNIV_SQL_NULL. dict_col_t::def_val: The 'default row' value of the column. If the column is not added instantly, def_val.len will be UNIV_SQL_DEFAULT. dict_col_t: Add the accessors is_virtual(), is_nullable(), is_instant(), instant_value(). dict_col_t::remove_instant(): Remove the 'instant ADD' status of a column. dict_col_t::name(const dict_table_t& table): Replaces dict_table_get_col_name(). dict_index_t::n_core_fields: The original number of fields. For secondary indexes and if instant ADD COLUMN has not been used, this will be equal to dict_index_t::n_fields. dict_index_t::n_core_null_bytes: Number of bytes needed to represent the null flags; usually equal to UT_BITS_IN_BYTES(n_nullable). dict_index_t::NO_CORE_NULL_BYTES: Magic value signalling that n_core_null_bytes was not initialized yet from the clustered index root page. dict_index_t: Add the accessors is_instant(), is_clust(), get_n_nullable(), instant_field_value(). dict_index_t::instant_add_field(): Adjust clustered index metadata for instant ADD COLUMN. dict_index_t::remove_instant(): Remove the 'instant ADD' status of a clustered index when the table becomes empty, or the very first instant ADD COLUMN operation is rolled back. dict_table_t: Add the accessors is_instant(), is_temporary(), supports_instant(). dict_table_t::instant_add_column(): Adjust metadata for instant ADD COLUMN. dict_table_t::rollback_instant(): Adjust metadata on the rollback of instant ADD COLUMN. prepare_inplace_alter_table_dict(): First create the ctx->new_table, and only then decide if the table really needs to be rebuilt. We must split the creation of table or index metadata from the creation of the dictionary table records and the creation of the data. In this way, we can transform a table-rebuilding operation into an instant ADD COLUMN operation. Dictionary objects will only be added to cache when table rebuilding or index creation is needed. The ctx->instant_table will never be added to cache. dict_table_t::add_to_cache(): Modified and renamed from dict_table_add_to_cache(). Do not modify the table metadata. Let the callers invoke dict_table_add_system_columns() and if needed, set can_be_evicted. dict_create_sys_tables_tuple(), dict_create_table_step(): Omit the system columns (which will now exist in the dict_table_t object already at this point). dict_create_table_step(): Expect the callers to invoke dict_table_add_system_columns(). pars_create_table(): Before creating the table creation execution graph, invoke dict_table_add_system_columns(). row_create_table_for_mysql(): Expect all callers to invoke dict_table_add_system_columns(). create_index_dict(): Replaces row_merge_create_index_graph(). innodb_update_n_cols(): Renamed from innobase_update_n_virtual(). Call my_error() if an error occurs. btr_cur_instant_init(), btr_cur_instant_init_low(), btr_cur_instant_root_init(): Load additional metadata from the clustered index and set dict_index_t::n_core_null_bytes. This is invoked when table metadata is first loaded into the data dictionary. dict_boot(): Initialize n_core_null_bytes for the four hard-coded dictionary tables. dict_create_index_step(): Initialize n_core_null_bytes. This is executed as part of CREATE TABLE. dict_index_build_internal_clust(): Initialize n_core_null_bytes to NO_CORE_NULL_BYTES if table->supports_instant(). row_create_index_for_mysql(): Initialize n_core_null_bytes for CREATE TEMPORARY TABLE. commit_cache_norebuild(): Call the code to rename or enlarge columns in the cache only if instant ADD COLUMN is not being used. (Instant ADD COLUMN would copy all column metadata from instant_table to old_table, including the names and lengths.) PAGE_INSTANT: A new 13-bit field for storing dict_index_t::n_core_fields. This is repurposing the 16-bit field PAGE_DIRECTION, of which only the least significant 3 bits were used. The original byte containing PAGE_DIRECTION will be accessible via the new constant PAGE_DIRECTION_B. page_get_instant(), page_set_instant(): Accessors for the PAGE_INSTANT. page_ptr_get_direction(), page_get_direction(), page_ptr_set_direction(): Accessors for PAGE_DIRECTION. page_direction_reset(): Reset PAGE_DIRECTION, PAGE_N_DIRECTION. page_direction_increment(): Increment PAGE_N_DIRECTION and set PAGE_DIRECTION. rec_get_offsets(): Use the 'leaf' parameter for non-debug purposes, and assume that heap_no is always set. Initialize all dict_index_t::n_fields for ROW_FORMAT=REDUNDANT records, even if the record contains fewer fields. rec_offs_make_valid(): Add the parameter 'leaf'. rec_copy_prefix_to_dtuple(): Assert that the tuple is only built on the core fields. Instant ADD COLUMN only applies to the clustered index, and we should never build a search key that has more than the PRIMARY KEY and possibly DB_TRX_ID,DB_ROLL_PTR. All these columns are always present. dict_index_build_data_tuple(): Remove assertions that would be duplicated in rec_copy_prefix_to_dtuple(). rec_init_offsets(): Support ROW_FORMAT=REDUNDANT records whose number of fields is between n_core_fields and n_fields. cmp_rec_rec_with_match(): Implement the comparison between two MIN_REC_FLAG records. trx_t::in_rollback: Make the field available in non-debug builds. trx_start_for_ddl_low(): Remove dangerous error-tolerance. A dictionary transaction must be flagged as such before it has generated any undo log records. This is because trx_undo_assign_undo() will mark the transaction as a dictionary transaction in the undo log header right before the very first undo log record is being written. btr_index_rec_validate(): Account for instant ADD COLUMN row_undo_ins_remove_clust_rec(): On the rollback of an insert into SYS_COLUMNS, revert instant ADD COLUMN in the cache by removing the last column from the table and the clustered index. row_search_on_row_ref(), row_undo_mod_parse_undo_rec(), row_undo_mod(), trx_undo_update_rec_get_update(): Handle the 'default row' as a special case. dtuple_t::trim(index): Omit a redundant suffix of an index tuple right before insert or update. After instant ADD COLUMN, if the last fields of a clustered index tuple match the 'default row', there is no need to store them. While trimming the entry, we must hold a page latch, so that the table cannot be emptied and the 'default row' be deleted. btr_cur_optimistic_update(), btr_cur_pessimistic_update(), row_upd_clust_rec_by_insert(), row_ins_clust_index_entry_low(): Invoke dtuple_t::trim() if needed. row_ins_clust_index_entry(): Restore dtuple_t::n_fields after calling row_ins_clust_index_entry_low(). rec_get_converted_size(), rec_get_converted_size_comp(): Allow the number of fields to be between n_core_fields and n_fields. Do not support infimum,supremum. They are never supposed to be stored in dtuple_t, because page creation nowadays uses a lower-level method for initializing them. rec_convert_dtuple_to_rec_comp(): Assign the status bits based on the number of fields. btr_cur_trim(): In an update, trim the index entry as needed. For the 'default row', handle rollback specially. For user records, omit fields that match the 'default row'. btr_cur_optimistic_delete_func(), btr_cur_pessimistic_delete(): Skip locking and adaptive hash index for the 'default row'. row_log_table_apply_convert_mrec(): Replace 'default row' values if needed. In the temporary file that is applied by row_log_table_apply(), we must identify whether the records contain the extra header for instantly added columns. For now, we will allocate an additional byte for this for ROW_T_INSERT and ROW_T_UPDATE records when the source table has been subject to instant ADD COLUMN. The ROW_T_DELETE records are fine, as they will be converted and will only contain 'core' columns (PRIMARY KEY and some system columns) that are converted from dtuple_t. rec_get_converted_size_temp(), rec_init_offsets_temp(), rec_convert_dtuple_to_temp(): Add the parameter 'status'. REC_INFO_DEFAULT_ROW = REC_INFO_MIN_REC_FLAG | REC_STATUS_COLUMNS_ADDED: An info_bits constant for distinguishing the 'default row' record. rec_comp_status_t: An enum of the status bit values. rec_leaf_format: An enum that replaces the bool parameter of rec_init_offsets_comp_ordinary().
2017-10-06 07:00:05 +03:00
if (n + DATA_N_SYS_COLS >= old_n_cols) {
/* Replace DB_ROW_ID */
n += n_add;
}
field.col = &cols[n];
DBUG_ASSERT(!field.col->is_virtual());
field.name = field.col->name(*this);
}
}
}
}
/** Roll back instant_add_column().
@param[in] old_n_cols original n_cols
@param[in] old_cols original cols
@param[in] old_col_names original col_names */
void
dict_table_t::rollback_instant(
unsigned old_n_cols,
dict_col_t* old_cols,
const char* old_col_names)
{
ut_ad(mutex_own(&dict_sys->mutex));
dict_index_t* index = indexes.start;
/* index->is_instant() does not necessarily hold here, because
the table may have been emptied */
DBUG_ASSERT(old_n_cols >= DATA_N_SYS_COLS);
DBUG_ASSERT(n_cols >= old_n_cols);
DBUG_ASSERT(n_cols == n_def);
DBUG_ASSERT(index->n_def == index->n_fields);
const unsigned n_remove = n_cols - old_n_cols;
for (unsigned i = index->n_fields - n_remove; i < index->n_fields;
i++) {
index->n_nullable -= index->fields[i].col->is_nullable();
}
index->n_fields -= n_remove;
index->n_def = index->n_fields;
if (index->n_core_fields > index->n_fields) {
index->n_core_fields = index->n_fields;
index->n_core_null_bytes = UT_BITS_IN_BYTES(index->n_nullable);
}
const dict_col_t* const new_cols = cols;
const dict_col_t* const new_cols_end = cols + n_cols;
cols = old_cols;
col_names = old_col_names;
n_cols = old_n_cols;
n_def = old_n_cols;
n_t_def -= n_remove;
n_t_cols -= n_remove;
for (unsigned i = n_v_def; i--; ) {
const dict_v_col_t& v = v_cols[i];
for (ulint n = v.num_base; n--; ) {
dict_col_t*& base = v.base_col[n];
if (!base->is_virtual()) {
base = &cols[base - new_cols];
}
}
}
do {
for (unsigned i = 0; i < index->n_fields; i++) {
dict_field_t& field = index->fields[i];
if (field.col < new_cols
|| field.col >= new_cols_end) {
DBUG_ASSERT(field.col->is_virtual());
} else {
2017-10-28 12:45:49 +03:00
DBUG_ASSERT(field.col >= new_cols);
size_t n = size_t(field.col - new_cols);
MDEV-11369 Instant ADD COLUMN for InnoDB For InnoDB tables, adding, dropping and reordering columns has required a rebuild of the table and all its indexes. Since MySQL 5.6 (and MariaDB 10.0) this has been supported online (LOCK=NONE), allowing concurrent modification of the tables. This work revises the InnoDB ROW_FORMAT=REDUNDANT, ROW_FORMAT=COMPACT and ROW_FORMAT=DYNAMIC so that columns can be appended instantaneously, with only minor changes performed to the table structure. The counter innodb_instant_alter_column in INFORMATION_SCHEMA.GLOBAL_STATUS is incremented whenever a table rebuild operation is converted into an instant ADD COLUMN operation. ROW_FORMAT=COMPRESSED tables will not support instant ADD COLUMN. Some usability limitations will be addressed in subsequent work: MDEV-13134 Introduce ALTER TABLE attributes ALGORITHM=NOCOPY and ALGORITHM=INSTANT MDEV-14016 Allow instant ADD COLUMN, ADD INDEX, LOCK=NONE The format of the clustered index (PRIMARY KEY) is changed as follows: (1) The FIL_PAGE_TYPE of the root page will be FIL_PAGE_TYPE_INSTANT, and a new field PAGE_INSTANT will contain the original number of fields in the clustered index ('core' fields). If instant ADD COLUMN has not been used or the table becomes empty, or the very first instant ADD COLUMN operation is rolled back, the fields PAGE_INSTANT and FIL_PAGE_TYPE will be reset to 0 and FIL_PAGE_INDEX. (2) A special 'default row' record is inserted into the leftmost leaf, between the page infimum and the first user record. This record is distinguished by the REC_INFO_MIN_REC_FLAG, and it is otherwise in the same format as records that contain values for the instantly added columns. This 'default row' always has the same number of fields as the clustered index according to the table definition. The values of 'core' fields are to be ignored. For other fields, the 'default row' will contain the default values as they were during the ALTER TABLE statement. (If the column default values are changed later, those values will only be stored in the .frm file. The 'default row' will contain the original evaluated values, which must be the same for every row.) The 'default row' must be completely hidden from higher-level access routines. Assertions have been added to ensure that no 'default row' is ever present in the adaptive hash index or in locked records. The 'default row' is never delete-marked. (3) In clustered index leaf page records, the number of fields must reside between the number of 'core' fields (dict_index_t::n_core_fields introduced in this work) and dict_index_t::n_fields. If the number of fields is less than dict_index_t::n_fields, the missing fields are replaced with the column value of the 'default row'. Note: The number of fields in the record may shrink if some of the last instantly added columns are updated to the value that is in the 'default row'. The function btr_cur_trim() implements this 'compression' on update and rollback; dtuple::trim() implements it on insert. (4) In ROW_FORMAT=COMPACT and ROW_FORMAT=DYNAMIC records, the new status value REC_STATUS_COLUMNS_ADDED will indicate the presence of a new record header that will encode n_fields-n_core_fields-1 in 1 or 2 bytes. (In ROW_FORMAT=REDUNDANT records, the record header always explicitly encodes the number of fields.) We introduce the undo log record type TRX_UNDO_INSERT_DEFAULT for covering the insert of the 'default row' record when instant ADD COLUMN is used for the first time. Subsequent instant ADD COLUMN can use TRX_UNDO_UPD_EXIST_REC. This is joint work with Vin Chen (陈福荣) from Tencent. The design that was discussed in April 2017 would not have allowed import or export of data files, because instead of the 'default row' it would have introduced a data dictionary table. The test rpl.rpl_alter_instant is exactly as contributed in pull request #408. The test innodb.instant_alter is based on a contributed test. The redo log record format changes for ROW_FORMAT=DYNAMIC and ROW_FORMAT=COMPACT are as contributed. (With this change present, crash recovery from MariaDB 10.3.1 will fail in spectacular ways!) Also the semantics of higher-level redo log records that modify the PAGE_INSTANT field is changed. The redo log format version identifier was already changed to LOG_HEADER_FORMAT_CURRENT=103 in MariaDB 10.3.1. Everything else has been rewritten by me. Thanks to Elena Stepanova, the code has been tested extensively. When rolling back an instant ADD COLUMN operation, we must empty the PAGE_FREE list after deleting or shortening the 'default row' record, by calling either btr_page_empty() or btr_page_reorganize(). We must know the size of each entry in the PAGE_FREE list. If rollback left a freed copy of the 'default row' in the PAGE_FREE list, we would be unable to determine its size (if it is in ROW_FORMAT=COMPACT or ROW_FORMAT=DYNAMIC) because it would contain more fields than the rolled-back definition of the clustered index. UNIV_SQL_DEFAULT: A new special constant that designates an instantly added column that is not present in the clustered index record. len_is_stored(): Check if a length is an actual length. There are two magic length values: UNIV_SQL_DEFAULT, UNIV_SQL_NULL. dict_col_t::def_val: The 'default row' value of the column. If the column is not added instantly, def_val.len will be UNIV_SQL_DEFAULT. dict_col_t: Add the accessors is_virtual(), is_nullable(), is_instant(), instant_value(). dict_col_t::remove_instant(): Remove the 'instant ADD' status of a column. dict_col_t::name(const dict_table_t& table): Replaces dict_table_get_col_name(). dict_index_t::n_core_fields: The original number of fields. For secondary indexes and if instant ADD COLUMN has not been used, this will be equal to dict_index_t::n_fields. dict_index_t::n_core_null_bytes: Number of bytes needed to represent the null flags; usually equal to UT_BITS_IN_BYTES(n_nullable). dict_index_t::NO_CORE_NULL_BYTES: Magic value signalling that n_core_null_bytes was not initialized yet from the clustered index root page. dict_index_t: Add the accessors is_instant(), is_clust(), get_n_nullable(), instant_field_value(). dict_index_t::instant_add_field(): Adjust clustered index metadata for instant ADD COLUMN. dict_index_t::remove_instant(): Remove the 'instant ADD' status of a clustered index when the table becomes empty, or the very first instant ADD COLUMN operation is rolled back. dict_table_t: Add the accessors is_instant(), is_temporary(), supports_instant(). dict_table_t::instant_add_column(): Adjust metadata for instant ADD COLUMN. dict_table_t::rollback_instant(): Adjust metadata on the rollback of instant ADD COLUMN. prepare_inplace_alter_table_dict(): First create the ctx->new_table, and only then decide if the table really needs to be rebuilt. We must split the creation of table or index metadata from the creation of the dictionary table records and the creation of the data. In this way, we can transform a table-rebuilding operation into an instant ADD COLUMN operation. Dictionary objects will only be added to cache when table rebuilding or index creation is needed. The ctx->instant_table will never be added to cache. dict_table_t::add_to_cache(): Modified and renamed from dict_table_add_to_cache(). Do not modify the table metadata. Let the callers invoke dict_table_add_system_columns() and if needed, set can_be_evicted. dict_create_sys_tables_tuple(), dict_create_table_step(): Omit the system columns (which will now exist in the dict_table_t object already at this point). dict_create_table_step(): Expect the callers to invoke dict_table_add_system_columns(). pars_create_table(): Before creating the table creation execution graph, invoke dict_table_add_system_columns(). row_create_table_for_mysql(): Expect all callers to invoke dict_table_add_system_columns(). create_index_dict(): Replaces row_merge_create_index_graph(). innodb_update_n_cols(): Renamed from innobase_update_n_virtual(). Call my_error() if an error occurs. btr_cur_instant_init(), btr_cur_instant_init_low(), btr_cur_instant_root_init(): Load additional metadata from the clustered index and set dict_index_t::n_core_null_bytes. This is invoked when table metadata is first loaded into the data dictionary. dict_boot(): Initialize n_core_null_bytes for the four hard-coded dictionary tables. dict_create_index_step(): Initialize n_core_null_bytes. This is executed as part of CREATE TABLE. dict_index_build_internal_clust(): Initialize n_core_null_bytes to NO_CORE_NULL_BYTES if table->supports_instant(). row_create_index_for_mysql(): Initialize n_core_null_bytes for CREATE TEMPORARY TABLE. commit_cache_norebuild(): Call the code to rename or enlarge columns in the cache only if instant ADD COLUMN is not being used. (Instant ADD COLUMN would copy all column metadata from instant_table to old_table, including the names and lengths.) PAGE_INSTANT: A new 13-bit field for storing dict_index_t::n_core_fields. This is repurposing the 16-bit field PAGE_DIRECTION, of which only the least significant 3 bits were used. The original byte containing PAGE_DIRECTION will be accessible via the new constant PAGE_DIRECTION_B. page_get_instant(), page_set_instant(): Accessors for the PAGE_INSTANT. page_ptr_get_direction(), page_get_direction(), page_ptr_set_direction(): Accessors for PAGE_DIRECTION. page_direction_reset(): Reset PAGE_DIRECTION, PAGE_N_DIRECTION. page_direction_increment(): Increment PAGE_N_DIRECTION and set PAGE_DIRECTION. rec_get_offsets(): Use the 'leaf' parameter for non-debug purposes, and assume that heap_no is always set. Initialize all dict_index_t::n_fields for ROW_FORMAT=REDUNDANT records, even if the record contains fewer fields. rec_offs_make_valid(): Add the parameter 'leaf'. rec_copy_prefix_to_dtuple(): Assert that the tuple is only built on the core fields. Instant ADD COLUMN only applies to the clustered index, and we should never build a search key that has more than the PRIMARY KEY and possibly DB_TRX_ID,DB_ROLL_PTR. All these columns are always present. dict_index_build_data_tuple(): Remove assertions that would be duplicated in rec_copy_prefix_to_dtuple(). rec_init_offsets(): Support ROW_FORMAT=REDUNDANT records whose number of fields is between n_core_fields and n_fields. cmp_rec_rec_with_match(): Implement the comparison between two MIN_REC_FLAG records. trx_t::in_rollback: Make the field available in non-debug builds. trx_start_for_ddl_low(): Remove dangerous error-tolerance. A dictionary transaction must be flagged as such before it has generated any undo log records. This is because trx_undo_assign_undo() will mark the transaction as a dictionary transaction in the undo log header right before the very first undo log record is being written. btr_index_rec_validate(): Account for instant ADD COLUMN row_undo_ins_remove_clust_rec(): On the rollback of an insert into SYS_COLUMNS, revert instant ADD COLUMN in the cache by removing the last column from the table and the clustered index. row_search_on_row_ref(), row_undo_mod_parse_undo_rec(), row_undo_mod(), trx_undo_update_rec_get_update(): Handle the 'default row' as a special case. dtuple_t::trim(index): Omit a redundant suffix of an index tuple right before insert or update. After instant ADD COLUMN, if the last fields of a clustered index tuple match the 'default row', there is no need to store them. While trimming the entry, we must hold a page latch, so that the table cannot be emptied and the 'default row' be deleted. btr_cur_optimistic_update(), btr_cur_pessimistic_update(), row_upd_clust_rec_by_insert(), row_ins_clust_index_entry_low(): Invoke dtuple_t::trim() if needed. row_ins_clust_index_entry(): Restore dtuple_t::n_fields after calling row_ins_clust_index_entry_low(). rec_get_converted_size(), rec_get_converted_size_comp(): Allow the number of fields to be between n_core_fields and n_fields. Do not support infimum,supremum. They are never supposed to be stored in dtuple_t, because page creation nowadays uses a lower-level method for initializing them. rec_convert_dtuple_to_rec_comp(): Assign the status bits based on the number of fields. btr_cur_trim(): In an update, trim the index entry as needed. For the 'default row', handle rollback specially. For user records, omit fields that match the 'default row'. btr_cur_optimistic_delete_func(), btr_cur_pessimistic_delete(): Skip locking and adaptive hash index for the 'default row'. row_log_table_apply_convert_mrec(): Replace 'default row' values if needed. In the temporary file that is applied by row_log_table_apply(), we must identify whether the records contain the extra header for instantly added columns. For now, we will allocate an additional byte for this for ROW_T_INSERT and ROW_T_UPDATE records when the source table has been subject to instant ADD COLUMN. The ROW_T_DELETE records are fine, as they will be converted and will only contain 'core' columns (PRIMARY KEY and some system columns) that are converted from dtuple_t. rec_get_converted_size_temp(), rec_init_offsets_temp(), rec_convert_dtuple_to_temp(): Add the parameter 'status'. REC_INFO_DEFAULT_ROW = REC_INFO_MIN_REC_FLAG | REC_STATUS_COLUMNS_ADDED: An info_bits constant for distinguishing the 'default row' record. rec_comp_status_t: An enum of the status bit values. rec_leaf_format: An enum that replaces the bool parameter of rec_init_offsets_comp_ordinary().
2017-10-06 07:00:05 +03:00
DBUG_ASSERT(n <= n_cols);
2017-10-28 12:45:49 +03:00
if (n + DATA_N_SYS_COLS >= n_cols) {
MDEV-11369 Instant ADD COLUMN for InnoDB For InnoDB tables, adding, dropping and reordering columns has required a rebuild of the table and all its indexes. Since MySQL 5.6 (and MariaDB 10.0) this has been supported online (LOCK=NONE), allowing concurrent modification of the tables. This work revises the InnoDB ROW_FORMAT=REDUNDANT, ROW_FORMAT=COMPACT and ROW_FORMAT=DYNAMIC so that columns can be appended instantaneously, with only minor changes performed to the table structure. The counter innodb_instant_alter_column in INFORMATION_SCHEMA.GLOBAL_STATUS is incremented whenever a table rebuild operation is converted into an instant ADD COLUMN operation. ROW_FORMAT=COMPRESSED tables will not support instant ADD COLUMN. Some usability limitations will be addressed in subsequent work: MDEV-13134 Introduce ALTER TABLE attributes ALGORITHM=NOCOPY and ALGORITHM=INSTANT MDEV-14016 Allow instant ADD COLUMN, ADD INDEX, LOCK=NONE The format of the clustered index (PRIMARY KEY) is changed as follows: (1) The FIL_PAGE_TYPE of the root page will be FIL_PAGE_TYPE_INSTANT, and a new field PAGE_INSTANT will contain the original number of fields in the clustered index ('core' fields). If instant ADD COLUMN has not been used or the table becomes empty, or the very first instant ADD COLUMN operation is rolled back, the fields PAGE_INSTANT and FIL_PAGE_TYPE will be reset to 0 and FIL_PAGE_INDEX. (2) A special 'default row' record is inserted into the leftmost leaf, between the page infimum and the first user record. This record is distinguished by the REC_INFO_MIN_REC_FLAG, and it is otherwise in the same format as records that contain values for the instantly added columns. This 'default row' always has the same number of fields as the clustered index according to the table definition. The values of 'core' fields are to be ignored. For other fields, the 'default row' will contain the default values as they were during the ALTER TABLE statement. (If the column default values are changed later, those values will only be stored in the .frm file. The 'default row' will contain the original evaluated values, which must be the same for every row.) The 'default row' must be completely hidden from higher-level access routines. Assertions have been added to ensure that no 'default row' is ever present in the adaptive hash index or in locked records. The 'default row' is never delete-marked. (3) In clustered index leaf page records, the number of fields must reside between the number of 'core' fields (dict_index_t::n_core_fields introduced in this work) and dict_index_t::n_fields. If the number of fields is less than dict_index_t::n_fields, the missing fields are replaced with the column value of the 'default row'. Note: The number of fields in the record may shrink if some of the last instantly added columns are updated to the value that is in the 'default row'. The function btr_cur_trim() implements this 'compression' on update and rollback; dtuple::trim() implements it on insert. (4) In ROW_FORMAT=COMPACT and ROW_FORMAT=DYNAMIC records, the new status value REC_STATUS_COLUMNS_ADDED will indicate the presence of a new record header that will encode n_fields-n_core_fields-1 in 1 or 2 bytes. (In ROW_FORMAT=REDUNDANT records, the record header always explicitly encodes the number of fields.) We introduce the undo log record type TRX_UNDO_INSERT_DEFAULT for covering the insert of the 'default row' record when instant ADD COLUMN is used for the first time. Subsequent instant ADD COLUMN can use TRX_UNDO_UPD_EXIST_REC. This is joint work with Vin Chen (陈福荣) from Tencent. The design that was discussed in April 2017 would not have allowed import or export of data files, because instead of the 'default row' it would have introduced a data dictionary table. The test rpl.rpl_alter_instant is exactly as contributed in pull request #408. The test innodb.instant_alter is based on a contributed test. The redo log record format changes for ROW_FORMAT=DYNAMIC and ROW_FORMAT=COMPACT are as contributed. (With this change present, crash recovery from MariaDB 10.3.1 will fail in spectacular ways!) Also the semantics of higher-level redo log records that modify the PAGE_INSTANT field is changed. The redo log format version identifier was already changed to LOG_HEADER_FORMAT_CURRENT=103 in MariaDB 10.3.1. Everything else has been rewritten by me. Thanks to Elena Stepanova, the code has been tested extensively. When rolling back an instant ADD COLUMN operation, we must empty the PAGE_FREE list after deleting or shortening the 'default row' record, by calling either btr_page_empty() or btr_page_reorganize(). We must know the size of each entry in the PAGE_FREE list. If rollback left a freed copy of the 'default row' in the PAGE_FREE list, we would be unable to determine its size (if it is in ROW_FORMAT=COMPACT or ROW_FORMAT=DYNAMIC) because it would contain more fields than the rolled-back definition of the clustered index. UNIV_SQL_DEFAULT: A new special constant that designates an instantly added column that is not present in the clustered index record. len_is_stored(): Check if a length is an actual length. There are two magic length values: UNIV_SQL_DEFAULT, UNIV_SQL_NULL. dict_col_t::def_val: The 'default row' value of the column. If the column is not added instantly, def_val.len will be UNIV_SQL_DEFAULT. dict_col_t: Add the accessors is_virtual(), is_nullable(), is_instant(), instant_value(). dict_col_t::remove_instant(): Remove the 'instant ADD' status of a column. dict_col_t::name(const dict_table_t& table): Replaces dict_table_get_col_name(). dict_index_t::n_core_fields: The original number of fields. For secondary indexes and if instant ADD COLUMN has not been used, this will be equal to dict_index_t::n_fields. dict_index_t::n_core_null_bytes: Number of bytes needed to represent the null flags; usually equal to UT_BITS_IN_BYTES(n_nullable). dict_index_t::NO_CORE_NULL_BYTES: Magic value signalling that n_core_null_bytes was not initialized yet from the clustered index root page. dict_index_t: Add the accessors is_instant(), is_clust(), get_n_nullable(), instant_field_value(). dict_index_t::instant_add_field(): Adjust clustered index metadata for instant ADD COLUMN. dict_index_t::remove_instant(): Remove the 'instant ADD' status of a clustered index when the table becomes empty, or the very first instant ADD COLUMN operation is rolled back. dict_table_t: Add the accessors is_instant(), is_temporary(), supports_instant(). dict_table_t::instant_add_column(): Adjust metadata for instant ADD COLUMN. dict_table_t::rollback_instant(): Adjust metadata on the rollback of instant ADD COLUMN. prepare_inplace_alter_table_dict(): First create the ctx->new_table, and only then decide if the table really needs to be rebuilt. We must split the creation of table or index metadata from the creation of the dictionary table records and the creation of the data. In this way, we can transform a table-rebuilding operation into an instant ADD COLUMN operation. Dictionary objects will only be added to cache when table rebuilding or index creation is needed. The ctx->instant_table will never be added to cache. dict_table_t::add_to_cache(): Modified and renamed from dict_table_add_to_cache(). Do not modify the table metadata. Let the callers invoke dict_table_add_system_columns() and if needed, set can_be_evicted. dict_create_sys_tables_tuple(), dict_create_table_step(): Omit the system columns (which will now exist in the dict_table_t object already at this point). dict_create_table_step(): Expect the callers to invoke dict_table_add_system_columns(). pars_create_table(): Before creating the table creation execution graph, invoke dict_table_add_system_columns(). row_create_table_for_mysql(): Expect all callers to invoke dict_table_add_system_columns(). create_index_dict(): Replaces row_merge_create_index_graph(). innodb_update_n_cols(): Renamed from innobase_update_n_virtual(). Call my_error() if an error occurs. btr_cur_instant_init(), btr_cur_instant_init_low(), btr_cur_instant_root_init(): Load additional metadata from the clustered index and set dict_index_t::n_core_null_bytes. This is invoked when table metadata is first loaded into the data dictionary. dict_boot(): Initialize n_core_null_bytes for the four hard-coded dictionary tables. dict_create_index_step(): Initialize n_core_null_bytes. This is executed as part of CREATE TABLE. dict_index_build_internal_clust(): Initialize n_core_null_bytes to NO_CORE_NULL_BYTES if table->supports_instant(). row_create_index_for_mysql(): Initialize n_core_null_bytes for CREATE TEMPORARY TABLE. commit_cache_norebuild(): Call the code to rename or enlarge columns in the cache only if instant ADD COLUMN is not being used. (Instant ADD COLUMN would copy all column metadata from instant_table to old_table, including the names and lengths.) PAGE_INSTANT: A new 13-bit field for storing dict_index_t::n_core_fields. This is repurposing the 16-bit field PAGE_DIRECTION, of which only the least significant 3 bits were used. The original byte containing PAGE_DIRECTION will be accessible via the new constant PAGE_DIRECTION_B. page_get_instant(), page_set_instant(): Accessors for the PAGE_INSTANT. page_ptr_get_direction(), page_get_direction(), page_ptr_set_direction(): Accessors for PAGE_DIRECTION. page_direction_reset(): Reset PAGE_DIRECTION, PAGE_N_DIRECTION. page_direction_increment(): Increment PAGE_N_DIRECTION and set PAGE_DIRECTION. rec_get_offsets(): Use the 'leaf' parameter for non-debug purposes, and assume that heap_no is always set. Initialize all dict_index_t::n_fields for ROW_FORMAT=REDUNDANT records, even if the record contains fewer fields. rec_offs_make_valid(): Add the parameter 'leaf'. rec_copy_prefix_to_dtuple(): Assert that the tuple is only built on the core fields. Instant ADD COLUMN only applies to the clustered index, and we should never build a search key that has more than the PRIMARY KEY and possibly DB_TRX_ID,DB_ROLL_PTR. All these columns are always present. dict_index_build_data_tuple(): Remove assertions that would be duplicated in rec_copy_prefix_to_dtuple(). rec_init_offsets(): Support ROW_FORMAT=REDUNDANT records whose number of fields is between n_core_fields and n_fields. cmp_rec_rec_with_match(): Implement the comparison between two MIN_REC_FLAG records. trx_t::in_rollback: Make the field available in non-debug builds. trx_start_for_ddl_low(): Remove dangerous error-tolerance. A dictionary transaction must be flagged as such before it has generated any undo log records. This is because trx_undo_assign_undo() will mark the transaction as a dictionary transaction in the undo log header right before the very first undo log record is being written. btr_index_rec_validate(): Account for instant ADD COLUMN row_undo_ins_remove_clust_rec(): On the rollback of an insert into SYS_COLUMNS, revert instant ADD COLUMN in the cache by removing the last column from the table and the clustered index. row_search_on_row_ref(), row_undo_mod_parse_undo_rec(), row_undo_mod(), trx_undo_update_rec_get_update(): Handle the 'default row' as a special case. dtuple_t::trim(index): Omit a redundant suffix of an index tuple right before insert or update. After instant ADD COLUMN, if the last fields of a clustered index tuple match the 'default row', there is no need to store them. While trimming the entry, we must hold a page latch, so that the table cannot be emptied and the 'default row' be deleted. btr_cur_optimistic_update(), btr_cur_pessimistic_update(), row_upd_clust_rec_by_insert(), row_ins_clust_index_entry_low(): Invoke dtuple_t::trim() if needed. row_ins_clust_index_entry(): Restore dtuple_t::n_fields after calling row_ins_clust_index_entry_low(). rec_get_converted_size(), rec_get_converted_size_comp(): Allow the number of fields to be between n_core_fields and n_fields. Do not support infimum,supremum. They are never supposed to be stored in dtuple_t, because page creation nowadays uses a lower-level method for initializing them. rec_convert_dtuple_to_rec_comp(): Assign the status bits based on the number of fields. btr_cur_trim(): In an update, trim the index entry as needed. For the 'default row', handle rollback specially. For user records, omit fields that match the 'default row'. btr_cur_optimistic_delete_func(), btr_cur_pessimistic_delete(): Skip locking and adaptive hash index for the 'default row'. row_log_table_apply_convert_mrec(): Replace 'default row' values if needed. In the temporary file that is applied by row_log_table_apply(), we must identify whether the records contain the extra header for instantly added columns. For now, we will allocate an additional byte for this for ROW_T_INSERT and ROW_T_UPDATE records when the source table has been subject to instant ADD COLUMN. The ROW_T_DELETE records are fine, as they will be converted and will only contain 'core' columns (PRIMARY KEY and some system columns) that are converted from dtuple_t. rec_get_converted_size_temp(), rec_init_offsets_temp(), rec_convert_dtuple_to_temp(): Add the parameter 'status'. REC_INFO_DEFAULT_ROW = REC_INFO_MIN_REC_FLAG | REC_STATUS_COLUMNS_ADDED: An info_bits constant for distinguishing the 'default row' record. rec_comp_status_t: An enum of the status bit values. rec_leaf_format: An enum that replaces the bool parameter of rec_init_offsets_comp_ordinary().
2017-10-06 07:00:05 +03:00
n -= n_remove;
}
field.col = &cols[n];
DBUG_ASSERT(!field.col->is_virtual());
field.name = field.col->name(*this);
}
}
} while ((index = dict_table_get_next_index(index)) != NULL);
}
/** Trim the instantly added columns when an insert into SYS_COLUMNS
is rolled back during ALTER TABLE or recovery.
@param[in] n number of surviving non-system columns */
void dict_table_t::rollback_instant(unsigned n)
{
ut_ad(mutex_own(&dict_sys->mutex));
dict_index_t* index = indexes.start;
DBUG_ASSERT(index->is_instant());
DBUG_ASSERT(index->n_def == index->n_fields);
DBUG_ASSERT(n_cols == n_def);
DBUG_ASSERT(n >= index->n_uniq);
DBUG_ASSERT(n_cols > n + DATA_N_SYS_COLS);
const unsigned n_remove = n_cols - n - DATA_N_SYS_COLS;
char* names = const_cast<char*>(dict_table_get_col_name(this, n));
const char* sys = names;
for (unsigned i = n_remove; i--; ) {
sys += strlen(sys) + 1;
}
static const char system[] = "DB_ROW_ID\0DB_TRX_ID\0DB_ROLL_PTR";
DBUG_ASSERT(!memcmp(sys, system, sizeof system));
for (unsigned i = index->n_fields - n_remove; i < index->n_fields;
i++) {
index->n_nullable -= index->fields[i].col->is_nullable();
}
index->n_fields -= n_remove;
index->n_def = index->n_fields;
memmove(names, sys, sizeof system);
memmove(cols + n, cols + n_cols - DATA_N_SYS_COLS,
DATA_N_SYS_COLS * sizeof *cols);
n_cols -= n_remove;
n_def = n_cols;
n_t_cols -= n_remove;
n_t_def -= n_remove;
for (unsigned i = DATA_N_SYS_COLS; i--; ) {
cols[n_cols - i].ind--;
}
if (dict_index_is_auto_gen_clust(index)) {
DBUG_ASSERT(index->n_uniq == 1);
dict_field_t* field = index->fields;
field->name = sys;
field->col = dict_table_get_sys_col(this, DATA_ROW_ID);
field++;
field->name = sys + sizeof "DB_ROW_ID";
field->col = dict_table_get_sys_col(this, DATA_TRX_ID);
field++;
field->name = sys + sizeof "DB_ROW_ID\0DB_TRX_ID";
field->col = dict_table_get_sys_col(this, DATA_ROLL_PTR);
/* Replace the DB_ROW_ID column in secondary indexes. */
while ((index = dict_table_get_next_index(index)) != NULL) {
field = &index->fields[index->n_fields - 1];
DBUG_ASSERT(field->col->mtype == DATA_SYS);
DBUG_ASSERT(field->col->prtype
== DATA_NOT_NULL + DATA_TRX_ID);
field->col--;
field->name = sys;
}
return;
}
dict_field_t* field = &index->fields[index->n_uniq];
field->name = sys + sizeof "DB_ROW_ID";
field->col = dict_table_get_sys_col(this, DATA_TRX_ID);
field++;
field->name = sys + sizeof "DB_ROW_ID\0DB_TRX_ID";
field->col = dict_table_get_sys_col(this, DATA_ROLL_PTR);
}
2017-12-28 10:40:00 +03:00
/** Check if record in clustered index is historical row.
@param[in] rec clustered row
@param[in] offsets offsets
@return true if row is historical */
bool
dict_index_t::vers_history_row(
const rec_t* rec,
const ulint* offsets)
{
ut_a(is_clust());
ulint len;
dict_col_t& col= table->cols[table->vers_end];
ut_ad(col.vers_sys_end());
ulint nfield = dict_col_get_clust_pos(&col, this);
const byte *data = rec_get_nth_field(rec, offsets, nfield, &len);
if (col.mtype == DATA_FIXBINARY) {
ut_ad(len == sizeof timestamp_max_bytes);
return 0 != memcmp(data, timestamp_max_bytes, len);
} else {
ut_ad(col.mtype == DATA_INT);
ut_ad(len == sizeof trx_id_max_bytes);
return 0 != memcmp(data, trx_id_max_bytes, len);
}
ut_ad(0);
return false;
}
/** Check if record in secondary index is historical row.
@param[in] rec record in a secondary index
@param[out] history_row true if row is historical
@return true on error */
bool
dict_index_t::vers_history_row(
const rec_t* rec,
bool &history_row)
{
ut_ad(!is_clust());
bool error = false;
mem_heap_t* heap = NULL;
dict_index_t* clust_index = NULL;
ulint offsets_[REC_OFFS_NORMAL_SIZE];
ulint* offsets = offsets_;
rec_offs_init(offsets_);
mtr_t mtr;
mtr.start();
rec_t* clust_rec =
row_get_clust_rec(BTR_SEARCH_LEAF, rec, this, &clust_index, &mtr);
if (clust_rec) {
offsets = rec_get_offsets(clust_rec, clust_index, offsets, true,
ULINT_UNDEFINED, &heap);
history_row = clust_index->vers_history_row(clust_rec, offsets);
} else {
ib::error() << "foreign constraints: secondary index is out of "
"sync";
ut_ad(!"secondary index is out of sync");
error = true;
}
mtr.commit();
if (heap) {
mem_heap_free(heap);
}
return(error);
}