2014-02-26 19:11:54 +01:00
|
|
|
/*****************************************************************************
|
|
|
|
|
2017-10-25 21:35:33 +03:00
|
|
|
Copyright (c) 1996, 2017, Oracle and/or its affiliates. All Rights Reserved.
|
2014-02-26 19:11:54 +01:00
|
|
|
Copyright (c) 2012, Facebook Inc.
|
MDEV-15563: Instant ROW_FORMAT=REDUNDANT column extension
This was developed by Aleksey Midenkov based on my design.
In the original InnoDB storage format (that was retroactively named
ROW_FORMAT=REDUNDANT in MySQL 5.0.3), the length of each index field
is stored explicitly.
Because of this, we can and now will allow instant conversion from
VARCHAR to CHAR or VARBINARY to BINARY of equal or greater size,
as well as instant conversion of TINYINT to SMALLINT to MEDIUMINT
to INT to BIGINT (while not changing between signed and unsigned).
Theoretically, we could allow changing from an unsigned integer to
a bigger unsigned integer, as well as changing CHAR to VARCHAR, but
that would require additional metadata and conversions whenever
reading old records.
Field_str::is_equal(), Field_varstring::is_equal(), Field_num::is_equal():
Return the new result IS_EQUAL_PACK_LENGTH_EXT if the table advertises
HA_EXTENDED_TYPES_CONVERSION capability and we are considering the
above-mentioned conversions.
ALTER_COLUMN_EQUAL_PACK_LENGTH_EXT: A new ALTER TABLE flag, similar
to ALTER_COLUMN_EQUAL_PACK_LENGTH but requiring conversions when
reading the data. The Field::is_equal() result IS_EQUAL_PACK_LENGTH_EXT
will map to this flag.
dtype_get_fixed_size_low(): For BINARY, CHAR and integer columns
in ROW_FORMAT=REDUNDANT, return 0 (variable length) from now on.
dtype_get_sql_null_size(): Keep returning the current size for
BINARY, CHAR and integer columns, so that in ROW_FORMAT=REDUNDANT
it will remain possible to update in place between NULL and NOT NULL
values.
btr_index_rec_validate(): Relax a CHECK TABLE length check for
ROW_FORMAT=REDUNDANT tables.
btr_cur_instant_init_low(): No longer trust fixed_len
for ROW_FORMAT=REDUNDANT tables.
We cannot rely on fixed_len anymore because the record can have shorter
length from before instant extension. Note that importing such tablespace
into earlier MariaDB versions produces ER_TABLE_SCHEMA_MISMATCH when
using a .cfg file.
2019-02-13 17:39:05 +02:00
|
|
|
Copyright (c) 2013, 2019, MariaDB Corporation.
|
2014-02-26 19:11:54 +01:00
|
|
|
|
|
|
|
This program is free software; you can redistribute it and/or modify it under
|
|
|
|
the terms of the GNU General Public License as published by the Free Software
|
|
|
|
Foundation; version 2 of the License.
|
|
|
|
|
|
|
|
This program is distributed in the hope that it will be useful, but WITHOUT
|
|
|
|
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
|
|
|
|
FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
|
|
|
|
|
|
|
|
You should have received a copy of the GNU General Public License along with
|
|
|
|
this program; if not, write to the Free Software Foundation, Inc.,
|
2019-05-11 18:08:32 +03:00
|
|
|
51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
|
2014-02-26 19:11:54 +01:00
|
|
|
|
|
|
|
*****************************************************************************/
|
|
|
|
|
|
|
|
/**************************************************//**
|
|
|
|
@file include/dict0mem.h
|
|
|
|
Data dictionary memory object creation
|
|
|
|
|
|
|
|
Created 1/8/1996 Heikki Tuuri
|
|
|
|
*******************************************************/
|
|
|
|
|
|
|
|
#ifndef dict0mem_h
|
|
|
|
#define dict0mem_h
|
|
|
|
|
|
|
|
#include "data0type.h"
|
|
|
|
#include "mem0mem.h"
|
|
|
|
#include "row0types.h"
|
|
|
|
#include "rem0types.h"
|
|
|
|
#include "btr0types.h"
|
2016-12-30 15:04:10 +02:00
|
|
|
#include "lock0types.h"
|
|
|
|
#include "que0types.h"
|
|
|
|
#include "sync0rw.h"
|
2014-02-26 19:11:54 +01:00
|
|
|
#include "ut0mem.h"
|
|
|
|
#include "ut0rnd.h"
|
|
|
|
#include "ut0byte.h"
|
|
|
|
#include "hash0hash.h"
|
|
|
|
#include "trx0types.h"
|
|
|
|
#include "fts0fts.h"
|
2016-08-12 11:17:45 +03:00
|
|
|
#include "buf0buf.h"
|
|
|
|
#include "gis0type.h"
|
2014-06-09 18:16:00 +02:00
|
|
|
#include "os0once.h"
|
2015-08-16 09:53:27 +03:00
|
|
|
#include "fil0fil.h"
|
|
|
|
#include "fil0crypt.h"
|
2018-11-16 19:36:04 +02:00
|
|
|
#include <sql_const.h>
|
2014-09-11 10:13:35 +02:00
|
|
|
#include <set>
|
|
|
|
#include <algorithm>
|
2014-11-18 17:41:12 +01:00
|
|
|
#include <iterator>
|
2015-03-03 12:39:42 +01:00
|
|
|
#include <ostream>
|
2014-02-26 19:11:54 +01:00
|
|
|
|
|
|
|
/* Forward declaration. */
|
|
|
|
struct ib_rbt_t;
|
|
|
|
|
|
|
|
/** Type flags of an index: OR'ing of the flags is allowed to define a
|
|
|
|
combination of types */
|
|
|
|
/* @{ */
|
2016-08-12 11:17:45 +03:00
|
|
|
#define DICT_CLUSTERED 1 /*!< clustered index; for other than
|
|
|
|
auto-generated clustered indexes,
|
|
|
|
also DICT_UNIQUE will be set */
|
2014-02-26 19:11:54 +01:00
|
|
|
#define DICT_UNIQUE 2 /*!< unique index */
|
|
|
|
#define DICT_IBUF 8 /*!< insert buffer tree */
|
|
|
|
#define DICT_CORRUPT 16 /*!< bit to store the corrupted flag
|
|
|
|
in SYS_INDEXES.TYPE */
|
|
|
|
#define DICT_FTS 32 /* FTS index; can't be combined with the
|
|
|
|
other flags */
|
2016-08-12 11:17:45 +03:00
|
|
|
#define DICT_SPATIAL 64 /* SPATIAL index; can't be combined with the
|
|
|
|
other flags */
|
|
|
|
#define DICT_VIRTUAL 128 /* Index on Virtual column */
|
2014-02-26 19:11:54 +01:00
|
|
|
|
2016-08-12 11:17:45 +03:00
|
|
|
#define DICT_IT_BITS 8 /*!< number of bits used for
|
2014-02-26 19:11:54 +01:00
|
|
|
SYS_INDEXES.TYPE */
|
|
|
|
/* @} */
|
|
|
|
|
|
|
|
#if 0 /* not implemented, retained for history */
|
|
|
|
/** Types for a table object */
|
|
|
|
#define DICT_TABLE_ORDINARY 1 /*!< ordinary table */
|
|
|
|
#define DICT_TABLE_CLUSTER_MEMBER 2
|
|
|
|
#define DICT_TABLE_CLUSTER 3 /* this means that the table is
|
|
|
|
really a cluster definition */
|
|
|
|
#endif
|
|
|
|
|
|
|
|
/* Table and tablespace flags are generally not used for the Antelope file
|
|
|
|
format except for the low order bit, which is used differently depending on
|
|
|
|
where the flags are stored.
|
|
|
|
|
|
|
|
==================== Low order flags bit =========================
|
|
|
|
| REDUNDANT | COMPACT | COMPRESSED and DYNAMIC
|
|
|
|
SYS_TABLES.TYPE | 1 | 1 | 1
|
|
|
|
dict_table_t::flags | 0 | 1 | 1
|
|
|
|
FSP_SPACE_FLAGS | 0 | 0 | 1
|
|
|
|
fil_space_t::flags | 0 | 0 | 1
|
|
|
|
|
|
|
|
Before the 5.1 plugin, SYS_TABLES.TYPE was always DICT_TABLE_ORDINARY (1)
|
|
|
|
and the tablespace flags field was always 0. In the 5.1 plugin, these fields
|
|
|
|
were repurposed to identify compressed and dynamic row formats.
|
|
|
|
|
|
|
|
The following types and constants describe the flags found in dict_table_t
|
|
|
|
and SYS_TABLES.TYPE. Similar flags found in fil_space_t and FSP_SPACE_FLAGS
|
|
|
|
are described in fsp0fsp.h. */
|
|
|
|
|
|
|
|
/* @{ */
|
|
|
|
/** dict_table_t::flags bit 0 is equal to 0 if the row format = Redundant */
|
|
|
|
#define DICT_TF_REDUNDANT 0 /*!< Redundant row format. */
|
|
|
|
/** dict_table_t::flags bit 0 is equal to 1 if the row format = Compact */
|
2018-04-28 15:49:09 +03:00
|
|
|
#define DICT_TF_COMPACT 1U /*!< Compact row format. */
|
2014-02-26 19:11:54 +01:00
|
|
|
|
|
|
|
/** This bitmask is used in SYS_TABLES.N_COLS to set and test whether
|
|
|
|
the Compact page format is used, i.e ROW_FORMAT != REDUNDANT */
|
|
|
|
#define DICT_N_COLS_COMPACT 0x80000000UL
|
|
|
|
|
|
|
|
/** Width of the COMPACT flag */
|
|
|
|
#define DICT_TF_WIDTH_COMPACT 1
|
2016-08-12 11:17:45 +03:00
|
|
|
|
2014-02-26 19:11:54 +01:00
|
|
|
/** Width of the ZIP_SSIZE flag */
|
|
|
|
#define DICT_TF_WIDTH_ZIP_SSIZE 4
|
2016-08-12 11:17:45 +03:00
|
|
|
|
2017-06-01 13:03:55 +03:00
|
|
|
/** Width of the ATOMIC_BLOBS flag. The ROW_FORMAT=REDUNDANT and
|
|
|
|
ROW_FORMAT=COMPACT broke up BLOB and TEXT fields, storing the first 768 bytes
|
|
|
|
in the clustered index. ROW_FORMAT=DYNAMIC and ROW_FORMAT=COMPRESSED
|
|
|
|
store the whole blob or text field off-page atomically.
|
2014-02-26 19:11:54 +01:00
|
|
|
Secondary indexes are created from this external data using row_ext_t
|
|
|
|
to cache the BLOB prefixes. */
|
|
|
|
#define DICT_TF_WIDTH_ATOMIC_BLOBS 1
|
2016-08-12 11:17:45 +03:00
|
|
|
|
2014-02-26 19:11:54 +01:00
|
|
|
/** If a table is created with the MYSQL option DATA DIRECTORY and
|
|
|
|
innodb-file-per-table, an older engine will not be able to find that table.
|
|
|
|
This flag prevents older engines from attempting to open the table and
|
|
|
|
allows InnoDB to update_create_info() accordingly. */
|
|
|
|
#define DICT_TF_WIDTH_DATA_DIR 1
|
|
|
|
|
2013-12-19 14:36:38 +02:00
|
|
|
/**
|
|
|
|
Width of the page compression flag
|
|
|
|
*/
|
|
|
|
#define DICT_TF_WIDTH_PAGE_COMPRESSION 1
|
|
|
|
#define DICT_TF_WIDTH_PAGE_COMPRESSION_LEVEL 4
|
|
|
|
|
MDEV-10139 Support for InnoDB SEQUENCE objects
We introduce a NO_ROLLBACK flag for InnoDB tables. This flag only works
for tables that have a single index. Apart from undo logging, this flag
will also prevent locking and the assignment of DB_ROW_ID or DB_TRX_ID,
and imply READ UNCOMMITTED isolation. It is assumed that the SQL layer
is guaranteeing mutual exclusion.
After the initial insert of the single record during CREATE SEQUENCE,
InnoDB will be updating the single record in-place. This is crash-safe
thanks to the redo log. (That is, after a crash after CREATE SEQUENCE
was committed, the effect of sequence operations will be observable
fully or not at all.)
When it comes to the durability of the updates of SEQUENCE in
InnoDB, there is a clear analogy to MDEV-6076 Persistent AUTO_INCREMENT.
The updates would be made persistent by the InnoDB redo log flush
at transaction commit or rollback (or XA PREPARE), provided that
innodb_log_flush_at_trx_commit=1.
Similar to AUTO_INCREMENT, it is possible that the update of a SEQUENCE
in a middle of transaction becomes durable before the COMMIT/ROLLBACK of
the transaction, in case the InnoDB redo log is being flushed as a result
of the a commit or rollback of some other transaction, or as a result of
a redo log checkpoint that can be initiated at any time by operations that
are writing redo log.
dict_table_t::no_rollback(): Check if the table does not support rollback.
BTR_NO_ROLLBACK: Logging and locking flags for no_rollback() tables.
DICT_TF_BITS: Add the NO_ROLLBACK flag.
row_ins_step(): Assign 0 to DB_ROW_ID and DB_TRX_ID, and skip
any locking for no-rollback tables. There will be only a single row
in no-rollback tables (or there must be a proper PRIMARY KEY).
row_search_mvcc(): Execute the READ UNCOMMITTED code path for
no-rollback tables.
ha_innobase::external_lock(), ha_innobase::store_lock():
Block CREATE/DROP SEQUENCE in innodb_read_only mode.
This probably has no effect for CREATE SEQUENCE, because already
ha_innobase::create() should have been called (and refused)
before external_lock() or store_lock() is called.
ha_innobase::store_lock(): For CREATE SEQUENCE, do not acquire any
InnoDB locks, even though TL_WRITE is being requested. (This is just
a performance optimization.)
innobase_copy_frm_flags_from_create_info(), row_drop_table_for_mysql():
Disable persistent statistics for no_rollback tables.
2017-03-27 18:58:16 +03:00
|
|
|
/**
|
2017-06-15 18:02:57 +03:00
|
|
|
The NO_ROLLBACK flag (3=yes; the values 1,2 used stand for
|
|
|
|
ATOMIC_WRITES=ON and ATOMIC_WRITES=OFF between MariaDB 10.1.0 and 10.2.3)
|
2014-12-22 16:53:17 +02:00
|
|
|
*/
|
2017-06-15 18:02:57 +03:00
|
|
|
#define DICT_TF_WIDTH_NO_ROLLBACK 2
|
2013-12-19 14:36:38 +02:00
|
|
|
|
2014-02-26 19:11:54 +01:00
|
|
|
/** Width of all the currently known table flags */
|
2016-08-12 11:17:45 +03:00
|
|
|
#define DICT_TF_BITS (DICT_TF_WIDTH_COMPACT \
|
|
|
|
+ DICT_TF_WIDTH_ZIP_SSIZE \
|
|
|
|
+ DICT_TF_WIDTH_ATOMIC_BLOBS \
|
|
|
|
+ DICT_TF_WIDTH_DATA_DIR \
|
|
|
|
+ DICT_TF_WIDTH_PAGE_COMPRESSION \
|
|
|
|
+ DICT_TF_WIDTH_PAGE_COMPRESSION_LEVEL \
|
MDEV-10139 Support for InnoDB SEQUENCE objects
We introduce a NO_ROLLBACK flag for InnoDB tables. This flag only works
for tables that have a single index. Apart from undo logging, this flag
will also prevent locking and the assignment of DB_ROW_ID or DB_TRX_ID,
and imply READ UNCOMMITTED isolation. It is assumed that the SQL layer
is guaranteeing mutual exclusion.
After the initial insert of the single record during CREATE SEQUENCE,
InnoDB will be updating the single record in-place. This is crash-safe
thanks to the redo log. (That is, after a crash after CREATE SEQUENCE
was committed, the effect of sequence operations will be observable
fully or not at all.)
When it comes to the durability of the updates of SEQUENCE in
InnoDB, there is a clear analogy to MDEV-6076 Persistent AUTO_INCREMENT.
The updates would be made persistent by the InnoDB redo log flush
at transaction commit or rollback (or XA PREPARE), provided that
innodb_log_flush_at_trx_commit=1.
Similar to AUTO_INCREMENT, it is possible that the update of a SEQUENCE
in a middle of transaction becomes durable before the COMMIT/ROLLBACK of
the transaction, in case the InnoDB redo log is being flushed as a result
of the a commit or rollback of some other transaction, or as a result of
a redo log checkpoint that can be initiated at any time by operations that
are writing redo log.
dict_table_t::no_rollback(): Check if the table does not support rollback.
BTR_NO_ROLLBACK: Logging and locking flags for no_rollback() tables.
DICT_TF_BITS: Add the NO_ROLLBACK flag.
row_ins_step(): Assign 0 to DB_ROW_ID and DB_TRX_ID, and skip
any locking for no-rollback tables. There will be only a single row
in no-rollback tables (or there must be a proper PRIMARY KEY).
row_search_mvcc(): Execute the READ UNCOMMITTED code path for
no-rollback tables.
ha_innobase::external_lock(), ha_innobase::store_lock():
Block CREATE/DROP SEQUENCE in innodb_read_only mode.
This probably has no effect for CREATE SEQUENCE, because already
ha_innobase::create() should have been called (and refused)
before external_lock() or store_lock() is called.
ha_innobase::store_lock(): For CREATE SEQUENCE, do not acquire any
InnoDB locks, even though TL_WRITE is being requested. (This is just
a performance optimization.)
innobase_copy_frm_flags_from_create_info(), row_drop_table_for_mysql():
Disable persistent statistics for no_rollback tables.
2017-03-27 18:58:16 +03:00
|
|
|
+ DICT_TF_WIDTH_NO_ROLLBACK)
|
2014-02-26 19:11:54 +01:00
|
|
|
|
|
|
|
/** Zero relative shift position of the COMPACT field */
|
|
|
|
#define DICT_TF_POS_COMPACT 0
|
|
|
|
/** Zero relative shift position of the ZIP_SSIZE field */
|
|
|
|
#define DICT_TF_POS_ZIP_SSIZE (DICT_TF_POS_COMPACT \
|
|
|
|
+ DICT_TF_WIDTH_COMPACT)
|
|
|
|
/** Zero relative shift position of the ATOMIC_BLOBS field */
|
|
|
|
#define DICT_TF_POS_ATOMIC_BLOBS (DICT_TF_POS_ZIP_SSIZE \
|
|
|
|
+ DICT_TF_WIDTH_ZIP_SSIZE)
|
|
|
|
/** Zero relative shift position of the DATA_DIR field */
|
|
|
|
#define DICT_TF_POS_DATA_DIR (DICT_TF_POS_ATOMIC_BLOBS \
|
|
|
|
+ DICT_TF_WIDTH_ATOMIC_BLOBS)
|
2013-12-19 14:36:38 +02:00
|
|
|
/** Zero relative shift position of the PAGE_COMPRESSION field */
|
MDEV-12873 InnoDB SYS_TABLES.TYPE incompatibility for PAGE_COMPRESSED=YES in MariaDB 10.2.2 to 10.2.6
Remove the SHARED_SPACE flag that was erroneously introduced in
MariaDB 10.2.2, and shift the SYS_TABLES.TYPE flags back to where
they were before MariaDB 10.2.2. While doing this, ensure that
tables created with affected MariaDB versions can be loaded,
and also ensure that tables created with MySQL 5.7 using the
TABLESPACE attribute cannot be loaded.
MariaDB 10.2.2 picked the SHARED_SPACE flag from MySQL 5.7,
shifting the MariaDB 10.1 flags PAGE_COMPRESSION, PAGE_COMPRESSION_LEVEL,
ATOMIC_WRITES by one bit. The SHARED_SPACE flag would always
be written as 0 by MariaDB, because MariaDB does not support
CREATE TABLESPACE or CREATE TABLE...TABLESPACE for InnoDB.
So, instead of the bits AALLLLCxxxxxxx we would have
AALLLLC0xxxxxxx if the table was created with MariaDB 10.2.2
to 10.2.6. (AA=ATOMIC_WRITES, LLLL=PAGE_COMPRESSION_LEVEL,
C=PAGE_COMPRESSED, xxxxxxx=7 bits that were not moved.)
PAGE_COMPRESSED=NO implies LLLLC=00000. That is not a problem.
If someone created a table in MariaDB 10.2.2 or 10.2.3 with
the attribute ATOMIC_WRITES=OFF (value 2; AA=10) and without
PAGE_COMPRESSED=YES or PAGE_COMPRESSION_LEVEL, the table should be
rejected. We ignore this problem, because it should be unlikely
for anyone to specify ATOMIC_WRITES=OFF, and because 10.2.2 and
10.2.2 were not mature releases. The value ATOMIC_WRITES=ON (1)
would be interpreted as ATOMIC_WRITES=OFF, but starting with
MariaDB 10.2.4 the ATOMIC_WRITES attribute is ignored.
PAGE_COMPRESSED=YES implies that PAGE_COMPRESSION_LEVEL be between
1 and 9 and that ROW_FORMAT be COMPACT or DYNAMIC. Thus, the affected
wrong bit pattern in SYS_TABLES.TYPE is of the form AALLLL10DB00001
where D signals the presence of a DATA DIRECTORY attribute and B is 1
for ROW_FORMAT=DYNAMIC and 0 for ROW_FORMAT=COMPACT. We must interpret
this bit pattern as AALLLL1DB00001 (discarding the extraneous 0 bit).
dict_sys_tables_rec_read(): Adjust the affected bit pattern when
reading the SYS_TABLES.TYPE column. In case of invalid flags,
report both SYS_TABLES.TYPE (after possible adjustment) and
SYS_TABLES.MIX_LEN.
dict_load_table_one(): Replace an unreachable condition on
!dict_tf2_is_valid() with a debug assertion. The flags will already
have been validated by dict_sys_tables_rec_read(); if that validation
fails, dict_load_table_low() will have failed.
fil_ibd_create(): Shorten an error message about a file pre-existing.
Datafile::validate_to_dd(): Clarify an error message about tablespace
flags mismatch.
ha_innobase::open(): Remove an unnecessary warning message.
dict_tf_is_valid(): Simplify and stricten the logic. Validate the
values of PAGE_COMPRESSION. Remove error log output; let the callers
handle that.
DICT_TF_BITS: Remove ATOMIC_WRITES, PAGE_ENCRYPTION, PAGE_ENCRYPTION_KEY.
The ATOMIC_WRITES is ignored once the SYS_TABLES.TYPE has been validated;
there is no need to store it in dict_table_t::flags. The PAGE_ENCRYPTION
and PAGE_ENCRYPTION_KEY are unused since MariaDB 10.1.4 (the GA release
was 10.1.8).
DICT_TF_BIT_MASK: Remove (unused).
FSP_FLAGS_MEM_ATOMIC_WRITES: Remove (the flags are never read).
row_import_read_v1(): Display an error if dict_tf_is_valid() fails.
2017-06-14 14:08:49 +03:00
|
|
|
#define DICT_TF_POS_PAGE_COMPRESSION (DICT_TF_POS_DATA_DIR \
|
|
|
|
+ DICT_TF_WIDTH_DATA_DIR)
|
2013-12-19 14:36:38 +02:00
|
|
|
/** Zero relative shift position of the PAGE_COMPRESSION_LEVEL field */
|
|
|
|
#define DICT_TF_POS_PAGE_COMPRESSION_LEVEL (DICT_TF_POS_PAGE_COMPRESSION \
|
|
|
|
+ DICT_TF_WIDTH_PAGE_COMPRESSION)
|
MDEV-10139 Support for InnoDB SEQUENCE objects
We introduce a NO_ROLLBACK flag for InnoDB tables. This flag only works
for tables that have a single index. Apart from undo logging, this flag
will also prevent locking and the assignment of DB_ROW_ID or DB_TRX_ID,
and imply READ UNCOMMITTED isolation. It is assumed that the SQL layer
is guaranteeing mutual exclusion.
After the initial insert of the single record during CREATE SEQUENCE,
InnoDB will be updating the single record in-place. This is crash-safe
thanks to the redo log. (That is, after a crash after CREATE SEQUENCE
was committed, the effect of sequence operations will be observable
fully or not at all.)
When it comes to the durability of the updates of SEQUENCE in
InnoDB, there is a clear analogy to MDEV-6076 Persistent AUTO_INCREMENT.
The updates would be made persistent by the InnoDB redo log flush
at transaction commit or rollback (or XA PREPARE), provided that
innodb_log_flush_at_trx_commit=1.
Similar to AUTO_INCREMENT, it is possible that the update of a SEQUENCE
in a middle of transaction becomes durable before the COMMIT/ROLLBACK of
the transaction, in case the InnoDB redo log is being flushed as a result
of the a commit or rollback of some other transaction, or as a result of
a redo log checkpoint that can be initiated at any time by operations that
are writing redo log.
dict_table_t::no_rollback(): Check if the table does not support rollback.
BTR_NO_ROLLBACK: Logging and locking flags for no_rollback() tables.
DICT_TF_BITS: Add the NO_ROLLBACK flag.
row_ins_step(): Assign 0 to DB_ROW_ID and DB_TRX_ID, and skip
any locking for no-rollback tables. There will be only a single row
in no-rollback tables (or there must be a proper PRIMARY KEY).
row_search_mvcc(): Execute the READ UNCOMMITTED code path for
no-rollback tables.
ha_innobase::external_lock(), ha_innobase::store_lock():
Block CREATE/DROP SEQUENCE in innodb_read_only mode.
This probably has no effect for CREATE SEQUENCE, because already
ha_innobase::create() should have been called (and refused)
before external_lock() or store_lock() is called.
ha_innobase::store_lock(): For CREATE SEQUENCE, do not acquire any
InnoDB locks, even though TL_WRITE is being requested. (This is just
a performance optimization.)
innobase_copy_frm_flags_from_create_info(), row_drop_table_for_mysql():
Disable persistent statistics for no_rollback tables.
2017-03-27 18:58:16 +03:00
|
|
|
/** Zero relative shift position of the NO_ROLLBACK field */
|
2017-06-15 18:02:57 +03:00
|
|
|
#define DICT_TF_POS_NO_ROLLBACK (DICT_TF_POS_PAGE_COMPRESSION_LEVEL \
|
2013-12-19 14:36:38 +02:00
|
|
|
+ DICT_TF_WIDTH_PAGE_COMPRESSION_LEVEL)
|
2017-06-15 18:02:57 +03:00
|
|
|
#define DICT_TF_POS_UNUSED (DICT_TF_POS_NO_ROLLBACK \
|
MDEV-10139 Support for InnoDB SEQUENCE objects
We introduce a NO_ROLLBACK flag for InnoDB tables. This flag only works
for tables that have a single index. Apart from undo logging, this flag
will also prevent locking and the assignment of DB_ROW_ID or DB_TRX_ID,
and imply READ UNCOMMITTED isolation. It is assumed that the SQL layer
is guaranteeing mutual exclusion.
After the initial insert of the single record during CREATE SEQUENCE,
InnoDB will be updating the single record in-place. This is crash-safe
thanks to the redo log. (That is, after a crash after CREATE SEQUENCE
was committed, the effect of sequence operations will be observable
fully or not at all.)
When it comes to the durability of the updates of SEQUENCE in
InnoDB, there is a clear analogy to MDEV-6076 Persistent AUTO_INCREMENT.
The updates would be made persistent by the InnoDB redo log flush
at transaction commit or rollback (or XA PREPARE), provided that
innodb_log_flush_at_trx_commit=1.
Similar to AUTO_INCREMENT, it is possible that the update of a SEQUENCE
in a middle of transaction becomes durable before the COMMIT/ROLLBACK of
the transaction, in case the InnoDB redo log is being flushed as a result
of the a commit or rollback of some other transaction, or as a result of
a redo log checkpoint that can be initiated at any time by operations that
are writing redo log.
dict_table_t::no_rollback(): Check if the table does not support rollback.
BTR_NO_ROLLBACK: Logging and locking flags for no_rollback() tables.
DICT_TF_BITS: Add the NO_ROLLBACK flag.
row_ins_step(): Assign 0 to DB_ROW_ID and DB_TRX_ID, and skip
any locking for no-rollback tables. There will be only a single row
in no-rollback tables (or there must be a proper PRIMARY KEY).
row_search_mvcc(): Execute the READ UNCOMMITTED code path for
no-rollback tables.
ha_innobase::external_lock(), ha_innobase::store_lock():
Block CREATE/DROP SEQUENCE in innodb_read_only mode.
This probably has no effect for CREATE SEQUENCE, because already
ha_innobase::create() should have been called (and refused)
before external_lock() or store_lock() is called.
ha_innobase::store_lock(): For CREATE SEQUENCE, do not acquire any
InnoDB locks, even though TL_WRITE is being requested. (This is just
a performance optimization.)
innobase_copy_frm_flags_from_create_info(), row_drop_table_for_mysql():
Disable persistent statistics for no_rollback tables.
2017-03-27 18:58:16 +03:00
|
|
|
+ DICT_TF_WIDTH_NO_ROLLBACK)
|
2014-02-26 19:11:54 +01:00
|
|
|
|
|
|
|
/** Bit mask of the COMPACT field */
|
|
|
|
#define DICT_TF_MASK_COMPACT \
|
2016-06-21 14:21:03 +02:00
|
|
|
((~(~0U << DICT_TF_WIDTH_COMPACT)) \
|
2014-02-26 19:11:54 +01:00
|
|
|
<< DICT_TF_POS_COMPACT)
|
|
|
|
/** Bit mask of the ZIP_SSIZE field */
|
|
|
|
#define DICT_TF_MASK_ZIP_SSIZE \
|
2016-06-21 14:21:03 +02:00
|
|
|
((~(~0U << DICT_TF_WIDTH_ZIP_SSIZE)) \
|
2014-02-26 19:11:54 +01:00
|
|
|
<< DICT_TF_POS_ZIP_SSIZE)
|
|
|
|
/** Bit mask of the ATOMIC_BLOBS field */
|
|
|
|
#define DICT_TF_MASK_ATOMIC_BLOBS \
|
2016-06-21 14:21:03 +02:00
|
|
|
((~(~0U << DICT_TF_WIDTH_ATOMIC_BLOBS)) \
|
2014-02-26 19:11:54 +01:00
|
|
|
<< DICT_TF_POS_ATOMIC_BLOBS)
|
|
|
|
/** Bit mask of the DATA_DIR field */
|
|
|
|
#define DICT_TF_MASK_DATA_DIR \
|
2016-06-21 14:21:03 +02:00
|
|
|
((~(~0U << DICT_TF_WIDTH_DATA_DIR)) \
|
2014-02-26 19:11:54 +01:00
|
|
|
<< DICT_TF_POS_DATA_DIR)
|
2013-12-19 14:36:38 +02:00
|
|
|
/** Bit mask of the PAGE_COMPRESSION field */
|
|
|
|
#define DICT_TF_MASK_PAGE_COMPRESSION \
|
2016-11-25 06:09:00 +02:00
|
|
|
((~(~0U << DICT_TF_WIDTH_PAGE_COMPRESSION)) \
|
2013-12-19 14:36:38 +02:00
|
|
|
<< DICT_TF_POS_PAGE_COMPRESSION)
|
|
|
|
/** Bit mask of the PAGE_COMPRESSION_LEVEL field */
|
|
|
|
#define DICT_TF_MASK_PAGE_COMPRESSION_LEVEL \
|
2016-11-25 06:09:00 +02:00
|
|
|
((~(~0U << DICT_TF_WIDTH_PAGE_COMPRESSION_LEVEL)) \
|
2013-12-19 14:36:38 +02:00
|
|
|
<< DICT_TF_POS_PAGE_COMPRESSION_LEVEL)
|
2017-06-15 18:02:57 +03:00
|
|
|
/** Bit mask of the NO_ROLLBACK field */
|
|
|
|
#define DICT_TF_MASK_NO_ROLLBACK \
|
|
|
|
((~(~0U << DICT_TF_WIDTH_NO_ROLLBACK)) \
|
|
|
|
<< DICT_TF_POS_NO_ROLLBACK)
|
2014-02-26 19:11:54 +01:00
|
|
|
|
|
|
|
/** Return the value of the COMPACT field */
|
|
|
|
#define DICT_TF_GET_COMPACT(flags) \
|
|
|
|
((flags & DICT_TF_MASK_COMPACT) \
|
|
|
|
>> DICT_TF_POS_COMPACT)
|
|
|
|
/** Return the value of the ZIP_SSIZE field */
|
|
|
|
#define DICT_TF_GET_ZIP_SSIZE(flags) \
|
|
|
|
((flags & DICT_TF_MASK_ZIP_SSIZE) \
|
|
|
|
>> DICT_TF_POS_ZIP_SSIZE)
|
|
|
|
/** Return the value of the ATOMIC_BLOBS field */
|
|
|
|
#define DICT_TF_HAS_ATOMIC_BLOBS(flags) \
|
|
|
|
((flags & DICT_TF_MASK_ATOMIC_BLOBS) \
|
|
|
|
>> DICT_TF_POS_ATOMIC_BLOBS)
|
2016-08-12 11:17:45 +03:00
|
|
|
/** Return the value of the DATA_DIR field */
|
2014-02-26 19:11:54 +01:00
|
|
|
#define DICT_TF_HAS_DATA_DIR(flags) \
|
|
|
|
((flags & DICT_TF_MASK_DATA_DIR) \
|
|
|
|
>> DICT_TF_POS_DATA_DIR)
|
2013-12-19 14:36:38 +02:00
|
|
|
/** Return the value of the PAGE_COMPRESSION field */
|
|
|
|
#define DICT_TF_GET_PAGE_COMPRESSION(flags) \
|
|
|
|
((flags & DICT_TF_MASK_PAGE_COMPRESSION) \
|
|
|
|
>> DICT_TF_POS_PAGE_COMPRESSION)
|
|
|
|
/** Return the value of the PAGE_COMPRESSION_LEVEL field */
|
|
|
|
#define DICT_TF_GET_PAGE_COMPRESSION_LEVEL(flags) \
|
|
|
|
((flags & DICT_TF_MASK_PAGE_COMPRESSION_LEVEL) \
|
|
|
|
>> DICT_TF_POS_PAGE_COMPRESSION_LEVEL)
|
|
|
|
|
2014-02-26 19:11:54 +01:00
|
|
|
/* @} */
|
|
|
|
|
|
|
|
/** @brief Table Flags set number 2.
|
|
|
|
|
|
|
|
These flags will be stored in SYS_TABLES.MIX_LEN. All unused flags
|
|
|
|
will be written as 0. The column may contain garbage for tables
|
|
|
|
created with old versions of InnoDB that only implemented
|
|
|
|
ROW_FORMAT=REDUNDANT. InnoDB engines do not check these flags
|
|
|
|
for unknown bits in order to protect backward incompatibility. */
|
|
|
|
/* @{ */
|
|
|
|
/** Total number of bits in table->flags2. */
|
2017-11-01 20:17:48 +03:00
|
|
|
#define DICT_TF2_BITS 7
|
2017-06-13 23:04:01 +03:00
|
|
|
#define DICT_TF2_UNUSED_BIT_MASK (~0U << DICT_TF2_BITS)
|
2016-08-12 11:17:45 +03:00
|
|
|
#define DICT_TF2_BIT_MASK ~DICT_TF2_UNUSED_BIT_MASK
|
2014-02-26 19:11:54 +01:00
|
|
|
|
|
|
|
/** TEMPORARY; TRUE for tables from CREATE TEMPORARY TABLE. */
|
2017-03-01 08:27:39 +02:00
|
|
|
#define DICT_TF2_TEMPORARY 1U
|
2016-08-12 11:17:45 +03:00
|
|
|
|
2014-02-26 19:11:54 +01:00
|
|
|
/** The table has an internal defined DOC ID column */
|
2017-03-01 08:27:39 +02:00
|
|
|
#define DICT_TF2_FTS_HAS_DOC_ID 2U
|
2016-08-12 11:17:45 +03:00
|
|
|
|
2014-02-26 19:11:54 +01:00
|
|
|
/** The table has an FTS index */
|
2017-03-01 08:27:39 +02:00
|
|
|
#define DICT_TF2_FTS 4U
|
2016-08-12 11:17:45 +03:00
|
|
|
|
2014-02-26 19:11:54 +01:00
|
|
|
/** Need to add Doc ID column for FTS index build.
|
|
|
|
This is a transient bit for index build */
|
2017-03-01 08:27:39 +02:00
|
|
|
#define DICT_TF2_FTS_ADD_DOC_ID 8U
|
2016-08-12 11:17:45 +03:00
|
|
|
|
2014-02-26 19:11:54 +01:00
|
|
|
/** This bit is used during table creation to indicate that it will
|
|
|
|
use its own tablespace instead of the system tablespace. */
|
2017-03-01 08:27:39 +02:00
|
|
|
#define DICT_TF2_USE_FILE_PER_TABLE 16U
|
2014-02-26 19:11:54 +01:00
|
|
|
|
|
|
|
/** Set when we discard/detach the tablespace */
|
2017-03-01 08:27:39 +02:00
|
|
|
#define DICT_TF2_DISCARDED 32U
|
2014-05-05 18:20:28 +02:00
|
|
|
|
|
|
|
/** This bit is set if all aux table names (both common tables and
|
|
|
|
index tables) of a FTS table are in HEX format. */
|
2017-03-01 08:27:39 +02:00
|
|
|
#define DICT_TF2_FTS_AUX_HEX_NAME 64U
|
2016-08-12 11:17:45 +03:00
|
|
|
|
2014-02-26 19:11:54 +01:00
|
|
|
/* @} */
|
|
|
|
|
2016-08-12 11:17:45 +03:00
|
|
|
#define DICT_TF2_FLAG_SET(table, flag) \
|
2014-02-26 19:11:54 +01:00
|
|
|
(table->flags2 |= (flag))
|
|
|
|
|
2016-08-12 11:17:45 +03:00
|
|
|
#define DICT_TF2_FLAG_IS_SET(table, flag) \
|
2014-02-26 19:11:54 +01:00
|
|
|
(table->flags2 & (flag))
|
|
|
|
|
2016-08-12 11:17:45 +03:00
|
|
|
#define DICT_TF2_FLAG_UNSET(table, flag) \
|
2014-02-26 19:11:54 +01:00
|
|
|
(table->flags2 &= ~(flag))
|
|
|
|
|
|
|
|
/** Tables could be chained together with Foreign key constraint. When
|
|
|
|
first load the parent table, we would load all of its descedents.
|
|
|
|
This could result in rescursive calls and out of stack error eventually.
|
|
|
|
DICT_FK_MAX_RECURSIVE_LOAD defines the maximum number of recursive loads,
|
|
|
|
when exceeded, the child table will not be loaded. It will be loaded when
|
|
|
|
the foreign constraint check needs to be run. */
|
|
|
|
#define DICT_FK_MAX_RECURSIVE_LOAD 20
|
|
|
|
|
|
|
|
/** Similarly, when tables are chained together with foreign key constraints
|
|
|
|
with on cascading delete/update clause, delete from parent table could
|
|
|
|
result in recursive cascading calls. This defines the maximum number of
|
|
|
|
such cascading deletes/updates allowed. When exceeded, the delete from
|
|
|
|
parent table will fail, and user has to drop excessive foreign constraint
|
|
|
|
before proceeds. */
|
2018-02-06 13:53:20 +02:00
|
|
|
#define FK_MAX_CASCADE_DEL 15
|
2014-02-26 19:11:54 +01:00
|
|
|
|
|
|
|
/**********************************************************************//**
|
|
|
|
Creates a table memory object.
|
2016-08-12 11:17:45 +03:00
|
|
|
@return own: table object */
|
2014-02-26 19:11:54 +01:00
|
|
|
dict_table_t*
|
|
|
|
dict_mem_table_create(
|
|
|
|
/*==================*/
|
|
|
|
const char* name, /*!< in: table name */
|
MDEV-12266: Change dict_table_t::space to fil_space_t*
InnoDB always keeps all tablespaces in the fil_system cache.
The fil_system.LRU is only for closing file handles; the
fil_space_t and fil_node_t for all data files will remain
in main memory. Between startup to shutdown, they can only be
created and removed by DDL statements. Therefore, we can
let dict_table_t::space point directly to the fil_space_t.
dict_table_t::space_id: A numeric tablespace ID for the corner cases
where we do not have a tablespace. The most prominent examples are
ALTER TABLE...DISCARD TABLESPACE or a missing or corrupted file.
There are a few functional differences; most notably:
(1) DROP TABLE will delete matching .ibd and .cfg files,
even if they were not attached to the data dictionary.
(2) Some error messages will report file names instead of numeric IDs.
There still are many functions that use numeric tablespace IDs instead
of fil_space_t*, and many functions could be converted to fil_space_t
member functions. Also, Tablespace and Datafile should be merged with
fil_space_t and fil_node_t. page_id_t and buf_page_get_gen() could use
fil_space_t& instead of a numeric ID, and after moving to a single
buffer pool (MDEV-15058), buf_pool_t::page_hash could be moved to
fil_space_t::page_hash.
FilSpace: Remove. Only few calls to fil_space_acquire() will remain,
and gradually they should be removed.
mtr_t::set_named_space_id(ulint): Renamed from set_named_space(),
to prevent accidental calls to this slower function. Very few
callers remain.
fseg_create(), fsp_reserve_free_extents(): Take fil_space_t*
as a parameter instead of a space_id.
fil_space_t::rename(): Wrapper for fil_rename_tablespace_check(),
fil_name_write_rename(), fil_rename_tablespace(). Mariabackup
passes the parameter log=false; InnoDB passes log=true.
dict_mem_table_create(): Take fil_space_t* instead of space_id
as parameter.
dict_process_sys_tables_rec_and_mtr_commit(): Replace the parameter
'status' with 'bool cached'.
dict_get_and_save_data_dir_path(): Avoid copying the fil_node_t::name.
fil_ibd_open(): Return the tablespace.
fil_space_t::set_imported(): Replaces fil_space_set_imported().
truncate_t: Change many member function parameters to fil_space_t*,
and remove page_size parameters.
row_truncate_prepare(): Merge to its only caller.
row_drop_table_from_cache(): Assert that the table is persistent.
dict_create_sys_indexes_tuple(): Write SYS_INDEXES.SPACE=FIL_NULL
if the tablespace has been discarded.
row_import_update_discarded_flag(): Remove a constant parameter.
2018-03-27 16:31:10 +03:00
|
|
|
fil_space_t* space, /*!< in: tablespace */
|
2016-08-12 11:17:45 +03:00
|
|
|
ulint n_cols, /*!< in: total number of columns
|
|
|
|
including virtual and non-virtual
|
|
|
|
columns */
|
|
|
|
ulint n_v_cols, /*!< in: number of virtual columns */
|
2014-02-26 19:11:54 +01:00
|
|
|
ulint flags, /*!< in: table flags */
|
|
|
|
ulint flags2); /*!< in: table flags2 */
|
|
|
|
/****************************************************************//**
|
|
|
|
Free a table memory object. */
|
|
|
|
void
|
|
|
|
dict_mem_table_free(
|
|
|
|
/*================*/
|
|
|
|
dict_table_t* table); /*!< in: table */
|
|
|
|
/**********************************************************************//**
|
|
|
|
Adds a column definition to a table. */
|
|
|
|
void
|
|
|
|
dict_mem_table_add_col(
|
|
|
|
/*===================*/
|
|
|
|
dict_table_t* table, /*!< in: table */
|
|
|
|
mem_heap_t* heap, /*!< in: temporary memory heap, or NULL */
|
|
|
|
const char* name, /*!< in: column name, or NULL */
|
|
|
|
ulint mtype, /*!< in: main datatype */
|
|
|
|
ulint prtype, /*!< in: precise type */
|
|
|
|
ulint len) /*!< in: precision */
|
2016-06-21 14:21:03 +02:00
|
|
|
MY_ATTRIBUTE((nonnull(1)));
|
2016-08-12 11:17:45 +03:00
|
|
|
/** Adds a virtual column definition to a table.
|
|
|
|
@param[in,out] table table
|
|
|
|
@param[in] heap temporary memory heap, or NULL. It is
|
|
|
|
used to store name when we have not finished
|
|
|
|
adding all columns. When all columns are
|
|
|
|
added, the whole name will copy to memory from
|
|
|
|
table->heap
|
|
|
|
@param[in] name column name
|
|
|
|
@param[in] mtype main datatype
|
|
|
|
@param[in] prtype precise type
|
|
|
|
@param[in] len length
|
|
|
|
@param[in] pos position in a table
|
|
|
|
@param[in] num_base number of base columns
|
|
|
|
@return the virtual column definition */
|
|
|
|
dict_v_col_t*
|
|
|
|
dict_mem_table_add_v_col(
|
|
|
|
dict_table_t* table,
|
|
|
|
mem_heap_t* heap,
|
|
|
|
const char* name,
|
|
|
|
ulint mtype,
|
|
|
|
ulint prtype,
|
|
|
|
ulint len,
|
|
|
|
ulint pos,
|
|
|
|
ulint num_base);
|
2016-09-06 09:43:16 +03:00
|
|
|
|
|
|
|
/** Adds a stored column definition to a table.
|
|
|
|
@param[in] table table
|
|
|
|
@param[in] num_base number of base columns. */
|
|
|
|
void
|
|
|
|
dict_mem_table_add_s_col(
|
|
|
|
dict_table_t* table,
|
|
|
|
ulint num_base);
|
|
|
|
|
2014-02-26 19:11:54 +01:00
|
|
|
/**********************************************************************//**
|
|
|
|
Renames a column of a table in the data dictionary cache. */
|
|
|
|
void
|
|
|
|
dict_mem_table_col_rename(
|
|
|
|
/*======================*/
|
|
|
|
dict_table_t* table, /*!< in/out: table */
|
2016-09-06 09:43:16 +03:00
|
|
|
ulint nth_col,/*!< in: column index */
|
2014-02-26 19:11:54 +01:00
|
|
|
const char* from, /*!< in: old column name */
|
2016-08-12 11:17:45 +03:00
|
|
|
const char* to, /*!< in: new column name */
|
|
|
|
bool is_virtual);
|
|
|
|
/*!< in: if this is a virtual column */
|
2014-02-26 19:11:54 +01:00
|
|
|
/**********************************************************************//**
|
|
|
|
This function populates a dict_col_t memory structure with
|
|
|
|
supplied information. */
|
|
|
|
void
|
|
|
|
dict_mem_fill_column_struct(
|
|
|
|
/*========================*/
|
|
|
|
dict_col_t* column, /*!< out: column struct to be
|
|
|
|
filled */
|
|
|
|
ulint col_pos, /*!< in: column position */
|
|
|
|
ulint mtype, /*!< in: main data type */
|
|
|
|
ulint prtype, /*!< in: precise type */
|
|
|
|
ulint col_len); /*!< in: column length */
|
|
|
|
/**********************************************************************//**
|
|
|
|
This function poplulates a dict_index_t index memory structure with
|
|
|
|
supplied information. */
|
|
|
|
UNIV_INLINE
|
|
|
|
void
|
|
|
|
dict_mem_fill_index_struct(
|
|
|
|
/*=======================*/
|
|
|
|
dict_index_t* index, /*!< out: index to be filled */
|
|
|
|
mem_heap_t* heap, /*!< in: memory heap */
|
|
|
|
const char* index_name, /*!< in: index name */
|
|
|
|
ulint type, /*!< in: DICT_UNIQUE,
|
|
|
|
DICT_CLUSTERED, ... ORed */
|
|
|
|
ulint n_fields); /*!< in: number of fields */
|
|
|
|
/**********************************************************************//**
|
|
|
|
Creates an index memory object.
|
2016-08-12 11:17:45 +03:00
|
|
|
@return own: index object */
|
2014-02-26 19:11:54 +01:00
|
|
|
dict_index_t*
|
|
|
|
dict_mem_index_create(
|
|
|
|
/*==================*/
|
2018-03-23 17:25:56 +02:00
|
|
|
dict_table_t* table, /*!< in: table */
|
2014-02-26 19:11:54 +01:00
|
|
|
const char* index_name, /*!< in: index name */
|
|
|
|
ulint type, /*!< in: DICT_UNIQUE,
|
|
|
|
DICT_CLUSTERED, ... ORed */
|
|
|
|
ulint n_fields); /*!< in: number of fields */
|
|
|
|
/**********************************************************************//**
|
|
|
|
Adds a field definition to an index. NOTE: does not take a copy
|
|
|
|
of the column name if the field is a column. The memory occupied
|
|
|
|
by the column name may be released only after publishing the index. */
|
|
|
|
void
|
|
|
|
dict_mem_index_add_field(
|
|
|
|
/*=====================*/
|
|
|
|
dict_index_t* index, /*!< in: index */
|
|
|
|
const char* name, /*!< in: column name */
|
|
|
|
ulint prefix_len); /*!< in: 0 or the column prefix length
|
|
|
|
in a MySQL index like
|
|
|
|
INDEX (textcol(25)) */
|
|
|
|
/**********************************************************************//**
|
|
|
|
Frees an index memory object. */
|
|
|
|
void
|
|
|
|
dict_mem_index_free(
|
|
|
|
/*================*/
|
|
|
|
dict_index_t* index); /*!< in: index */
|
|
|
|
/**********************************************************************//**
|
|
|
|
Creates and initializes a foreign constraint memory object.
|
2016-08-12 11:17:45 +03:00
|
|
|
@return own: foreign constraint struct */
|
2014-02-26 19:11:54 +01:00
|
|
|
dict_foreign_t*
|
|
|
|
dict_mem_foreign_create(void);
|
|
|
|
/*=========================*/
|
|
|
|
|
|
|
|
/**********************************************************************//**
|
|
|
|
Sets the foreign_table_name_lookup pointer based on the value of
|
|
|
|
lower_case_table_names. If that is 0 or 1, foreign_table_name_lookup
|
|
|
|
will point to foreign_table_name. If 2, then another string is
|
|
|
|
allocated from the heap and set to lower case. */
|
|
|
|
void
|
|
|
|
dict_mem_foreign_table_name_lookup_set(
|
|
|
|
/*===================================*/
|
|
|
|
dict_foreign_t* foreign, /*!< in/out: foreign struct */
|
|
|
|
ibool do_alloc); /*!< in: is an alloc needed */
|
|
|
|
|
|
|
|
/**********************************************************************//**
|
|
|
|
Sets the referenced_table_name_lookup pointer based on the value of
|
|
|
|
lower_case_table_names. If that is 0 or 1, referenced_table_name_lookup
|
|
|
|
will point to referenced_table_name. If 2, then another string is
|
|
|
|
allocated from the heap and set to lower case. */
|
|
|
|
void
|
|
|
|
dict_mem_referenced_table_name_lookup_set(
|
|
|
|
/*======================================*/
|
|
|
|
dict_foreign_t* foreign, /*!< in/out: foreign struct */
|
|
|
|
ibool do_alloc); /*!< in: is an alloc needed */
|
|
|
|
|
2016-09-06 09:43:16 +03:00
|
|
|
/** Fills the dependent virtual columns in a set.
|
|
|
|
Reason for being dependent are
|
|
|
|
1) FK can be present on base column of virtual columns
|
|
|
|
2) FK can be present on column which is a part of virtual index
|
|
|
|
@param[in,out] foreign foreign key information. */
|
|
|
|
void
|
|
|
|
dict_mem_foreign_fill_vcol_set(
|
|
|
|
dict_foreign_t* foreign);
|
|
|
|
|
|
|
|
/** Fill virtual columns set in each fk constraint present in the table.
|
|
|
|
@param[in,out] table innodb table object. */
|
|
|
|
void
|
|
|
|
dict_mem_table_fill_foreign_vcol_set(
|
|
|
|
dict_table_t* table);
|
|
|
|
|
|
|
|
/** Free the vcol_set from all foreign key constraint on the table.
|
|
|
|
@param[in,out] table innodb table object. */
|
|
|
|
void
|
|
|
|
dict_mem_table_free_foreign_vcol_set(
|
|
|
|
dict_table_t* table);
|
|
|
|
|
MDEV-18836 ASAN: heap-use-after-free after TRUNCATE
row_drop_tables_for_mysql_in_background(): Copy the table name
before closing the table handle, to avoid heap-use-after-free if
another thread succeeds in dropping the table before
row_drop_table_for_mysql_in_background() completes the table name lookup.
dict_mem_create_temporary_tablename(): With innodb_safe_truncate=ON
(the default), generate a simple, unique, collision-free table name
using only the id, no pseudorandom component. This is safe, because
on startup, we will drop any #sql tables that might exist in InnoDB.
This is a backport from 10.3. It should have been backported already
as part of backporting MDEV-14717,MDEV-14585 which were prerequisites
for the MDEV-13564 backup-friendly TRUNCATE TABLE.
This seems to reduce the chance of table creation failures in
ha_innobase::truncate().
ha_innobase::truncate(): Do not invoke close(), but instead
mimic it, so that we can restore to the original table handle
in case opening the truncated copy of the table failed.
2019-03-13 13:31:45 +02:00
|
|
|
/** Create a temporary tablename like "#sql-ibNNN".
|
2014-11-18 17:41:12 +01:00
|
|
|
@param[in] heap A memory heap
|
|
|
|
@param[in] dbtab Table name in the form database/table name
|
|
|
|
@param[in] id Table id
|
|
|
|
@return A unique temporary tablename suitable for InnoDB use */
|
2014-02-26 19:11:54 +01:00
|
|
|
char*
|
|
|
|
dict_mem_create_temporary_tablename(
|
2014-11-18 17:41:12 +01:00
|
|
|
mem_heap_t* heap,
|
|
|
|
const char* dbtab,
|
|
|
|
table_id_t id);
|
|
|
|
|
2016-08-12 11:17:45 +03:00
|
|
|
/** SQL identifier name wrapper for pretty-printing */
|
|
|
|
class id_name_t
|
|
|
|
{
|
|
|
|
public:
|
|
|
|
/** Default constructor */
|
|
|
|
id_name_t()
|
|
|
|
: m_name()
|
|
|
|
{}
|
|
|
|
/** Constructor
|
|
|
|
@param[in] name identifier to assign */
|
|
|
|
explicit id_name_t(
|
|
|
|
const char* name)
|
|
|
|
: m_name(name)
|
|
|
|
{}
|
|
|
|
|
|
|
|
/** Assignment operator
|
|
|
|
@param[in] name identifier to assign */
|
|
|
|
id_name_t& operator=(
|
|
|
|
const char* name)
|
|
|
|
{
|
|
|
|
m_name = name;
|
|
|
|
return(*this);
|
|
|
|
}
|
|
|
|
|
|
|
|
/** Implicit type conversion
|
|
|
|
@return the name */
|
|
|
|
operator const char*() const
|
|
|
|
{
|
|
|
|
return(m_name);
|
|
|
|
}
|
|
|
|
|
|
|
|
/** Explicit type conversion
|
|
|
|
@return the name */
|
|
|
|
const char* operator()() const
|
|
|
|
{
|
|
|
|
return(m_name);
|
|
|
|
}
|
|
|
|
|
|
|
|
private:
|
|
|
|
/** The name in internal representation */
|
|
|
|
const char* m_name;
|
|
|
|
};
|
|
|
|
|
2014-02-26 19:11:54 +01:00
|
|
|
/** Data structure for a column in a table */
|
|
|
|
struct dict_col_t{
|
|
|
|
/*----------------------*/
|
|
|
|
/** The following are copied from dtype_t,
|
|
|
|
so that all bit-fields can be packed tightly. */
|
|
|
|
/* @{ */
|
|
|
|
unsigned prtype:32; /*!< precise type; MySQL data
|
|
|
|
type, charset code, flags to
|
|
|
|
indicate nullability,
|
|
|
|
signedness, whether this is a
|
|
|
|
binary string, whether this is
|
|
|
|
a true VARCHAR where MySQL
|
|
|
|
uses 2 bytes to store the length */
|
|
|
|
unsigned mtype:8; /*!< main data type */
|
|
|
|
|
|
|
|
/* the remaining fields do not affect alphabetical ordering: */
|
|
|
|
|
|
|
|
unsigned len:16; /*!< length; for MySQL data this
|
|
|
|
is field->pack_length(),
|
|
|
|
except that for a >= 5.0.3
|
|
|
|
type true VARCHAR this is the
|
|
|
|
maximum byte length of the
|
|
|
|
string data (in addition to
|
|
|
|
the string, MySQL uses 1 or 2
|
|
|
|
bytes to store the string length) */
|
|
|
|
|
2018-01-22 11:18:10 +02:00
|
|
|
unsigned mbminlen:3; /*!< minimum length of a
|
|
|
|
character, in bytes */
|
|
|
|
unsigned mbmaxlen:3; /*!< maximum length of a
|
|
|
|
character, in bytes */
|
2014-02-26 19:11:54 +01:00
|
|
|
/*----------------------*/
|
|
|
|
/* End of definitions copied from dtype_t */
|
|
|
|
/* @} */
|
|
|
|
|
|
|
|
unsigned ind:10; /*!< table column position
|
|
|
|
(starting from 0) */
|
|
|
|
unsigned ord_part:1; /*!< nonzero if this column
|
|
|
|
appears in the ordering fields
|
|
|
|
of an index */
|
|
|
|
unsigned max_prefix:12; /*!< maximum index prefix length on
|
|
|
|
this column. Our current max limit is
|
2017-06-01 13:03:55 +03:00
|
|
|
3072 (REC_VERSION_56_MAX_INDEX_COL_LEN)
|
|
|
|
bytes. */
|
MDEV-15662 Instant DROP COLUMN or changing the order of columns
Allow ADD COLUMN anywhere in a table, not only adding as the
last column.
Allow instant DROP COLUMN and instant changing the order of columns.
The added columns will always be added last in clustered index records.
In new records, instantly dropped columns will be stored as NULL or
empty when possible.
Information about dropped and reordered columns will be written in
a metadata BLOB (mblob), which is stored before the first 'user' field
in the hidden metadata record at the start of the clustered index.
The presence of mblob is indicated by setting the delete-mark flag in
the metadata record.
The metadata BLOB stores the number of clustered index fields,
followed by an array of column information for each field.
For dropped columns, we store the NOT NULL flag, the fixed length,
and for variable-length columns, whether the maximum length exceeded
255 bytes. For non-dropped columns, we store the column position.
Unlike with MDEV-11369, when a table becomes empty, it cannot
be converted back to the canonical format. The reason for this is
that other threads may hold cached objects such as
row_prebuilt_t::ins_node that could refer to dropped or reordered
index fields.
For instant DROP COLUMN and ROW_FORMAT=COMPACT or ROW_FORMAT=DYNAMIC,
we must store the n_core_null_bytes in the root page, so that the
chain of node pointer records can be followed in order to reach the
leftmost leaf page where the metadata record is located.
If the mblob is present, we will zero-initialize the strings
"infimum" and "supremum" in the root page, and use the last byte of
"supremum" for storing the number of null bytes (which are allocated
but useless on node pointer pages). This is necessary for
btr_cur_instant_init_metadata() to be able to navigate to the mblob.
If the PRIMARY KEY contains any variable-length column and some
nullable columns were instantly dropped, the dict_index_t::n_nullable
in the data dictionary could be smaller than it actually is in the
non-leaf pages. Because of this, the non-leaf pages could use more
bytes for the null flags than the data dictionary expects, and we
could be reading the lengths of the variable-length columns from the
wrong offset, and thus reading the child page number from wrong place.
This is the result of two design mistakes that involve unnecessary
storage of data: First, it is nonsense to store any data fields for
the leftmost node pointer records, because the comparisons would be
resolved by the MIN_REC_FLAG alone. Second, there cannot be any null
fields in the clustered index node pointer fields, but we nevertheless
reserve space for all the null flags.
Limitations (future work):
MDEV-17459 Allow instant ALTER TABLE even if FULLTEXT INDEX exists
MDEV-17468 Avoid table rebuild on operations on generated columns
MDEV-17494 Refuse ALGORITHM=INSTANT when the row size is too large
btr_page_reorganize_low(): Preserve any metadata in the root page.
Call lock_move_reorganize_page() only after restoring the "infimum"
and "supremum" records, to avoid a memcmp() assertion failure.
dict_col_t::DROPPED: Magic value for dict_col_t::ind.
dict_col_t::clear_instant(): Renamed from dict_col_t::remove_instant().
Do not assert that the column was instantly added, because we
sometimes call this unconditionally for all columns.
Convert an instantly added column to a "core column". The old name
remove_instant() could be mistaken to refer to "instant DROP COLUMN".
dict_col_t::is_added(): Rename from dict_col_t::is_instant().
dtype_t::metadata_blob_init(): Initialize the mblob data type.
dtuple_t::is_metadata(), dtuple_t::is_alter_metadata(),
upd_t::is_metadata(), upd_t::is_alter_metadata(): Check if info_bits
refer to a metadata record.
dict_table_t::instant: Metadata about dropped or reordered columns.
dict_table_t::prepare_instant(): Prepare
ha_innobase_inplace_ctx::instant_table for instant ALTER TABLE.
innobase_instant_try() will pass this to dict_table_t::instant_column().
On rollback, dict_table_t::rollback_instant() will be called.
dict_table_t::instant_column(): Renamed from instant_add_column().
Add the parameter col_map so that columns can be reordered.
Copy and adjust v_cols[] as well.
dict_table_t::find(): Find an old column based on a new column number.
dict_table_t::serialise_columns(), dict_table_t::deserialise_columns():
Convert the mblob.
dict_index_t::instant_metadata(): Create the metadata record
for instant ALTER TABLE. Invoke dict_table_t::serialise_columns().
dict_index_t::reconstruct_fields(): Invoked by
dict_table_t::deserialise_columns().
dict_index_t::clear_instant_alter(): Move the fields for the
dropped columns to the end, and sort the surviving index fields
in ascending order of column position.
ha_innobase::check_if_supported_inplace_alter(): Do not allow
adding a FTS_DOC_ID column if a hidden FTS_DOC_ID column exists
due to FULLTEXT INDEX. (This always required ALGORITHM=COPY.)
instant_alter_column_possible(): Add a parameter for InnoDB table,
to check for additional conditions, such as the maximum number of
index fields.
ha_innobase_inplace_ctx::first_alter_pos: The first column whose position
is affected by instant ADD, DROP, or changing the order of columns.
innobase_build_col_map(): Skip added virtual columns.
prepare_inplace_add_virtual(): Correctly compute num_to_add_vcol.
Remove some unnecessary code. Note that the call to
innodb_base_col_setup() should be executed later.
commit_try_norebuild(): If ctx->is_instant(), let the virtual
columns be added or dropped by innobase_instant_try().
innobase_instant_try(): Fill in a zero default value for the
hidden column FTS_DOC_ID (to reduce the work needed in MDEV-17459).
If any columns were dropped or reordered (or added not last),
delete any SYS_COLUMNS records for the following columns, and
insert SYS_COLUMNS records for all subsequent stored columns as well
as for all virtual columns. If any virtual column is dropped, rewrite
all virtual column metadata. Use a shortcut only for adding
virtual columns. This is because innobase_drop_virtual_try()
assumes that the dropped virtual columns still exist in ctx->old_table.
innodb_update_cols(): Renamed from innodb_update_n_cols().
innobase_add_one_virtual(), innobase_insert_sys_virtual(): Change
the return type to bool, and invoke my_error() when detecting an error.
innodb_insert_sys_columns(): Insert a record into SYS_COLUMNS.
Refactored from innobase_add_one_virtual() and innobase_instant_add_col().
innobase_instant_add_col(): Replace the parameter dfield with type.
innobase_instant_drop_cols(): Drop matching columns from SYS_COLUMNS
and all columns from SYS_VIRTUAL.
innobase_add_virtual_try(), innobase_drop_virtual_try(): Let
the caller invoke innodb_update_cols().
innobase_rename_column_try(): Skip dropped columns.
commit_cache_norebuild(): Update table->fts->doc_col.
dict_mem_table_col_rename_low(): Skip dropped columns.
trx_undo_rec_get_partial_row(): Skip dropped columns.
trx_undo_update_rec_get_update(): Handle the metadata BLOB correctly.
trx_undo_page_report_modify(): Avoid out-of-bounds access to record fields.
Log metadata records consistently.
Apparently, the first fields of a clustered index may be updated
in an update_undo vector when the index is ID_IND of SYS_FOREIGN,
as part of renaming the table during ALTER TABLE. Normally, updates of
the PRIMARY KEY should be logged as delete-mark and an insert.
row_undo_mod_parse_undo_rec(), row_purge_parse_undo_rec():
Use trx_undo_metadata.
row_undo_mod_clust_low(): On metadata rollback, roll back the root page too.
row_undo_mod_clust(): Relax an assertion. The delete-mark flag was
repurposed for ALTER TABLE metadata records.
row_rec_to_index_entry_impl(): Add the template parameter mblob
and the optional parameter info_bits for specifying the desired new
info bits. For the metadata tuple, allow conversion between the original
format (ADD COLUMN only) and the generic format (with hidden BLOB).
Add the optional parameter "pad" to determine whether the tuple should
be padded to the index fields (on ALTER TABLE it should), or whether
it should remain at its original size (on rollback).
row_build_index_entry_low(): Clean up the code, removing
redundant variables and conditions. For instantly dropped columns,
generate a dummy value that is NULL, the empty string, or a
fixed length of NUL bytes, depending on the type of the dropped column.
row_upd_clust_rec_by_insert_inherit_func(): On the update of PRIMARY KEY
of a record that contained a dropped column whose value was stored
externally, we will be inserting a dummy NULL or empty string value
to the field of the dropped column. The externally stored column would
eventually be dropped when purge removes the delete-marked record for
the old PRIMARY KEY value.
btr_index_rec_validate(): Recognize the metadata record.
btr_discard_only_page_on_level(): Preserve the generic instant
ALTER TABLE metadata.
btr_set_instant(): Replaces page_set_instant(). This sets a clustered
index root page to the appropriate format, or upgrades from
the MDEV-11369 instant ADD COLUMN to generic ALTER TABLE format.
btr_cur_instant_init_low(): Read and validate the metadata BLOB page
before reconstructing the dictionary information based on it.
btr_cur_instant_init_metadata(): Do not read any lengths from the
metadata record header before reading the BLOB. At this point, we
would not actually know how many nullable fields the metadata record
contains.
btr_cur_instant_root_init(): Initialize n_core_null_bytes in one
of two possible ways.
btr_cur_trim(): Handle the mblob record.
row_metadata_to_tuple(): Convert a metadata record to a data tuple,
based on the new info_bits of the metadata record.
btr_cur_pessimistic_update(): Invoke row_metadata_to_tuple() if needed.
Invoke dtuple_convert_big_rec() for metadata records if the record is
too large, or if the mblob is not yet marked as externally stored.
btr_cur_optimistic_delete_func(), btr_cur_pessimistic_delete():
When the last user record is deleted, do not delete the
generic instant ALTER TABLE metadata record. Only delete
MDEV-11369 instant ADD COLUMN metadata records.
btr_cur_optimistic_insert(): Avoid unnecessary computation of rec_size.
btr_pcur_store_position(): Allow a logically empty page to contain
a metadata record for generic ALTER TABLE.
REC_INFO_DEFAULT_ROW_ADD: Renamed from REC_INFO_DEFAULT_ROW.
This is for the old instant ADD COLUMN (MDEV-11369) only.
REC_INFO_DEFAULT_ROW_ALTER: The more generic metadata record,
with additional information for dropped or reordered columns.
rec_info_bits_valid(): Remove. The only case when this would fail
is when the record is the generic ALTER TABLE metadata record.
rec_is_alter_metadata(): Check if a record is the metadata record
for instant ALTER TABLE (other than ADD COLUMN). NOTE: This function
must not be invoked on node pointer records, because the delete-mark
flag in those records may be set (it is garbage), and then a debug
assertion could fail because index->is_instant() does not necessarily
hold.
rec_is_add_metadata(): Check if a record is MDEV-11369 ADD COLUMN metadata
record (not more generic instant ALTER TABLE).
rec_get_converted_size_comp_prefix_low(): Assume that the metadata
field will be stored externally. In dtuple_convert_big_rec() during
the rec_get_converted_size() call, it would not be there yet.
rec_get_converted_size_comp(): Replace status,fields,n_fields with tuple.
rec_init_offsets_comp_ordinary(), rec_get_converted_size_comp_prefix_low(),
rec_convert_dtuple_to_rec_comp(): Add template<bool mblob = false>.
With mblob=true, process a record with a metadata BLOB.
rec_copy_prefix_to_buf(): Assert that no fields beyond the key and
system columns are being copied. Exclude the metadata BLOB field.
rec_convert_dtuple_to_metadata_comp(): Convert an alter metadata tuple
into a record.
row_upd_index_replace_metadata(): Apply an update vector to an
alter_metadata tuple.
row_log_allocate(): Replace dict_index_t::is_instant()
with a more appropriate condition that ignores dict_table_t::instant.
Only a table on which the MDEV-11369 ADD COLUMN was performed
can "lose its instantness" when it becomes empty. After
instant DROP COLUMN or reordering columns, we cannot simply
convert the table to the canonical format, because the data
dictionary cache and all possibly existing references to it
from other client connection threads would have to be adjusted.
row_quiesce_write_index_fields(): Do not crash when the table contains
an instantly dropped column.
Thanks to Thirunarayanan Balathandayuthapani for discussing the design
and implementing an initial prototype of this.
Thanks to Matthias Leich for testing.
2018-10-19 16:49:54 +03:00
|
|
|
private:
|
|
|
|
/** Special value of ind for a dropped column */
|
|
|
|
static const unsigned DROPPED = 1023;
|
|
|
|
public:
|
MDEV-11369 Instant ADD COLUMN for InnoDB
For InnoDB tables, adding, dropping and reordering columns has
required a rebuild of the table and all its indexes. Since MySQL 5.6
(and MariaDB 10.0) this has been supported online (LOCK=NONE), allowing
concurrent modification of the tables.
This work revises the InnoDB ROW_FORMAT=REDUNDANT, ROW_FORMAT=COMPACT
and ROW_FORMAT=DYNAMIC so that columns can be appended instantaneously,
with only minor changes performed to the table structure. The counter
innodb_instant_alter_column in INFORMATION_SCHEMA.GLOBAL_STATUS
is incremented whenever a table rebuild operation is converted into
an instant ADD COLUMN operation.
ROW_FORMAT=COMPRESSED tables will not support instant ADD COLUMN.
Some usability limitations will be addressed in subsequent work:
MDEV-13134 Introduce ALTER TABLE attributes ALGORITHM=NOCOPY
and ALGORITHM=INSTANT
MDEV-14016 Allow instant ADD COLUMN, ADD INDEX, LOCK=NONE
The format of the clustered index (PRIMARY KEY) is changed as follows:
(1) The FIL_PAGE_TYPE of the root page will be FIL_PAGE_TYPE_INSTANT,
and a new field PAGE_INSTANT will contain the original number of fields
in the clustered index ('core' fields).
If instant ADD COLUMN has not been used or the table becomes empty,
or the very first instant ADD COLUMN operation is rolled back,
the fields PAGE_INSTANT and FIL_PAGE_TYPE will be reset
to 0 and FIL_PAGE_INDEX.
(2) A special 'default row' record is inserted into the leftmost leaf,
between the page infimum and the first user record. This record is
distinguished by the REC_INFO_MIN_REC_FLAG, and it is otherwise in the
same format as records that contain values for the instantly added
columns. This 'default row' always has the same number of fields as
the clustered index according to the table definition. The values of
'core' fields are to be ignored. For other fields, the 'default row'
will contain the default values as they were during the ALTER TABLE
statement. (If the column default values are changed later, those
values will only be stored in the .frm file. The 'default row' will
contain the original evaluated values, which must be the same for
every row.) The 'default row' must be completely hidden from
higher-level access routines. Assertions have been added to ensure
that no 'default row' is ever present in the adaptive hash index
or in locked records. The 'default row' is never delete-marked.
(3) In clustered index leaf page records, the number of fields must
reside between the number of 'core' fields (dict_index_t::n_core_fields
introduced in this work) and dict_index_t::n_fields. If the number
of fields is less than dict_index_t::n_fields, the missing fields
are replaced with the column value of the 'default row'.
Note: The number of fields in the record may shrink if some of the
last instantly added columns are updated to the value that is
in the 'default row'. The function btr_cur_trim() implements this
'compression' on update and rollback; dtuple::trim() implements it
on insert.
(4) In ROW_FORMAT=COMPACT and ROW_FORMAT=DYNAMIC records, the new
status value REC_STATUS_COLUMNS_ADDED will indicate the presence of
a new record header that will encode n_fields-n_core_fields-1 in
1 or 2 bytes. (In ROW_FORMAT=REDUNDANT records, the record header
always explicitly encodes the number of fields.)
We introduce the undo log record type TRX_UNDO_INSERT_DEFAULT for
covering the insert of the 'default row' record when instant ADD COLUMN
is used for the first time. Subsequent instant ADD COLUMN can use
TRX_UNDO_UPD_EXIST_REC.
This is joint work with Vin Chen (陈福荣) from Tencent. The design
that was discussed in April 2017 would not have allowed import or
export of data files, because instead of the 'default row' it would
have introduced a data dictionary table. The test
rpl.rpl_alter_instant is exactly as contributed in pull request #408.
The test innodb.instant_alter is based on a contributed test.
The redo log record format changes for ROW_FORMAT=DYNAMIC and
ROW_FORMAT=COMPACT are as contributed. (With this change present,
crash recovery from MariaDB 10.3.1 will fail in spectacular ways!)
Also the semantics of higher-level redo log records that modify the
PAGE_INSTANT field is changed. The redo log format version identifier
was already changed to LOG_HEADER_FORMAT_CURRENT=103 in MariaDB 10.3.1.
Everything else has been rewritten by me. Thanks to Elena Stepanova,
the code has been tested extensively.
When rolling back an instant ADD COLUMN operation, we must empty the
PAGE_FREE list after deleting or shortening the 'default row' record,
by calling either btr_page_empty() or btr_page_reorganize(). We must
know the size of each entry in the PAGE_FREE list. If rollback left a
freed copy of the 'default row' in the PAGE_FREE list, we would be
unable to determine its size (if it is in ROW_FORMAT=COMPACT or
ROW_FORMAT=DYNAMIC) because it would contain more fields than the
rolled-back definition of the clustered index.
UNIV_SQL_DEFAULT: A new special constant that designates an instantly
added column that is not present in the clustered index record.
len_is_stored(): Check if a length is an actual length. There are
two magic length values: UNIV_SQL_DEFAULT, UNIV_SQL_NULL.
dict_col_t::def_val: The 'default row' value of the column. If the
column is not added instantly, def_val.len will be UNIV_SQL_DEFAULT.
dict_col_t: Add the accessors is_virtual(), is_nullable(), is_instant(),
instant_value().
dict_col_t::remove_instant(): Remove the 'instant ADD' status of
a column.
dict_col_t::name(const dict_table_t& table): Replaces
dict_table_get_col_name().
dict_index_t::n_core_fields: The original number of fields.
For secondary indexes and if instant ADD COLUMN has not been used,
this will be equal to dict_index_t::n_fields.
dict_index_t::n_core_null_bytes: Number of bytes needed to
represent the null flags; usually equal to UT_BITS_IN_BYTES(n_nullable).
dict_index_t::NO_CORE_NULL_BYTES: Magic value signalling that
n_core_null_bytes was not initialized yet from the clustered index
root page.
dict_index_t: Add the accessors is_instant(), is_clust(),
get_n_nullable(), instant_field_value().
dict_index_t::instant_add_field(): Adjust clustered index metadata
for instant ADD COLUMN.
dict_index_t::remove_instant(): Remove the 'instant ADD' status
of a clustered index when the table becomes empty, or the very first
instant ADD COLUMN operation is rolled back.
dict_table_t: Add the accessors is_instant(), is_temporary(),
supports_instant().
dict_table_t::instant_add_column(): Adjust metadata for
instant ADD COLUMN.
dict_table_t::rollback_instant(): Adjust metadata on the rollback
of instant ADD COLUMN.
prepare_inplace_alter_table_dict(): First create the ctx->new_table,
and only then decide if the table really needs to be rebuilt.
We must split the creation of table or index metadata from the
creation of the dictionary table records and the creation of
the data. In this way, we can transform a table-rebuilding operation
into an instant ADD COLUMN operation. Dictionary objects will only
be added to cache when table rebuilding or index creation is needed.
The ctx->instant_table will never be added to cache.
dict_table_t::add_to_cache(): Modified and renamed from
dict_table_add_to_cache(). Do not modify the table metadata.
Let the callers invoke dict_table_add_system_columns() and if needed,
set can_be_evicted.
dict_create_sys_tables_tuple(), dict_create_table_step(): Omit the
system columns (which will now exist in the dict_table_t object
already at this point).
dict_create_table_step(): Expect the callers to invoke
dict_table_add_system_columns().
pars_create_table(): Before creating the table creation execution
graph, invoke dict_table_add_system_columns().
row_create_table_for_mysql(): Expect all callers to invoke
dict_table_add_system_columns().
create_index_dict(): Replaces row_merge_create_index_graph().
innodb_update_n_cols(): Renamed from innobase_update_n_virtual().
Call my_error() if an error occurs.
btr_cur_instant_init(), btr_cur_instant_init_low(),
btr_cur_instant_root_init():
Load additional metadata from the clustered index and set
dict_index_t::n_core_null_bytes. This is invoked
when table metadata is first loaded into the data dictionary.
dict_boot(): Initialize n_core_null_bytes for the four hard-coded
dictionary tables.
dict_create_index_step(): Initialize n_core_null_bytes. This is
executed as part of CREATE TABLE.
dict_index_build_internal_clust(): Initialize n_core_null_bytes to
NO_CORE_NULL_BYTES if table->supports_instant().
row_create_index_for_mysql(): Initialize n_core_null_bytes for
CREATE TEMPORARY TABLE.
commit_cache_norebuild(): Call the code to rename or enlarge columns
in the cache only if instant ADD COLUMN is not being used.
(Instant ADD COLUMN would copy all column metadata from
instant_table to old_table, including the names and lengths.)
PAGE_INSTANT: A new 13-bit field for storing dict_index_t::n_core_fields.
This is repurposing the 16-bit field PAGE_DIRECTION, of which only the
least significant 3 bits were used. The original byte containing
PAGE_DIRECTION will be accessible via the new constant PAGE_DIRECTION_B.
page_get_instant(), page_set_instant(): Accessors for the PAGE_INSTANT.
page_ptr_get_direction(), page_get_direction(),
page_ptr_set_direction(): Accessors for PAGE_DIRECTION.
page_direction_reset(): Reset PAGE_DIRECTION, PAGE_N_DIRECTION.
page_direction_increment(): Increment PAGE_N_DIRECTION
and set PAGE_DIRECTION.
rec_get_offsets(): Use the 'leaf' parameter for non-debug purposes,
and assume that heap_no is always set.
Initialize all dict_index_t::n_fields for ROW_FORMAT=REDUNDANT records,
even if the record contains fewer fields.
rec_offs_make_valid(): Add the parameter 'leaf'.
rec_copy_prefix_to_dtuple(): Assert that the tuple is only built
on the core fields. Instant ADD COLUMN only applies to the
clustered index, and we should never build a search key that has
more than the PRIMARY KEY and possibly DB_TRX_ID,DB_ROLL_PTR.
All these columns are always present.
dict_index_build_data_tuple(): Remove assertions that would be
duplicated in rec_copy_prefix_to_dtuple().
rec_init_offsets(): Support ROW_FORMAT=REDUNDANT records whose
number of fields is between n_core_fields and n_fields.
cmp_rec_rec_with_match(): Implement the comparison between two
MIN_REC_FLAG records.
trx_t::in_rollback: Make the field available in non-debug builds.
trx_start_for_ddl_low(): Remove dangerous error-tolerance.
A dictionary transaction must be flagged as such before it has generated
any undo log records. This is because trx_undo_assign_undo() will mark
the transaction as a dictionary transaction in the undo log header
right before the very first undo log record is being written.
btr_index_rec_validate(): Account for instant ADD COLUMN
row_undo_ins_remove_clust_rec(): On the rollback of an insert into
SYS_COLUMNS, revert instant ADD COLUMN in the cache by removing the
last column from the table and the clustered index.
row_search_on_row_ref(), row_undo_mod_parse_undo_rec(), row_undo_mod(),
trx_undo_update_rec_get_update(): Handle the 'default row'
as a special case.
dtuple_t::trim(index): Omit a redundant suffix of an index tuple right
before insert or update. After instant ADD COLUMN, if the last fields
of a clustered index tuple match the 'default row', there is no
need to store them. While trimming the entry, we must hold a page latch,
so that the table cannot be emptied and the 'default row' be deleted.
btr_cur_optimistic_update(), btr_cur_pessimistic_update(),
row_upd_clust_rec_by_insert(), row_ins_clust_index_entry_low():
Invoke dtuple_t::trim() if needed.
row_ins_clust_index_entry(): Restore dtuple_t::n_fields after calling
row_ins_clust_index_entry_low().
rec_get_converted_size(), rec_get_converted_size_comp(): Allow the number
of fields to be between n_core_fields and n_fields. Do not support
infimum,supremum. They are never supposed to be stored in dtuple_t,
because page creation nowadays uses a lower-level method for initializing
them.
rec_convert_dtuple_to_rec_comp(): Assign the status bits based on the
number of fields.
btr_cur_trim(): In an update, trim the index entry as needed. For the
'default row', handle rollback specially. For user records, omit
fields that match the 'default row'.
btr_cur_optimistic_delete_func(), btr_cur_pessimistic_delete():
Skip locking and adaptive hash index for the 'default row'.
row_log_table_apply_convert_mrec(): Replace 'default row' values if needed.
In the temporary file that is applied by row_log_table_apply(),
we must identify whether the records contain the extra header for
instantly added columns. For now, we will allocate an additional byte
for this for ROW_T_INSERT and ROW_T_UPDATE records when the source table
has been subject to instant ADD COLUMN. The ROW_T_DELETE records are
fine, as they will be converted and will only contain 'core' columns
(PRIMARY KEY and some system columns) that are converted from dtuple_t.
rec_get_converted_size_temp(), rec_init_offsets_temp(),
rec_convert_dtuple_to_temp(): Add the parameter 'status'.
REC_INFO_DEFAULT_ROW = REC_INFO_MIN_REC_FLAG | REC_STATUS_COLUMNS_ADDED:
An info_bits constant for distinguishing the 'default row' record.
rec_comp_status_t: An enum of the status bit values.
rec_leaf_format: An enum that replaces the bool parameter of
rec_init_offsets_comp_ordinary().
2017-10-06 07:00:05 +03:00
|
|
|
|
2018-06-04 15:55:37 +03:00
|
|
|
/** Detach the column from an index.
|
|
|
|
@param[in] index index to be detached from */
|
|
|
|
inline void detach(const dict_index_t& index);
|
2018-06-04 16:12:00 +03:00
|
|
|
|
MDEV-11369 Instant ADD COLUMN for InnoDB
For InnoDB tables, adding, dropping and reordering columns has
required a rebuild of the table and all its indexes. Since MySQL 5.6
(and MariaDB 10.0) this has been supported online (LOCK=NONE), allowing
concurrent modification of the tables.
This work revises the InnoDB ROW_FORMAT=REDUNDANT, ROW_FORMAT=COMPACT
and ROW_FORMAT=DYNAMIC so that columns can be appended instantaneously,
with only minor changes performed to the table structure. The counter
innodb_instant_alter_column in INFORMATION_SCHEMA.GLOBAL_STATUS
is incremented whenever a table rebuild operation is converted into
an instant ADD COLUMN operation.
ROW_FORMAT=COMPRESSED tables will not support instant ADD COLUMN.
Some usability limitations will be addressed in subsequent work:
MDEV-13134 Introduce ALTER TABLE attributes ALGORITHM=NOCOPY
and ALGORITHM=INSTANT
MDEV-14016 Allow instant ADD COLUMN, ADD INDEX, LOCK=NONE
The format of the clustered index (PRIMARY KEY) is changed as follows:
(1) The FIL_PAGE_TYPE of the root page will be FIL_PAGE_TYPE_INSTANT,
and a new field PAGE_INSTANT will contain the original number of fields
in the clustered index ('core' fields).
If instant ADD COLUMN has not been used or the table becomes empty,
or the very first instant ADD COLUMN operation is rolled back,
the fields PAGE_INSTANT and FIL_PAGE_TYPE will be reset
to 0 and FIL_PAGE_INDEX.
(2) A special 'default row' record is inserted into the leftmost leaf,
between the page infimum and the first user record. This record is
distinguished by the REC_INFO_MIN_REC_FLAG, and it is otherwise in the
same format as records that contain values for the instantly added
columns. This 'default row' always has the same number of fields as
the clustered index according to the table definition. The values of
'core' fields are to be ignored. For other fields, the 'default row'
will contain the default values as they were during the ALTER TABLE
statement. (If the column default values are changed later, those
values will only be stored in the .frm file. The 'default row' will
contain the original evaluated values, which must be the same for
every row.) The 'default row' must be completely hidden from
higher-level access routines. Assertions have been added to ensure
that no 'default row' is ever present in the adaptive hash index
or in locked records. The 'default row' is never delete-marked.
(3) In clustered index leaf page records, the number of fields must
reside between the number of 'core' fields (dict_index_t::n_core_fields
introduced in this work) and dict_index_t::n_fields. If the number
of fields is less than dict_index_t::n_fields, the missing fields
are replaced with the column value of the 'default row'.
Note: The number of fields in the record may shrink if some of the
last instantly added columns are updated to the value that is
in the 'default row'. The function btr_cur_trim() implements this
'compression' on update and rollback; dtuple::trim() implements it
on insert.
(4) In ROW_FORMAT=COMPACT and ROW_FORMAT=DYNAMIC records, the new
status value REC_STATUS_COLUMNS_ADDED will indicate the presence of
a new record header that will encode n_fields-n_core_fields-1 in
1 or 2 bytes. (In ROW_FORMAT=REDUNDANT records, the record header
always explicitly encodes the number of fields.)
We introduce the undo log record type TRX_UNDO_INSERT_DEFAULT for
covering the insert of the 'default row' record when instant ADD COLUMN
is used for the first time. Subsequent instant ADD COLUMN can use
TRX_UNDO_UPD_EXIST_REC.
This is joint work with Vin Chen (陈福荣) from Tencent. The design
that was discussed in April 2017 would not have allowed import or
export of data files, because instead of the 'default row' it would
have introduced a data dictionary table. The test
rpl.rpl_alter_instant is exactly as contributed in pull request #408.
The test innodb.instant_alter is based on a contributed test.
The redo log record format changes for ROW_FORMAT=DYNAMIC and
ROW_FORMAT=COMPACT are as contributed. (With this change present,
crash recovery from MariaDB 10.3.1 will fail in spectacular ways!)
Also the semantics of higher-level redo log records that modify the
PAGE_INSTANT field is changed. The redo log format version identifier
was already changed to LOG_HEADER_FORMAT_CURRENT=103 in MariaDB 10.3.1.
Everything else has been rewritten by me. Thanks to Elena Stepanova,
the code has been tested extensively.
When rolling back an instant ADD COLUMN operation, we must empty the
PAGE_FREE list after deleting or shortening the 'default row' record,
by calling either btr_page_empty() or btr_page_reorganize(). We must
know the size of each entry in the PAGE_FREE list. If rollback left a
freed copy of the 'default row' in the PAGE_FREE list, we would be
unable to determine its size (if it is in ROW_FORMAT=COMPACT or
ROW_FORMAT=DYNAMIC) because it would contain more fields than the
rolled-back definition of the clustered index.
UNIV_SQL_DEFAULT: A new special constant that designates an instantly
added column that is not present in the clustered index record.
len_is_stored(): Check if a length is an actual length. There are
two magic length values: UNIV_SQL_DEFAULT, UNIV_SQL_NULL.
dict_col_t::def_val: The 'default row' value of the column. If the
column is not added instantly, def_val.len will be UNIV_SQL_DEFAULT.
dict_col_t: Add the accessors is_virtual(), is_nullable(), is_instant(),
instant_value().
dict_col_t::remove_instant(): Remove the 'instant ADD' status of
a column.
dict_col_t::name(const dict_table_t& table): Replaces
dict_table_get_col_name().
dict_index_t::n_core_fields: The original number of fields.
For secondary indexes and if instant ADD COLUMN has not been used,
this will be equal to dict_index_t::n_fields.
dict_index_t::n_core_null_bytes: Number of bytes needed to
represent the null flags; usually equal to UT_BITS_IN_BYTES(n_nullable).
dict_index_t::NO_CORE_NULL_BYTES: Magic value signalling that
n_core_null_bytes was not initialized yet from the clustered index
root page.
dict_index_t: Add the accessors is_instant(), is_clust(),
get_n_nullable(), instant_field_value().
dict_index_t::instant_add_field(): Adjust clustered index metadata
for instant ADD COLUMN.
dict_index_t::remove_instant(): Remove the 'instant ADD' status
of a clustered index when the table becomes empty, or the very first
instant ADD COLUMN operation is rolled back.
dict_table_t: Add the accessors is_instant(), is_temporary(),
supports_instant().
dict_table_t::instant_add_column(): Adjust metadata for
instant ADD COLUMN.
dict_table_t::rollback_instant(): Adjust metadata on the rollback
of instant ADD COLUMN.
prepare_inplace_alter_table_dict(): First create the ctx->new_table,
and only then decide if the table really needs to be rebuilt.
We must split the creation of table or index metadata from the
creation of the dictionary table records and the creation of
the data. In this way, we can transform a table-rebuilding operation
into an instant ADD COLUMN operation. Dictionary objects will only
be added to cache when table rebuilding or index creation is needed.
The ctx->instant_table will never be added to cache.
dict_table_t::add_to_cache(): Modified and renamed from
dict_table_add_to_cache(). Do not modify the table metadata.
Let the callers invoke dict_table_add_system_columns() and if needed,
set can_be_evicted.
dict_create_sys_tables_tuple(), dict_create_table_step(): Omit the
system columns (which will now exist in the dict_table_t object
already at this point).
dict_create_table_step(): Expect the callers to invoke
dict_table_add_system_columns().
pars_create_table(): Before creating the table creation execution
graph, invoke dict_table_add_system_columns().
row_create_table_for_mysql(): Expect all callers to invoke
dict_table_add_system_columns().
create_index_dict(): Replaces row_merge_create_index_graph().
innodb_update_n_cols(): Renamed from innobase_update_n_virtual().
Call my_error() if an error occurs.
btr_cur_instant_init(), btr_cur_instant_init_low(),
btr_cur_instant_root_init():
Load additional metadata from the clustered index and set
dict_index_t::n_core_null_bytes. This is invoked
when table metadata is first loaded into the data dictionary.
dict_boot(): Initialize n_core_null_bytes for the four hard-coded
dictionary tables.
dict_create_index_step(): Initialize n_core_null_bytes. This is
executed as part of CREATE TABLE.
dict_index_build_internal_clust(): Initialize n_core_null_bytes to
NO_CORE_NULL_BYTES if table->supports_instant().
row_create_index_for_mysql(): Initialize n_core_null_bytes for
CREATE TEMPORARY TABLE.
commit_cache_norebuild(): Call the code to rename or enlarge columns
in the cache only if instant ADD COLUMN is not being used.
(Instant ADD COLUMN would copy all column metadata from
instant_table to old_table, including the names and lengths.)
PAGE_INSTANT: A new 13-bit field for storing dict_index_t::n_core_fields.
This is repurposing the 16-bit field PAGE_DIRECTION, of which only the
least significant 3 bits were used. The original byte containing
PAGE_DIRECTION will be accessible via the new constant PAGE_DIRECTION_B.
page_get_instant(), page_set_instant(): Accessors for the PAGE_INSTANT.
page_ptr_get_direction(), page_get_direction(),
page_ptr_set_direction(): Accessors for PAGE_DIRECTION.
page_direction_reset(): Reset PAGE_DIRECTION, PAGE_N_DIRECTION.
page_direction_increment(): Increment PAGE_N_DIRECTION
and set PAGE_DIRECTION.
rec_get_offsets(): Use the 'leaf' parameter for non-debug purposes,
and assume that heap_no is always set.
Initialize all dict_index_t::n_fields for ROW_FORMAT=REDUNDANT records,
even if the record contains fewer fields.
rec_offs_make_valid(): Add the parameter 'leaf'.
rec_copy_prefix_to_dtuple(): Assert that the tuple is only built
on the core fields. Instant ADD COLUMN only applies to the
clustered index, and we should never build a search key that has
more than the PRIMARY KEY and possibly DB_TRX_ID,DB_ROLL_PTR.
All these columns are always present.
dict_index_build_data_tuple(): Remove assertions that would be
duplicated in rec_copy_prefix_to_dtuple().
rec_init_offsets(): Support ROW_FORMAT=REDUNDANT records whose
number of fields is between n_core_fields and n_fields.
cmp_rec_rec_with_match(): Implement the comparison between two
MIN_REC_FLAG records.
trx_t::in_rollback: Make the field available in non-debug builds.
trx_start_for_ddl_low(): Remove dangerous error-tolerance.
A dictionary transaction must be flagged as such before it has generated
any undo log records. This is because trx_undo_assign_undo() will mark
the transaction as a dictionary transaction in the undo log header
right before the very first undo log record is being written.
btr_index_rec_validate(): Account for instant ADD COLUMN
row_undo_ins_remove_clust_rec(): On the rollback of an insert into
SYS_COLUMNS, revert instant ADD COLUMN in the cache by removing the
last column from the table and the clustered index.
row_search_on_row_ref(), row_undo_mod_parse_undo_rec(), row_undo_mod(),
trx_undo_update_rec_get_update(): Handle the 'default row'
as a special case.
dtuple_t::trim(index): Omit a redundant suffix of an index tuple right
before insert or update. After instant ADD COLUMN, if the last fields
of a clustered index tuple match the 'default row', there is no
need to store them. While trimming the entry, we must hold a page latch,
so that the table cannot be emptied and the 'default row' be deleted.
btr_cur_optimistic_update(), btr_cur_pessimistic_update(),
row_upd_clust_rec_by_insert(), row_ins_clust_index_entry_low():
Invoke dtuple_t::trim() if needed.
row_ins_clust_index_entry(): Restore dtuple_t::n_fields after calling
row_ins_clust_index_entry_low().
rec_get_converted_size(), rec_get_converted_size_comp(): Allow the number
of fields to be between n_core_fields and n_fields. Do not support
infimum,supremum. They are never supposed to be stored in dtuple_t,
because page creation nowadays uses a lower-level method for initializing
them.
rec_convert_dtuple_to_rec_comp(): Assign the status bits based on the
number of fields.
btr_cur_trim(): In an update, trim the index entry as needed. For the
'default row', handle rollback specially. For user records, omit
fields that match the 'default row'.
btr_cur_optimistic_delete_func(), btr_cur_pessimistic_delete():
Skip locking and adaptive hash index for the 'default row'.
row_log_table_apply_convert_mrec(): Replace 'default row' values if needed.
In the temporary file that is applied by row_log_table_apply(),
we must identify whether the records contain the extra header for
instantly added columns. For now, we will allocate an additional byte
for this for ROW_T_INSERT and ROW_T_UPDATE records when the source table
has been subject to instant ADD COLUMN. The ROW_T_DELETE records are
fine, as they will be converted and will only contain 'core' columns
(PRIMARY KEY and some system columns) that are converted from dtuple_t.
rec_get_converted_size_temp(), rec_init_offsets_temp(),
rec_convert_dtuple_to_temp(): Add the parameter 'status'.
REC_INFO_DEFAULT_ROW = REC_INFO_MIN_REC_FLAG | REC_STATUS_COLUMNS_ADDED:
An info_bits constant for distinguishing the 'default row' record.
rec_comp_status_t: An enum of the status bit values.
rec_leaf_format: An enum that replaces the bool parameter of
rec_init_offsets_comp_ordinary().
2017-10-06 07:00:05 +03:00
|
|
|
/** Data for instantly added columns */
|
2018-07-26 22:52:53 +03:00
|
|
|
struct def_t {
|
MDEV-11369 Instant ADD COLUMN for InnoDB
For InnoDB tables, adding, dropping and reordering columns has
required a rebuild of the table and all its indexes. Since MySQL 5.6
(and MariaDB 10.0) this has been supported online (LOCK=NONE), allowing
concurrent modification of the tables.
This work revises the InnoDB ROW_FORMAT=REDUNDANT, ROW_FORMAT=COMPACT
and ROW_FORMAT=DYNAMIC so that columns can be appended instantaneously,
with only minor changes performed to the table structure. The counter
innodb_instant_alter_column in INFORMATION_SCHEMA.GLOBAL_STATUS
is incremented whenever a table rebuild operation is converted into
an instant ADD COLUMN operation.
ROW_FORMAT=COMPRESSED tables will not support instant ADD COLUMN.
Some usability limitations will be addressed in subsequent work:
MDEV-13134 Introduce ALTER TABLE attributes ALGORITHM=NOCOPY
and ALGORITHM=INSTANT
MDEV-14016 Allow instant ADD COLUMN, ADD INDEX, LOCK=NONE
The format of the clustered index (PRIMARY KEY) is changed as follows:
(1) The FIL_PAGE_TYPE of the root page will be FIL_PAGE_TYPE_INSTANT,
and a new field PAGE_INSTANT will contain the original number of fields
in the clustered index ('core' fields).
If instant ADD COLUMN has not been used or the table becomes empty,
or the very first instant ADD COLUMN operation is rolled back,
the fields PAGE_INSTANT and FIL_PAGE_TYPE will be reset
to 0 and FIL_PAGE_INDEX.
(2) A special 'default row' record is inserted into the leftmost leaf,
between the page infimum and the first user record. This record is
distinguished by the REC_INFO_MIN_REC_FLAG, and it is otherwise in the
same format as records that contain values for the instantly added
columns. This 'default row' always has the same number of fields as
the clustered index according to the table definition. The values of
'core' fields are to be ignored. For other fields, the 'default row'
will contain the default values as they were during the ALTER TABLE
statement. (If the column default values are changed later, those
values will only be stored in the .frm file. The 'default row' will
contain the original evaluated values, which must be the same for
every row.) The 'default row' must be completely hidden from
higher-level access routines. Assertions have been added to ensure
that no 'default row' is ever present in the adaptive hash index
or in locked records. The 'default row' is never delete-marked.
(3) In clustered index leaf page records, the number of fields must
reside between the number of 'core' fields (dict_index_t::n_core_fields
introduced in this work) and dict_index_t::n_fields. If the number
of fields is less than dict_index_t::n_fields, the missing fields
are replaced with the column value of the 'default row'.
Note: The number of fields in the record may shrink if some of the
last instantly added columns are updated to the value that is
in the 'default row'. The function btr_cur_trim() implements this
'compression' on update and rollback; dtuple::trim() implements it
on insert.
(4) In ROW_FORMAT=COMPACT and ROW_FORMAT=DYNAMIC records, the new
status value REC_STATUS_COLUMNS_ADDED will indicate the presence of
a new record header that will encode n_fields-n_core_fields-1 in
1 or 2 bytes. (In ROW_FORMAT=REDUNDANT records, the record header
always explicitly encodes the number of fields.)
We introduce the undo log record type TRX_UNDO_INSERT_DEFAULT for
covering the insert of the 'default row' record when instant ADD COLUMN
is used for the first time. Subsequent instant ADD COLUMN can use
TRX_UNDO_UPD_EXIST_REC.
This is joint work with Vin Chen (陈福荣) from Tencent. The design
that was discussed in April 2017 would not have allowed import or
export of data files, because instead of the 'default row' it would
have introduced a data dictionary table. The test
rpl.rpl_alter_instant is exactly as contributed in pull request #408.
The test innodb.instant_alter is based on a contributed test.
The redo log record format changes for ROW_FORMAT=DYNAMIC and
ROW_FORMAT=COMPACT are as contributed. (With this change present,
crash recovery from MariaDB 10.3.1 will fail in spectacular ways!)
Also the semantics of higher-level redo log records that modify the
PAGE_INSTANT field is changed. The redo log format version identifier
was already changed to LOG_HEADER_FORMAT_CURRENT=103 in MariaDB 10.3.1.
Everything else has been rewritten by me. Thanks to Elena Stepanova,
the code has been tested extensively.
When rolling back an instant ADD COLUMN operation, we must empty the
PAGE_FREE list after deleting or shortening the 'default row' record,
by calling either btr_page_empty() or btr_page_reorganize(). We must
know the size of each entry in the PAGE_FREE list. If rollback left a
freed copy of the 'default row' in the PAGE_FREE list, we would be
unable to determine its size (if it is in ROW_FORMAT=COMPACT or
ROW_FORMAT=DYNAMIC) because it would contain more fields than the
rolled-back definition of the clustered index.
UNIV_SQL_DEFAULT: A new special constant that designates an instantly
added column that is not present in the clustered index record.
len_is_stored(): Check if a length is an actual length. There are
two magic length values: UNIV_SQL_DEFAULT, UNIV_SQL_NULL.
dict_col_t::def_val: The 'default row' value of the column. If the
column is not added instantly, def_val.len will be UNIV_SQL_DEFAULT.
dict_col_t: Add the accessors is_virtual(), is_nullable(), is_instant(),
instant_value().
dict_col_t::remove_instant(): Remove the 'instant ADD' status of
a column.
dict_col_t::name(const dict_table_t& table): Replaces
dict_table_get_col_name().
dict_index_t::n_core_fields: The original number of fields.
For secondary indexes and if instant ADD COLUMN has not been used,
this will be equal to dict_index_t::n_fields.
dict_index_t::n_core_null_bytes: Number of bytes needed to
represent the null flags; usually equal to UT_BITS_IN_BYTES(n_nullable).
dict_index_t::NO_CORE_NULL_BYTES: Magic value signalling that
n_core_null_bytes was not initialized yet from the clustered index
root page.
dict_index_t: Add the accessors is_instant(), is_clust(),
get_n_nullable(), instant_field_value().
dict_index_t::instant_add_field(): Adjust clustered index metadata
for instant ADD COLUMN.
dict_index_t::remove_instant(): Remove the 'instant ADD' status
of a clustered index when the table becomes empty, or the very first
instant ADD COLUMN operation is rolled back.
dict_table_t: Add the accessors is_instant(), is_temporary(),
supports_instant().
dict_table_t::instant_add_column(): Adjust metadata for
instant ADD COLUMN.
dict_table_t::rollback_instant(): Adjust metadata on the rollback
of instant ADD COLUMN.
prepare_inplace_alter_table_dict(): First create the ctx->new_table,
and only then decide if the table really needs to be rebuilt.
We must split the creation of table or index metadata from the
creation of the dictionary table records and the creation of
the data. In this way, we can transform a table-rebuilding operation
into an instant ADD COLUMN operation. Dictionary objects will only
be added to cache when table rebuilding or index creation is needed.
The ctx->instant_table will never be added to cache.
dict_table_t::add_to_cache(): Modified and renamed from
dict_table_add_to_cache(). Do not modify the table metadata.
Let the callers invoke dict_table_add_system_columns() and if needed,
set can_be_evicted.
dict_create_sys_tables_tuple(), dict_create_table_step(): Omit the
system columns (which will now exist in the dict_table_t object
already at this point).
dict_create_table_step(): Expect the callers to invoke
dict_table_add_system_columns().
pars_create_table(): Before creating the table creation execution
graph, invoke dict_table_add_system_columns().
row_create_table_for_mysql(): Expect all callers to invoke
dict_table_add_system_columns().
create_index_dict(): Replaces row_merge_create_index_graph().
innodb_update_n_cols(): Renamed from innobase_update_n_virtual().
Call my_error() if an error occurs.
btr_cur_instant_init(), btr_cur_instant_init_low(),
btr_cur_instant_root_init():
Load additional metadata from the clustered index and set
dict_index_t::n_core_null_bytes. This is invoked
when table metadata is first loaded into the data dictionary.
dict_boot(): Initialize n_core_null_bytes for the four hard-coded
dictionary tables.
dict_create_index_step(): Initialize n_core_null_bytes. This is
executed as part of CREATE TABLE.
dict_index_build_internal_clust(): Initialize n_core_null_bytes to
NO_CORE_NULL_BYTES if table->supports_instant().
row_create_index_for_mysql(): Initialize n_core_null_bytes for
CREATE TEMPORARY TABLE.
commit_cache_norebuild(): Call the code to rename or enlarge columns
in the cache only if instant ADD COLUMN is not being used.
(Instant ADD COLUMN would copy all column metadata from
instant_table to old_table, including the names and lengths.)
PAGE_INSTANT: A new 13-bit field for storing dict_index_t::n_core_fields.
This is repurposing the 16-bit field PAGE_DIRECTION, of which only the
least significant 3 bits were used. The original byte containing
PAGE_DIRECTION will be accessible via the new constant PAGE_DIRECTION_B.
page_get_instant(), page_set_instant(): Accessors for the PAGE_INSTANT.
page_ptr_get_direction(), page_get_direction(),
page_ptr_set_direction(): Accessors for PAGE_DIRECTION.
page_direction_reset(): Reset PAGE_DIRECTION, PAGE_N_DIRECTION.
page_direction_increment(): Increment PAGE_N_DIRECTION
and set PAGE_DIRECTION.
rec_get_offsets(): Use the 'leaf' parameter for non-debug purposes,
and assume that heap_no is always set.
Initialize all dict_index_t::n_fields for ROW_FORMAT=REDUNDANT records,
even if the record contains fewer fields.
rec_offs_make_valid(): Add the parameter 'leaf'.
rec_copy_prefix_to_dtuple(): Assert that the tuple is only built
on the core fields. Instant ADD COLUMN only applies to the
clustered index, and we should never build a search key that has
more than the PRIMARY KEY and possibly DB_TRX_ID,DB_ROLL_PTR.
All these columns are always present.
dict_index_build_data_tuple(): Remove assertions that would be
duplicated in rec_copy_prefix_to_dtuple().
rec_init_offsets(): Support ROW_FORMAT=REDUNDANT records whose
number of fields is between n_core_fields and n_fields.
cmp_rec_rec_with_match(): Implement the comparison between two
MIN_REC_FLAG records.
trx_t::in_rollback: Make the field available in non-debug builds.
trx_start_for_ddl_low(): Remove dangerous error-tolerance.
A dictionary transaction must be flagged as such before it has generated
any undo log records. This is because trx_undo_assign_undo() will mark
the transaction as a dictionary transaction in the undo log header
right before the very first undo log record is being written.
btr_index_rec_validate(): Account for instant ADD COLUMN
row_undo_ins_remove_clust_rec(): On the rollback of an insert into
SYS_COLUMNS, revert instant ADD COLUMN in the cache by removing the
last column from the table and the clustered index.
row_search_on_row_ref(), row_undo_mod_parse_undo_rec(), row_undo_mod(),
trx_undo_update_rec_get_update(): Handle the 'default row'
as a special case.
dtuple_t::trim(index): Omit a redundant suffix of an index tuple right
before insert or update. After instant ADD COLUMN, if the last fields
of a clustered index tuple match the 'default row', there is no
need to store them. While trimming the entry, we must hold a page latch,
so that the table cannot be emptied and the 'default row' be deleted.
btr_cur_optimistic_update(), btr_cur_pessimistic_update(),
row_upd_clust_rec_by_insert(), row_ins_clust_index_entry_low():
Invoke dtuple_t::trim() if needed.
row_ins_clust_index_entry(): Restore dtuple_t::n_fields after calling
row_ins_clust_index_entry_low().
rec_get_converted_size(), rec_get_converted_size_comp(): Allow the number
of fields to be between n_core_fields and n_fields. Do not support
infimum,supremum. They are never supposed to be stored in dtuple_t,
because page creation nowadays uses a lower-level method for initializing
them.
rec_convert_dtuple_to_rec_comp(): Assign the status bits based on the
number of fields.
btr_cur_trim(): In an update, trim the index entry as needed. For the
'default row', handle rollback specially. For user records, omit
fields that match the 'default row'.
btr_cur_optimistic_delete_func(), btr_cur_pessimistic_delete():
Skip locking and adaptive hash index for the 'default row'.
row_log_table_apply_convert_mrec(): Replace 'default row' values if needed.
In the temporary file that is applied by row_log_table_apply(),
we must identify whether the records contain the extra header for
instantly added columns. For now, we will allocate an additional byte
for this for ROW_T_INSERT and ROW_T_UPDATE records when the source table
has been subject to instant ADD COLUMN. The ROW_T_DELETE records are
fine, as they will be converted and will only contain 'core' columns
(PRIMARY KEY and some system columns) that are converted from dtuple_t.
rec_get_converted_size_temp(), rec_init_offsets_temp(),
rec_convert_dtuple_to_temp(): Add the parameter 'status'.
REC_INFO_DEFAULT_ROW = REC_INFO_MIN_REC_FLAG | REC_STATUS_COLUMNS_ADDED:
An info_bits constant for distinguishing the 'default row' record.
rec_comp_status_t: An enum of the status bit values.
rec_leaf_format: An enum that replaces the bool parameter of
rec_init_offsets_comp_ordinary().
2017-10-06 07:00:05 +03:00
|
|
|
/** original default value of instantly added column */
|
|
|
|
const void* data;
|
|
|
|
/** len of data, or UNIV_SQL_DEFAULT if unavailable */
|
|
|
|
ulint len;
|
|
|
|
} def_val;
|
|
|
|
|
|
|
|
/** Retrieve the column name.
|
2018-09-26 17:18:13 +03:00
|
|
|
@param[in] table the table of this column */
|
MDEV-11369 Instant ADD COLUMN for InnoDB
For InnoDB tables, adding, dropping and reordering columns has
required a rebuild of the table and all its indexes. Since MySQL 5.6
(and MariaDB 10.0) this has been supported online (LOCK=NONE), allowing
concurrent modification of the tables.
This work revises the InnoDB ROW_FORMAT=REDUNDANT, ROW_FORMAT=COMPACT
and ROW_FORMAT=DYNAMIC so that columns can be appended instantaneously,
with only minor changes performed to the table structure. The counter
innodb_instant_alter_column in INFORMATION_SCHEMA.GLOBAL_STATUS
is incremented whenever a table rebuild operation is converted into
an instant ADD COLUMN operation.
ROW_FORMAT=COMPRESSED tables will not support instant ADD COLUMN.
Some usability limitations will be addressed in subsequent work:
MDEV-13134 Introduce ALTER TABLE attributes ALGORITHM=NOCOPY
and ALGORITHM=INSTANT
MDEV-14016 Allow instant ADD COLUMN, ADD INDEX, LOCK=NONE
The format of the clustered index (PRIMARY KEY) is changed as follows:
(1) The FIL_PAGE_TYPE of the root page will be FIL_PAGE_TYPE_INSTANT,
and a new field PAGE_INSTANT will contain the original number of fields
in the clustered index ('core' fields).
If instant ADD COLUMN has not been used or the table becomes empty,
or the very first instant ADD COLUMN operation is rolled back,
the fields PAGE_INSTANT and FIL_PAGE_TYPE will be reset
to 0 and FIL_PAGE_INDEX.
(2) A special 'default row' record is inserted into the leftmost leaf,
between the page infimum and the first user record. This record is
distinguished by the REC_INFO_MIN_REC_FLAG, and it is otherwise in the
same format as records that contain values for the instantly added
columns. This 'default row' always has the same number of fields as
the clustered index according to the table definition. The values of
'core' fields are to be ignored. For other fields, the 'default row'
will contain the default values as they were during the ALTER TABLE
statement. (If the column default values are changed later, those
values will only be stored in the .frm file. The 'default row' will
contain the original evaluated values, which must be the same for
every row.) The 'default row' must be completely hidden from
higher-level access routines. Assertions have been added to ensure
that no 'default row' is ever present in the adaptive hash index
or in locked records. The 'default row' is never delete-marked.
(3) In clustered index leaf page records, the number of fields must
reside between the number of 'core' fields (dict_index_t::n_core_fields
introduced in this work) and dict_index_t::n_fields. If the number
of fields is less than dict_index_t::n_fields, the missing fields
are replaced with the column value of the 'default row'.
Note: The number of fields in the record may shrink if some of the
last instantly added columns are updated to the value that is
in the 'default row'. The function btr_cur_trim() implements this
'compression' on update and rollback; dtuple::trim() implements it
on insert.
(4) In ROW_FORMAT=COMPACT and ROW_FORMAT=DYNAMIC records, the new
status value REC_STATUS_COLUMNS_ADDED will indicate the presence of
a new record header that will encode n_fields-n_core_fields-1 in
1 or 2 bytes. (In ROW_FORMAT=REDUNDANT records, the record header
always explicitly encodes the number of fields.)
We introduce the undo log record type TRX_UNDO_INSERT_DEFAULT for
covering the insert of the 'default row' record when instant ADD COLUMN
is used for the first time. Subsequent instant ADD COLUMN can use
TRX_UNDO_UPD_EXIST_REC.
This is joint work with Vin Chen (陈福荣) from Tencent. The design
that was discussed in April 2017 would not have allowed import or
export of data files, because instead of the 'default row' it would
have introduced a data dictionary table. The test
rpl.rpl_alter_instant is exactly as contributed in pull request #408.
The test innodb.instant_alter is based on a contributed test.
The redo log record format changes for ROW_FORMAT=DYNAMIC and
ROW_FORMAT=COMPACT are as contributed. (With this change present,
crash recovery from MariaDB 10.3.1 will fail in spectacular ways!)
Also the semantics of higher-level redo log records that modify the
PAGE_INSTANT field is changed. The redo log format version identifier
was already changed to LOG_HEADER_FORMAT_CURRENT=103 in MariaDB 10.3.1.
Everything else has been rewritten by me. Thanks to Elena Stepanova,
the code has been tested extensively.
When rolling back an instant ADD COLUMN operation, we must empty the
PAGE_FREE list after deleting or shortening the 'default row' record,
by calling either btr_page_empty() or btr_page_reorganize(). We must
know the size of each entry in the PAGE_FREE list. If rollback left a
freed copy of the 'default row' in the PAGE_FREE list, we would be
unable to determine its size (if it is in ROW_FORMAT=COMPACT or
ROW_FORMAT=DYNAMIC) because it would contain more fields than the
rolled-back definition of the clustered index.
UNIV_SQL_DEFAULT: A new special constant that designates an instantly
added column that is not present in the clustered index record.
len_is_stored(): Check if a length is an actual length. There are
two magic length values: UNIV_SQL_DEFAULT, UNIV_SQL_NULL.
dict_col_t::def_val: The 'default row' value of the column. If the
column is not added instantly, def_val.len will be UNIV_SQL_DEFAULT.
dict_col_t: Add the accessors is_virtual(), is_nullable(), is_instant(),
instant_value().
dict_col_t::remove_instant(): Remove the 'instant ADD' status of
a column.
dict_col_t::name(const dict_table_t& table): Replaces
dict_table_get_col_name().
dict_index_t::n_core_fields: The original number of fields.
For secondary indexes and if instant ADD COLUMN has not been used,
this will be equal to dict_index_t::n_fields.
dict_index_t::n_core_null_bytes: Number of bytes needed to
represent the null flags; usually equal to UT_BITS_IN_BYTES(n_nullable).
dict_index_t::NO_CORE_NULL_BYTES: Magic value signalling that
n_core_null_bytes was not initialized yet from the clustered index
root page.
dict_index_t: Add the accessors is_instant(), is_clust(),
get_n_nullable(), instant_field_value().
dict_index_t::instant_add_field(): Adjust clustered index metadata
for instant ADD COLUMN.
dict_index_t::remove_instant(): Remove the 'instant ADD' status
of a clustered index when the table becomes empty, or the very first
instant ADD COLUMN operation is rolled back.
dict_table_t: Add the accessors is_instant(), is_temporary(),
supports_instant().
dict_table_t::instant_add_column(): Adjust metadata for
instant ADD COLUMN.
dict_table_t::rollback_instant(): Adjust metadata on the rollback
of instant ADD COLUMN.
prepare_inplace_alter_table_dict(): First create the ctx->new_table,
and only then decide if the table really needs to be rebuilt.
We must split the creation of table or index metadata from the
creation of the dictionary table records and the creation of
the data. In this way, we can transform a table-rebuilding operation
into an instant ADD COLUMN operation. Dictionary objects will only
be added to cache when table rebuilding or index creation is needed.
The ctx->instant_table will never be added to cache.
dict_table_t::add_to_cache(): Modified and renamed from
dict_table_add_to_cache(). Do not modify the table metadata.
Let the callers invoke dict_table_add_system_columns() and if needed,
set can_be_evicted.
dict_create_sys_tables_tuple(), dict_create_table_step(): Omit the
system columns (which will now exist in the dict_table_t object
already at this point).
dict_create_table_step(): Expect the callers to invoke
dict_table_add_system_columns().
pars_create_table(): Before creating the table creation execution
graph, invoke dict_table_add_system_columns().
row_create_table_for_mysql(): Expect all callers to invoke
dict_table_add_system_columns().
create_index_dict(): Replaces row_merge_create_index_graph().
innodb_update_n_cols(): Renamed from innobase_update_n_virtual().
Call my_error() if an error occurs.
btr_cur_instant_init(), btr_cur_instant_init_low(),
btr_cur_instant_root_init():
Load additional metadata from the clustered index and set
dict_index_t::n_core_null_bytes. This is invoked
when table metadata is first loaded into the data dictionary.
dict_boot(): Initialize n_core_null_bytes for the four hard-coded
dictionary tables.
dict_create_index_step(): Initialize n_core_null_bytes. This is
executed as part of CREATE TABLE.
dict_index_build_internal_clust(): Initialize n_core_null_bytes to
NO_CORE_NULL_BYTES if table->supports_instant().
row_create_index_for_mysql(): Initialize n_core_null_bytes for
CREATE TEMPORARY TABLE.
commit_cache_norebuild(): Call the code to rename or enlarge columns
in the cache only if instant ADD COLUMN is not being used.
(Instant ADD COLUMN would copy all column metadata from
instant_table to old_table, including the names and lengths.)
PAGE_INSTANT: A new 13-bit field for storing dict_index_t::n_core_fields.
This is repurposing the 16-bit field PAGE_DIRECTION, of which only the
least significant 3 bits were used. The original byte containing
PAGE_DIRECTION will be accessible via the new constant PAGE_DIRECTION_B.
page_get_instant(), page_set_instant(): Accessors for the PAGE_INSTANT.
page_ptr_get_direction(), page_get_direction(),
page_ptr_set_direction(): Accessors for PAGE_DIRECTION.
page_direction_reset(): Reset PAGE_DIRECTION, PAGE_N_DIRECTION.
page_direction_increment(): Increment PAGE_N_DIRECTION
and set PAGE_DIRECTION.
rec_get_offsets(): Use the 'leaf' parameter for non-debug purposes,
and assume that heap_no is always set.
Initialize all dict_index_t::n_fields for ROW_FORMAT=REDUNDANT records,
even if the record contains fewer fields.
rec_offs_make_valid(): Add the parameter 'leaf'.
rec_copy_prefix_to_dtuple(): Assert that the tuple is only built
on the core fields. Instant ADD COLUMN only applies to the
clustered index, and we should never build a search key that has
more than the PRIMARY KEY and possibly DB_TRX_ID,DB_ROLL_PTR.
All these columns are always present.
dict_index_build_data_tuple(): Remove assertions that would be
duplicated in rec_copy_prefix_to_dtuple().
rec_init_offsets(): Support ROW_FORMAT=REDUNDANT records whose
number of fields is between n_core_fields and n_fields.
cmp_rec_rec_with_match(): Implement the comparison between two
MIN_REC_FLAG records.
trx_t::in_rollback: Make the field available in non-debug builds.
trx_start_for_ddl_low(): Remove dangerous error-tolerance.
A dictionary transaction must be flagged as such before it has generated
any undo log records. This is because trx_undo_assign_undo() will mark
the transaction as a dictionary transaction in the undo log header
right before the very first undo log record is being written.
btr_index_rec_validate(): Account for instant ADD COLUMN
row_undo_ins_remove_clust_rec(): On the rollback of an insert into
SYS_COLUMNS, revert instant ADD COLUMN in the cache by removing the
last column from the table and the clustered index.
row_search_on_row_ref(), row_undo_mod_parse_undo_rec(), row_undo_mod(),
trx_undo_update_rec_get_update(): Handle the 'default row'
as a special case.
dtuple_t::trim(index): Omit a redundant suffix of an index tuple right
before insert or update. After instant ADD COLUMN, if the last fields
of a clustered index tuple match the 'default row', there is no
need to store them. While trimming the entry, we must hold a page latch,
so that the table cannot be emptied and the 'default row' be deleted.
btr_cur_optimistic_update(), btr_cur_pessimistic_update(),
row_upd_clust_rec_by_insert(), row_ins_clust_index_entry_low():
Invoke dtuple_t::trim() if needed.
row_ins_clust_index_entry(): Restore dtuple_t::n_fields after calling
row_ins_clust_index_entry_low().
rec_get_converted_size(), rec_get_converted_size_comp(): Allow the number
of fields to be between n_core_fields and n_fields. Do not support
infimum,supremum. They are never supposed to be stored in dtuple_t,
because page creation nowadays uses a lower-level method for initializing
them.
rec_convert_dtuple_to_rec_comp(): Assign the status bits based on the
number of fields.
btr_cur_trim(): In an update, trim the index entry as needed. For the
'default row', handle rollback specially. For user records, omit
fields that match the 'default row'.
btr_cur_optimistic_delete_func(), btr_cur_pessimistic_delete():
Skip locking and adaptive hash index for the 'default row'.
row_log_table_apply_convert_mrec(): Replace 'default row' values if needed.
In the temporary file that is applied by row_log_table_apply(),
we must identify whether the records contain the extra header for
instantly added columns. For now, we will allocate an additional byte
for this for ROW_T_INSERT and ROW_T_UPDATE records when the source table
has been subject to instant ADD COLUMN. The ROW_T_DELETE records are
fine, as they will be converted and will only contain 'core' columns
(PRIMARY KEY and some system columns) that are converted from dtuple_t.
rec_get_converted_size_temp(), rec_init_offsets_temp(),
rec_convert_dtuple_to_temp(): Add the parameter 'status'.
REC_INFO_DEFAULT_ROW = REC_INFO_MIN_REC_FLAG | REC_STATUS_COLUMNS_ADDED:
An info_bits constant for distinguishing the 'default row' record.
rec_comp_status_t: An enum of the status bit values.
rec_leaf_format: An enum that replaces the bool parameter of
rec_init_offsets_comp_ordinary().
2017-10-06 07:00:05 +03:00
|
|
|
const char* name(const dict_table_t& table) const;
|
|
|
|
|
|
|
|
/** @return whether this is a virtual column */
|
|
|
|
bool is_virtual() const { return prtype & DATA_VIRTUAL; }
|
|
|
|
/** @return whether NULL is an allowed value for this column */
|
|
|
|
bool is_nullable() const { return !(prtype & DATA_NOT_NULL); }
|
2017-11-22 22:21:37 +02:00
|
|
|
|
2018-01-13 00:19:16 +03:00
|
|
|
/** @return whether table of this system field is TRX_ID-based */
|
|
|
|
bool vers_native() const
|
|
|
|
{
|
2018-03-19 17:25:58 +03:00
|
|
|
ut_ad(vers_sys_start() || vers_sys_end());
|
2018-01-13 00:19:16 +03:00
|
|
|
ut_ad(mtype == DATA_INT || mtype == DATA_FIXBINARY);
|
|
|
|
return mtype == DATA_INT;
|
|
|
|
}
|
2018-12-03 19:56:55 +03:00
|
|
|
/** @return whether this user column (not row_start, row_end)
|
|
|
|
has System Versioning property */
|
2017-11-22 22:21:37 +02:00
|
|
|
bool is_versioned() const { return !(~prtype & DATA_VERSIONED); }
|
|
|
|
/** @return whether this is the system version start */
|
2017-12-18 19:03:51 +03:00
|
|
|
bool vers_sys_start() const
|
2017-11-22 22:21:37 +02:00
|
|
|
{
|
|
|
|
return (prtype & DATA_VERSIONED) == DATA_VERS_START;
|
|
|
|
}
|
|
|
|
/** @return whether this is the system version end */
|
2017-12-18 19:03:51 +03:00
|
|
|
bool vers_sys_end() const
|
2017-11-22 22:21:37 +02:00
|
|
|
{
|
|
|
|
return (prtype & DATA_VERSIONED) == DATA_VERS_END;
|
|
|
|
}
|
|
|
|
|
MDEV-11369 Instant ADD COLUMN for InnoDB
For InnoDB tables, adding, dropping and reordering columns has
required a rebuild of the table and all its indexes. Since MySQL 5.6
(and MariaDB 10.0) this has been supported online (LOCK=NONE), allowing
concurrent modification of the tables.
This work revises the InnoDB ROW_FORMAT=REDUNDANT, ROW_FORMAT=COMPACT
and ROW_FORMAT=DYNAMIC so that columns can be appended instantaneously,
with only minor changes performed to the table structure. The counter
innodb_instant_alter_column in INFORMATION_SCHEMA.GLOBAL_STATUS
is incremented whenever a table rebuild operation is converted into
an instant ADD COLUMN operation.
ROW_FORMAT=COMPRESSED tables will not support instant ADD COLUMN.
Some usability limitations will be addressed in subsequent work:
MDEV-13134 Introduce ALTER TABLE attributes ALGORITHM=NOCOPY
and ALGORITHM=INSTANT
MDEV-14016 Allow instant ADD COLUMN, ADD INDEX, LOCK=NONE
The format of the clustered index (PRIMARY KEY) is changed as follows:
(1) The FIL_PAGE_TYPE of the root page will be FIL_PAGE_TYPE_INSTANT,
and a new field PAGE_INSTANT will contain the original number of fields
in the clustered index ('core' fields).
If instant ADD COLUMN has not been used or the table becomes empty,
or the very first instant ADD COLUMN operation is rolled back,
the fields PAGE_INSTANT and FIL_PAGE_TYPE will be reset
to 0 and FIL_PAGE_INDEX.
(2) A special 'default row' record is inserted into the leftmost leaf,
between the page infimum and the first user record. This record is
distinguished by the REC_INFO_MIN_REC_FLAG, and it is otherwise in the
same format as records that contain values for the instantly added
columns. This 'default row' always has the same number of fields as
the clustered index according to the table definition. The values of
'core' fields are to be ignored. For other fields, the 'default row'
will contain the default values as they were during the ALTER TABLE
statement. (If the column default values are changed later, those
values will only be stored in the .frm file. The 'default row' will
contain the original evaluated values, which must be the same for
every row.) The 'default row' must be completely hidden from
higher-level access routines. Assertions have been added to ensure
that no 'default row' is ever present in the adaptive hash index
or in locked records. The 'default row' is never delete-marked.
(3) In clustered index leaf page records, the number of fields must
reside between the number of 'core' fields (dict_index_t::n_core_fields
introduced in this work) and dict_index_t::n_fields. If the number
of fields is less than dict_index_t::n_fields, the missing fields
are replaced with the column value of the 'default row'.
Note: The number of fields in the record may shrink if some of the
last instantly added columns are updated to the value that is
in the 'default row'. The function btr_cur_trim() implements this
'compression' on update and rollback; dtuple::trim() implements it
on insert.
(4) In ROW_FORMAT=COMPACT and ROW_FORMAT=DYNAMIC records, the new
status value REC_STATUS_COLUMNS_ADDED will indicate the presence of
a new record header that will encode n_fields-n_core_fields-1 in
1 or 2 bytes. (In ROW_FORMAT=REDUNDANT records, the record header
always explicitly encodes the number of fields.)
We introduce the undo log record type TRX_UNDO_INSERT_DEFAULT for
covering the insert of the 'default row' record when instant ADD COLUMN
is used for the first time. Subsequent instant ADD COLUMN can use
TRX_UNDO_UPD_EXIST_REC.
This is joint work with Vin Chen (陈福荣) from Tencent. The design
that was discussed in April 2017 would not have allowed import or
export of data files, because instead of the 'default row' it would
have introduced a data dictionary table. The test
rpl.rpl_alter_instant is exactly as contributed in pull request #408.
The test innodb.instant_alter is based on a contributed test.
The redo log record format changes for ROW_FORMAT=DYNAMIC and
ROW_FORMAT=COMPACT are as contributed. (With this change present,
crash recovery from MariaDB 10.3.1 will fail in spectacular ways!)
Also the semantics of higher-level redo log records that modify the
PAGE_INSTANT field is changed. The redo log format version identifier
was already changed to LOG_HEADER_FORMAT_CURRENT=103 in MariaDB 10.3.1.
Everything else has been rewritten by me. Thanks to Elena Stepanova,
the code has been tested extensively.
When rolling back an instant ADD COLUMN operation, we must empty the
PAGE_FREE list after deleting or shortening the 'default row' record,
by calling either btr_page_empty() or btr_page_reorganize(). We must
know the size of each entry in the PAGE_FREE list. If rollback left a
freed copy of the 'default row' in the PAGE_FREE list, we would be
unable to determine its size (if it is in ROW_FORMAT=COMPACT or
ROW_FORMAT=DYNAMIC) because it would contain more fields than the
rolled-back definition of the clustered index.
UNIV_SQL_DEFAULT: A new special constant that designates an instantly
added column that is not present in the clustered index record.
len_is_stored(): Check if a length is an actual length. There are
two magic length values: UNIV_SQL_DEFAULT, UNIV_SQL_NULL.
dict_col_t::def_val: The 'default row' value of the column. If the
column is not added instantly, def_val.len will be UNIV_SQL_DEFAULT.
dict_col_t: Add the accessors is_virtual(), is_nullable(), is_instant(),
instant_value().
dict_col_t::remove_instant(): Remove the 'instant ADD' status of
a column.
dict_col_t::name(const dict_table_t& table): Replaces
dict_table_get_col_name().
dict_index_t::n_core_fields: The original number of fields.
For secondary indexes and if instant ADD COLUMN has not been used,
this will be equal to dict_index_t::n_fields.
dict_index_t::n_core_null_bytes: Number of bytes needed to
represent the null flags; usually equal to UT_BITS_IN_BYTES(n_nullable).
dict_index_t::NO_CORE_NULL_BYTES: Magic value signalling that
n_core_null_bytes was not initialized yet from the clustered index
root page.
dict_index_t: Add the accessors is_instant(), is_clust(),
get_n_nullable(), instant_field_value().
dict_index_t::instant_add_field(): Adjust clustered index metadata
for instant ADD COLUMN.
dict_index_t::remove_instant(): Remove the 'instant ADD' status
of a clustered index when the table becomes empty, or the very first
instant ADD COLUMN operation is rolled back.
dict_table_t: Add the accessors is_instant(), is_temporary(),
supports_instant().
dict_table_t::instant_add_column(): Adjust metadata for
instant ADD COLUMN.
dict_table_t::rollback_instant(): Adjust metadata on the rollback
of instant ADD COLUMN.
prepare_inplace_alter_table_dict(): First create the ctx->new_table,
and only then decide if the table really needs to be rebuilt.
We must split the creation of table or index metadata from the
creation of the dictionary table records and the creation of
the data. In this way, we can transform a table-rebuilding operation
into an instant ADD COLUMN operation. Dictionary objects will only
be added to cache when table rebuilding or index creation is needed.
The ctx->instant_table will never be added to cache.
dict_table_t::add_to_cache(): Modified and renamed from
dict_table_add_to_cache(). Do not modify the table metadata.
Let the callers invoke dict_table_add_system_columns() and if needed,
set can_be_evicted.
dict_create_sys_tables_tuple(), dict_create_table_step(): Omit the
system columns (which will now exist in the dict_table_t object
already at this point).
dict_create_table_step(): Expect the callers to invoke
dict_table_add_system_columns().
pars_create_table(): Before creating the table creation execution
graph, invoke dict_table_add_system_columns().
row_create_table_for_mysql(): Expect all callers to invoke
dict_table_add_system_columns().
create_index_dict(): Replaces row_merge_create_index_graph().
innodb_update_n_cols(): Renamed from innobase_update_n_virtual().
Call my_error() if an error occurs.
btr_cur_instant_init(), btr_cur_instant_init_low(),
btr_cur_instant_root_init():
Load additional metadata from the clustered index and set
dict_index_t::n_core_null_bytes. This is invoked
when table metadata is first loaded into the data dictionary.
dict_boot(): Initialize n_core_null_bytes for the four hard-coded
dictionary tables.
dict_create_index_step(): Initialize n_core_null_bytes. This is
executed as part of CREATE TABLE.
dict_index_build_internal_clust(): Initialize n_core_null_bytes to
NO_CORE_NULL_BYTES if table->supports_instant().
row_create_index_for_mysql(): Initialize n_core_null_bytes for
CREATE TEMPORARY TABLE.
commit_cache_norebuild(): Call the code to rename or enlarge columns
in the cache only if instant ADD COLUMN is not being used.
(Instant ADD COLUMN would copy all column metadata from
instant_table to old_table, including the names and lengths.)
PAGE_INSTANT: A new 13-bit field for storing dict_index_t::n_core_fields.
This is repurposing the 16-bit field PAGE_DIRECTION, of which only the
least significant 3 bits were used. The original byte containing
PAGE_DIRECTION will be accessible via the new constant PAGE_DIRECTION_B.
page_get_instant(), page_set_instant(): Accessors for the PAGE_INSTANT.
page_ptr_get_direction(), page_get_direction(),
page_ptr_set_direction(): Accessors for PAGE_DIRECTION.
page_direction_reset(): Reset PAGE_DIRECTION, PAGE_N_DIRECTION.
page_direction_increment(): Increment PAGE_N_DIRECTION
and set PAGE_DIRECTION.
rec_get_offsets(): Use the 'leaf' parameter for non-debug purposes,
and assume that heap_no is always set.
Initialize all dict_index_t::n_fields for ROW_FORMAT=REDUNDANT records,
even if the record contains fewer fields.
rec_offs_make_valid(): Add the parameter 'leaf'.
rec_copy_prefix_to_dtuple(): Assert that the tuple is only built
on the core fields. Instant ADD COLUMN only applies to the
clustered index, and we should never build a search key that has
more than the PRIMARY KEY and possibly DB_TRX_ID,DB_ROLL_PTR.
All these columns are always present.
dict_index_build_data_tuple(): Remove assertions that would be
duplicated in rec_copy_prefix_to_dtuple().
rec_init_offsets(): Support ROW_FORMAT=REDUNDANT records whose
number of fields is between n_core_fields and n_fields.
cmp_rec_rec_with_match(): Implement the comparison between two
MIN_REC_FLAG records.
trx_t::in_rollback: Make the field available in non-debug builds.
trx_start_for_ddl_low(): Remove dangerous error-tolerance.
A dictionary transaction must be flagged as such before it has generated
any undo log records. This is because trx_undo_assign_undo() will mark
the transaction as a dictionary transaction in the undo log header
right before the very first undo log record is being written.
btr_index_rec_validate(): Account for instant ADD COLUMN
row_undo_ins_remove_clust_rec(): On the rollback of an insert into
SYS_COLUMNS, revert instant ADD COLUMN in the cache by removing the
last column from the table and the clustered index.
row_search_on_row_ref(), row_undo_mod_parse_undo_rec(), row_undo_mod(),
trx_undo_update_rec_get_update(): Handle the 'default row'
as a special case.
dtuple_t::trim(index): Omit a redundant suffix of an index tuple right
before insert or update. After instant ADD COLUMN, if the last fields
of a clustered index tuple match the 'default row', there is no
need to store them. While trimming the entry, we must hold a page latch,
so that the table cannot be emptied and the 'default row' be deleted.
btr_cur_optimistic_update(), btr_cur_pessimistic_update(),
row_upd_clust_rec_by_insert(), row_ins_clust_index_entry_low():
Invoke dtuple_t::trim() if needed.
row_ins_clust_index_entry(): Restore dtuple_t::n_fields after calling
row_ins_clust_index_entry_low().
rec_get_converted_size(), rec_get_converted_size_comp(): Allow the number
of fields to be between n_core_fields and n_fields. Do not support
infimum,supremum. They are never supposed to be stored in dtuple_t,
because page creation nowadays uses a lower-level method for initializing
them.
rec_convert_dtuple_to_rec_comp(): Assign the status bits based on the
number of fields.
btr_cur_trim(): In an update, trim the index entry as needed. For the
'default row', handle rollback specially. For user records, omit
fields that match the 'default row'.
btr_cur_optimistic_delete_func(), btr_cur_pessimistic_delete():
Skip locking and adaptive hash index for the 'default row'.
row_log_table_apply_convert_mrec(): Replace 'default row' values if needed.
In the temporary file that is applied by row_log_table_apply(),
we must identify whether the records contain the extra header for
instantly added columns. For now, we will allocate an additional byte
for this for ROW_T_INSERT and ROW_T_UPDATE records when the source table
has been subject to instant ADD COLUMN. The ROW_T_DELETE records are
fine, as they will be converted and will only contain 'core' columns
(PRIMARY KEY and some system columns) that are converted from dtuple_t.
rec_get_converted_size_temp(), rec_init_offsets_temp(),
rec_convert_dtuple_to_temp(): Add the parameter 'status'.
REC_INFO_DEFAULT_ROW = REC_INFO_MIN_REC_FLAG | REC_STATUS_COLUMNS_ADDED:
An info_bits constant for distinguishing the 'default row' record.
rec_comp_status_t: An enum of the status bit values.
rec_leaf_format: An enum that replaces the bool parameter of
rec_init_offsets_comp_ordinary().
2017-10-06 07:00:05 +03:00
|
|
|
/** @return whether this is an instantly-added column */
|
MDEV-15662 Instant DROP COLUMN or changing the order of columns
Allow ADD COLUMN anywhere in a table, not only adding as the
last column.
Allow instant DROP COLUMN and instant changing the order of columns.
The added columns will always be added last in clustered index records.
In new records, instantly dropped columns will be stored as NULL or
empty when possible.
Information about dropped and reordered columns will be written in
a metadata BLOB (mblob), which is stored before the first 'user' field
in the hidden metadata record at the start of the clustered index.
The presence of mblob is indicated by setting the delete-mark flag in
the metadata record.
The metadata BLOB stores the number of clustered index fields,
followed by an array of column information for each field.
For dropped columns, we store the NOT NULL flag, the fixed length,
and for variable-length columns, whether the maximum length exceeded
255 bytes. For non-dropped columns, we store the column position.
Unlike with MDEV-11369, when a table becomes empty, it cannot
be converted back to the canonical format. The reason for this is
that other threads may hold cached objects such as
row_prebuilt_t::ins_node that could refer to dropped or reordered
index fields.
For instant DROP COLUMN and ROW_FORMAT=COMPACT or ROW_FORMAT=DYNAMIC,
we must store the n_core_null_bytes in the root page, so that the
chain of node pointer records can be followed in order to reach the
leftmost leaf page where the metadata record is located.
If the mblob is present, we will zero-initialize the strings
"infimum" and "supremum" in the root page, and use the last byte of
"supremum" for storing the number of null bytes (which are allocated
but useless on node pointer pages). This is necessary for
btr_cur_instant_init_metadata() to be able to navigate to the mblob.
If the PRIMARY KEY contains any variable-length column and some
nullable columns were instantly dropped, the dict_index_t::n_nullable
in the data dictionary could be smaller than it actually is in the
non-leaf pages. Because of this, the non-leaf pages could use more
bytes for the null flags than the data dictionary expects, and we
could be reading the lengths of the variable-length columns from the
wrong offset, and thus reading the child page number from wrong place.
This is the result of two design mistakes that involve unnecessary
storage of data: First, it is nonsense to store any data fields for
the leftmost node pointer records, because the comparisons would be
resolved by the MIN_REC_FLAG alone. Second, there cannot be any null
fields in the clustered index node pointer fields, but we nevertheless
reserve space for all the null flags.
Limitations (future work):
MDEV-17459 Allow instant ALTER TABLE even if FULLTEXT INDEX exists
MDEV-17468 Avoid table rebuild on operations on generated columns
MDEV-17494 Refuse ALGORITHM=INSTANT when the row size is too large
btr_page_reorganize_low(): Preserve any metadata in the root page.
Call lock_move_reorganize_page() only after restoring the "infimum"
and "supremum" records, to avoid a memcmp() assertion failure.
dict_col_t::DROPPED: Magic value for dict_col_t::ind.
dict_col_t::clear_instant(): Renamed from dict_col_t::remove_instant().
Do not assert that the column was instantly added, because we
sometimes call this unconditionally for all columns.
Convert an instantly added column to a "core column". The old name
remove_instant() could be mistaken to refer to "instant DROP COLUMN".
dict_col_t::is_added(): Rename from dict_col_t::is_instant().
dtype_t::metadata_blob_init(): Initialize the mblob data type.
dtuple_t::is_metadata(), dtuple_t::is_alter_metadata(),
upd_t::is_metadata(), upd_t::is_alter_metadata(): Check if info_bits
refer to a metadata record.
dict_table_t::instant: Metadata about dropped or reordered columns.
dict_table_t::prepare_instant(): Prepare
ha_innobase_inplace_ctx::instant_table for instant ALTER TABLE.
innobase_instant_try() will pass this to dict_table_t::instant_column().
On rollback, dict_table_t::rollback_instant() will be called.
dict_table_t::instant_column(): Renamed from instant_add_column().
Add the parameter col_map so that columns can be reordered.
Copy and adjust v_cols[] as well.
dict_table_t::find(): Find an old column based on a new column number.
dict_table_t::serialise_columns(), dict_table_t::deserialise_columns():
Convert the mblob.
dict_index_t::instant_metadata(): Create the metadata record
for instant ALTER TABLE. Invoke dict_table_t::serialise_columns().
dict_index_t::reconstruct_fields(): Invoked by
dict_table_t::deserialise_columns().
dict_index_t::clear_instant_alter(): Move the fields for the
dropped columns to the end, and sort the surviving index fields
in ascending order of column position.
ha_innobase::check_if_supported_inplace_alter(): Do not allow
adding a FTS_DOC_ID column if a hidden FTS_DOC_ID column exists
due to FULLTEXT INDEX. (This always required ALGORITHM=COPY.)
instant_alter_column_possible(): Add a parameter for InnoDB table,
to check for additional conditions, such as the maximum number of
index fields.
ha_innobase_inplace_ctx::first_alter_pos: The first column whose position
is affected by instant ADD, DROP, or changing the order of columns.
innobase_build_col_map(): Skip added virtual columns.
prepare_inplace_add_virtual(): Correctly compute num_to_add_vcol.
Remove some unnecessary code. Note that the call to
innodb_base_col_setup() should be executed later.
commit_try_norebuild(): If ctx->is_instant(), let the virtual
columns be added or dropped by innobase_instant_try().
innobase_instant_try(): Fill in a zero default value for the
hidden column FTS_DOC_ID (to reduce the work needed in MDEV-17459).
If any columns were dropped or reordered (or added not last),
delete any SYS_COLUMNS records for the following columns, and
insert SYS_COLUMNS records for all subsequent stored columns as well
as for all virtual columns. If any virtual column is dropped, rewrite
all virtual column metadata. Use a shortcut only for adding
virtual columns. This is because innobase_drop_virtual_try()
assumes that the dropped virtual columns still exist in ctx->old_table.
innodb_update_cols(): Renamed from innodb_update_n_cols().
innobase_add_one_virtual(), innobase_insert_sys_virtual(): Change
the return type to bool, and invoke my_error() when detecting an error.
innodb_insert_sys_columns(): Insert a record into SYS_COLUMNS.
Refactored from innobase_add_one_virtual() and innobase_instant_add_col().
innobase_instant_add_col(): Replace the parameter dfield with type.
innobase_instant_drop_cols(): Drop matching columns from SYS_COLUMNS
and all columns from SYS_VIRTUAL.
innobase_add_virtual_try(), innobase_drop_virtual_try(): Let
the caller invoke innodb_update_cols().
innobase_rename_column_try(): Skip dropped columns.
commit_cache_norebuild(): Update table->fts->doc_col.
dict_mem_table_col_rename_low(): Skip dropped columns.
trx_undo_rec_get_partial_row(): Skip dropped columns.
trx_undo_update_rec_get_update(): Handle the metadata BLOB correctly.
trx_undo_page_report_modify(): Avoid out-of-bounds access to record fields.
Log metadata records consistently.
Apparently, the first fields of a clustered index may be updated
in an update_undo vector when the index is ID_IND of SYS_FOREIGN,
as part of renaming the table during ALTER TABLE. Normally, updates of
the PRIMARY KEY should be logged as delete-mark and an insert.
row_undo_mod_parse_undo_rec(), row_purge_parse_undo_rec():
Use trx_undo_metadata.
row_undo_mod_clust_low(): On metadata rollback, roll back the root page too.
row_undo_mod_clust(): Relax an assertion. The delete-mark flag was
repurposed for ALTER TABLE metadata records.
row_rec_to_index_entry_impl(): Add the template parameter mblob
and the optional parameter info_bits for specifying the desired new
info bits. For the metadata tuple, allow conversion between the original
format (ADD COLUMN only) and the generic format (with hidden BLOB).
Add the optional parameter "pad" to determine whether the tuple should
be padded to the index fields (on ALTER TABLE it should), or whether
it should remain at its original size (on rollback).
row_build_index_entry_low(): Clean up the code, removing
redundant variables and conditions. For instantly dropped columns,
generate a dummy value that is NULL, the empty string, or a
fixed length of NUL bytes, depending on the type of the dropped column.
row_upd_clust_rec_by_insert_inherit_func(): On the update of PRIMARY KEY
of a record that contained a dropped column whose value was stored
externally, we will be inserting a dummy NULL or empty string value
to the field of the dropped column. The externally stored column would
eventually be dropped when purge removes the delete-marked record for
the old PRIMARY KEY value.
btr_index_rec_validate(): Recognize the metadata record.
btr_discard_only_page_on_level(): Preserve the generic instant
ALTER TABLE metadata.
btr_set_instant(): Replaces page_set_instant(). This sets a clustered
index root page to the appropriate format, or upgrades from
the MDEV-11369 instant ADD COLUMN to generic ALTER TABLE format.
btr_cur_instant_init_low(): Read and validate the metadata BLOB page
before reconstructing the dictionary information based on it.
btr_cur_instant_init_metadata(): Do not read any lengths from the
metadata record header before reading the BLOB. At this point, we
would not actually know how many nullable fields the metadata record
contains.
btr_cur_instant_root_init(): Initialize n_core_null_bytes in one
of two possible ways.
btr_cur_trim(): Handle the mblob record.
row_metadata_to_tuple(): Convert a metadata record to a data tuple,
based on the new info_bits of the metadata record.
btr_cur_pessimistic_update(): Invoke row_metadata_to_tuple() if needed.
Invoke dtuple_convert_big_rec() for metadata records if the record is
too large, or if the mblob is not yet marked as externally stored.
btr_cur_optimistic_delete_func(), btr_cur_pessimistic_delete():
When the last user record is deleted, do not delete the
generic instant ALTER TABLE metadata record. Only delete
MDEV-11369 instant ADD COLUMN metadata records.
btr_cur_optimistic_insert(): Avoid unnecessary computation of rec_size.
btr_pcur_store_position(): Allow a logically empty page to contain
a metadata record for generic ALTER TABLE.
REC_INFO_DEFAULT_ROW_ADD: Renamed from REC_INFO_DEFAULT_ROW.
This is for the old instant ADD COLUMN (MDEV-11369) only.
REC_INFO_DEFAULT_ROW_ALTER: The more generic metadata record,
with additional information for dropped or reordered columns.
rec_info_bits_valid(): Remove. The only case when this would fail
is when the record is the generic ALTER TABLE metadata record.
rec_is_alter_metadata(): Check if a record is the metadata record
for instant ALTER TABLE (other than ADD COLUMN). NOTE: This function
must not be invoked on node pointer records, because the delete-mark
flag in those records may be set (it is garbage), and then a debug
assertion could fail because index->is_instant() does not necessarily
hold.
rec_is_add_metadata(): Check if a record is MDEV-11369 ADD COLUMN metadata
record (not more generic instant ALTER TABLE).
rec_get_converted_size_comp_prefix_low(): Assume that the metadata
field will be stored externally. In dtuple_convert_big_rec() during
the rec_get_converted_size() call, it would not be there yet.
rec_get_converted_size_comp(): Replace status,fields,n_fields with tuple.
rec_init_offsets_comp_ordinary(), rec_get_converted_size_comp_prefix_low(),
rec_convert_dtuple_to_rec_comp(): Add template<bool mblob = false>.
With mblob=true, process a record with a metadata BLOB.
rec_copy_prefix_to_buf(): Assert that no fields beyond the key and
system columns are being copied. Exclude the metadata BLOB field.
rec_convert_dtuple_to_metadata_comp(): Convert an alter metadata tuple
into a record.
row_upd_index_replace_metadata(): Apply an update vector to an
alter_metadata tuple.
row_log_allocate(): Replace dict_index_t::is_instant()
with a more appropriate condition that ignores dict_table_t::instant.
Only a table on which the MDEV-11369 ADD COLUMN was performed
can "lose its instantness" when it becomes empty. After
instant DROP COLUMN or reordering columns, we cannot simply
convert the table to the canonical format, because the data
dictionary cache and all possibly existing references to it
from other client connection threads would have to be adjusted.
row_quiesce_write_index_fields(): Do not crash when the table contains
an instantly dropped column.
Thanks to Thirunarayanan Balathandayuthapani for discussing the design
and implementing an initial prototype of this.
Thanks to Matthias Leich for testing.
2018-10-19 16:49:54 +03:00
|
|
|
bool is_added() const
|
MDEV-11369 Instant ADD COLUMN for InnoDB
For InnoDB tables, adding, dropping and reordering columns has
required a rebuild of the table and all its indexes. Since MySQL 5.6
(and MariaDB 10.0) this has been supported online (LOCK=NONE), allowing
concurrent modification of the tables.
This work revises the InnoDB ROW_FORMAT=REDUNDANT, ROW_FORMAT=COMPACT
and ROW_FORMAT=DYNAMIC so that columns can be appended instantaneously,
with only minor changes performed to the table structure. The counter
innodb_instant_alter_column in INFORMATION_SCHEMA.GLOBAL_STATUS
is incremented whenever a table rebuild operation is converted into
an instant ADD COLUMN operation.
ROW_FORMAT=COMPRESSED tables will not support instant ADD COLUMN.
Some usability limitations will be addressed in subsequent work:
MDEV-13134 Introduce ALTER TABLE attributes ALGORITHM=NOCOPY
and ALGORITHM=INSTANT
MDEV-14016 Allow instant ADD COLUMN, ADD INDEX, LOCK=NONE
The format of the clustered index (PRIMARY KEY) is changed as follows:
(1) The FIL_PAGE_TYPE of the root page will be FIL_PAGE_TYPE_INSTANT,
and a new field PAGE_INSTANT will contain the original number of fields
in the clustered index ('core' fields).
If instant ADD COLUMN has not been used or the table becomes empty,
or the very first instant ADD COLUMN operation is rolled back,
the fields PAGE_INSTANT and FIL_PAGE_TYPE will be reset
to 0 and FIL_PAGE_INDEX.
(2) A special 'default row' record is inserted into the leftmost leaf,
between the page infimum and the first user record. This record is
distinguished by the REC_INFO_MIN_REC_FLAG, and it is otherwise in the
same format as records that contain values for the instantly added
columns. This 'default row' always has the same number of fields as
the clustered index according to the table definition. The values of
'core' fields are to be ignored. For other fields, the 'default row'
will contain the default values as they were during the ALTER TABLE
statement. (If the column default values are changed later, those
values will only be stored in the .frm file. The 'default row' will
contain the original evaluated values, which must be the same for
every row.) The 'default row' must be completely hidden from
higher-level access routines. Assertions have been added to ensure
that no 'default row' is ever present in the adaptive hash index
or in locked records. The 'default row' is never delete-marked.
(3) In clustered index leaf page records, the number of fields must
reside between the number of 'core' fields (dict_index_t::n_core_fields
introduced in this work) and dict_index_t::n_fields. If the number
of fields is less than dict_index_t::n_fields, the missing fields
are replaced with the column value of the 'default row'.
Note: The number of fields in the record may shrink if some of the
last instantly added columns are updated to the value that is
in the 'default row'. The function btr_cur_trim() implements this
'compression' on update and rollback; dtuple::trim() implements it
on insert.
(4) In ROW_FORMAT=COMPACT and ROW_FORMAT=DYNAMIC records, the new
status value REC_STATUS_COLUMNS_ADDED will indicate the presence of
a new record header that will encode n_fields-n_core_fields-1 in
1 or 2 bytes. (In ROW_FORMAT=REDUNDANT records, the record header
always explicitly encodes the number of fields.)
We introduce the undo log record type TRX_UNDO_INSERT_DEFAULT for
covering the insert of the 'default row' record when instant ADD COLUMN
is used for the first time. Subsequent instant ADD COLUMN can use
TRX_UNDO_UPD_EXIST_REC.
This is joint work with Vin Chen (陈福荣) from Tencent. The design
that was discussed in April 2017 would not have allowed import or
export of data files, because instead of the 'default row' it would
have introduced a data dictionary table. The test
rpl.rpl_alter_instant is exactly as contributed in pull request #408.
The test innodb.instant_alter is based on a contributed test.
The redo log record format changes for ROW_FORMAT=DYNAMIC and
ROW_FORMAT=COMPACT are as contributed. (With this change present,
crash recovery from MariaDB 10.3.1 will fail in spectacular ways!)
Also the semantics of higher-level redo log records that modify the
PAGE_INSTANT field is changed. The redo log format version identifier
was already changed to LOG_HEADER_FORMAT_CURRENT=103 in MariaDB 10.3.1.
Everything else has been rewritten by me. Thanks to Elena Stepanova,
the code has been tested extensively.
When rolling back an instant ADD COLUMN operation, we must empty the
PAGE_FREE list after deleting or shortening the 'default row' record,
by calling either btr_page_empty() or btr_page_reorganize(). We must
know the size of each entry in the PAGE_FREE list. If rollback left a
freed copy of the 'default row' in the PAGE_FREE list, we would be
unable to determine its size (if it is in ROW_FORMAT=COMPACT or
ROW_FORMAT=DYNAMIC) because it would contain more fields than the
rolled-back definition of the clustered index.
UNIV_SQL_DEFAULT: A new special constant that designates an instantly
added column that is not present in the clustered index record.
len_is_stored(): Check if a length is an actual length. There are
two magic length values: UNIV_SQL_DEFAULT, UNIV_SQL_NULL.
dict_col_t::def_val: The 'default row' value of the column. If the
column is not added instantly, def_val.len will be UNIV_SQL_DEFAULT.
dict_col_t: Add the accessors is_virtual(), is_nullable(), is_instant(),
instant_value().
dict_col_t::remove_instant(): Remove the 'instant ADD' status of
a column.
dict_col_t::name(const dict_table_t& table): Replaces
dict_table_get_col_name().
dict_index_t::n_core_fields: The original number of fields.
For secondary indexes and if instant ADD COLUMN has not been used,
this will be equal to dict_index_t::n_fields.
dict_index_t::n_core_null_bytes: Number of bytes needed to
represent the null flags; usually equal to UT_BITS_IN_BYTES(n_nullable).
dict_index_t::NO_CORE_NULL_BYTES: Magic value signalling that
n_core_null_bytes was not initialized yet from the clustered index
root page.
dict_index_t: Add the accessors is_instant(), is_clust(),
get_n_nullable(), instant_field_value().
dict_index_t::instant_add_field(): Adjust clustered index metadata
for instant ADD COLUMN.
dict_index_t::remove_instant(): Remove the 'instant ADD' status
of a clustered index when the table becomes empty, or the very first
instant ADD COLUMN operation is rolled back.
dict_table_t: Add the accessors is_instant(), is_temporary(),
supports_instant().
dict_table_t::instant_add_column(): Adjust metadata for
instant ADD COLUMN.
dict_table_t::rollback_instant(): Adjust metadata on the rollback
of instant ADD COLUMN.
prepare_inplace_alter_table_dict(): First create the ctx->new_table,
and only then decide if the table really needs to be rebuilt.
We must split the creation of table or index metadata from the
creation of the dictionary table records and the creation of
the data. In this way, we can transform a table-rebuilding operation
into an instant ADD COLUMN operation. Dictionary objects will only
be added to cache when table rebuilding or index creation is needed.
The ctx->instant_table will never be added to cache.
dict_table_t::add_to_cache(): Modified and renamed from
dict_table_add_to_cache(). Do not modify the table metadata.
Let the callers invoke dict_table_add_system_columns() and if needed,
set can_be_evicted.
dict_create_sys_tables_tuple(), dict_create_table_step(): Omit the
system columns (which will now exist in the dict_table_t object
already at this point).
dict_create_table_step(): Expect the callers to invoke
dict_table_add_system_columns().
pars_create_table(): Before creating the table creation execution
graph, invoke dict_table_add_system_columns().
row_create_table_for_mysql(): Expect all callers to invoke
dict_table_add_system_columns().
create_index_dict(): Replaces row_merge_create_index_graph().
innodb_update_n_cols(): Renamed from innobase_update_n_virtual().
Call my_error() if an error occurs.
btr_cur_instant_init(), btr_cur_instant_init_low(),
btr_cur_instant_root_init():
Load additional metadata from the clustered index and set
dict_index_t::n_core_null_bytes. This is invoked
when table metadata is first loaded into the data dictionary.
dict_boot(): Initialize n_core_null_bytes for the four hard-coded
dictionary tables.
dict_create_index_step(): Initialize n_core_null_bytes. This is
executed as part of CREATE TABLE.
dict_index_build_internal_clust(): Initialize n_core_null_bytes to
NO_CORE_NULL_BYTES if table->supports_instant().
row_create_index_for_mysql(): Initialize n_core_null_bytes for
CREATE TEMPORARY TABLE.
commit_cache_norebuild(): Call the code to rename or enlarge columns
in the cache only if instant ADD COLUMN is not being used.
(Instant ADD COLUMN would copy all column metadata from
instant_table to old_table, including the names and lengths.)
PAGE_INSTANT: A new 13-bit field for storing dict_index_t::n_core_fields.
This is repurposing the 16-bit field PAGE_DIRECTION, of which only the
least significant 3 bits were used. The original byte containing
PAGE_DIRECTION will be accessible via the new constant PAGE_DIRECTION_B.
page_get_instant(), page_set_instant(): Accessors for the PAGE_INSTANT.
page_ptr_get_direction(), page_get_direction(),
page_ptr_set_direction(): Accessors for PAGE_DIRECTION.
page_direction_reset(): Reset PAGE_DIRECTION, PAGE_N_DIRECTION.
page_direction_increment(): Increment PAGE_N_DIRECTION
and set PAGE_DIRECTION.
rec_get_offsets(): Use the 'leaf' parameter for non-debug purposes,
and assume that heap_no is always set.
Initialize all dict_index_t::n_fields for ROW_FORMAT=REDUNDANT records,
even if the record contains fewer fields.
rec_offs_make_valid(): Add the parameter 'leaf'.
rec_copy_prefix_to_dtuple(): Assert that the tuple is only built
on the core fields. Instant ADD COLUMN only applies to the
clustered index, and we should never build a search key that has
more than the PRIMARY KEY and possibly DB_TRX_ID,DB_ROLL_PTR.
All these columns are always present.
dict_index_build_data_tuple(): Remove assertions that would be
duplicated in rec_copy_prefix_to_dtuple().
rec_init_offsets(): Support ROW_FORMAT=REDUNDANT records whose
number of fields is between n_core_fields and n_fields.
cmp_rec_rec_with_match(): Implement the comparison between two
MIN_REC_FLAG records.
trx_t::in_rollback: Make the field available in non-debug builds.
trx_start_for_ddl_low(): Remove dangerous error-tolerance.
A dictionary transaction must be flagged as such before it has generated
any undo log records. This is because trx_undo_assign_undo() will mark
the transaction as a dictionary transaction in the undo log header
right before the very first undo log record is being written.
btr_index_rec_validate(): Account for instant ADD COLUMN
row_undo_ins_remove_clust_rec(): On the rollback of an insert into
SYS_COLUMNS, revert instant ADD COLUMN in the cache by removing the
last column from the table and the clustered index.
row_search_on_row_ref(), row_undo_mod_parse_undo_rec(), row_undo_mod(),
trx_undo_update_rec_get_update(): Handle the 'default row'
as a special case.
dtuple_t::trim(index): Omit a redundant suffix of an index tuple right
before insert or update. After instant ADD COLUMN, if the last fields
of a clustered index tuple match the 'default row', there is no
need to store them. While trimming the entry, we must hold a page latch,
so that the table cannot be emptied and the 'default row' be deleted.
btr_cur_optimistic_update(), btr_cur_pessimistic_update(),
row_upd_clust_rec_by_insert(), row_ins_clust_index_entry_low():
Invoke dtuple_t::trim() if needed.
row_ins_clust_index_entry(): Restore dtuple_t::n_fields after calling
row_ins_clust_index_entry_low().
rec_get_converted_size(), rec_get_converted_size_comp(): Allow the number
of fields to be between n_core_fields and n_fields. Do not support
infimum,supremum. They are never supposed to be stored in dtuple_t,
because page creation nowadays uses a lower-level method for initializing
them.
rec_convert_dtuple_to_rec_comp(): Assign the status bits based on the
number of fields.
btr_cur_trim(): In an update, trim the index entry as needed. For the
'default row', handle rollback specially. For user records, omit
fields that match the 'default row'.
btr_cur_optimistic_delete_func(), btr_cur_pessimistic_delete():
Skip locking and adaptive hash index for the 'default row'.
row_log_table_apply_convert_mrec(): Replace 'default row' values if needed.
In the temporary file that is applied by row_log_table_apply(),
we must identify whether the records contain the extra header for
instantly added columns. For now, we will allocate an additional byte
for this for ROW_T_INSERT and ROW_T_UPDATE records when the source table
has been subject to instant ADD COLUMN. The ROW_T_DELETE records are
fine, as they will be converted and will only contain 'core' columns
(PRIMARY KEY and some system columns) that are converted from dtuple_t.
rec_get_converted_size_temp(), rec_init_offsets_temp(),
rec_convert_dtuple_to_temp(): Add the parameter 'status'.
REC_INFO_DEFAULT_ROW = REC_INFO_MIN_REC_FLAG | REC_STATUS_COLUMNS_ADDED:
An info_bits constant for distinguishing the 'default row' record.
rec_comp_status_t: An enum of the status bit values.
rec_leaf_format: An enum that replaces the bool parameter of
rec_init_offsets_comp_ordinary().
2017-10-06 07:00:05 +03:00
|
|
|
{
|
|
|
|
DBUG_ASSERT(def_val.len != UNIV_SQL_DEFAULT || !def_val.data);
|
|
|
|
return def_val.len != UNIV_SQL_DEFAULT;
|
|
|
|
}
|
MDEV-15662 Instant DROP COLUMN or changing the order of columns
Allow ADD COLUMN anywhere in a table, not only adding as the
last column.
Allow instant DROP COLUMN and instant changing the order of columns.
The added columns will always be added last in clustered index records.
In new records, instantly dropped columns will be stored as NULL or
empty when possible.
Information about dropped and reordered columns will be written in
a metadata BLOB (mblob), which is stored before the first 'user' field
in the hidden metadata record at the start of the clustered index.
The presence of mblob is indicated by setting the delete-mark flag in
the metadata record.
The metadata BLOB stores the number of clustered index fields,
followed by an array of column information for each field.
For dropped columns, we store the NOT NULL flag, the fixed length,
and for variable-length columns, whether the maximum length exceeded
255 bytes. For non-dropped columns, we store the column position.
Unlike with MDEV-11369, when a table becomes empty, it cannot
be converted back to the canonical format. The reason for this is
that other threads may hold cached objects such as
row_prebuilt_t::ins_node that could refer to dropped or reordered
index fields.
For instant DROP COLUMN and ROW_FORMAT=COMPACT or ROW_FORMAT=DYNAMIC,
we must store the n_core_null_bytes in the root page, so that the
chain of node pointer records can be followed in order to reach the
leftmost leaf page where the metadata record is located.
If the mblob is present, we will zero-initialize the strings
"infimum" and "supremum" in the root page, and use the last byte of
"supremum" for storing the number of null bytes (which are allocated
but useless on node pointer pages). This is necessary for
btr_cur_instant_init_metadata() to be able to navigate to the mblob.
If the PRIMARY KEY contains any variable-length column and some
nullable columns were instantly dropped, the dict_index_t::n_nullable
in the data dictionary could be smaller than it actually is in the
non-leaf pages. Because of this, the non-leaf pages could use more
bytes for the null flags than the data dictionary expects, and we
could be reading the lengths of the variable-length columns from the
wrong offset, and thus reading the child page number from wrong place.
This is the result of two design mistakes that involve unnecessary
storage of data: First, it is nonsense to store any data fields for
the leftmost node pointer records, because the comparisons would be
resolved by the MIN_REC_FLAG alone. Second, there cannot be any null
fields in the clustered index node pointer fields, but we nevertheless
reserve space for all the null flags.
Limitations (future work):
MDEV-17459 Allow instant ALTER TABLE even if FULLTEXT INDEX exists
MDEV-17468 Avoid table rebuild on operations on generated columns
MDEV-17494 Refuse ALGORITHM=INSTANT when the row size is too large
btr_page_reorganize_low(): Preserve any metadata in the root page.
Call lock_move_reorganize_page() only after restoring the "infimum"
and "supremum" records, to avoid a memcmp() assertion failure.
dict_col_t::DROPPED: Magic value for dict_col_t::ind.
dict_col_t::clear_instant(): Renamed from dict_col_t::remove_instant().
Do not assert that the column was instantly added, because we
sometimes call this unconditionally for all columns.
Convert an instantly added column to a "core column". The old name
remove_instant() could be mistaken to refer to "instant DROP COLUMN".
dict_col_t::is_added(): Rename from dict_col_t::is_instant().
dtype_t::metadata_blob_init(): Initialize the mblob data type.
dtuple_t::is_metadata(), dtuple_t::is_alter_metadata(),
upd_t::is_metadata(), upd_t::is_alter_metadata(): Check if info_bits
refer to a metadata record.
dict_table_t::instant: Metadata about dropped or reordered columns.
dict_table_t::prepare_instant(): Prepare
ha_innobase_inplace_ctx::instant_table for instant ALTER TABLE.
innobase_instant_try() will pass this to dict_table_t::instant_column().
On rollback, dict_table_t::rollback_instant() will be called.
dict_table_t::instant_column(): Renamed from instant_add_column().
Add the parameter col_map so that columns can be reordered.
Copy and adjust v_cols[] as well.
dict_table_t::find(): Find an old column based on a new column number.
dict_table_t::serialise_columns(), dict_table_t::deserialise_columns():
Convert the mblob.
dict_index_t::instant_metadata(): Create the metadata record
for instant ALTER TABLE. Invoke dict_table_t::serialise_columns().
dict_index_t::reconstruct_fields(): Invoked by
dict_table_t::deserialise_columns().
dict_index_t::clear_instant_alter(): Move the fields for the
dropped columns to the end, and sort the surviving index fields
in ascending order of column position.
ha_innobase::check_if_supported_inplace_alter(): Do not allow
adding a FTS_DOC_ID column if a hidden FTS_DOC_ID column exists
due to FULLTEXT INDEX. (This always required ALGORITHM=COPY.)
instant_alter_column_possible(): Add a parameter for InnoDB table,
to check for additional conditions, such as the maximum number of
index fields.
ha_innobase_inplace_ctx::first_alter_pos: The first column whose position
is affected by instant ADD, DROP, or changing the order of columns.
innobase_build_col_map(): Skip added virtual columns.
prepare_inplace_add_virtual(): Correctly compute num_to_add_vcol.
Remove some unnecessary code. Note that the call to
innodb_base_col_setup() should be executed later.
commit_try_norebuild(): If ctx->is_instant(), let the virtual
columns be added or dropped by innobase_instant_try().
innobase_instant_try(): Fill in a zero default value for the
hidden column FTS_DOC_ID (to reduce the work needed in MDEV-17459).
If any columns were dropped or reordered (or added not last),
delete any SYS_COLUMNS records for the following columns, and
insert SYS_COLUMNS records for all subsequent stored columns as well
as for all virtual columns. If any virtual column is dropped, rewrite
all virtual column metadata. Use a shortcut only for adding
virtual columns. This is because innobase_drop_virtual_try()
assumes that the dropped virtual columns still exist in ctx->old_table.
innodb_update_cols(): Renamed from innodb_update_n_cols().
innobase_add_one_virtual(), innobase_insert_sys_virtual(): Change
the return type to bool, and invoke my_error() when detecting an error.
innodb_insert_sys_columns(): Insert a record into SYS_COLUMNS.
Refactored from innobase_add_one_virtual() and innobase_instant_add_col().
innobase_instant_add_col(): Replace the parameter dfield with type.
innobase_instant_drop_cols(): Drop matching columns from SYS_COLUMNS
and all columns from SYS_VIRTUAL.
innobase_add_virtual_try(), innobase_drop_virtual_try(): Let
the caller invoke innodb_update_cols().
innobase_rename_column_try(): Skip dropped columns.
commit_cache_norebuild(): Update table->fts->doc_col.
dict_mem_table_col_rename_low(): Skip dropped columns.
trx_undo_rec_get_partial_row(): Skip dropped columns.
trx_undo_update_rec_get_update(): Handle the metadata BLOB correctly.
trx_undo_page_report_modify(): Avoid out-of-bounds access to record fields.
Log metadata records consistently.
Apparently, the first fields of a clustered index may be updated
in an update_undo vector when the index is ID_IND of SYS_FOREIGN,
as part of renaming the table during ALTER TABLE. Normally, updates of
the PRIMARY KEY should be logged as delete-mark and an insert.
row_undo_mod_parse_undo_rec(), row_purge_parse_undo_rec():
Use trx_undo_metadata.
row_undo_mod_clust_low(): On metadata rollback, roll back the root page too.
row_undo_mod_clust(): Relax an assertion. The delete-mark flag was
repurposed for ALTER TABLE metadata records.
row_rec_to_index_entry_impl(): Add the template parameter mblob
and the optional parameter info_bits for specifying the desired new
info bits. For the metadata tuple, allow conversion between the original
format (ADD COLUMN only) and the generic format (with hidden BLOB).
Add the optional parameter "pad" to determine whether the tuple should
be padded to the index fields (on ALTER TABLE it should), or whether
it should remain at its original size (on rollback).
row_build_index_entry_low(): Clean up the code, removing
redundant variables and conditions. For instantly dropped columns,
generate a dummy value that is NULL, the empty string, or a
fixed length of NUL bytes, depending on the type of the dropped column.
row_upd_clust_rec_by_insert_inherit_func(): On the update of PRIMARY KEY
of a record that contained a dropped column whose value was stored
externally, we will be inserting a dummy NULL or empty string value
to the field of the dropped column. The externally stored column would
eventually be dropped when purge removes the delete-marked record for
the old PRIMARY KEY value.
btr_index_rec_validate(): Recognize the metadata record.
btr_discard_only_page_on_level(): Preserve the generic instant
ALTER TABLE metadata.
btr_set_instant(): Replaces page_set_instant(). This sets a clustered
index root page to the appropriate format, or upgrades from
the MDEV-11369 instant ADD COLUMN to generic ALTER TABLE format.
btr_cur_instant_init_low(): Read and validate the metadata BLOB page
before reconstructing the dictionary information based on it.
btr_cur_instant_init_metadata(): Do not read any lengths from the
metadata record header before reading the BLOB. At this point, we
would not actually know how many nullable fields the metadata record
contains.
btr_cur_instant_root_init(): Initialize n_core_null_bytes in one
of two possible ways.
btr_cur_trim(): Handle the mblob record.
row_metadata_to_tuple(): Convert a metadata record to a data tuple,
based on the new info_bits of the metadata record.
btr_cur_pessimistic_update(): Invoke row_metadata_to_tuple() if needed.
Invoke dtuple_convert_big_rec() for metadata records if the record is
too large, or if the mblob is not yet marked as externally stored.
btr_cur_optimistic_delete_func(), btr_cur_pessimistic_delete():
When the last user record is deleted, do not delete the
generic instant ALTER TABLE metadata record. Only delete
MDEV-11369 instant ADD COLUMN metadata records.
btr_cur_optimistic_insert(): Avoid unnecessary computation of rec_size.
btr_pcur_store_position(): Allow a logically empty page to contain
a metadata record for generic ALTER TABLE.
REC_INFO_DEFAULT_ROW_ADD: Renamed from REC_INFO_DEFAULT_ROW.
This is for the old instant ADD COLUMN (MDEV-11369) only.
REC_INFO_DEFAULT_ROW_ALTER: The more generic metadata record,
with additional information for dropped or reordered columns.
rec_info_bits_valid(): Remove. The only case when this would fail
is when the record is the generic ALTER TABLE metadata record.
rec_is_alter_metadata(): Check if a record is the metadata record
for instant ALTER TABLE (other than ADD COLUMN). NOTE: This function
must not be invoked on node pointer records, because the delete-mark
flag in those records may be set (it is garbage), and then a debug
assertion could fail because index->is_instant() does not necessarily
hold.
rec_is_add_metadata(): Check if a record is MDEV-11369 ADD COLUMN metadata
record (not more generic instant ALTER TABLE).
rec_get_converted_size_comp_prefix_low(): Assume that the metadata
field will be stored externally. In dtuple_convert_big_rec() during
the rec_get_converted_size() call, it would not be there yet.
rec_get_converted_size_comp(): Replace status,fields,n_fields with tuple.
rec_init_offsets_comp_ordinary(), rec_get_converted_size_comp_prefix_low(),
rec_convert_dtuple_to_rec_comp(): Add template<bool mblob = false>.
With mblob=true, process a record with a metadata BLOB.
rec_copy_prefix_to_buf(): Assert that no fields beyond the key and
system columns are being copied. Exclude the metadata BLOB field.
rec_convert_dtuple_to_metadata_comp(): Convert an alter metadata tuple
into a record.
row_upd_index_replace_metadata(): Apply an update vector to an
alter_metadata tuple.
row_log_allocate(): Replace dict_index_t::is_instant()
with a more appropriate condition that ignores dict_table_t::instant.
Only a table on which the MDEV-11369 ADD COLUMN was performed
can "lose its instantness" when it becomes empty. After
instant DROP COLUMN or reordering columns, we cannot simply
convert the table to the canonical format, because the data
dictionary cache and all possibly existing references to it
from other client connection threads would have to be adjusted.
row_quiesce_write_index_fields(): Do not crash when the table contains
an instantly dropped column.
Thanks to Thirunarayanan Balathandayuthapani for discussing the design
and implementing an initial prototype of this.
Thanks to Matthias Leich for testing.
2018-10-19 16:49:54 +03:00
|
|
|
/** Flag the column instantly dropped */
|
|
|
|
void set_dropped() { ind = DROPPED; }
|
|
|
|
/** Flag the column instantly dropped.
|
|
|
|
@param[in] not_null whether the column was NOT NULL
|
|
|
|
@param[in] len2 whether the length exceeds 255 bytes
|
|
|
|
@param[in] fixed_len the fixed length in bytes, or 0 */
|
|
|
|
void set_dropped(bool not_null, bool len2, unsigned fixed)
|
|
|
|
{
|
|
|
|
DBUG_ASSERT(!len2 || !fixed);
|
|
|
|
prtype = not_null
|
|
|
|
? DATA_NOT_NULL | DATA_BINARY_TYPE
|
|
|
|
: DATA_BINARY_TYPE;
|
|
|
|
if (fixed) {
|
|
|
|
mtype = DATA_FIXBINARY;
|
|
|
|
len = fixed;
|
|
|
|
} else {
|
|
|
|
mtype = DATA_BINARY;
|
|
|
|
len = len2 ? 65535 : 255;
|
|
|
|
}
|
|
|
|
mbminlen = mbmaxlen = 0;
|
|
|
|
ind = DROPPED;
|
|
|
|
ord_part = 0;
|
|
|
|
max_prefix = 0;
|
|
|
|
}
|
|
|
|
/** @return whether the column was instantly dropped */
|
|
|
|
bool is_dropped() const { return ind == DROPPED; }
|
|
|
|
/** @return whether the column was instantly dropped
|
|
|
|
@param[in] index the clustered index */
|
|
|
|
inline bool is_dropped(const dict_index_t& index) const;
|
|
|
|
|
MDEV-11369 Instant ADD COLUMN for InnoDB
For InnoDB tables, adding, dropping and reordering columns has
required a rebuild of the table and all its indexes. Since MySQL 5.6
(and MariaDB 10.0) this has been supported online (LOCK=NONE), allowing
concurrent modification of the tables.
This work revises the InnoDB ROW_FORMAT=REDUNDANT, ROW_FORMAT=COMPACT
and ROW_FORMAT=DYNAMIC so that columns can be appended instantaneously,
with only minor changes performed to the table structure. The counter
innodb_instant_alter_column in INFORMATION_SCHEMA.GLOBAL_STATUS
is incremented whenever a table rebuild operation is converted into
an instant ADD COLUMN operation.
ROW_FORMAT=COMPRESSED tables will not support instant ADD COLUMN.
Some usability limitations will be addressed in subsequent work:
MDEV-13134 Introduce ALTER TABLE attributes ALGORITHM=NOCOPY
and ALGORITHM=INSTANT
MDEV-14016 Allow instant ADD COLUMN, ADD INDEX, LOCK=NONE
The format of the clustered index (PRIMARY KEY) is changed as follows:
(1) The FIL_PAGE_TYPE of the root page will be FIL_PAGE_TYPE_INSTANT,
and a new field PAGE_INSTANT will contain the original number of fields
in the clustered index ('core' fields).
If instant ADD COLUMN has not been used or the table becomes empty,
or the very first instant ADD COLUMN operation is rolled back,
the fields PAGE_INSTANT and FIL_PAGE_TYPE will be reset
to 0 and FIL_PAGE_INDEX.
(2) A special 'default row' record is inserted into the leftmost leaf,
between the page infimum and the first user record. This record is
distinguished by the REC_INFO_MIN_REC_FLAG, and it is otherwise in the
same format as records that contain values for the instantly added
columns. This 'default row' always has the same number of fields as
the clustered index according to the table definition. The values of
'core' fields are to be ignored. For other fields, the 'default row'
will contain the default values as they were during the ALTER TABLE
statement. (If the column default values are changed later, those
values will only be stored in the .frm file. The 'default row' will
contain the original evaluated values, which must be the same for
every row.) The 'default row' must be completely hidden from
higher-level access routines. Assertions have been added to ensure
that no 'default row' is ever present in the adaptive hash index
or in locked records. The 'default row' is never delete-marked.
(3) In clustered index leaf page records, the number of fields must
reside between the number of 'core' fields (dict_index_t::n_core_fields
introduced in this work) and dict_index_t::n_fields. If the number
of fields is less than dict_index_t::n_fields, the missing fields
are replaced with the column value of the 'default row'.
Note: The number of fields in the record may shrink if some of the
last instantly added columns are updated to the value that is
in the 'default row'. The function btr_cur_trim() implements this
'compression' on update and rollback; dtuple::trim() implements it
on insert.
(4) In ROW_FORMAT=COMPACT and ROW_FORMAT=DYNAMIC records, the new
status value REC_STATUS_COLUMNS_ADDED will indicate the presence of
a new record header that will encode n_fields-n_core_fields-1 in
1 or 2 bytes. (In ROW_FORMAT=REDUNDANT records, the record header
always explicitly encodes the number of fields.)
We introduce the undo log record type TRX_UNDO_INSERT_DEFAULT for
covering the insert of the 'default row' record when instant ADD COLUMN
is used for the first time. Subsequent instant ADD COLUMN can use
TRX_UNDO_UPD_EXIST_REC.
This is joint work with Vin Chen (陈福荣) from Tencent. The design
that was discussed in April 2017 would not have allowed import or
export of data files, because instead of the 'default row' it would
have introduced a data dictionary table. The test
rpl.rpl_alter_instant is exactly as contributed in pull request #408.
The test innodb.instant_alter is based on a contributed test.
The redo log record format changes for ROW_FORMAT=DYNAMIC and
ROW_FORMAT=COMPACT are as contributed. (With this change present,
crash recovery from MariaDB 10.3.1 will fail in spectacular ways!)
Also the semantics of higher-level redo log records that modify the
PAGE_INSTANT field is changed. The redo log format version identifier
was already changed to LOG_HEADER_FORMAT_CURRENT=103 in MariaDB 10.3.1.
Everything else has been rewritten by me. Thanks to Elena Stepanova,
the code has been tested extensively.
When rolling back an instant ADD COLUMN operation, we must empty the
PAGE_FREE list after deleting or shortening the 'default row' record,
by calling either btr_page_empty() or btr_page_reorganize(). We must
know the size of each entry in the PAGE_FREE list. If rollback left a
freed copy of the 'default row' in the PAGE_FREE list, we would be
unable to determine its size (if it is in ROW_FORMAT=COMPACT or
ROW_FORMAT=DYNAMIC) because it would contain more fields than the
rolled-back definition of the clustered index.
UNIV_SQL_DEFAULT: A new special constant that designates an instantly
added column that is not present in the clustered index record.
len_is_stored(): Check if a length is an actual length. There are
two magic length values: UNIV_SQL_DEFAULT, UNIV_SQL_NULL.
dict_col_t::def_val: The 'default row' value of the column. If the
column is not added instantly, def_val.len will be UNIV_SQL_DEFAULT.
dict_col_t: Add the accessors is_virtual(), is_nullable(), is_instant(),
instant_value().
dict_col_t::remove_instant(): Remove the 'instant ADD' status of
a column.
dict_col_t::name(const dict_table_t& table): Replaces
dict_table_get_col_name().
dict_index_t::n_core_fields: The original number of fields.
For secondary indexes and if instant ADD COLUMN has not been used,
this will be equal to dict_index_t::n_fields.
dict_index_t::n_core_null_bytes: Number of bytes needed to
represent the null flags; usually equal to UT_BITS_IN_BYTES(n_nullable).
dict_index_t::NO_CORE_NULL_BYTES: Magic value signalling that
n_core_null_bytes was not initialized yet from the clustered index
root page.
dict_index_t: Add the accessors is_instant(), is_clust(),
get_n_nullable(), instant_field_value().
dict_index_t::instant_add_field(): Adjust clustered index metadata
for instant ADD COLUMN.
dict_index_t::remove_instant(): Remove the 'instant ADD' status
of a clustered index when the table becomes empty, or the very first
instant ADD COLUMN operation is rolled back.
dict_table_t: Add the accessors is_instant(), is_temporary(),
supports_instant().
dict_table_t::instant_add_column(): Adjust metadata for
instant ADD COLUMN.
dict_table_t::rollback_instant(): Adjust metadata on the rollback
of instant ADD COLUMN.
prepare_inplace_alter_table_dict(): First create the ctx->new_table,
and only then decide if the table really needs to be rebuilt.
We must split the creation of table or index metadata from the
creation of the dictionary table records and the creation of
the data. In this way, we can transform a table-rebuilding operation
into an instant ADD COLUMN operation. Dictionary objects will only
be added to cache when table rebuilding or index creation is needed.
The ctx->instant_table will never be added to cache.
dict_table_t::add_to_cache(): Modified and renamed from
dict_table_add_to_cache(). Do not modify the table metadata.
Let the callers invoke dict_table_add_system_columns() and if needed,
set can_be_evicted.
dict_create_sys_tables_tuple(), dict_create_table_step(): Omit the
system columns (which will now exist in the dict_table_t object
already at this point).
dict_create_table_step(): Expect the callers to invoke
dict_table_add_system_columns().
pars_create_table(): Before creating the table creation execution
graph, invoke dict_table_add_system_columns().
row_create_table_for_mysql(): Expect all callers to invoke
dict_table_add_system_columns().
create_index_dict(): Replaces row_merge_create_index_graph().
innodb_update_n_cols(): Renamed from innobase_update_n_virtual().
Call my_error() if an error occurs.
btr_cur_instant_init(), btr_cur_instant_init_low(),
btr_cur_instant_root_init():
Load additional metadata from the clustered index and set
dict_index_t::n_core_null_bytes. This is invoked
when table metadata is first loaded into the data dictionary.
dict_boot(): Initialize n_core_null_bytes for the four hard-coded
dictionary tables.
dict_create_index_step(): Initialize n_core_null_bytes. This is
executed as part of CREATE TABLE.
dict_index_build_internal_clust(): Initialize n_core_null_bytes to
NO_CORE_NULL_BYTES if table->supports_instant().
row_create_index_for_mysql(): Initialize n_core_null_bytes for
CREATE TEMPORARY TABLE.
commit_cache_norebuild(): Call the code to rename or enlarge columns
in the cache only if instant ADD COLUMN is not being used.
(Instant ADD COLUMN would copy all column metadata from
instant_table to old_table, including the names and lengths.)
PAGE_INSTANT: A new 13-bit field for storing dict_index_t::n_core_fields.
This is repurposing the 16-bit field PAGE_DIRECTION, of which only the
least significant 3 bits were used. The original byte containing
PAGE_DIRECTION will be accessible via the new constant PAGE_DIRECTION_B.
page_get_instant(), page_set_instant(): Accessors for the PAGE_INSTANT.
page_ptr_get_direction(), page_get_direction(),
page_ptr_set_direction(): Accessors for PAGE_DIRECTION.
page_direction_reset(): Reset PAGE_DIRECTION, PAGE_N_DIRECTION.
page_direction_increment(): Increment PAGE_N_DIRECTION
and set PAGE_DIRECTION.
rec_get_offsets(): Use the 'leaf' parameter for non-debug purposes,
and assume that heap_no is always set.
Initialize all dict_index_t::n_fields for ROW_FORMAT=REDUNDANT records,
even if the record contains fewer fields.
rec_offs_make_valid(): Add the parameter 'leaf'.
rec_copy_prefix_to_dtuple(): Assert that the tuple is only built
on the core fields. Instant ADD COLUMN only applies to the
clustered index, and we should never build a search key that has
more than the PRIMARY KEY and possibly DB_TRX_ID,DB_ROLL_PTR.
All these columns are always present.
dict_index_build_data_tuple(): Remove assertions that would be
duplicated in rec_copy_prefix_to_dtuple().
rec_init_offsets(): Support ROW_FORMAT=REDUNDANT records whose
number of fields is between n_core_fields and n_fields.
cmp_rec_rec_with_match(): Implement the comparison between two
MIN_REC_FLAG records.
trx_t::in_rollback: Make the field available in non-debug builds.
trx_start_for_ddl_low(): Remove dangerous error-tolerance.
A dictionary transaction must be flagged as such before it has generated
any undo log records. This is because trx_undo_assign_undo() will mark
the transaction as a dictionary transaction in the undo log header
right before the very first undo log record is being written.
btr_index_rec_validate(): Account for instant ADD COLUMN
row_undo_ins_remove_clust_rec(): On the rollback of an insert into
SYS_COLUMNS, revert instant ADD COLUMN in the cache by removing the
last column from the table and the clustered index.
row_search_on_row_ref(), row_undo_mod_parse_undo_rec(), row_undo_mod(),
trx_undo_update_rec_get_update(): Handle the 'default row'
as a special case.
dtuple_t::trim(index): Omit a redundant suffix of an index tuple right
before insert or update. After instant ADD COLUMN, if the last fields
of a clustered index tuple match the 'default row', there is no
need to store them. While trimming the entry, we must hold a page latch,
so that the table cannot be emptied and the 'default row' be deleted.
btr_cur_optimistic_update(), btr_cur_pessimistic_update(),
row_upd_clust_rec_by_insert(), row_ins_clust_index_entry_low():
Invoke dtuple_t::trim() if needed.
row_ins_clust_index_entry(): Restore dtuple_t::n_fields after calling
row_ins_clust_index_entry_low().
rec_get_converted_size(), rec_get_converted_size_comp(): Allow the number
of fields to be between n_core_fields and n_fields. Do not support
infimum,supremum. They are never supposed to be stored in dtuple_t,
because page creation nowadays uses a lower-level method for initializing
them.
rec_convert_dtuple_to_rec_comp(): Assign the status bits based on the
number of fields.
btr_cur_trim(): In an update, trim the index entry as needed. For the
'default row', handle rollback specially. For user records, omit
fields that match the 'default row'.
btr_cur_optimistic_delete_func(), btr_cur_pessimistic_delete():
Skip locking and adaptive hash index for the 'default row'.
row_log_table_apply_convert_mrec(): Replace 'default row' values if needed.
In the temporary file that is applied by row_log_table_apply(),
we must identify whether the records contain the extra header for
instantly added columns. For now, we will allocate an additional byte
for this for ROW_T_INSERT and ROW_T_UPDATE records when the source table
has been subject to instant ADD COLUMN. The ROW_T_DELETE records are
fine, as they will be converted and will only contain 'core' columns
(PRIMARY KEY and some system columns) that are converted from dtuple_t.
rec_get_converted_size_temp(), rec_init_offsets_temp(),
rec_convert_dtuple_to_temp(): Add the parameter 'status'.
REC_INFO_DEFAULT_ROW = REC_INFO_MIN_REC_FLAG | REC_STATUS_COLUMNS_ADDED:
An info_bits constant for distinguishing the 'default row' record.
rec_comp_status_t: An enum of the status bit values.
rec_leaf_format: An enum that replaces the bool parameter of
rec_init_offsets_comp_ordinary().
2017-10-06 07:00:05 +03:00
|
|
|
/** Get the default value of an instantly-added column.
|
|
|
|
@param[out] len value length (in bytes), or UNIV_SQL_NULL
|
|
|
|
@return default value
|
|
|
|
@retval NULL if the default value is SQL NULL (len=UNIV_SQL_NULL) */
|
|
|
|
const byte* instant_value(ulint* len) const
|
|
|
|
{
|
MDEV-15662 Instant DROP COLUMN or changing the order of columns
Allow ADD COLUMN anywhere in a table, not only adding as the
last column.
Allow instant DROP COLUMN and instant changing the order of columns.
The added columns will always be added last in clustered index records.
In new records, instantly dropped columns will be stored as NULL or
empty when possible.
Information about dropped and reordered columns will be written in
a metadata BLOB (mblob), which is stored before the first 'user' field
in the hidden metadata record at the start of the clustered index.
The presence of mblob is indicated by setting the delete-mark flag in
the metadata record.
The metadata BLOB stores the number of clustered index fields,
followed by an array of column information for each field.
For dropped columns, we store the NOT NULL flag, the fixed length,
and for variable-length columns, whether the maximum length exceeded
255 bytes. For non-dropped columns, we store the column position.
Unlike with MDEV-11369, when a table becomes empty, it cannot
be converted back to the canonical format. The reason for this is
that other threads may hold cached objects such as
row_prebuilt_t::ins_node that could refer to dropped or reordered
index fields.
For instant DROP COLUMN and ROW_FORMAT=COMPACT or ROW_FORMAT=DYNAMIC,
we must store the n_core_null_bytes in the root page, so that the
chain of node pointer records can be followed in order to reach the
leftmost leaf page where the metadata record is located.
If the mblob is present, we will zero-initialize the strings
"infimum" and "supremum" in the root page, and use the last byte of
"supremum" for storing the number of null bytes (which are allocated
but useless on node pointer pages). This is necessary for
btr_cur_instant_init_metadata() to be able to navigate to the mblob.
If the PRIMARY KEY contains any variable-length column and some
nullable columns were instantly dropped, the dict_index_t::n_nullable
in the data dictionary could be smaller than it actually is in the
non-leaf pages. Because of this, the non-leaf pages could use more
bytes for the null flags than the data dictionary expects, and we
could be reading the lengths of the variable-length columns from the
wrong offset, and thus reading the child page number from wrong place.
This is the result of two design mistakes that involve unnecessary
storage of data: First, it is nonsense to store any data fields for
the leftmost node pointer records, because the comparisons would be
resolved by the MIN_REC_FLAG alone. Second, there cannot be any null
fields in the clustered index node pointer fields, but we nevertheless
reserve space for all the null flags.
Limitations (future work):
MDEV-17459 Allow instant ALTER TABLE even if FULLTEXT INDEX exists
MDEV-17468 Avoid table rebuild on operations on generated columns
MDEV-17494 Refuse ALGORITHM=INSTANT when the row size is too large
btr_page_reorganize_low(): Preserve any metadata in the root page.
Call lock_move_reorganize_page() only after restoring the "infimum"
and "supremum" records, to avoid a memcmp() assertion failure.
dict_col_t::DROPPED: Magic value for dict_col_t::ind.
dict_col_t::clear_instant(): Renamed from dict_col_t::remove_instant().
Do not assert that the column was instantly added, because we
sometimes call this unconditionally for all columns.
Convert an instantly added column to a "core column". The old name
remove_instant() could be mistaken to refer to "instant DROP COLUMN".
dict_col_t::is_added(): Rename from dict_col_t::is_instant().
dtype_t::metadata_blob_init(): Initialize the mblob data type.
dtuple_t::is_metadata(), dtuple_t::is_alter_metadata(),
upd_t::is_metadata(), upd_t::is_alter_metadata(): Check if info_bits
refer to a metadata record.
dict_table_t::instant: Metadata about dropped or reordered columns.
dict_table_t::prepare_instant(): Prepare
ha_innobase_inplace_ctx::instant_table for instant ALTER TABLE.
innobase_instant_try() will pass this to dict_table_t::instant_column().
On rollback, dict_table_t::rollback_instant() will be called.
dict_table_t::instant_column(): Renamed from instant_add_column().
Add the parameter col_map so that columns can be reordered.
Copy and adjust v_cols[] as well.
dict_table_t::find(): Find an old column based on a new column number.
dict_table_t::serialise_columns(), dict_table_t::deserialise_columns():
Convert the mblob.
dict_index_t::instant_metadata(): Create the metadata record
for instant ALTER TABLE. Invoke dict_table_t::serialise_columns().
dict_index_t::reconstruct_fields(): Invoked by
dict_table_t::deserialise_columns().
dict_index_t::clear_instant_alter(): Move the fields for the
dropped columns to the end, and sort the surviving index fields
in ascending order of column position.
ha_innobase::check_if_supported_inplace_alter(): Do not allow
adding a FTS_DOC_ID column if a hidden FTS_DOC_ID column exists
due to FULLTEXT INDEX. (This always required ALGORITHM=COPY.)
instant_alter_column_possible(): Add a parameter for InnoDB table,
to check for additional conditions, such as the maximum number of
index fields.
ha_innobase_inplace_ctx::first_alter_pos: The first column whose position
is affected by instant ADD, DROP, or changing the order of columns.
innobase_build_col_map(): Skip added virtual columns.
prepare_inplace_add_virtual(): Correctly compute num_to_add_vcol.
Remove some unnecessary code. Note that the call to
innodb_base_col_setup() should be executed later.
commit_try_norebuild(): If ctx->is_instant(), let the virtual
columns be added or dropped by innobase_instant_try().
innobase_instant_try(): Fill in a zero default value for the
hidden column FTS_DOC_ID (to reduce the work needed in MDEV-17459).
If any columns were dropped or reordered (or added not last),
delete any SYS_COLUMNS records for the following columns, and
insert SYS_COLUMNS records for all subsequent stored columns as well
as for all virtual columns. If any virtual column is dropped, rewrite
all virtual column metadata. Use a shortcut only for adding
virtual columns. This is because innobase_drop_virtual_try()
assumes that the dropped virtual columns still exist in ctx->old_table.
innodb_update_cols(): Renamed from innodb_update_n_cols().
innobase_add_one_virtual(), innobase_insert_sys_virtual(): Change
the return type to bool, and invoke my_error() when detecting an error.
innodb_insert_sys_columns(): Insert a record into SYS_COLUMNS.
Refactored from innobase_add_one_virtual() and innobase_instant_add_col().
innobase_instant_add_col(): Replace the parameter dfield with type.
innobase_instant_drop_cols(): Drop matching columns from SYS_COLUMNS
and all columns from SYS_VIRTUAL.
innobase_add_virtual_try(), innobase_drop_virtual_try(): Let
the caller invoke innodb_update_cols().
innobase_rename_column_try(): Skip dropped columns.
commit_cache_norebuild(): Update table->fts->doc_col.
dict_mem_table_col_rename_low(): Skip dropped columns.
trx_undo_rec_get_partial_row(): Skip dropped columns.
trx_undo_update_rec_get_update(): Handle the metadata BLOB correctly.
trx_undo_page_report_modify(): Avoid out-of-bounds access to record fields.
Log metadata records consistently.
Apparently, the first fields of a clustered index may be updated
in an update_undo vector when the index is ID_IND of SYS_FOREIGN,
as part of renaming the table during ALTER TABLE. Normally, updates of
the PRIMARY KEY should be logged as delete-mark and an insert.
row_undo_mod_parse_undo_rec(), row_purge_parse_undo_rec():
Use trx_undo_metadata.
row_undo_mod_clust_low(): On metadata rollback, roll back the root page too.
row_undo_mod_clust(): Relax an assertion. The delete-mark flag was
repurposed for ALTER TABLE metadata records.
row_rec_to_index_entry_impl(): Add the template parameter mblob
and the optional parameter info_bits for specifying the desired new
info bits. For the metadata tuple, allow conversion between the original
format (ADD COLUMN only) and the generic format (with hidden BLOB).
Add the optional parameter "pad" to determine whether the tuple should
be padded to the index fields (on ALTER TABLE it should), or whether
it should remain at its original size (on rollback).
row_build_index_entry_low(): Clean up the code, removing
redundant variables and conditions. For instantly dropped columns,
generate a dummy value that is NULL, the empty string, or a
fixed length of NUL bytes, depending on the type of the dropped column.
row_upd_clust_rec_by_insert_inherit_func(): On the update of PRIMARY KEY
of a record that contained a dropped column whose value was stored
externally, we will be inserting a dummy NULL or empty string value
to the field of the dropped column. The externally stored column would
eventually be dropped when purge removes the delete-marked record for
the old PRIMARY KEY value.
btr_index_rec_validate(): Recognize the metadata record.
btr_discard_only_page_on_level(): Preserve the generic instant
ALTER TABLE metadata.
btr_set_instant(): Replaces page_set_instant(). This sets a clustered
index root page to the appropriate format, or upgrades from
the MDEV-11369 instant ADD COLUMN to generic ALTER TABLE format.
btr_cur_instant_init_low(): Read and validate the metadata BLOB page
before reconstructing the dictionary information based on it.
btr_cur_instant_init_metadata(): Do not read any lengths from the
metadata record header before reading the BLOB. At this point, we
would not actually know how many nullable fields the metadata record
contains.
btr_cur_instant_root_init(): Initialize n_core_null_bytes in one
of two possible ways.
btr_cur_trim(): Handle the mblob record.
row_metadata_to_tuple(): Convert a metadata record to a data tuple,
based on the new info_bits of the metadata record.
btr_cur_pessimistic_update(): Invoke row_metadata_to_tuple() if needed.
Invoke dtuple_convert_big_rec() for metadata records if the record is
too large, or if the mblob is not yet marked as externally stored.
btr_cur_optimistic_delete_func(), btr_cur_pessimistic_delete():
When the last user record is deleted, do not delete the
generic instant ALTER TABLE metadata record. Only delete
MDEV-11369 instant ADD COLUMN metadata records.
btr_cur_optimistic_insert(): Avoid unnecessary computation of rec_size.
btr_pcur_store_position(): Allow a logically empty page to contain
a metadata record for generic ALTER TABLE.
REC_INFO_DEFAULT_ROW_ADD: Renamed from REC_INFO_DEFAULT_ROW.
This is for the old instant ADD COLUMN (MDEV-11369) only.
REC_INFO_DEFAULT_ROW_ALTER: The more generic metadata record,
with additional information for dropped or reordered columns.
rec_info_bits_valid(): Remove. The only case when this would fail
is when the record is the generic ALTER TABLE metadata record.
rec_is_alter_metadata(): Check if a record is the metadata record
for instant ALTER TABLE (other than ADD COLUMN). NOTE: This function
must not be invoked on node pointer records, because the delete-mark
flag in those records may be set (it is garbage), and then a debug
assertion could fail because index->is_instant() does not necessarily
hold.
rec_is_add_metadata(): Check if a record is MDEV-11369 ADD COLUMN metadata
record (not more generic instant ALTER TABLE).
rec_get_converted_size_comp_prefix_low(): Assume that the metadata
field will be stored externally. In dtuple_convert_big_rec() during
the rec_get_converted_size() call, it would not be there yet.
rec_get_converted_size_comp(): Replace status,fields,n_fields with tuple.
rec_init_offsets_comp_ordinary(), rec_get_converted_size_comp_prefix_low(),
rec_convert_dtuple_to_rec_comp(): Add template<bool mblob = false>.
With mblob=true, process a record with a metadata BLOB.
rec_copy_prefix_to_buf(): Assert that no fields beyond the key and
system columns are being copied. Exclude the metadata BLOB field.
rec_convert_dtuple_to_metadata_comp(): Convert an alter metadata tuple
into a record.
row_upd_index_replace_metadata(): Apply an update vector to an
alter_metadata tuple.
row_log_allocate(): Replace dict_index_t::is_instant()
with a more appropriate condition that ignores dict_table_t::instant.
Only a table on which the MDEV-11369 ADD COLUMN was performed
can "lose its instantness" when it becomes empty. After
instant DROP COLUMN or reordering columns, we cannot simply
convert the table to the canonical format, because the data
dictionary cache and all possibly existing references to it
from other client connection threads would have to be adjusted.
row_quiesce_write_index_fields(): Do not crash when the table contains
an instantly dropped column.
Thanks to Thirunarayanan Balathandayuthapani for discussing the design
and implementing an initial prototype of this.
Thanks to Matthias Leich for testing.
2018-10-19 16:49:54 +03:00
|
|
|
DBUG_ASSERT(is_added());
|
MDEV-11369 Instant ADD COLUMN for InnoDB
For InnoDB tables, adding, dropping and reordering columns has
required a rebuild of the table and all its indexes. Since MySQL 5.6
(and MariaDB 10.0) this has been supported online (LOCK=NONE), allowing
concurrent modification of the tables.
This work revises the InnoDB ROW_FORMAT=REDUNDANT, ROW_FORMAT=COMPACT
and ROW_FORMAT=DYNAMIC so that columns can be appended instantaneously,
with only minor changes performed to the table structure. The counter
innodb_instant_alter_column in INFORMATION_SCHEMA.GLOBAL_STATUS
is incremented whenever a table rebuild operation is converted into
an instant ADD COLUMN operation.
ROW_FORMAT=COMPRESSED tables will not support instant ADD COLUMN.
Some usability limitations will be addressed in subsequent work:
MDEV-13134 Introduce ALTER TABLE attributes ALGORITHM=NOCOPY
and ALGORITHM=INSTANT
MDEV-14016 Allow instant ADD COLUMN, ADD INDEX, LOCK=NONE
The format of the clustered index (PRIMARY KEY) is changed as follows:
(1) The FIL_PAGE_TYPE of the root page will be FIL_PAGE_TYPE_INSTANT,
and a new field PAGE_INSTANT will contain the original number of fields
in the clustered index ('core' fields).
If instant ADD COLUMN has not been used or the table becomes empty,
or the very first instant ADD COLUMN operation is rolled back,
the fields PAGE_INSTANT and FIL_PAGE_TYPE will be reset
to 0 and FIL_PAGE_INDEX.
(2) A special 'default row' record is inserted into the leftmost leaf,
between the page infimum and the first user record. This record is
distinguished by the REC_INFO_MIN_REC_FLAG, and it is otherwise in the
same format as records that contain values for the instantly added
columns. This 'default row' always has the same number of fields as
the clustered index according to the table definition. The values of
'core' fields are to be ignored. For other fields, the 'default row'
will contain the default values as they were during the ALTER TABLE
statement. (If the column default values are changed later, those
values will only be stored in the .frm file. The 'default row' will
contain the original evaluated values, which must be the same for
every row.) The 'default row' must be completely hidden from
higher-level access routines. Assertions have been added to ensure
that no 'default row' is ever present in the adaptive hash index
or in locked records. The 'default row' is never delete-marked.
(3) In clustered index leaf page records, the number of fields must
reside between the number of 'core' fields (dict_index_t::n_core_fields
introduced in this work) and dict_index_t::n_fields. If the number
of fields is less than dict_index_t::n_fields, the missing fields
are replaced with the column value of the 'default row'.
Note: The number of fields in the record may shrink if some of the
last instantly added columns are updated to the value that is
in the 'default row'. The function btr_cur_trim() implements this
'compression' on update and rollback; dtuple::trim() implements it
on insert.
(4) In ROW_FORMAT=COMPACT and ROW_FORMAT=DYNAMIC records, the new
status value REC_STATUS_COLUMNS_ADDED will indicate the presence of
a new record header that will encode n_fields-n_core_fields-1 in
1 or 2 bytes. (In ROW_FORMAT=REDUNDANT records, the record header
always explicitly encodes the number of fields.)
We introduce the undo log record type TRX_UNDO_INSERT_DEFAULT for
covering the insert of the 'default row' record when instant ADD COLUMN
is used for the first time. Subsequent instant ADD COLUMN can use
TRX_UNDO_UPD_EXIST_REC.
This is joint work with Vin Chen (陈福荣) from Tencent. The design
that was discussed in April 2017 would not have allowed import or
export of data files, because instead of the 'default row' it would
have introduced a data dictionary table. The test
rpl.rpl_alter_instant is exactly as contributed in pull request #408.
The test innodb.instant_alter is based on a contributed test.
The redo log record format changes for ROW_FORMAT=DYNAMIC and
ROW_FORMAT=COMPACT are as contributed. (With this change present,
crash recovery from MariaDB 10.3.1 will fail in spectacular ways!)
Also the semantics of higher-level redo log records that modify the
PAGE_INSTANT field is changed. The redo log format version identifier
was already changed to LOG_HEADER_FORMAT_CURRENT=103 in MariaDB 10.3.1.
Everything else has been rewritten by me. Thanks to Elena Stepanova,
the code has been tested extensively.
When rolling back an instant ADD COLUMN operation, we must empty the
PAGE_FREE list after deleting or shortening the 'default row' record,
by calling either btr_page_empty() or btr_page_reorganize(). We must
know the size of each entry in the PAGE_FREE list. If rollback left a
freed copy of the 'default row' in the PAGE_FREE list, we would be
unable to determine its size (if it is in ROW_FORMAT=COMPACT or
ROW_FORMAT=DYNAMIC) because it would contain more fields than the
rolled-back definition of the clustered index.
UNIV_SQL_DEFAULT: A new special constant that designates an instantly
added column that is not present in the clustered index record.
len_is_stored(): Check if a length is an actual length. There are
two magic length values: UNIV_SQL_DEFAULT, UNIV_SQL_NULL.
dict_col_t::def_val: The 'default row' value of the column. If the
column is not added instantly, def_val.len will be UNIV_SQL_DEFAULT.
dict_col_t: Add the accessors is_virtual(), is_nullable(), is_instant(),
instant_value().
dict_col_t::remove_instant(): Remove the 'instant ADD' status of
a column.
dict_col_t::name(const dict_table_t& table): Replaces
dict_table_get_col_name().
dict_index_t::n_core_fields: The original number of fields.
For secondary indexes and if instant ADD COLUMN has not been used,
this will be equal to dict_index_t::n_fields.
dict_index_t::n_core_null_bytes: Number of bytes needed to
represent the null flags; usually equal to UT_BITS_IN_BYTES(n_nullable).
dict_index_t::NO_CORE_NULL_BYTES: Magic value signalling that
n_core_null_bytes was not initialized yet from the clustered index
root page.
dict_index_t: Add the accessors is_instant(), is_clust(),
get_n_nullable(), instant_field_value().
dict_index_t::instant_add_field(): Adjust clustered index metadata
for instant ADD COLUMN.
dict_index_t::remove_instant(): Remove the 'instant ADD' status
of a clustered index when the table becomes empty, or the very first
instant ADD COLUMN operation is rolled back.
dict_table_t: Add the accessors is_instant(), is_temporary(),
supports_instant().
dict_table_t::instant_add_column(): Adjust metadata for
instant ADD COLUMN.
dict_table_t::rollback_instant(): Adjust metadata on the rollback
of instant ADD COLUMN.
prepare_inplace_alter_table_dict(): First create the ctx->new_table,
and only then decide if the table really needs to be rebuilt.
We must split the creation of table or index metadata from the
creation of the dictionary table records and the creation of
the data. In this way, we can transform a table-rebuilding operation
into an instant ADD COLUMN operation. Dictionary objects will only
be added to cache when table rebuilding or index creation is needed.
The ctx->instant_table will never be added to cache.
dict_table_t::add_to_cache(): Modified and renamed from
dict_table_add_to_cache(). Do not modify the table metadata.
Let the callers invoke dict_table_add_system_columns() and if needed,
set can_be_evicted.
dict_create_sys_tables_tuple(), dict_create_table_step(): Omit the
system columns (which will now exist in the dict_table_t object
already at this point).
dict_create_table_step(): Expect the callers to invoke
dict_table_add_system_columns().
pars_create_table(): Before creating the table creation execution
graph, invoke dict_table_add_system_columns().
row_create_table_for_mysql(): Expect all callers to invoke
dict_table_add_system_columns().
create_index_dict(): Replaces row_merge_create_index_graph().
innodb_update_n_cols(): Renamed from innobase_update_n_virtual().
Call my_error() if an error occurs.
btr_cur_instant_init(), btr_cur_instant_init_low(),
btr_cur_instant_root_init():
Load additional metadata from the clustered index and set
dict_index_t::n_core_null_bytes. This is invoked
when table metadata is first loaded into the data dictionary.
dict_boot(): Initialize n_core_null_bytes for the four hard-coded
dictionary tables.
dict_create_index_step(): Initialize n_core_null_bytes. This is
executed as part of CREATE TABLE.
dict_index_build_internal_clust(): Initialize n_core_null_bytes to
NO_CORE_NULL_BYTES if table->supports_instant().
row_create_index_for_mysql(): Initialize n_core_null_bytes for
CREATE TEMPORARY TABLE.
commit_cache_norebuild(): Call the code to rename or enlarge columns
in the cache only if instant ADD COLUMN is not being used.
(Instant ADD COLUMN would copy all column metadata from
instant_table to old_table, including the names and lengths.)
PAGE_INSTANT: A new 13-bit field for storing dict_index_t::n_core_fields.
This is repurposing the 16-bit field PAGE_DIRECTION, of which only the
least significant 3 bits were used. The original byte containing
PAGE_DIRECTION will be accessible via the new constant PAGE_DIRECTION_B.
page_get_instant(), page_set_instant(): Accessors for the PAGE_INSTANT.
page_ptr_get_direction(), page_get_direction(),
page_ptr_set_direction(): Accessors for PAGE_DIRECTION.
page_direction_reset(): Reset PAGE_DIRECTION, PAGE_N_DIRECTION.
page_direction_increment(): Increment PAGE_N_DIRECTION
and set PAGE_DIRECTION.
rec_get_offsets(): Use the 'leaf' parameter for non-debug purposes,
and assume that heap_no is always set.
Initialize all dict_index_t::n_fields for ROW_FORMAT=REDUNDANT records,
even if the record contains fewer fields.
rec_offs_make_valid(): Add the parameter 'leaf'.
rec_copy_prefix_to_dtuple(): Assert that the tuple is only built
on the core fields. Instant ADD COLUMN only applies to the
clustered index, and we should never build a search key that has
more than the PRIMARY KEY and possibly DB_TRX_ID,DB_ROLL_PTR.
All these columns are always present.
dict_index_build_data_tuple(): Remove assertions that would be
duplicated in rec_copy_prefix_to_dtuple().
rec_init_offsets(): Support ROW_FORMAT=REDUNDANT records whose
number of fields is between n_core_fields and n_fields.
cmp_rec_rec_with_match(): Implement the comparison between two
MIN_REC_FLAG records.
trx_t::in_rollback: Make the field available in non-debug builds.
trx_start_for_ddl_low(): Remove dangerous error-tolerance.
A dictionary transaction must be flagged as such before it has generated
any undo log records. This is because trx_undo_assign_undo() will mark
the transaction as a dictionary transaction in the undo log header
right before the very first undo log record is being written.
btr_index_rec_validate(): Account for instant ADD COLUMN
row_undo_ins_remove_clust_rec(): On the rollback of an insert into
SYS_COLUMNS, revert instant ADD COLUMN in the cache by removing the
last column from the table and the clustered index.
row_search_on_row_ref(), row_undo_mod_parse_undo_rec(), row_undo_mod(),
trx_undo_update_rec_get_update(): Handle the 'default row'
as a special case.
dtuple_t::trim(index): Omit a redundant suffix of an index tuple right
before insert or update. After instant ADD COLUMN, if the last fields
of a clustered index tuple match the 'default row', there is no
need to store them. While trimming the entry, we must hold a page latch,
so that the table cannot be emptied and the 'default row' be deleted.
btr_cur_optimistic_update(), btr_cur_pessimistic_update(),
row_upd_clust_rec_by_insert(), row_ins_clust_index_entry_low():
Invoke dtuple_t::trim() if needed.
row_ins_clust_index_entry(): Restore dtuple_t::n_fields after calling
row_ins_clust_index_entry_low().
rec_get_converted_size(), rec_get_converted_size_comp(): Allow the number
of fields to be between n_core_fields and n_fields. Do not support
infimum,supremum. They are never supposed to be stored in dtuple_t,
because page creation nowadays uses a lower-level method for initializing
them.
rec_convert_dtuple_to_rec_comp(): Assign the status bits based on the
number of fields.
btr_cur_trim(): In an update, trim the index entry as needed. For the
'default row', handle rollback specially. For user records, omit
fields that match the 'default row'.
btr_cur_optimistic_delete_func(), btr_cur_pessimistic_delete():
Skip locking and adaptive hash index for the 'default row'.
row_log_table_apply_convert_mrec(): Replace 'default row' values if needed.
In the temporary file that is applied by row_log_table_apply(),
we must identify whether the records contain the extra header for
instantly added columns. For now, we will allocate an additional byte
for this for ROW_T_INSERT and ROW_T_UPDATE records when the source table
has been subject to instant ADD COLUMN. The ROW_T_DELETE records are
fine, as they will be converted and will only contain 'core' columns
(PRIMARY KEY and some system columns) that are converted from dtuple_t.
rec_get_converted_size_temp(), rec_init_offsets_temp(),
rec_convert_dtuple_to_temp(): Add the parameter 'status'.
REC_INFO_DEFAULT_ROW = REC_INFO_MIN_REC_FLAG | REC_STATUS_COLUMNS_ADDED:
An info_bits constant for distinguishing the 'default row' record.
rec_comp_status_t: An enum of the status bit values.
rec_leaf_format: An enum that replaces the bool parameter of
rec_init_offsets_comp_ordinary().
2017-10-06 07:00:05 +03:00
|
|
|
*len = def_val.len;
|
|
|
|
return static_cast<const byte*>(def_val.data);
|
|
|
|
}
|
|
|
|
|
|
|
|
/** Remove the 'instant ADD' status of the column */
|
MDEV-15662 Instant DROP COLUMN or changing the order of columns
Allow ADD COLUMN anywhere in a table, not only adding as the
last column.
Allow instant DROP COLUMN and instant changing the order of columns.
The added columns will always be added last in clustered index records.
In new records, instantly dropped columns will be stored as NULL or
empty when possible.
Information about dropped and reordered columns will be written in
a metadata BLOB (mblob), which is stored before the first 'user' field
in the hidden metadata record at the start of the clustered index.
The presence of mblob is indicated by setting the delete-mark flag in
the metadata record.
The metadata BLOB stores the number of clustered index fields,
followed by an array of column information for each field.
For dropped columns, we store the NOT NULL flag, the fixed length,
and for variable-length columns, whether the maximum length exceeded
255 bytes. For non-dropped columns, we store the column position.
Unlike with MDEV-11369, when a table becomes empty, it cannot
be converted back to the canonical format. The reason for this is
that other threads may hold cached objects such as
row_prebuilt_t::ins_node that could refer to dropped or reordered
index fields.
For instant DROP COLUMN and ROW_FORMAT=COMPACT or ROW_FORMAT=DYNAMIC,
we must store the n_core_null_bytes in the root page, so that the
chain of node pointer records can be followed in order to reach the
leftmost leaf page where the metadata record is located.
If the mblob is present, we will zero-initialize the strings
"infimum" and "supremum" in the root page, and use the last byte of
"supremum" for storing the number of null bytes (which are allocated
but useless on node pointer pages). This is necessary for
btr_cur_instant_init_metadata() to be able to navigate to the mblob.
If the PRIMARY KEY contains any variable-length column and some
nullable columns were instantly dropped, the dict_index_t::n_nullable
in the data dictionary could be smaller than it actually is in the
non-leaf pages. Because of this, the non-leaf pages could use more
bytes for the null flags than the data dictionary expects, and we
could be reading the lengths of the variable-length columns from the
wrong offset, and thus reading the child page number from wrong place.
This is the result of two design mistakes that involve unnecessary
storage of data: First, it is nonsense to store any data fields for
the leftmost node pointer records, because the comparisons would be
resolved by the MIN_REC_FLAG alone. Second, there cannot be any null
fields in the clustered index node pointer fields, but we nevertheless
reserve space for all the null flags.
Limitations (future work):
MDEV-17459 Allow instant ALTER TABLE even if FULLTEXT INDEX exists
MDEV-17468 Avoid table rebuild on operations on generated columns
MDEV-17494 Refuse ALGORITHM=INSTANT when the row size is too large
btr_page_reorganize_low(): Preserve any metadata in the root page.
Call lock_move_reorganize_page() only after restoring the "infimum"
and "supremum" records, to avoid a memcmp() assertion failure.
dict_col_t::DROPPED: Magic value for dict_col_t::ind.
dict_col_t::clear_instant(): Renamed from dict_col_t::remove_instant().
Do not assert that the column was instantly added, because we
sometimes call this unconditionally for all columns.
Convert an instantly added column to a "core column". The old name
remove_instant() could be mistaken to refer to "instant DROP COLUMN".
dict_col_t::is_added(): Rename from dict_col_t::is_instant().
dtype_t::metadata_blob_init(): Initialize the mblob data type.
dtuple_t::is_metadata(), dtuple_t::is_alter_metadata(),
upd_t::is_metadata(), upd_t::is_alter_metadata(): Check if info_bits
refer to a metadata record.
dict_table_t::instant: Metadata about dropped or reordered columns.
dict_table_t::prepare_instant(): Prepare
ha_innobase_inplace_ctx::instant_table for instant ALTER TABLE.
innobase_instant_try() will pass this to dict_table_t::instant_column().
On rollback, dict_table_t::rollback_instant() will be called.
dict_table_t::instant_column(): Renamed from instant_add_column().
Add the parameter col_map so that columns can be reordered.
Copy and adjust v_cols[] as well.
dict_table_t::find(): Find an old column based on a new column number.
dict_table_t::serialise_columns(), dict_table_t::deserialise_columns():
Convert the mblob.
dict_index_t::instant_metadata(): Create the metadata record
for instant ALTER TABLE. Invoke dict_table_t::serialise_columns().
dict_index_t::reconstruct_fields(): Invoked by
dict_table_t::deserialise_columns().
dict_index_t::clear_instant_alter(): Move the fields for the
dropped columns to the end, and sort the surviving index fields
in ascending order of column position.
ha_innobase::check_if_supported_inplace_alter(): Do not allow
adding a FTS_DOC_ID column if a hidden FTS_DOC_ID column exists
due to FULLTEXT INDEX. (This always required ALGORITHM=COPY.)
instant_alter_column_possible(): Add a parameter for InnoDB table,
to check for additional conditions, such as the maximum number of
index fields.
ha_innobase_inplace_ctx::first_alter_pos: The first column whose position
is affected by instant ADD, DROP, or changing the order of columns.
innobase_build_col_map(): Skip added virtual columns.
prepare_inplace_add_virtual(): Correctly compute num_to_add_vcol.
Remove some unnecessary code. Note that the call to
innodb_base_col_setup() should be executed later.
commit_try_norebuild(): If ctx->is_instant(), let the virtual
columns be added or dropped by innobase_instant_try().
innobase_instant_try(): Fill in a zero default value for the
hidden column FTS_DOC_ID (to reduce the work needed in MDEV-17459).
If any columns were dropped or reordered (or added not last),
delete any SYS_COLUMNS records for the following columns, and
insert SYS_COLUMNS records for all subsequent stored columns as well
as for all virtual columns. If any virtual column is dropped, rewrite
all virtual column metadata. Use a shortcut only for adding
virtual columns. This is because innobase_drop_virtual_try()
assumes that the dropped virtual columns still exist in ctx->old_table.
innodb_update_cols(): Renamed from innodb_update_n_cols().
innobase_add_one_virtual(), innobase_insert_sys_virtual(): Change
the return type to bool, and invoke my_error() when detecting an error.
innodb_insert_sys_columns(): Insert a record into SYS_COLUMNS.
Refactored from innobase_add_one_virtual() and innobase_instant_add_col().
innobase_instant_add_col(): Replace the parameter dfield with type.
innobase_instant_drop_cols(): Drop matching columns from SYS_COLUMNS
and all columns from SYS_VIRTUAL.
innobase_add_virtual_try(), innobase_drop_virtual_try(): Let
the caller invoke innodb_update_cols().
innobase_rename_column_try(): Skip dropped columns.
commit_cache_norebuild(): Update table->fts->doc_col.
dict_mem_table_col_rename_low(): Skip dropped columns.
trx_undo_rec_get_partial_row(): Skip dropped columns.
trx_undo_update_rec_get_update(): Handle the metadata BLOB correctly.
trx_undo_page_report_modify(): Avoid out-of-bounds access to record fields.
Log metadata records consistently.
Apparently, the first fields of a clustered index may be updated
in an update_undo vector when the index is ID_IND of SYS_FOREIGN,
as part of renaming the table during ALTER TABLE. Normally, updates of
the PRIMARY KEY should be logged as delete-mark and an insert.
row_undo_mod_parse_undo_rec(), row_purge_parse_undo_rec():
Use trx_undo_metadata.
row_undo_mod_clust_low(): On metadata rollback, roll back the root page too.
row_undo_mod_clust(): Relax an assertion. The delete-mark flag was
repurposed for ALTER TABLE metadata records.
row_rec_to_index_entry_impl(): Add the template parameter mblob
and the optional parameter info_bits for specifying the desired new
info bits. For the metadata tuple, allow conversion between the original
format (ADD COLUMN only) and the generic format (with hidden BLOB).
Add the optional parameter "pad" to determine whether the tuple should
be padded to the index fields (on ALTER TABLE it should), or whether
it should remain at its original size (on rollback).
row_build_index_entry_low(): Clean up the code, removing
redundant variables and conditions. For instantly dropped columns,
generate a dummy value that is NULL, the empty string, or a
fixed length of NUL bytes, depending on the type of the dropped column.
row_upd_clust_rec_by_insert_inherit_func(): On the update of PRIMARY KEY
of a record that contained a dropped column whose value was stored
externally, we will be inserting a dummy NULL or empty string value
to the field of the dropped column. The externally stored column would
eventually be dropped when purge removes the delete-marked record for
the old PRIMARY KEY value.
btr_index_rec_validate(): Recognize the metadata record.
btr_discard_only_page_on_level(): Preserve the generic instant
ALTER TABLE metadata.
btr_set_instant(): Replaces page_set_instant(). This sets a clustered
index root page to the appropriate format, or upgrades from
the MDEV-11369 instant ADD COLUMN to generic ALTER TABLE format.
btr_cur_instant_init_low(): Read and validate the metadata BLOB page
before reconstructing the dictionary information based on it.
btr_cur_instant_init_metadata(): Do not read any lengths from the
metadata record header before reading the BLOB. At this point, we
would not actually know how many nullable fields the metadata record
contains.
btr_cur_instant_root_init(): Initialize n_core_null_bytes in one
of two possible ways.
btr_cur_trim(): Handle the mblob record.
row_metadata_to_tuple(): Convert a metadata record to a data tuple,
based on the new info_bits of the metadata record.
btr_cur_pessimistic_update(): Invoke row_metadata_to_tuple() if needed.
Invoke dtuple_convert_big_rec() for metadata records if the record is
too large, or if the mblob is not yet marked as externally stored.
btr_cur_optimistic_delete_func(), btr_cur_pessimistic_delete():
When the last user record is deleted, do not delete the
generic instant ALTER TABLE metadata record. Only delete
MDEV-11369 instant ADD COLUMN metadata records.
btr_cur_optimistic_insert(): Avoid unnecessary computation of rec_size.
btr_pcur_store_position(): Allow a logically empty page to contain
a metadata record for generic ALTER TABLE.
REC_INFO_DEFAULT_ROW_ADD: Renamed from REC_INFO_DEFAULT_ROW.
This is for the old instant ADD COLUMN (MDEV-11369) only.
REC_INFO_DEFAULT_ROW_ALTER: The more generic metadata record,
with additional information for dropped or reordered columns.
rec_info_bits_valid(): Remove. The only case when this would fail
is when the record is the generic ALTER TABLE metadata record.
rec_is_alter_metadata(): Check if a record is the metadata record
for instant ALTER TABLE (other than ADD COLUMN). NOTE: This function
must not be invoked on node pointer records, because the delete-mark
flag in those records may be set (it is garbage), and then a debug
assertion could fail because index->is_instant() does not necessarily
hold.
rec_is_add_metadata(): Check if a record is MDEV-11369 ADD COLUMN metadata
record (not more generic instant ALTER TABLE).
rec_get_converted_size_comp_prefix_low(): Assume that the metadata
field will be stored externally. In dtuple_convert_big_rec() during
the rec_get_converted_size() call, it would not be there yet.
rec_get_converted_size_comp(): Replace status,fields,n_fields with tuple.
rec_init_offsets_comp_ordinary(), rec_get_converted_size_comp_prefix_low(),
rec_convert_dtuple_to_rec_comp(): Add template<bool mblob = false>.
With mblob=true, process a record with a metadata BLOB.
rec_copy_prefix_to_buf(): Assert that no fields beyond the key and
system columns are being copied. Exclude the metadata BLOB field.
rec_convert_dtuple_to_metadata_comp(): Convert an alter metadata tuple
into a record.
row_upd_index_replace_metadata(): Apply an update vector to an
alter_metadata tuple.
row_log_allocate(): Replace dict_index_t::is_instant()
with a more appropriate condition that ignores dict_table_t::instant.
Only a table on which the MDEV-11369 ADD COLUMN was performed
can "lose its instantness" when it becomes empty. After
instant DROP COLUMN or reordering columns, we cannot simply
convert the table to the canonical format, because the data
dictionary cache and all possibly existing references to it
from other client connection threads would have to be adjusted.
row_quiesce_write_index_fields(): Do not crash when the table contains
an instantly dropped column.
Thanks to Thirunarayanan Balathandayuthapani for discussing the design
and implementing an initial prototype of this.
Thanks to Matthias Leich for testing.
2018-10-19 16:49:54 +03:00
|
|
|
void clear_instant()
|
MDEV-11369 Instant ADD COLUMN for InnoDB
For InnoDB tables, adding, dropping and reordering columns has
required a rebuild of the table and all its indexes. Since MySQL 5.6
(and MariaDB 10.0) this has been supported online (LOCK=NONE), allowing
concurrent modification of the tables.
This work revises the InnoDB ROW_FORMAT=REDUNDANT, ROW_FORMAT=COMPACT
and ROW_FORMAT=DYNAMIC so that columns can be appended instantaneously,
with only minor changes performed to the table structure. The counter
innodb_instant_alter_column in INFORMATION_SCHEMA.GLOBAL_STATUS
is incremented whenever a table rebuild operation is converted into
an instant ADD COLUMN operation.
ROW_FORMAT=COMPRESSED tables will not support instant ADD COLUMN.
Some usability limitations will be addressed in subsequent work:
MDEV-13134 Introduce ALTER TABLE attributes ALGORITHM=NOCOPY
and ALGORITHM=INSTANT
MDEV-14016 Allow instant ADD COLUMN, ADD INDEX, LOCK=NONE
The format of the clustered index (PRIMARY KEY) is changed as follows:
(1) The FIL_PAGE_TYPE of the root page will be FIL_PAGE_TYPE_INSTANT,
and a new field PAGE_INSTANT will contain the original number of fields
in the clustered index ('core' fields).
If instant ADD COLUMN has not been used or the table becomes empty,
or the very first instant ADD COLUMN operation is rolled back,
the fields PAGE_INSTANT and FIL_PAGE_TYPE will be reset
to 0 and FIL_PAGE_INDEX.
(2) A special 'default row' record is inserted into the leftmost leaf,
between the page infimum and the first user record. This record is
distinguished by the REC_INFO_MIN_REC_FLAG, and it is otherwise in the
same format as records that contain values for the instantly added
columns. This 'default row' always has the same number of fields as
the clustered index according to the table definition. The values of
'core' fields are to be ignored. For other fields, the 'default row'
will contain the default values as they were during the ALTER TABLE
statement. (If the column default values are changed later, those
values will only be stored in the .frm file. The 'default row' will
contain the original evaluated values, which must be the same for
every row.) The 'default row' must be completely hidden from
higher-level access routines. Assertions have been added to ensure
that no 'default row' is ever present in the adaptive hash index
or in locked records. The 'default row' is never delete-marked.
(3) In clustered index leaf page records, the number of fields must
reside between the number of 'core' fields (dict_index_t::n_core_fields
introduced in this work) and dict_index_t::n_fields. If the number
of fields is less than dict_index_t::n_fields, the missing fields
are replaced with the column value of the 'default row'.
Note: The number of fields in the record may shrink if some of the
last instantly added columns are updated to the value that is
in the 'default row'. The function btr_cur_trim() implements this
'compression' on update and rollback; dtuple::trim() implements it
on insert.
(4) In ROW_FORMAT=COMPACT and ROW_FORMAT=DYNAMIC records, the new
status value REC_STATUS_COLUMNS_ADDED will indicate the presence of
a new record header that will encode n_fields-n_core_fields-1 in
1 or 2 bytes. (In ROW_FORMAT=REDUNDANT records, the record header
always explicitly encodes the number of fields.)
We introduce the undo log record type TRX_UNDO_INSERT_DEFAULT for
covering the insert of the 'default row' record when instant ADD COLUMN
is used for the first time. Subsequent instant ADD COLUMN can use
TRX_UNDO_UPD_EXIST_REC.
This is joint work with Vin Chen (陈福荣) from Tencent. The design
that was discussed in April 2017 would not have allowed import or
export of data files, because instead of the 'default row' it would
have introduced a data dictionary table. The test
rpl.rpl_alter_instant is exactly as contributed in pull request #408.
The test innodb.instant_alter is based on a contributed test.
The redo log record format changes for ROW_FORMAT=DYNAMIC and
ROW_FORMAT=COMPACT are as contributed. (With this change present,
crash recovery from MariaDB 10.3.1 will fail in spectacular ways!)
Also the semantics of higher-level redo log records that modify the
PAGE_INSTANT field is changed. The redo log format version identifier
was already changed to LOG_HEADER_FORMAT_CURRENT=103 in MariaDB 10.3.1.
Everything else has been rewritten by me. Thanks to Elena Stepanova,
the code has been tested extensively.
When rolling back an instant ADD COLUMN operation, we must empty the
PAGE_FREE list after deleting or shortening the 'default row' record,
by calling either btr_page_empty() or btr_page_reorganize(). We must
know the size of each entry in the PAGE_FREE list. If rollback left a
freed copy of the 'default row' in the PAGE_FREE list, we would be
unable to determine its size (if it is in ROW_FORMAT=COMPACT or
ROW_FORMAT=DYNAMIC) because it would contain more fields than the
rolled-back definition of the clustered index.
UNIV_SQL_DEFAULT: A new special constant that designates an instantly
added column that is not present in the clustered index record.
len_is_stored(): Check if a length is an actual length. There are
two magic length values: UNIV_SQL_DEFAULT, UNIV_SQL_NULL.
dict_col_t::def_val: The 'default row' value of the column. If the
column is not added instantly, def_val.len will be UNIV_SQL_DEFAULT.
dict_col_t: Add the accessors is_virtual(), is_nullable(), is_instant(),
instant_value().
dict_col_t::remove_instant(): Remove the 'instant ADD' status of
a column.
dict_col_t::name(const dict_table_t& table): Replaces
dict_table_get_col_name().
dict_index_t::n_core_fields: The original number of fields.
For secondary indexes and if instant ADD COLUMN has not been used,
this will be equal to dict_index_t::n_fields.
dict_index_t::n_core_null_bytes: Number of bytes needed to
represent the null flags; usually equal to UT_BITS_IN_BYTES(n_nullable).
dict_index_t::NO_CORE_NULL_BYTES: Magic value signalling that
n_core_null_bytes was not initialized yet from the clustered index
root page.
dict_index_t: Add the accessors is_instant(), is_clust(),
get_n_nullable(), instant_field_value().
dict_index_t::instant_add_field(): Adjust clustered index metadata
for instant ADD COLUMN.
dict_index_t::remove_instant(): Remove the 'instant ADD' status
of a clustered index when the table becomes empty, or the very first
instant ADD COLUMN operation is rolled back.
dict_table_t: Add the accessors is_instant(), is_temporary(),
supports_instant().
dict_table_t::instant_add_column(): Adjust metadata for
instant ADD COLUMN.
dict_table_t::rollback_instant(): Adjust metadata on the rollback
of instant ADD COLUMN.
prepare_inplace_alter_table_dict(): First create the ctx->new_table,
and only then decide if the table really needs to be rebuilt.
We must split the creation of table or index metadata from the
creation of the dictionary table records and the creation of
the data. In this way, we can transform a table-rebuilding operation
into an instant ADD COLUMN operation. Dictionary objects will only
be added to cache when table rebuilding or index creation is needed.
The ctx->instant_table will never be added to cache.
dict_table_t::add_to_cache(): Modified and renamed from
dict_table_add_to_cache(). Do not modify the table metadata.
Let the callers invoke dict_table_add_system_columns() and if needed,
set can_be_evicted.
dict_create_sys_tables_tuple(), dict_create_table_step(): Omit the
system columns (which will now exist in the dict_table_t object
already at this point).
dict_create_table_step(): Expect the callers to invoke
dict_table_add_system_columns().
pars_create_table(): Before creating the table creation execution
graph, invoke dict_table_add_system_columns().
row_create_table_for_mysql(): Expect all callers to invoke
dict_table_add_system_columns().
create_index_dict(): Replaces row_merge_create_index_graph().
innodb_update_n_cols(): Renamed from innobase_update_n_virtual().
Call my_error() if an error occurs.
btr_cur_instant_init(), btr_cur_instant_init_low(),
btr_cur_instant_root_init():
Load additional metadata from the clustered index and set
dict_index_t::n_core_null_bytes. This is invoked
when table metadata is first loaded into the data dictionary.
dict_boot(): Initialize n_core_null_bytes for the four hard-coded
dictionary tables.
dict_create_index_step(): Initialize n_core_null_bytes. This is
executed as part of CREATE TABLE.
dict_index_build_internal_clust(): Initialize n_core_null_bytes to
NO_CORE_NULL_BYTES if table->supports_instant().
row_create_index_for_mysql(): Initialize n_core_null_bytes for
CREATE TEMPORARY TABLE.
commit_cache_norebuild(): Call the code to rename or enlarge columns
in the cache only if instant ADD COLUMN is not being used.
(Instant ADD COLUMN would copy all column metadata from
instant_table to old_table, including the names and lengths.)
PAGE_INSTANT: A new 13-bit field for storing dict_index_t::n_core_fields.
This is repurposing the 16-bit field PAGE_DIRECTION, of which only the
least significant 3 bits were used. The original byte containing
PAGE_DIRECTION will be accessible via the new constant PAGE_DIRECTION_B.
page_get_instant(), page_set_instant(): Accessors for the PAGE_INSTANT.
page_ptr_get_direction(), page_get_direction(),
page_ptr_set_direction(): Accessors for PAGE_DIRECTION.
page_direction_reset(): Reset PAGE_DIRECTION, PAGE_N_DIRECTION.
page_direction_increment(): Increment PAGE_N_DIRECTION
and set PAGE_DIRECTION.
rec_get_offsets(): Use the 'leaf' parameter for non-debug purposes,
and assume that heap_no is always set.
Initialize all dict_index_t::n_fields for ROW_FORMAT=REDUNDANT records,
even if the record contains fewer fields.
rec_offs_make_valid(): Add the parameter 'leaf'.
rec_copy_prefix_to_dtuple(): Assert that the tuple is only built
on the core fields. Instant ADD COLUMN only applies to the
clustered index, and we should never build a search key that has
more than the PRIMARY KEY and possibly DB_TRX_ID,DB_ROLL_PTR.
All these columns are always present.
dict_index_build_data_tuple(): Remove assertions that would be
duplicated in rec_copy_prefix_to_dtuple().
rec_init_offsets(): Support ROW_FORMAT=REDUNDANT records whose
number of fields is between n_core_fields and n_fields.
cmp_rec_rec_with_match(): Implement the comparison between two
MIN_REC_FLAG records.
trx_t::in_rollback: Make the field available in non-debug builds.
trx_start_for_ddl_low(): Remove dangerous error-tolerance.
A dictionary transaction must be flagged as such before it has generated
any undo log records. This is because trx_undo_assign_undo() will mark
the transaction as a dictionary transaction in the undo log header
right before the very first undo log record is being written.
btr_index_rec_validate(): Account for instant ADD COLUMN
row_undo_ins_remove_clust_rec(): On the rollback of an insert into
SYS_COLUMNS, revert instant ADD COLUMN in the cache by removing the
last column from the table and the clustered index.
row_search_on_row_ref(), row_undo_mod_parse_undo_rec(), row_undo_mod(),
trx_undo_update_rec_get_update(): Handle the 'default row'
as a special case.
dtuple_t::trim(index): Omit a redundant suffix of an index tuple right
before insert or update. After instant ADD COLUMN, if the last fields
of a clustered index tuple match the 'default row', there is no
need to store them. While trimming the entry, we must hold a page latch,
so that the table cannot be emptied and the 'default row' be deleted.
btr_cur_optimistic_update(), btr_cur_pessimistic_update(),
row_upd_clust_rec_by_insert(), row_ins_clust_index_entry_low():
Invoke dtuple_t::trim() if needed.
row_ins_clust_index_entry(): Restore dtuple_t::n_fields after calling
row_ins_clust_index_entry_low().
rec_get_converted_size(), rec_get_converted_size_comp(): Allow the number
of fields to be between n_core_fields and n_fields. Do not support
infimum,supremum. They are never supposed to be stored in dtuple_t,
because page creation nowadays uses a lower-level method for initializing
them.
rec_convert_dtuple_to_rec_comp(): Assign the status bits based on the
number of fields.
btr_cur_trim(): In an update, trim the index entry as needed. For the
'default row', handle rollback specially. For user records, omit
fields that match the 'default row'.
btr_cur_optimistic_delete_func(), btr_cur_pessimistic_delete():
Skip locking and adaptive hash index for the 'default row'.
row_log_table_apply_convert_mrec(): Replace 'default row' values if needed.
In the temporary file that is applied by row_log_table_apply(),
we must identify whether the records contain the extra header for
instantly added columns. For now, we will allocate an additional byte
for this for ROW_T_INSERT and ROW_T_UPDATE records when the source table
has been subject to instant ADD COLUMN. The ROW_T_DELETE records are
fine, as they will be converted and will only contain 'core' columns
(PRIMARY KEY and some system columns) that are converted from dtuple_t.
rec_get_converted_size_temp(), rec_init_offsets_temp(),
rec_convert_dtuple_to_temp(): Add the parameter 'status'.
REC_INFO_DEFAULT_ROW = REC_INFO_MIN_REC_FLAG | REC_STATUS_COLUMNS_ADDED:
An info_bits constant for distinguishing the 'default row' record.
rec_comp_status_t: An enum of the status bit values.
rec_leaf_format: An enum that replaces the bool parameter of
rec_init_offsets_comp_ordinary().
2017-10-06 07:00:05 +03:00
|
|
|
{
|
|
|
|
def_val.len = UNIV_SQL_DEFAULT;
|
|
|
|
def_val.data = NULL;
|
|
|
|
}
|
2018-11-08 12:22:10 +02:00
|
|
|
|
|
|
|
/** Determine if the columns have the same format
|
|
|
|
except for is_nullable() and is_versioned().
|
|
|
|
@param[in] other column to compare to
|
|
|
|
@return whether the columns have the same format */
|
2019-02-18 18:30:01 +02:00
|
|
|
bool same_format(const dict_col_t& other) const
|
2018-11-08 12:22:10 +02:00
|
|
|
{
|
2019-02-18 18:30:01 +02:00
|
|
|
return mtype == other.mtype
|
|
|
|
&& len >= other.len
|
|
|
|
&& mbminlen == other.mbminlen
|
|
|
|
&& mbmaxlen == other.mbmaxlen
|
|
|
|
&& !((prtype ^ other.prtype)
|
2019-02-25 15:35:00 +02:00
|
|
|
& ~(DATA_NOT_NULL | DATA_VERSIONED
|
|
|
|
| DATA_LONG_TRUE_VARCHAR));
|
2018-11-08 12:22:10 +02:00
|
|
|
}
|
2014-02-26 19:11:54 +01:00
|
|
|
};
|
|
|
|
|
2016-08-12 11:17:45 +03:00
|
|
|
/** Index information put in a list of virtual column structure. Index
|
|
|
|
id and virtual column position in the index will be logged.
|
|
|
|
There can be multiple entries for a given index, with a different position. */
|
|
|
|
struct dict_v_idx_t {
|
|
|
|
/** active index on the column */
|
|
|
|
dict_index_t* index;
|
|
|
|
|
|
|
|
/** position in this index */
|
|
|
|
ulint nth_field;
|
MDEV-11369 Instant ADD COLUMN for InnoDB
For InnoDB tables, adding, dropping and reordering columns has
required a rebuild of the table and all its indexes. Since MySQL 5.6
(and MariaDB 10.0) this has been supported online (LOCK=NONE), allowing
concurrent modification of the tables.
This work revises the InnoDB ROW_FORMAT=REDUNDANT, ROW_FORMAT=COMPACT
and ROW_FORMAT=DYNAMIC so that columns can be appended instantaneously,
with only minor changes performed to the table structure. The counter
innodb_instant_alter_column in INFORMATION_SCHEMA.GLOBAL_STATUS
is incremented whenever a table rebuild operation is converted into
an instant ADD COLUMN operation.
ROW_FORMAT=COMPRESSED tables will not support instant ADD COLUMN.
Some usability limitations will be addressed in subsequent work:
MDEV-13134 Introduce ALTER TABLE attributes ALGORITHM=NOCOPY
and ALGORITHM=INSTANT
MDEV-14016 Allow instant ADD COLUMN, ADD INDEX, LOCK=NONE
The format of the clustered index (PRIMARY KEY) is changed as follows:
(1) The FIL_PAGE_TYPE of the root page will be FIL_PAGE_TYPE_INSTANT,
and a new field PAGE_INSTANT will contain the original number of fields
in the clustered index ('core' fields).
If instant ADD COLUMN has not been used or the table becomes empty,
or the very first instant ADD COLUMN operation is rolled back,
the fields PAGE_INSTANT and FIL_PAGE_TYPE will be reset
to 0 and FIL_PAGE_INDEX.
(2) A special 'default row' record is inserted into the leftmost leaf,
between the page infimum and the first user record. This record is
distinguished by the REC_INFO_MIN_REC_FLAG, and it is otherwise in the
same format as records that contain values for the instantly added
columns. This 'default row' always has the same number of fields as
the clustered index according to the table definition. The values of
'core' fields are to be ignored. For other fields, the 'default row'
will contain the default values as they were during the ALTER TABLE
statement. (If the column default values are changed later, those
values will only be stored in the .frm file. The 'default row' will
contain the original evaluated values, which must be the same for
every row.) The 'default row' must be completely hidden from
higher-level access routines. Assertions have been added to ensure
that no 'default row' is ever present in the adaptive hash index
or in locked records. The 'default row' is never delete-marked.
(3) In clustered index leaf page records, the number of fields must
reside between the number of 'core' fields (dict_index_t::n_core_fields
introduced in this work) and dict_index_t::n_fields. If the number
of fields is less than dict_index_t::n_fields, the missing fields
are replaced with the column value of the 'default row'.
Note: The number of fields in the record may shrink if some of the
last instantly added columns are updated to the value that is
in the 'default row'. The function btr_cur_trim() implements this
'compression' on update and rollback; dtuple::trim() implements it
on insert.
(4) In ROW_FORMAT=COMPACT and ROW_FORMAT=DYNAMIC records, the new
status value REC_STATUS_COLUMNS_ADDED will indicate the presence of
a new record header that will encode n_fields-n_core_fields-1 in
1 or 2 bytes. (In ROW_FORMAT=REDUNDANT records, the record header
always explicitly encodes the number of fields.)
We introduce the undo log record type TRX_UNDO_INSERT_DEFAULT for
covering the insert of the 'default row' record when instant ADD COLUMN
is used for the first time. Subsequent instant ADD COLUMN can use
TRX_UNDO_UPD_EXIST_REC.
This is joint work with Vin Chen (陈福荣) from Tencent. The design
that was discussed in April 2017 would not have allowed import or
export of data files, because instead of the 'default row' it would
have introduced a data dictionary table. The test
rpl.rpl_alter_instant is exactly as contributed in pull request #408.
The test innodb.instant_alter is based on a contributed test.
The redo log record format changes for ROW_FORMAT=DYNAMIC and
ROW_FORMAT=COMPACT are as contributed. (With this change present,
crash recovery from MariaDB 10.3.1 will fail in spectacular ways!)
Also the semantics of higher-level redo log records that modify the
PAGE_INSTANT field is changed. The redo log format version identifier
was already changed to LOG_HEADER_FORMAT_CURRENT=103 in MariaDB 10.3.1.
Everything else has been rewritten by me. Thanks to Elena Stepanova,
the code has been tested extensively.
When rolling back an instant ADD COLUMN operation, we must empty the
PAGE_FREE list after deleting or shortening the 'default row' record,
by calling either btr_page_empty() or btr_page_reorganize(). We must
know the size of each entry in the PAGE_FREE list. If rollback left a
freed copy of the 'default row' in the PAGE_FREE list, we would be
unable to determine its size (if it is in ROW_FORMAT=COMPACT or
ROW_FORMAT=DYNAMIC) because it would contain more fields than the
rolled-back definition of the clustered index.
UNIV_SQL_DEFAULT: A new special constant that designates an instantly
added column that is not present in the clustered index record.
len_is_stored(): Check if a length is an actual length. There are
two magic length values: UNIV_SQL_DEFAULT, UNIV_SQL_NULL.
dict_col_t::def_val: The 'default row' value of the column. If the
column is not added instantly, def_val.len will be UNIV_SQL_DEFAULT.
dict_col_t: Add the accessors is_virtual(), is_nullable(), is_instant(),
instant_value().
dict_col_t::remove_instant(): Remove the 'instant ADD' status of
a column.
dict_col_t::name(const dict_table_t& table): Replaces
dict_table_get_col_name().
dict_index_t::n_core_fields: The original number of fields.
For secondary indexes and if instant ADD COLUMN has not been used,
this will be equal to dict_index_t::n_fields.
dict_index_t::n_core_null_bytes: Number of bytes needed to
represent the null flags; usually equal to UT_BITS_IN_BYTES(n_nullable).
dict_index_t::NO_CORE_NULL_BYTES: Magic value signalling that
n_core_null_bytes was not initialized yet from the clustered index
root page.
dict_index_t: Add the accessors is_instant(), is_clust(),
get_n_nullable(), instant_field_value().
dict_index_t::instant_add_field(): Adjust clustered index metadata
for instant ADD COLUMN.
dict_index_t::remove_instant(): Remove the 'instant ADD' status
of a clustered index when the table becomes empty, or the very first
instant ADD COLUMN operation is rolled back.
dict_table_t: Add the accessors is_instant(), is_temporary(),
supports_instant().
dict_table_t::instant_add_column(): Adjust metadata for
instant ADD COLUMN.
dict_table_t::rollback_instant(): Adjust metadata on the rollback
of instant ADD COLUMN.
prepare_inplace_alter_table_dict(): First create the ctx->new_table,
and only then decide if the table really needs to be rebuilt.
We must split the creation of table or index metadata from the
creation of the dictionary table records and the creation of
the data. In this way, we can transform a table-rebuilding operation
into an instant ADD COLUMN operation. Dictionary objects will only
be added to cache when table rebuilding or index creation is needed.
The ctx->instant_table will never be added to cache.
dict_table_t::add_to_cache(): Modified and renamed from
dict_table_add_to_cache(). Do not modify the table metadata.
Let the callers invoke dict_table_add_system_columns() and if needed,
set can_be_evicted.
dict_create_sys_tables_tuple(), dict_create_table_step(): Omit the
system columns (which will now exist in the dict_table_t object
already at this point).
dict_create_table_step(): Expect the callers to invoke
dict_table_add_system_columns().
pars_create_table(): Before creating the table creation execution
graph, invoke dict_table_add_system_columns().
row_create_table_for_mysql(): Expect all callers to invoke
dict_table_add_system_columns().
create_index_dict(): Replaces row_merge_create_index_graph().
innodb_update_n_cols(): Renamed from innobase_update_n_virtual().
Call my_error() if an error occurs.
btr_cur_instant_init(), btr_cur_instant_init_low(),
btr_cur_instant_root_init():
Load additional metadata from the clustered index and set
dict_index_t::n_core_null_bytes. This is invoked
when table metadata is first loaded into the data dictionary.
dict_boot(): Initialize n_core_null_bytes for the four hard-coded
dictionary tables.
dict_create_index_step(): Initialize n_core_null_bytes. This is
executed as part of CREATE TABLE.
dict_index_build_internal_clust(): Initialize n_core_null_bytes to
NO_CORE_NULL_BYTES if table->supports_instant().
row_create_index_for_mysql(): Initialize n_core_null_bytes for
CREATE TEMPORARY TABLE.
commit_cache_norebuild(): Call the code to rename or enlarge columns
in the cache only if instant ADD COLUMN is not being used.
(Instant ADD COLUMN would copy all column metadata from
instant_table to old_table, including the names and lengths.)
PAGE_INSTANT: A new 13-bit field for storing dict_index_t::n_core_fields.
This is repurposing the 16-bit field PAGE_DIRECTION, of which only the
least significant 3 bits were used. The original byte containing
PAGE_DIRECTION will be accessible via the new constant PAGE_DIRECTION_B.
page_get_instant(), page_set_instant(): Accessors for the PAGE_INSTANT.
page_ptr_get_direction(), page_get_direction(),
page_ptr_set_direction(): Accessors for PAGE_DIRECTION.
page_direction_reset(): Reset PAGE_DIRECTION, PAGE_N_DIRECTION.
page_direction_increment(): Increment PAGE_N_DIRECTION
and set PAGE_DIRECTION.
rec_get_offsets(): Use the 'leaf' parameter for non-debug purposes,
and assume that heap_no is always set.
Initialize all dict_index_t::n_fields for ROW_FORMAT=REDUNDANT records,
even if the record contains fewer fields.
rec_offs_make_valid(): Add the parameter 'leaf'.
rec_copy_prefix_to_dtuple(): Assert that the tuple is only built
on the core fields. Instant ADD COLUMN only applies to the
clustered index, and we should never build a search key that has
more than the PRIMARY KEY and possibly DB_TRX_ID,DB_ROLL_PTR.
All these columns are always present.
dict_index_build_data_tuple(): Remove assertions that would be
duplicated in rec_copy_prefix_to_dtuple().
rec_init_offsets(): Support ROW_FORMAT=REDUNDANT records whose
number of fields is between n_core_fields and n_fields.
cmp_rec_rec_with_match(): Implement the comparison between two
MIN_REC_FLAG records.
trx_t::in_rollback: Make the field available in non-debug builds.
trx_start_for_ddl_low(): Remove dangerous error-tolerance.
A dictionary transaction must be flagged as such before it has generated
any undo log records. This is because trx_undo_assign_undo() will mark
the transaction as a dictionary transaction in the undo log header
right before the very first undo log record is being written.
btr_index_rec_validate(): Account for instant ADD COLUMN
row_undo_ins_remove_clust_rec(): On the rollback of an insert into
SYS_COLUMNS, revert instant ADD COLUMN in the cache by removing the
last column from the table and the clustered index.
row_search_on_row_ref(), row_undo_mod_parse_undo_rec(), row_undo_mod(),
trx_undo_update_rec_get_update(): Handle the 'default row'
as a special case.
dtuple_t::trim(index): Omit a redundant suffix of an index tuple right
before insert or update. After instant ADD COLUMN, if the last fields
of a clustered index tuple match the 'default row', there is no
need to store them. While trimming the entry, we must hold a page latch,
so that the table cannot be emptied and the 'default row' be deleted.
btr_cur_optimistic_update(), btr_cur_pessimistic_update(),
row_upd_clust_rec_by_insert(), row_ins_clust_index_entry_low():
Invoke dtuple_t::trim() if needed.
row_ins_clust_index_entry(): Restore dtuple_t::n_fields after calling
row_ins_clust_index_entry_low().
rec_get_converted_size(), rec_get_converted_size_comp(): Allow the number
of fields to be between n_core_fields and n_fields. Do not support
infimum,supremum. They are never supposed to be stored in dtuple_t,
because page creation nowadays uses a lower-level method for initializing
them.
rec_convert_dtuple_to_rec_comp(): Assign the status bits based on the
number of fields.
btr_cur_trim(): In an update, trim the index entry as needed. For the
'default row', handle rollback specially. For user records, omit
fields that match the 'default row'.
btr_cur_optimistic_delete_func(), btr_cur_pessimistic_delete():
Skip locking and adaptive hash index for the 'default row'.
row_log_table_apply_convert_mrec(): Replace 'default row' values if needed.
In the temporary file that is applied by row_log_table_apply(),
we must identify whether the records contain the extra header for
instantly added columns. For now, we will allocate an additional byte
for this for ROW_T_INSERT and ROW_T_UPDATE records when the source table
has been subject to instant ADD COLUMN. The ROW_T_DELETE records are
fine, as they will be converted and will only contain 'core' columns
(PRIMARY KEY and some system columns) that are converted from dtuple_t.
rec_get_converted_size_temp(), rec_init_offsets_temp(),
rec_convert_dtuple_to_temp(): Add the parameter 'status'.
REC_INFO_DEFAULT_ROW = REC_INFO_MIN_REC_FLAG | REC_STATUS_COLUMNS_ADDED:
An info_bits constant for distinguishing the 'default row' record.
rec_comp_status_t: An enum of the status bit values.
rec_leaf_format: An enum that replaces the bool parameter of
rec_init_offsets_comp_ordinary().
2017-10-06 07:00:05 +03:00
|
|
|
|
|
|
|
dict_v_idx_t(dict_index_t* index, ulint nth_field)
|
|
|
|
: index(index), nth_field(nth_field) {}
|
2016-08-12 11:17:45 +03:00
|
|
|
};
|
|
|
|
|
|
|
|
/** Index list to put in dict_v_col_t */
|
2019-05-27 19:45:44 +03:00
|
|
|
typedef std::forward_list<dict_v_idx_t, ut_allocator<dict_v_idx_t> >
|
|
|
|
dict_v_idx_list;
|
2016-08-12 11:17:45 +03:00
|
|
|
|
|
|
|
/** Data structure for a virtual column in a table */
|
|
|
|
struct dict_v_col_t{
|
|
|
|
/** column structure */
|
|
|
|
dict_col_t m_col;
|
|
|
|
|
|
|
|
/** array of base column ptr */
|
|
|
|
dict_col_t** base_col;
|
|
|
|
|
|
|
|
/** number of base column */
|
2019-05-27 19:45:44 +03:00
|
|
|
unsigned num_base:10;
|
2016-08-12 11:17:45 +03:00
|
|
|
|
|
|
|
/** column pos in table */
|
2019-05-27 19:45:44 +03:00
|
|
|
unsigned v_pos:10;
|
|
|
|
|
|
|
|
/** number of indexes */
|
|
|
|
unsigned n_v_indexes:12;
|
2016-08-12 11:17:45 +03:00
|
|
|
|
|
|
|
/** Virtual index list, and column position in the index,
|
2019-01-22 06:51:03 +02:00
|
|
|
the allocated memory is not from table->heap */
|
2016-08-12 11:17:45 +03:00
|
|
|
dict_v_idx_list* v_indexes;
|
|
|
|
|
2019-05-27 19:45:44 +03:00
|
|
|
/** Detach the column from an index.
|
|
|
|
@param[in] index index to be detached from */
|
|
|
|
void detach(const dict_index_t& index)
|
|
|
|
{
|
|
|
|
ut_ad(!n_v_indexes || v_indexes);
|
|
|
|
if (!n_v_indexes) return;
|
|
|
|
auto i = v_indexes->before_begin();
|
|
|
|
ut_d(unsigned n = 0);
|
|
|
|
do {
|
|
|
|
auto prev = i++;
|
|
|
|
if (i == v_indexes->end()) {
|
|
|
|
ut_ad(n == n_v_indexes);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
ut_ad(++n <= n_v_indexes);
|
|
|
|
if (i->index == &index) {
|
|
|
|
v_indexes->erase_after(prev);
|
|
|
|
n_v_indexes--;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
} while (i != v_indexes->end());
|
|
|
|
}
|
2016-08-12 11:17:45 +03:00
|
|
|
};
|
|
|
|
|
|
|
|
/** Data structure for newly added virtual column in a table */
|
|
|
|
struct dict_add_v_col_t{
|
|
|
|
/** number of new virtual column */
|
|
|
|
ulint n_v_col;
|
|
|
|
|
|
|
|
/** column structures */
|
|
|
|
const dict_v_col_t* v_col;
|
|
|
|
|
|
|
|
/** new col names */
|
|
|
|
const char** v_col_name;
|
|
|
|
};
|
|
|
|
|
2016-09-06 09:43:16 +03:00
|
|
|
/** Data structure for a stored column in a table. */
|
|
|
|
struct dict_s_col_t {
|
|
|
|
/** Stored column ptr */
|
|
|
|
dict_col_t* m_col;
|
|
|
|
/** array of base col ptr */
|
|
|
|
dict_col_t** base_col;
|
|
|
|
/** number of base columns */
|
|
|
|
ulint num_base;
|
|
|
|
/** column pos in table */
|
|
|
|
ulint s_pos;
|
|
|
|
};
|
|
|
|
|
|
|
|
/** list to put stored column for create_table_info_t */
|
2019-05-27 19:45:44 +03:00
|
|
|
typedef std::forward_list<dict_s_col_t, ut_allocator<dict_s_col_t> >
|
|
|
|
dict_s_col_list;
|
2016-09-06 09:43:16 +03:00
|
|
|
|
2014-02-26 19:11:54 +01:00
|
|
|
/** @brief DICT_ANTELOPE_MAX_INDEX_COL_LEN is measured in bytes and
|
|
|
|
is the maximum indexed column length (or indexed prefix length) in
|
|
|
|
ROW_FORMAT=REDUNDANT and ROW_FORMAT=COMPACT. Also, in any format,
|
|
|
|
any fixed-length field that is longer than this will be encoded as
|
|
|
|
a variable-length field.
|
|
|
|
|
|
|
|
It is set to 3*256, so that one can create a column prefix index on
|
|
|
|
256 characters of a TEXT or VARCHAR column also in the UTF-8
|
|
|
|
charset. In that charset, a character may take at most 3 bytes. This
|
|
|
|
constant MUST NOT BE CHANGED, or the compatibility of InnoDB data
|
|
|
|
files would be at risk! */
|
|
|
|
#define DICT_ANTELOPE_MAX_INDEX_COL_LEN REC_ANTELOPE_MAX_INDEX_COL_LEN
|
|
|
|
|
|
|
|
/** Find out maximum indexed column length by its table format.
|
|
|
|
For ROW_FORMAT=REDUNDANT and ROW_FORMAT=COMPACT, the maximum
|
|
|
|
field length is REC_ANTELOPE_MAX_INDEX_COL_LEN - 1 (767). For
|
2017-06-01 13:03:55 +03:00
|
|
|
ROW_FORMAT=COMPRESSED and ROW_FORMAT=DYNAMIC, the length could
|
2014-02-26 19:11:54 +01:00
|
|
|
be REC_VERSION_56_MAX_INDEX_COL_LEN (3072) bytes */
|
2017-06-01 13:03:55 +03:00
|
|
|
#define DICT_MAX_FIELD_LEN_BY_FORMAT(table) \
|
|
|
|
(dict_table_has_atomic_blobs(table) \
|
|
|
|
? REC_VERSION_56_MAX_INDEX_COL_LEN \
|
|
|
|
: REC_ANTELOPE_MAX_INDEX_COL_LEN - 1)
|
|
|
|
|
|
|
|
#define DICT_MAX_FIELD_LEN_BY_FORMAT_FLAG(flags) \
|
|
|
|
(DICT_TF_HAS_ATOMIC_BLOBS(flags) \
|
|
|
|
? REC_VERSION_56_MAX_INDEX_COL_LEN \
|
|
|
|
: REC_ANTELOPE_MAX_INDEX_COL_LEN - 1)
|
2014-02-26 19:11:54 +01:00
|
|
|
|
|
|
|
/** Defines the maximum fixed length column size */
|
|
|
|
#define DICT_MAX_FIXED_COL_LEN DICT_ANTELOPE_MAX_INDEX_COL_LEN
|
2016-09-06 09:43:16 +03:00
|
|
|
|
2014-08-06 15:39:15 +03:00
|
|
|
#ifdef WITH_WSREP
|
|
|
|
#define WSREP_MAX_SUPPORTED_KEY_LENGTH 3500
|
|
|
|
#endif /* WITH_WSREP */
|
2014-02-26 19:11:54 +01:00
|
|
|
|
|
|
|
/** Data structure for a field in an index */
|
|
|
|
struct dict_field_t{
|
|
|
|
dict_col_t* col; /*!< pointer to the table column */
|
2016-08-12 11:17:45 +03:00
|
|
|
id_name_t name; /*!< name of the column */
|
2014-02-26 19:11:54 +01:00
|
|
|
unsigned prefix_len:12; /*!< 0 or the length of the column
|
|
|
|
prefix in bytes in a MySQL index of
|
|
|
|
type, e.g., INDEX (textcol(25));
|
|
|
|
must be smaller than
|
|
|
|
DICT_MAX_FIELD_LEN_BY_FORMAT;
|
|
|
|
NOTE that in the UTF-8 charset, MySQL
|
|
|
|
sets this to (mbmaxlen * the prefix len)
|
|
|
|
in UTF-8 chars */
|
|
|
|
unsigned fixed_len:10; /*!< 0 or the fixed length of the
|
|
|
|
column if smaller than
|
|
|
|
DICT_ANTELOPE_MAX_INDEX_COL_LEN */
|
MDEV-11369 Instant ADD COLUMN for InnoDB
For InnoDB tables, adding, dropping and reordering columns has
required a rebuild of the table and all its indexes. Since MySQL 5.6
(and MariaDB 10.0) this has been supported online (LOCK=NONE), allowing
concurrent modification of the tables.
This work revises the InnoDB ROW_FORMAT=REDUNDANT, ROW_FORMAT=COMPACT
and ROW_FORMAT=DYNAMIC so that columns can be appended instantaneously,
with only minor changes performed to the table structure. The counter
innodb_instant_alter_column in INFORMATION_SCHEMA.GLOBAL_STATUS
is incremented whenever a table rebuild operation is converted into
an instant ADD COLUMN operation.
ROW_FORMAT=COMPRESSED tables will not support instant ADD COLUMN.
Some usability limitations will be addressed in subsequent work:
MDEV-13134 Introduce ALTER TABLE attributes ALGORITHM=NOCOPY
and ALGORITHM=INSTANT
MDEV-14016 Allow instant ADD COLUMN, ADD INDEX, LOCK=NONE
The format of the clustered index (PRIMARY KEY) is changed as follows:
(1) The FIL_PAGE_TYPE of the root page will be FIL_PAGE_TYPE_INSTANT,
and a new field PAGE_INSTANT will contain the original number of fields
in the clustered index ('core' fields).
If instant ADD COLUMN has not been used or the table becomes empty,
or the very first instant ADD COLUMN operation is rolled back,
the fields PAGE_INSTANT and FIL_PAGE_TYPE will be reset
to 0 and FIL_PAGE_INDEX.
(2) A special 'default row' record is inserted into the leftmost leaf,
between the page infimum and the first user record. This record is
distinguished by the REC_INFO_MIN_REC_FLAG, and it is otherwise in the
same format as records that contain values for the instantly added
columns. This 'default row' always has the same number of fields as
the clustered index according to the table definition. The values of
'core' fields are to be ignored. For other fields, the 'default row'
will contain the default values as they were during the ALTER TABLE
statement. (If the column default values are changed later, those
values will only be stored in the .frm file. The 'default row' will
contain the original evaluated values, which must be the same for
every row.) The 'default row' must be completely hidden from
higher-level access routines. Assertions have been added to ensure
that no 'default row' is ever present in the adaptive hash index
or in locked records. The 'default row' is never delete-marked.
(3) In clustered index leaf page records, the number of fields must
reside between the number of 'core' fields (dict_index_t::n_core_fields
introduced in this work) and dict_index_t::n_fields. If the number
of fields is less than dict_index_t::n_fields, the missing fields
are replaced with the column value of the 'default row'.
Note: The number of fields in the record may shrink if some of the
last instantly added columns are updated to the value that is
in the 'default row'. The function btr_cur_trim() implements this
'compression' on update and rollback; dtuple::trim() implements it
on insert.
(4) In ROW_FORMAT=COMPACT and ROW_FORMAT=DYNAMIC records, the new
status value REC_STATUS_COLUMNS_ADDED will indicate the presence of
a new record header that will encode n_fields-n_core_fields-1 in
1 or 2 bytes. (In ROW_FORMAT=REDUNDANT records, the record header
always explicitly encodes the number of fields.)
We introduce the undo log record type TRX_UNDO_INSERT_DEFAULT for
covering the insert of the 'default row' record when instant ADD COLUMN
is used for the first time. Subsequent instant ADD COLUMN can use
TRX_UNDO_UPD_EXIST_REC.
This is joint work with Vin Chen (陈福荣) from Tencent. The design
that was discussed in April 2017 would not have allowed import or
export of data files, because instead of the 'default row' it would
have introduced a data dictionary table. The test
rpl.rpl_alter_instant is exactly as contributed in pull request #408.
The test innodb.instant_alter is based on a contributed test.
The redo log record format changes for ROW_FORMAT=DYNAMIC and
ROW_FORMAT=COMPACT are as contributed. (With this change present,
crash recovery from MariaDB 10.3.1 will fail in spectacular ways!)
Also the semantics of higher-level redo log records that modify the
PAGE_INSTANT field is changed. The redo log format version identifier
was already changed to LOG_HEADER_FORMAT_CURRENT=103 in MariaDB 10.3.1.
Everything else has been rewritten by me. Thanks to Elena Stepanova,
the code has been tested extensively.
When rolling back an instant ADD COLUMN operation, we must empty the
PAGE_FREE list after deleting or shortening the 'default row' record,
by calling either btr_page_empty() or btr_page_reorganize(). We must
know the size of each entry in the PAGE_FREE list. If rollback left a
freed copy of the 'default row' in the PAGE_FREE list, we would be
unable to determine its size (if it is in ROW_FORMAT=COMPACT or
ROW_FORMAT=DYNAMIC) because it would contain more fields than the
rolled-back definition of the clustered index.
UNIV_SQL_DEFAULT: A new special constant that designates an instantly
added column that is not present in the clustered index record.
len_is_stored(): Check if a length is an actual length. There are
two magic length values: UNIV_SQL_DEFAULT, UNIV_SQL_NULL.
dict_col_t::def_val: The 'default row' value of the column. If the
column is not added instantly, def_val.len will be UNIV_SQL_DEFAULT.
dict_col_t: Add the accessors is_virtual(), is_nullable(), is_instant(),
instant_value().
dict_col_t::remove_instant(): Remove the 'instant ADD' status of
a column.
dict_col_t::name(const dict_table_t& table): Replaces
dict_table_get_col_name().
dict_index_t::n_core_fields: The original number of fields.
For secondary indexes and if instant ADD COLUMN has not been used,
this will be equal to dict_index_t::n_fields.
dict_index_t::n_core_null_bytes: Number of bytes needed to
represent the null flags; usually equal to UT_BITS_IN_BYTES(n_nullable).
dict_index_t::NO_CORE_NULL_BYTES: Magic value signalling that
n_core_null_bytes was not initialized yet from the clustered index
root page.
dict_index_t: Add the accessors is_instant(), is_clust(),
get_n_nullable(), instant_field_value().
dict_index_t::instant_add_field(): Adjust clustered index metadata
for instant ADD COLUMN.
dict_index_t::remove_instant(): Remove the 'instant ADD' status
of a clustered index when the table becomes empty, or the very first
instant ADD COLUMN operation is rolled back.
dict_table_t: Add the accessors is_instant(), is_temporary(),
supports_instant().
dict_table_t::instant_add_column(): Adjust metadata for
instant ADD COLUMN.
dict_table_t::rollback_instant(): Adjust metadata on the rollback
of instant ADD COLUMN.
prepare_inplace_alter_table_dict(): First create the ctx->new_table,
and only then decide if the table really needs to be rebuilt.
We must split the creation of table or index metadata from the
creation of the dictionary table records and the creation of
the data. In this way, we can transform a table-rebuilding operation
into an instant ADD COLUMN operation. Dictionary objects will only
be added to cache when table rebuilding or index creation is needed.
The ctx->instant_table will never be added to cache.
dict_table_t::add_to_cache(): Modified and renamed from
dict_table_add_to_cache(). Do not modify the table metadata.
Let the callers invoke dict_table_add_system_columns() and if needed,
set can_be_evicted.
dict_create_sys_tables_tuple(), dict_create_table_step(): Omit the
system columns (which will now exist in the dict_table_t object
already at this point).
dict_create_table_step(): Expect the callers to invoke
dict_table_add_system_columns().
pars_create_table(): Before creating the table creation execution
graph, invoke dict_table_add_system_columns().
row_create_table_for_mysql(): Expect all callers to invoke
dict_table_add_system_columns().
create_index_dict(): Replaces row_merge_create_index_graph().
innodb_update_n_cols(): Renamed from innobase_update_n_virtual().
Call my_error() if an error occurs.
btr_cur_instant_init(), btr_cur_instant_init_low(),
btr_cur_instant_root_init():
Load additional metadata from the clustered index and set
dict_index_t::n_core_null_bytes. This is invoked
when table metadata is first loaded into the data dictionary.
dict_boot(): Initialize n_core_null_bytes for the four hard-coded
dictionary tables.
dict_create_index_step(): Initialize n_core_null_bytes. This is
executed as part of CREATE TABLE.
dict_index_build_internal_clust(): Initialize n_core_null_bytes to
NO_CORE_NULL_BYTES if table->supports_instant().
row_create_index_for_mysql(): Initialize n_core_null_bytes for
CREATE TEMPORARY TABLE.
commit_cache_norebuild(): Call the code to rename or enlarge columns
in the cache only if instant ADD COLUMN is not being used.
(Instant ADD COLUMN would copy all column metadata from
instant_table to old_table, including the names and lengths.)
PAGE_INSTANT: A new 13-bit field for storing dict_index_t::n_core_fields.
This is repurposing the 16-bit field PAGE_DIRECTION, of which only the
least significant 3 bits were used. The original byte containing
PAGE_DIRECTION will be accessible via the new constant PAGE_DIRECTION_B.
page_get_instant(), page_set_instant(): Accessors for the PAGE_INSTANT.
page_ptr_get_direction(), page_get_direction(),
page_ptr_set_direction(): Accessors for PAGE_DIRECTION.
page_direction_reset(): Reset PAGE_DIRECTION, PAGE_N_DIRECTION.
page_direction_increment(): Increment PAGE_N_DIRECTION
and set PAGE_DIRECTION.
rec_get_offsets(): Use the 'leaf' parameter for non-debug purposes,
and assume that heap_no is always set.
Initialize all dict_index_t::n_fields for ROW_FORMAT=REDUNDANT records,
even if the record contains fewer fields.
rec_offs_make_valid(): Add the parameter 'leaf'.
rec_copy_prefix_to_dtuple(): Assert that the tuple is only built
on the core fields. Instant ADD COLUMN only applies to the
clustered index, and we should never build a search key that has
more than the PRIMARY KEY and possibly DB_TRX_ID,DB_ROLL_PTR.
All these columns are always present.
dict_index_build_data_tuple(): Remove assertions that would be
duplicated in rec_copy_prefix_to_dtuple().
rec_init_offsets(): Support ROW_FORMAT=REDUNDANT records whose
number of fields is between n_core_fields and n_fields.
cmp_rec_rec_with_match(): Implement the comparison between two
MIN_REC_FLAG records.
trx_t::in_rollback: Make the field available in non-debug builds.
trx_start_for_ddl_low(): Remove dangerous error-tolerance.
A dictionary transaction must be flagged as such before it has generated
any undo log records. This is because trx_undo_assign_undo() will mark
the transaction as a dictionary transaction in the undo log header
right before the very first undo log record is being written.
btr_index_rec_validate(): Account for instant ADD COLUMN
row_undo_ins_remove_clust_rec(): On the rollback of an insert into
SYS_COLUMNS, revert instant ADD COLUMN in the cache by removing the
last column from the table and the clustered index.
row_search_on_row_ref(), row_undo_mod_parse_undo_rec(), row_undo_mod(),
trx_undo_update_rec_get_update(): Handle the 'default row'
as a special case.
dtuple_t::trim(index): Omit a redundant suffix of an index tuple right
before insert or update. After instant ADD COLUMN, if the last fields
of a clustered index tuple match the 'default row', there is no
need to store them. While trimming the entry, we must hold a page latch,
so that the table cannot be emptied and the 'default row' be deleted.
btr_cur_optimistic_update(), btr_cur_pessimistic_update(),
row_upd_clust_rec_by_insert(), row_ins_clust_index_entry_low():
Invoke dtuple_t::trim() if needed.
row_ins_clust_index_entry(): Restore dtuple_t::n_fields after calling
row_ins_clust_index_entry_low().
rec_get_converted_size(), rec_get_converted_size_comp(): Allow the number
of fields to be between n_core_fields and n_fields. Do not support
infimum,supremum. They are never supposed to be stored in dtuple_t,
because page creation nowadays uses a lower-level method for initializing
them.
rec_convert_dtuple_to_rec_comp(): Assign the status bits based on the
number of fields.
btr_cur_trim(): In an update, trim the index entry as needed. For the
'default row', handle rollback specially. For user records, omit
fields that match the 'default row'.
btr_cur_optimistic_delete_func(), btr_cur_pessimistic_delete():
Skip locking and adaptive hash index for the 'default row'.
row_log_table_apply_convert_mrec(): Replace 'default row' values if needed.
In the temporary file that is applied by row_log_table_apply(),
we must identify whether the records contain the extra header for
instantly added columns. For now, we will allocate an additional byte
for this for ROW_T_INSERT and ROW_T_UPDATE records when the source table
has been subject to instant ADD COLUMN. The ROW_T_DELETE records are
fine, as they will be converted and will only contain 'core' columns
(PRIMARY KEY and some system columns) that are converted from dtuple_t.
rec_get_converted_size_temp(), rec_init_offsets_temp(),
rec_convert_dtuple_to_temp(): Add the parameter 'status'.
REC_INFO_DEFAULT_ROW = REC_INFO_MIN_REC_FLAG | REC_STATUS_COLUMNS_ADDED:
An info_bits constant for distinguishing the 'default row' record.
rec_comp_status_t: An enum of the status bit values.
rec_leaf_format: An enum that replaces the bool parameter of
rec_init_offsets_comp_ordinary().
2017-10-06 07:00:05 +03:00
|
|
|
|
2018-08-03 11:22:20 +03:00
|
|
|
/** Zero-initialize all fields */
|
|
|
|
dict_field_t() : col(NULL), name(NULL), prefix_len(0), fixed_len(0) {}
|
2018-08-03 15:57:23 +03:00
|
|
|
|
MDEV-11369 Instant ADD COLUMN for InnoDB
For InnoDB tables, adding, dropping and reordering columns has
required a rebuild of the table and all its indexes. Since MySQL 5.6
(and MariaDB 10.0) this has been supported online (LOCK=NONE), allowing
concurrent modification of the tables.
This work revises the InnoDB ROW_FORMAT=REDUNDANT, ROW_FORMAT=COMPACT
and ROW_FORMAT=DYNAMIC so that columns can be appended instantaneously,
with only minor changes performed to the table structure. The counter
innodb_instant_alter_column in INFORMATION_SCHEMA.GLOBAL_STATUS
is incremented whenever a table rebuild operation is converted into
an instant ADD COLUMN operation.
ROW_FORMAT=COMPRESSED tables will not support instant ADD COLUMN.
Some usability limitations will be addressed in subsequent work:
MDEV-13134 Introduce ALTER TABLE attributes ALGORITHM=NOCOPY
and ALGORITHM=INSTANT
MDEV-14016 Allow instant ADD COLUMN, ADD INDEX, LOCK=NONE
The format of the clustered index (PRIMARY KEY) is changed as follows:
(1) The FIL_PAGE_TYPE of the root page will be FIL_PAGE_TYPE_INSTANT,
and a new field PAGE_INSTANT will contain the original number of fields
in the clustered index ('core' fields).
If instant ADD COLUMN has not been used or the table becomes empty,
or the very first instant ADD COLUMN operation is rolled back,
the fields PAGE_INSTANT and FIL_PAGE_TYPE will be reset
to 0 and FIL_PAGE_INDEX.
(2) A special 'default row' record is inserted into the leftmost leaf,
between the page infimum and the first user record. This record is
distinguished by the REC_INFO_MIN_REC_FLAG, and it is otherwise in the
same format as records that contain values for the instantly added
columns. This 'default row' always has the same number of fields as
the clustered index according to the table definition. The values of
'core' fields are to be ignored. For other fields, the 'default row'
will contain the default values as they were during the ALTER TABLE
statement. (If the column default values are changed later, those
values will only be stored in the .frm file. The 'default row' will
contain the original evaluated values, which must be the same for
every row.) The 'default row' must be completely hidden from
higher-level access routines. Assertions have been added to ensure
that no 'default row' is ever present in the adaptive hash index
or in locked records. The 'default row' is never delete-marked.
(3) In clustered index leaf page records, the number of fields must
reside between the number of 'core' fields (dict_index_t::n_core_fields
introduced in this work) and dict_index_t::n_fields. If the number
of fields is less than dict_index_t::n_fields, the missing fields
are replaced with the column value of the 'default row'.
Note: The number of fields in the record may shrink if some of the
last instantly added columns are updated to the value that is
in the 'default row'. The function btr_cur_trim() implements this
'compression' on update and rollback; dtuple::trim() implements it
on insert.
(4) In ROW_FORMAT=COMPACT and ROW_FORMAT=DYNAMIC records, the new
status value REC_STATUS_COLUMNS_ADDED will indicate the presence of
a new record header that will encode n_fields-n_core_fields-1 in
1 or 2 bytes. (In ROW_FORMAT=REDUNDANT records, the record header
always explicitly encodes the number of fields.)
We introduce the undo log record type TRX_UNDO_INSERT_DEFAULT for
covering the insert of the 'default row' record when instant ADD COLUMN
is used for the first time. Subsequent instant ADD COLUMN can use
TRX_UNDO_UPD_EXIST_REC.
This is joint work with Vin Chen (陈福荣) from Tencent. The design
that was discussed in April 2017 would not have allowed import or
export of data files, because instead of the 'default row' it would
have introduced a data dictionary table. The test
rpl.rpl_alter_instant is exactly as contributed in pull request #408.
The test innodb.instant_alter is based on a contributed test.
The redo log record format changes for ROW_FORMAT=DYNAMIC and
ROW_FORMAT=COMPACT are as contributed. (With this change present,
crash recovery from MariaDB 10.3.1 will fail in spectacular ways!)
Also the semantics of higher-level redo log records that modify the
PAGE_INSTANT field is changed. The redo log format version identifier
was already changed to LOG_HEADER_FORMAT_CURRENT=103 in MariaDB 10.3.1.
Everything else has been rewritten by me. Thanks to Elena Stepanova,
the code has been tested extensively.
When rolling back an instant ADD COLUMN operation, we must empty the
PAGE_FREE list after deleting or shortening the 'default row' record,
by calling either btr_page_empty() or btr_page_reorganize(). We must
know the size of each entry in the PAGE_FREE list. If rollback left a
freed copy of the 'default row' in the PAGE_FREE list, we would be
unable to determine its size (if it is in ROW_FORMAT=COMPACT or
ROW_FORMAT=DYNAMIC) because it would contain more fields than the
rolled-back definition of the clustered index.
UNIV_SQL_DEFAULT: A new special constant that designates an instantly
added column that is not present in the clustered index record.
len_is_stored(): Check if a length is an actual length. There are
two magic length values: UNIV_SQL_DEFAULT, UNIV_SQL_NULL.
dict_col_t::def_val: The 'default row' value of the column. If the
column is not added instantly, def_val.len will be UNIV_SQL_DEFAULT.
dict_col_t: Add the accessors is_virtual(), is_nullable(), is_instant(),
instant_value().
dict_col_t::remove_instant(): Remove the 'instant ADD' status of
a column.
dict_col_t::name(const dict_table_t& table): Replaces
dict_table_get_col_name().
dict_index_t::n_core_fields: The original number of fields.
For secondary indexes and if instant ADD COLUMN has not been used,
this will be equal to dict_index_t::n_fields.
dict_index_t::n_core_null_bytes: Number of bytes needed to
represent the null flags; usually equal to UT_BITS_IN_BYTES(n_nullable).
dict_index_t::NO_CORE_NULL_BYTES: Magic value signalling that
n_core_null_bytes was not initialized yet from the clustered index
root page.
dict_index_t: Add the accessors is_instant(), is_clust(),
get_n_nullable(), instant_field_value().
dict_index_t::instant_add_field(): Adjust clustered index metadata
for instant ADD COLUMN.
dict_index_t::remove_instant(): Remove the 'instant ADD' status
of a clustered index when the table becomes empty, or the very first
instant ADD COLUMN operation is rolled back.
dict_table_t: Add the accessors is_instant(), is_temporary(),
supports_instant().
dict_table_t::instant_add_column(): Adjust metadata for
instant ADD COLUMN.
dict_table_t::rollback_instant(): Adjust metadata on the rollback
of instant ADD COLUMN.
prepare_inplace_alter_table_dict(): First create the ctx->new_table,
and only then decide if the table really needs to be rebuilt.
We must split the creation of table or index metadata from the
creation of the dictionary table records and the creation of
the data. In this way, we can transform a table-rebuilding operation
into an instant ADD COLUMN operation. Dictionary objects will only
be added to cache when table rebuilding or index creation is needed.
The ctx->instant_table will never be added to cache.
dict_table_t::add_to_cache(): Modified and renamed from
dict_table_add_to_cache(). Do not modify the table metadata.
Let the callers invoke dict_table_add_system_columns() and if needed,
set can_be_evicted.
dict_create_sys_tables_tuple(), dict_create_table_step(): Omit the
system columns (which will now exist in the dict_table_t object
already at this point).
dict_create_table_step(): Expect the callers to invoke
dict_table_add_system_columns().
pars_create_table(): Before creating the table creation execution
graph, invoke dict_table_add_system_columns().
row_create_table_for_mysql(): Expect all callers to invoke
dict_table_add_system_columns().
create_index_dict(): Replaces row_merge_create_index_graph().
innodb_update_n_cols(): Renamed from innobase_update_n_virtual().
Call my_error() if an error occurs.
btr_cur_instant_init(), btr_cur_instant_init_low(),
btr_cur_instant_root_init():
Load additional metadata from the clustered index and set
dict_index_t::n_core_null_bytes. This is invoked
when table metadata is first loaded into the data dictionary.
dict_boot(): Initialize n_core_null_bytes for the four hard-coded
dictionary tables.
dict_create_index_step(): Initialize n_core_null_bytes. This is
executed as part of CREATE TABLE.
dict_index_build_internal_clust(): Initialize n_core_null_bytes to
NO_CORE_NULL_BYTES if table->supports_instant().
row_create_index_for_mysql(): Initialize n_core_null_bytes for
CREATE TEMPORARY TABLE.
commit_cache_norebuild(): Call the code to rename or enlarge columns
in the cache only if instant ADD COLUMN is not being used.
(Instant ADD COLUMN would copy all column metadata from
instant_table to old_table, including the names and lengths.)
PAGE_INSTANT: A new 13-bit field for storing dict_index_t::n_core_fields.
This is repurposing the 16-bit field PAGE_DIRECTION, of which only the
least significant 3 bits were used. The original byte containing
PAGE_DIRECTION will be accessible via the new constant PAGE_DIRECTION_B.
page_get_instant(), page_set_instant(): Accessors for the PAGE_INSTANT.
page_ptr_get_direction(), page_get_direction(),
page_ptr_set_direction(): Accessors for PAGE_DIRECTION.
page_direction_reset(): Reset PAGE_DIRECTION, PAGE_N_DIRECTION.
page_direction_increment(): Increment PAGE_N_DIRECTION
and set PAGE_DIRECTION.
rec_get_offsets(): Use the 'leaf' parameter for non-debug purposes,
and assume that heap_no is always set.
Initialize all dict_index_t::n_fields for ROW_FORMAT=REDUNDANT records,
even if the record contains fewer fields.
rec_offs_make_valid(): Add the parameter 'leaf'.
rec_copy_prefix_to_dtuple(): Assert that the tuple is only built
on the core fields. Instant ADD COLUMN only applies to the
clustered index, and we should never build a search key that has
more than the PRIMARY KEY and possibly DB_TRX_ID,DB_ROLL_PTR.
All these columns are always present.
dict_index_build_data_tuple(): Remove assertions that would be
duplicated in rec_copy_prefix_to_dtuple().
rec_init_offsets(): Support ROW_FORMAT=REDUNDANT records whose
number of fields is between n_core_fields and n_fields.
cmp_rec_rec_with_match(): Implement the comparison between two
MIN_REC_FLAG records.
trx_t::in_rollback: Make the field available in non-debug builds.
trx_start_for_ddl_low(): Remove dangerous error-tolerance.
A dictionary transaction must be flagged as such before it has generated
any undo log records. This is because trx_undo_assign_undo() will mark
the transaction as a dictionary transaction in the undo log header
right before the very first undo log record is being written.
btr_index_rec_validate(): Account for instant ADD COLUMN
row_undo_ins_remove_clust_rec(): On the rollback of an insert into
SYS_COLUMNS, revert instant ADD COLUMN in the cache by removing the
last column from the table and the clustered index.
row_search_on_row_ref(), row_undo_mod_parse_undo_rec(), row_undo_mod(),
trx_undo_update_rec_get_update(): Handle the 'default row'
as a special case.
dtuple_t::trim(index): Omit a redundant suffix of an index tuple right
before insert or update. After instant ADD COLUMN, if the last fields
of a clustered index tuple match the 'default row', there is no
need to store them. While trimming the entry, we must hold a page latch,
so that the table cannot be emptied and the 'default row' be deleted.
btr_cur_optimistic_update(), btr_cur_pessimistic_update(),
row_upd_clust_rec_by_insert(), row_ins_clust_index_entry_low():
Invoke dtuple_t::trim() if needed.
row_ins_clust_index_entry(): Restore dtuple_t::n_fields after calling
row_ins_clust_index_entry_low().
rec_get_converted_size(), rec_get_converted_size_comp(): Allow the number
of fields to be between n_core_fields and n_fields. Do not support
infimum,supremum. They are never supposed to be stored in dtuple_t,
because page creation nowadays uses a lower-level method for initializing
them.
rec_convert_dtuple_to_rec_comp(): Assign the status bits based on the
number of fields.
btr_cur_trim(): In an update, trim the index entry as needed. For the
'default row', handle rollback specially. For user records, omit
fields that match the 'default row'.
btr_cur_optimistic_delete_func(), btr_cur_pessimistic_delete():
Skip locking and adaptive hash index for the 'default row'.
row_log_table_apply_convert_mrec(): Replace 'default row' values if needed.
In the temporary file that is applied by row_log_table_apply(),
we must identify whether the records contain the extra header for
instantly added columns. For now, we will allocate an additional byte
for this for ROW_T_INSERT and ROW_T_UPDATE records when the source table
has been subject to instant ADD COLUMN. The ROW_T_DELETE records are
fine, as they will be converted and will only contain 'core' columns
(PRIMARY KEY and some system columns) that are converted from dtuple_t.
rec_get_converted_size_temp(), rec_init_offsets_temp(),
rec_convert_dtuple_to_temp(): Add the parameter 'status'.
REC_INFO_DEFAULT_ROW = REC_INFO_MIN_REC_FLAG | REC_STATUS_COLUMNS_ADDED:
An info_bits constant for distinguishing the 'default row' record.
rec_comp_status_t: An enum of the status bit values.
rec_leaf_format: An enum that replaces the bool parameter of
rec_init_offsets_comp_ordinary().
2017-10-06 07:00:05 +03:00
|
|
|
/** Check whether two index fields are equivalent.
|
|
|
|
@param[in] old the other index field
|
|
|
|
@return whether the index fields are equivalent */
|
|
|
|
bool same(const dict_field_t& other) const
|
|
|
|
{
|
|
|
|
return(prefix_len == other.prefix_len
|
|
|
|
&& fixed_len == other.fixed_len);
|
|
|
|
}
|
2014-02-26 19:11:54 +01:00
|
|
|
};
|
|
|
|
|
|
|
|
/**********************************************************************//**
|
|
|
|
PADDING HEURISTIC BASED ON LINEAR INCREASE OF PADDING TO AVOID
|
|
|
|
COMPRESSION FAILURES
|
|
|
|
(Note: this is relevant only for compressed indexes)
|
|
|
|
GOAL: Avoid compression failures by maintaining information about the
|
|
|
|
compressibility of data. If data is not very compressible then leave
|
|
|
|
some extra space 'padding' in the uncompressed page making it more
|
|
|
|
likely that compression of less than fully packed uncompressed page will
|
|
|
|
succeed.
|
|
|
|
|
|
|
|
This padding heuristic works by increasing the pad linearly until the
|
|
|
|
desired failure rate is reached. A "round" is a fixed number of
|
|
|
|
compression operations.
|
|
|
|
After each round, the compression failure rate for that round is
|
|
|
|
computed. If the failure rate is too high, then padding is incremented
|
|
|
|
by a fixed value, otherwise it's left intact.
|
|
|
|
If the compression failure is lower than the desired rate for a fixed
|
|
|
|
number of consecutive rounds, then the padding is decreased by a fixed
|
|
|
|
value. This is done to prevent overshooting the padding value,
|
|
|
|
and to accommodate the possible change in data compressibility. */
|
|
|
|
|
|
|
|
/** Number of zip ops in one round. */
|
|
|
|
#define ZIP_PAD_ROUND_LEN (128)
|
|
|
|
|
|
|
|
/** Number of successful rounds after which the padding is decreased */
|
|
|
|
#define ZIP_PAD_SUCCESSFUL_ROUND_LIMIT (5)
|
|
|
|
|
|
|
|
/** Amount by which padding is increased. */
|
|
|
|
#define ZIP_PAD_INCR (128)
|
|
|
|
|
|
|
|
/** Percentage of compression failures that are allowed in a single
|
|
|
|
round */
|
|
|
|
extern ulong zip_failure_threshold_pct;
|
|
|
|
|
|
|
|
/** Maximum percentage of a page that can be allowed as a pad to avoid
|
|
|
|
compression failures */
|
|
|
|
extern ulong zip_pad_max;
|
|
|
|
|
2016-08-12 11:17:45 +03:00
|
|
|
/** Data structure to hold information about about how much space in
|
2014-02-26 19:11:54 +01:00
|
|
|
an uncompressed page should be left as padding to avoid compression
|
|
|
|
failures. This estimate is based on a self-adapting heuristic. */
|
|
|
|
struct zip_pad_info_t {
|
2016-08-12 11:17:45 +03:00
|
|
|
SysMutex* mutex; /*!< mutex protecting the info */
|
2018-12-28 22:28:58 +04:00
|
|
|
Atomic_counter<ulint>
|
|
|
|
pad; /*!< number of bytes used as pad */
|
2014-02-26 19:11:54 +01:00
|
|
|
ulint success;/*!< successful compression ops during
|
|
|
|
current round */
|
|
|
|
ulint failure;/*!< failed compression ops during
|
|
|
|
current round */
|
|
|
|
ulint n_rounds;/*!< number of currently successful
|
|
|
|
rounds */
|
2015-05-04 22:13:46 +02:00
|
|
|
volatile os_once::state_t
|
|
|
|
mutex_created;
|
|
|
|
/*!< Creation state of mutex member */
|
2014-02-26 19:11:54 +01:00
|
|
|
};
|
|
|
|
|
2014-08-06 15:28:58 +03:00
|
|
|
/** Number of samples of data size kept when page compression fails for
|
|
|
|
a certain index.*/
|
|
|
|
#define STAT_DEFRAG_DATA_SIZE_N_SAMPLE 10
|
|
|
|
|
2016-08-12 11:17:45 +03:00
|
|
|
/** "GEN_CLUST_INDEX" is the name reserved for InnoDB default
|
|
|
|
system clustered index when there is no primary key. */
|
|
|
|
const char innobase_index_reserve_name[] = "GEN_CLUST_INDEX";
|
|
|
|
|
|
|
|
/* Estimated number of offsets in records (based on columns)
|
|
|
|
to start with. */
|
|
|
|
#define OFFS_IN_REC_NORMAL_SIZE 100
|
|
|
|
|
2014-02-26 19:11:54 +01:00
|
|
|
/** Data structure for an index. Most fields will be
|
|
|
|
initialized to 0, NULL or FALSE in dict_mem_index_create(). */
|
MDEV-15662 Instant DROP COLUMN or changing the order of columns
Allow ADD COLUMN anywhere in a table, not only adding as the
last column.
Allow instant DROP COLUMN and instant changing the order of columns.
The added columns will always be added last in clustered index records.
In new records, instantly dropped columns will be stored as NULL or
empty when possible.
Information about dropped and reordered columns will be written in
a metadata BLOB (mblob), which is stored before the first 'user' field
in the hidden metadata record at the start of the clustered index.
The presence of mblob is indicated by setting the delete-mark flag in
the metadata record.
The metadata BLOB stores the number of clustered index fields,
followed by an array of column information for each field.
For dropped columns, we store the NOT NULL flag, the fixed length,
and for variable-length columns, whether the maximum length exceeded
255 bytes. For non-dropped columns, we store the column position.
Unlike with MDEV-11369, when a table becomes empty, it cannot
be converted back to the canonical format. The reason for this is
that other threads may hold cached objects such as
row_prebuilt_t::ins_node that could refer to dropped or reordered
index fields.
For instant DROP COLUMN and ROW_FORMAT=COMPACT or ROW_FORMAT=DYNAMIC,
we must store the n_core_null_bytes in the root page, so that the
chain of node pointer records can be followed in order to reach the
leftmost leaf page where the metadata record is located.
If the mblob is present, we will zero-initialize the strings
"infimum" and "supremum" in the root page, and use the last byte of
"supremum" for storing the number of null bytes (which are allocated
but useless on node pointer pages). This is necessary for
btr_cur_instant_init_metadata() to be able to navigate to the mblob.
If the PRIMARY KEY contains any variable-length column and some
nullable columns were instantly dropped, the dict_index_t::n_nullable
in the data dictionary could be smaller than it actually is in the
non-leaf pages. Because of this, the non-leaf pages could use more
bytes for the null flags than the data dictionary expects, and we
could be reading the lengths of the variable-length columns from the
wrong offset, and thus reading the child page number from wrong place.
This is the result of two design mistakes that involve unnecessary
storage of data: First, it is nonsense to store any data fields for
the leftmost node pointer records, because the comparisons would be
resolved by the MIN_REC_FLAG alone. Second, there cannot be any null
fields in the clustered index node pointer fields, but we nevertheless
reserve space for all the null flags.
Limitations (future work):
MDEV-17459 Allow instant ALTER TABLE even if FULLTEXT INDEX exists
MDEV-17468 Avoid table rebuild on operations on generated columns
MDEV-17494 Refuse ALGORITHM=INSTANT when the row size is too large
btr_page_reorganize_low(): Preserve any metadata in the root page.
Call lock_move_reorganize_page() only after restoring the "infimum"
and "supremum" records, to avoid a memcmp() assertion failure.
dict_col_t::DROPPED: Magic value for dict_col_t::ind.
dict_col_t::clear_instant(): Renamed from dict_col_t::remove_instant().
Do not assert that the column was instantly added, because we
sometimes call this unconditionally for all columns.
Convert an instantly added column to a "core column". The old name
remove_instant() could be mistaken to refer to "instant DROP COLUMN".
dict_col_t::is_added(): Rename from dict_col_t::is_instant().
dtype_t::metadata_blob_init(): Initialize the mblob data type.
dtuple_t::is_metadata(), dtuple_t::is_alter_metadata(),
upd_t::is_metadata(), upd_t::is_alter_metadata(): Check if info_bits
refer to a metadata record.
dict_table_t::instant: Metadata about dropped or reordered columns.
dict_table_t::prepare_instant(): Prepare
ha_innobase_inplace_ctx::instant_table for instant ALTER TABLE.
innobase_instant_try() will pass this to dict_table_t::instant_column().
On rollback, dict_table_t::rollback_instant() will be called.
dict_table_t::instant_column(): Renamed from instant_add_column().
Add the parameter col_map so that columns can be reordered.
Copy and adjust v_cols[] as well.
dict_table_t::find(): Find an old column based on a new column number.
dict_table_t::serialise_columns(), dict_table_t::deserialise_columns():
Convert the mblob.
dict_index_t::instant_metadata(): Create the metadata record
for instant ALTER TABLE. Invoke dict_table_t::serialise_columns().
dict_index_t::reconstruct_fields(): Invoked by
dict_table_t::deserialise_columns().
dict_index_t::clear_instant_alter(): Move the fields for the
dropped columns to the end, and sort the surviving index fields
in ascending order of column position.
ha_innobase::check_if_supported_inplace_alter(): Do not allow
adding a FTS_DOC_ID column if a hidden FTS_DOC_ID column exists
due to FULLTEXT INDEX. (This always required ALGORITHM=COPY.)
instant_alter_column_possible(): Add a parameter for InnoDB table,
to check for additional conditions, such as the maximum number of
index fields.
ha_innobase_inplace_ctx::first_alter_pos: The first column whose position
is affected by instant ADD, DROP, or changing the order of columns.
innobase_build_col_map(): Skip added virtual columns.
prepare_inplace_add_virtual(): Correctly compute num_to_add_vcol.
Remove some unnecessary code. Note that the call to
innodb_base_col_setup() should be executed later.
commit_try_norebuild(): If ctx->is_instant(), let the virtual
columns be added or dropped by innobase_instant_try().
innobase_instant_try(): Fill in a zero default value for the
hidden column FTS_DOC_ID (to reduce the work needed in MDEV-17459).
If any columns were dropped or reordered (or added not last),
delete any SYS_COLUMNS records for the following columns, and
insert SYS_COLUMNS records for all subsequent stored columns as well
as for all virtual columns. If any virtual column is dropped, rewrite
all virtual column metadata. Use a shortcut only for adding
virtual columns. This is because innobase_drop_virtual_try()
assumes that the dropped virtual columns still exist in ctx->old_table.
innodb_update_cols(): Renamed from innodb_update_n_cols().
innobase_add_one_virtual(), innobase_insert_sys_virtual(): Change
the return type to bool, and invoke my_error() when detecting an error.
innodb_insert_sys_columns(): Insert a record into SYS_COLUMNS.
Refactored from innobase_add_one_virtual() and innobase_instant_add_col().
innobase_instant_add_col(): Replace the parameter dfield with type.
innobase_instant_drop_cols(): Drop matching columns from SYS_COLUMNS
and all columns from SYS_VIRTUAL.
innobase_add_virtual_try(), innobase_drop_virtual_try(): Let
the caller invoke innodb_update_cols().
innobase_rename_column_try(): Skip dropped columns.
commit_cache_norebuild(): Update table->fts->doc_col.
dict_mem_table_col_rename_low(): Skip dropped columns.
trx_undo_rec_get_partial_row(): Skip dropped columns.
trx_undo_update_rec_get_update(): Handle the metadata BLOB correctly.
trx_undo_page_report_modify(): Avoid out-of-bounds access to record fields.
Log metadata records consistently.
Apparently, the first fields of a clustered index may be updated
in an update_undo vector when the index is ID_IND of SYS_FOREIGN,
as part of renaming the table during ALTER TABLE. Normally, updates of
the PRIMARY KEY should be logged as delete-mark and an insert.
row_undo_mod_parse_undo_rec(), row_purge_parse_undo_rec():
Use trx_undo_metadata.
row_undo_mod_clust_low(): On metadata rollback, roll back the root page too.
row_undo_mod_clust(): Relax an assertion. The delete-mark flag was
repurposed for ALTER TABLE metadata records.
row_rec_to_index_entry_impl(): Add the template parameter mblob
and the optional parameter info_bits for specifying the desired new
info bits. For the metadata tuple, allow conversion between the original
format (ADD COLUMN only) and the generic format (with hidden BLOB).
Add the optional parameter "pad" to determine whether the tuple should
be padded to the index fields (on ALTER TABLE it should), or whether
it should remain at its original size (on rollback).
row_build_index_entry_low(): Clean up the code, removing
redundant variables and conditions. For instantly dropped columns,
generate a dummy value that is NULL, the empty string, or a
fixed length of NUL bytes, depending on the type of the dropped column.
row_upd_clust_rec_by_insert_inherit_func(): On the update of PRIMARY KEY
of a record that contained a dropped column whose value was stored
externally, we will be inserting a dummy NULL or empty string value
to the field of the dropped column. The externally stored column would
eventually be dropped when purge removes the delete-marked record for
the old PRIMARY KEY value.
btr_index_rec_validate(): Recognize the metadata record.
btr_discard_only_page_on_level(): Preserve the generic instant
ALTER TABLE metadata.
btr_set_instant(): Replaces page_set_instant(). This sets a clustered
index root page to the appropriate format, or upgrades from
the MDEV-11369 instant ADD COLUMN to generic ALTER TABLE format.
btr_cur_instant_init_low(): Read and validate the metadata BLOB page
before reconstructing the dictionary information based on it.
btr_cur_instant_init_metadata(): Do not read any lengths from the
metadata record header before reading the BLOB. At this point, we
would not actually know how many nullable fields the metadata record
contains.
btr_cur_instant_root_init(): Initialize n_core_null_bytes in one
of two possible ways.
btr_cur_trim(): Handle the mblob record.
row_metadata_to_tuple(): Convert a metadata record to a data tuple,
based on the new info_bits of the metadata record.
btr_cur_pessimistic_update(): Invoke row_metadata_to_tuple() if needed.
Invoke dtuple_convert_big_rec() for metadata records if the record is
too large, or if the mblob is not yet marked as externally stored.
btr_cur_optimistic_delete_func(), btr_cur_pessimistic_delete():
When the last user record is deleted, do not delete the
generic instant ALTER TABLE metadata record. Only delete
MDEV-11369 instant ADD COLUMN metadata records.
btr_cur_optimistic_insert(): Avoid unnecessary computation of rec_size.
btr_pcur_store_position(): Allow a logically empty page to contain
a metadata record for generic ALTER TABLE.
REC_INFO_DEFAULT_ROW_ADD: Renamed from REC_INFO_DEFAULT_ROW.
This is for the old instant ADD COLUMN (MDEV-11369) only.
REC_INFO_DEFAULT_ROW_ALTER: The more generic metadata record,
with additional information for dropped or reordered columns.
rec_info_bits_valid(): Remove. The only case when this would fail
is when the record is the generic ALTER TABLE metadata record.
rec_is_alter_metadata(): Check if a record is the metadata record
for instant ALTER TABLE (other than ADD COLUMN). NOTE: This function
must not be invoked on node pointer records, because the delete-mark
flag in those records may be set (it is garbage), and then a debug
assertion could fail because index->is_instant() does not necessarily
hold.
rec_is_add_metadata(): Check if a record is MDEV-11369 ADD COLUMN metadata
record (not more generic instant ALTER TABLE).
rec_get_converted_size_comp_prefix_low(): Assume that the metadata
field will be stored externally. In dtuple_convert_big_rec() during
the rec_get_converted_size() call, it would not be there yet.
rec_get_converted_size_comp(): Replace status,fields,n_fields with tuple.
rec_init_offsets_comp_ordinary(), rec_get_converted_size_comp_prefix_low(),
rec_convert_dtuple_to_rec_comp(): Add template<bool mblob = false>.
With mblob=true, process a record with a metadata BLOB.
rec_copy_prefix_to_buf(): Assert that no fields beyond the key and
system columns are being copied. Exclude the metadata BLOB field.
rec_convert_dtuple_to_metadata_comp(): Convert an alter metadata tuple
into a record.
row_upd_index_replace_metadata(): Apply an update vector to an
alter_metadata tuple.
row_log_allocate(): Replace dict_index_t::is_instant()
with a more appropriate condition that ignores dict_table_t::instant.
Only a table on which the MDEV-11369 ADD COLUMN was performed
can "lose its instantness" when it becomes empty. After
instant DROP COLUMN or reordering columns, we cannot simply
convert the table to the canonical format, because the data
dictionary cache and all possibly existing references to it
from other client connection threads would have to be adjusted.
row_quiesce_write_index_fields(): Do not crash when the table contains
an instantly dropped column.
Thanks to Thirunarayanan Balathandayuthapani for discussing the design
and implementing an initial prototype of this.
Thanks to Matthias Leich for testing.
2018-10-19 16:49:54 +03:00
|
|
|
struct dict_index_t {
|
2014-02-26 19:11:54 +01:00
|
|
|
index_id_t id; /*!< id of the index */
|
|
|
|
mem_heap_t* heap; /*!< memory heap */
|
2016-08-12 11:17:45 +03:00
|
|
|
id_name_t name; /*!< index name */
|
2014-02-26 19:11:54 +01:00
|
|
|
dict_table_t* table; /*!< back pointer to table */
|
|
|
|
unsigned page:32;/*!< index tree root page number */
|
2016-08-12 11:17:45 +03:00
|
|
|
unsigned merge_threshold:6;
|
|
|
|
/*!< In the pessimistic delete, if the page
|
|
|
|
data size drops below this limit in percent,
|
|
|
|
merging it to a neighbor is tried */
|
|
|
|
# define DICT_INDEX_MERGE_THRESHOLD_DEFAULT 50
|
2014-02-26 19:11:54 +01:00
|
|
|
unsigned type:DICT_IT_BITS;
|
|
|
|
/*!< index type (DICT_CLUSTERED, DICT_UNIQUE,
|
2016-08-12 11:17:45 +03:00
|
|
|
DICT_IBUF, DICT_CORRUPT) */
|
2014-02-26 19:11:54 +01:00
|
|
|
#define MAX_KEY_LENGTH_BITS 12
|
|
|
|
unsigned trx_id_offset:MAX_KEY_LENGTH_BITS;
|
|
|
|
/*!< position of the trx id column
|
|
|
|
in a clustered index record, if the fields
|
|
|
|
before it are known to be of a fixed size,
|
|
|
|
0 otherwise */
|
2018-04-26 16:33:05 +03:00
|
|
|
#if (1<<MAX_KEY_LENGTH_BITS) < HA_MAX_KEY_LENGTH
|
|
|
|
# error (1<<MAX_KEY_LENGTH_BITS) < HA_MAX_KEY_LENGTH
|
2014-02-26 19:11:54 +01:00
|
|
|
#endif
|
|
|
|
unsigned n_user_defined_cols:10;
|
|
|
|
/*!< number of columns the user defined to
|
|
|
|
be in the index: in the internal
|
|
|
|
representation we add more columns */
|
2016-08-12 11:17:45 +03:00
|
|
|
unsigned nulls_equal:1;
|
|
|
|
/*!< if true, SQL NULL == SQL NULL */
|
2017-02-23 23:05:12 +02:00
|
|
|
#ifdef BTR_CUR_HASH_ADAPT
|
2016-12-05 21:04:30 +02:00
|
|
|
#ifdef MYSQL_INDEX_DISABLE_AHI
|
|
|
|
unsigned disable_ahi:1;
|
|
|
|
/*!< whether to disable the
|
|
|
|
adaptive hash index.
|
|
|
|
Maybe this could be disabled for
|
|
|
|
temporary tables? */
|
|
|
|
#endif
|
2017-02-23 23:05:12 +02:00
|
|
|
#endif /* BTR_CUR_HASH_ADAPT */
|
2014-02-26 19:11:54 +01:00
|
|
|
unsigned n_uniq:10;/*!< number of fields from the beginning
|
|
|
|
which are enough to determine an index
|
|
|
|
entry uniquely */
|
|
|
|
unsigned n_def:10;/*!< number of fields defined so far */
|
|
|
|
unsigned n_fields:10;/*!< number of fields in the index */
|
|
|
|
unsigned n_nullable:10;/*!< number of nullable fields */
|
MDEV-11369 Instant ADD COLUMN for InnoDB
For InnoDB tables, adding, dropping and reordering columns has
required a rebuild of the table and all its indexes. Since MySQL 5.6
(and MariaDB 10.0) this has been supported online (LOCK=NONE), allowing
concurrent modification of the tables.
This work revises the InnoDB ROW_FORMAT=REDUNDANT, ROW_FORMAT=COMPACT
and ROW_FORMAT=DYNAMIC so that columns can be appended instantaneously,
with only minor changes performed to the table structure. The counter
innodb_instant_alter_column in INFORMATION_SCHEMA.GLOBAL_STATUS
is incremented whenever a table rebuild operation is converted into
an instant ADD COLUMN operation.
ROW_FORMAT=COMPRESSED tables will not support instant ADD COLUMN.
Some usability limitations will be addressed in subsequent work:
MDEV-13134 Introduce ALTER TABLE attributes ALGORITHM=NOCOPY
and ALGORITHM=INSTANT
MDEV-14016 Allow instant ADD COLUMN, ADD INDEX, LOCK=NONE
The format of the clustered index (PRIMARY KEY) is changed as follows:
(1) The FIL_PAGE_TYPE of the root page will be FIL_PAGE_TYPE_INSTANT,
and a new field PAGE_INSTANT will contain the original number of fields
in the clustered index ('core' fields).
If instant ADD COLUMN has not been used or the table becomes empty,
or the very first instant ADD COLUMN operation is rolled back,
the fields PAGE_INSTANT and FIL_PAGE_TYPE will be reset
to 0 and FIL_PAGE_INDEX.
(2) A special 'default row' record is inserted into the leftmost leaf,
between the page infimum and the first user record. This record is
distinguished by the REC_INFO_MIN_REC_FLAG, and it is otherwise in the
same format as records that contain values for the instantly added
columns. This 'default row' always has the same number of fields as
the clustered index according to the table definition. The values of
'core' fields are to be ignored. For other fields, the 'default row'
will contain the default values as they were during the ALTER TABLE
statement. (If the column default values are changed later, those
values will only be stored in the .frm file. The 'default row' will
contain the original evaluated values, which must be the same for
every row.) The 'default row' must be completely hidden from
higher-level access routines. Assertions have been added to ensure
that no 'default row' is ever present in the adaptive hash index
or in locked records. The 'default row' is never delete-marked.
(3) In clustered index leaf page records, the number of fields must
reside between the number of 'core' fields (dict_index_t::n_core_fields
introduced in this work) and dict_index_t::n_fields. If the number
of fields is less than dict_index_t::n_fields, the missing fields
are replaced with the column value of the 'default row'.
Note: The number of fields in the record may shrink if some of the
last instantly added columns are updated to the value that is
in the 'default row'. The function btr_cur_trim() implements this
'compression' on update and rollback; dtuple::trim() implements it
on insert.
(4) In ROW_FORMAT=COMPACT and ROW_FORMAT=DYNAMIC records, the new
status value REC_STATUS_COLUMNS_ADDED will indicate the presence of
a new record header that will encode n_fields-n_core_fields-1 in
1 or 2 bytes. (In ROW_FORMAT=REDUNDANT records, the record header
always explicitly encodes the number of fields.)
We introduce the undo log record type TRX_UNDO_INSERT_DEFAULT for
covering the insert of the 'default row' record when instant ADD COLUMN
is used for the first time. Subsequent instant ADD COLUMN can use
TRX_UNDO_UPD_EXIST_REC.
This is joint work with Vin Chen (陈福荣) from Tencent. The design
that was discussed in April 2017 would not have allowed import or
export of data files, because instead of the 'default row' it would
have introduced a data dictionary table. The test
rpl.rpl_alter_instant is exactly as contributed in pull request #408.
The test innodb.instant_alter is based on a contributed test.
The redo log record format changes for ROW_FORMAT=DYNAMIC and
ROW_FORMAT=COMPACT are as contributed. (With this change present,
crash recovery from MariaDB 10.3.1 will fail in spectacular ways!)
Also the semantics of higher-level redo log records that modify the
PAGE_INSTANT field is changed. The redo log format version identifier
was already changed to LOG_HEADER_FORMAT_CURRENT=103 in MariaDB 10.3.1.
Everything else has been rewritten by me. Thanks to Elena Stepanova,
the code has been tested extensively.
When rolling back an instant ADD COLUMN operation, we must empty the
PAGE_FREE list after deleting or shortening the 'default row' record,
by calling either btr_page_empty() or btr_page_reorganize(). We must
know the size of each entry in the PAGE_FREE list. If rollback left a
freed copy of the 'default row' in the PAGE_FREE list, we would be
unable to determine its size (if it is in ROW_FORMAT=COMPACT or
ROW_FORMAT=DYNAMIC) because it would contain more fields than the
rolled-back definition of the clustered index.
UNIV_SQL_DEFAULT: A new special constant that designates an instantly
added column that is not present in the clustered index record.
len_is_stored(): Check if a length is an actual length. There are
two magic length values: UNIV_SQL_DEFAULT, UNIV_SQL_NULL.
dict_col_t::def_val: The 'default row' value of the column. If the
column is not added instantly, def_val.len will be UNIV_SQL_DEFAULT.
dict_col_t: Add the accessors is_virtual(), is_nullable(), is_instant(),
instant_value().
dict_col_t::remove_instant(): Remove the 'instant ADD' status of
a column.
dict_col_t::name(const dict_table_t& table): Replaces
dict_table_get_col_name().
dict_index_t::n_core_fields: The original number of fields.
For secondary indexes and if instant ADD COLUMN has not been used,
this will be equal to dict_index_t::n_fields.
dict_index_t::n_core_null_bytes: Number of bytes needed to
represent the null flags; usually equal to UT_BITS_IN_BYTES(n_nullable).
dict_index_t::NO_CORE_NULL_BYTES: Magic value signalling that
n_core_null_bytes was not initialized yet from the clustered index
root page.
dict_index_t: Add the accessors is_instant(), is_clust(),
get_n_nullable(), instant_field_value().
dict_index_t::instant_add_field(): Adjust clustered index metadata
for instant ADD COLUMN.
dict_index_t::remove_instant(): Remove the 'instant ADD' status
of a clustered index when the table becomes empty, or the very first
instant ADD COLUMN operation is rolled back.
dict_table_t: Add the accessors is_instant(), is_temporary(),
supports_instant().
dict_table_t::instant_add_column(): Adjust metadata for
instant ADD COLUMN.
dict_table_t::rollback_instant(): Adjust metadata on the rollback
of instant ADD COLUMN.
prepare_inplace_alter_table_dict(): First create the ctx->new_table,
and only then decide if the table really needs to be rebuilt.
We must split the creation of table or index metadata from the
creation of the dictionary table records and the creation of
the data. In this way, we can transform a table-rebuilding operation
into an instant ADD COLUMN operation. Dictionary objects will only
be added to cache when table rebuilding or index creation is needed.
The ctx->instant_table will never be added to cache.
dict_table_t::add_to_cache(): Modified and renamed from
dict_table_add_to_cache(). Do not modify the table metadata.
Let the callers invoke dict_table_add_system_columns() and if needed,
set can_be_evicted.
dict_create_sys_tables_tuple(), dict_create_table_step(): Omit the
system columns (which will now exist in the dict_table_t object
already at this point).
dict_create_table_step(): Expect the callers to invoke
dict_table_add_system_columns().
pars_create_table(): Before creating the table creation execution
graph, invoke dict_table_add_system_columns().
row_create_table_for_mysql(): Expect all callers to invoke
dict_table_add_system_columns().
create_index_dict(): Replaces row_merge_create_index_graph().
innodb_update_n_cols(): Renamed from innobase_update_n_virtual().
Call my_error() if an error occurs.
btr_cur_instant_init(), btr_cur_instant_init_low(),
btr_cur_instant_root_init():
Load additional metadata from the clustered index and set
dict_index_t::n_core_null_bytes. This is invoked
when table metadata is first loaded into the data dictionary.
dict_boot(): Initialize n_core_null_bytes for the four hard-coded
dictionary tables.
dict_create_index_step(): Initialize n_core_null_bytes. This is
executed as part of CREATE TABLE.
dict_index_build_internal_clust(): Initialize n_core_null_bytes to
NO_CORE_NULL_BYTES if table->supports_instant().
row_create_index_for_mysql(): Initialize n_core_null_bytes for
CREATE TEMPORARY TABLE.
commit_cache_norebuild(): Call the code to rename or enlarge columns
in the cache only if instant ADD COLUMN is not being used.
(Instant ADD COLUMN would copy all column metadata from
instant_table to old_table, including the names and lengths.)
PAGE_INSTANT: A new 13-bit field for storing dict_index_t::n_core_fields.
This is repurposing the 16-bit field PAGE_DIRECTION, of which only the
least significant 3 bits were used. The original byte containing
PAGE_DIRECTION will be accessible via the new constant PAGE_DIRECTION_B.
page_get_instant(), page_set_instant(): Accessors for the PAGE_INSTANT.
page_ptr_get_direction(), page_get_direction(),
page_ptr_set_direction(): Accessors for PAGE_DIRECTION.
page_direction_reset(): Reset PAGE_DIRECTION, PAGE_N_DIRECTION.
page_direction_increment(): Increment PAGE_N_DIRECTION
and set PAGE_DIRECTION.
rec_get_offsets(): Use the 'leaf' parameter for non-debug purposes,
and assume that heap_no is always set.
Initialize all dict_index_t::n_fields for ROW_FORMAT=REDUNDANT records,
even if the record contains fewer fields.
rec_offs_make_valid(): Add the parameter 'leaf'.
rec_copy_prefix_to_dtuple(): Assert that the tuple is only built
on the core fields. Instant ADD COLUMN only applies to the
clustered index, and we should never build a search key that has
more than the PRIMARY KEY and possibly DB_TRX_ID,DB_ROLL_PTR.
All these columns are always present.
dict_index_build_data_tuple(): Remove assertions that would be
duplicated in rec_copy_prefix_to_dtuple().
rec_init_offsets(): Support ROW_FORMAT=REDUNDANT records whose
number of fields is between n_core_fields and n_fields.
cmp_rec_rec_with_match(): Implement the comparison between two
MIN_REC_FLAG records.
trx_t::in_rollback: Make the field available in non-debug builds.
trx_start_for_ddl_low(): Remove dangerous error-tolerance.
A dictionary transaction must be flagged as such before it has generated
any undo log records. This is because trx_undo_assign_undo() will mark
the transaction as a dictionary transaction in the undo log header
right before the very first undo log record is being written.
btr_index_rec_validate(): Account for instant ADD COLUMN
row_undo_ins_remove_clust_rec(): On the rollback of an insert into
SYS_COLUMNS, revert instant ADD COLUMN in the cache by removing the
last column from the table and the clustered index.
row_search_on_row_ref(), row_undo_mod_parse_undo_rec(), row_undo_mod(),
trx_undo_update_rec_get_update(): Handle the 'default row'
as a special case.
dtuple_t::trim(index): Omit a redundant suffix of an index tuple right
before insert or update. After instant ADD COLUMN, if the last fields
of a clustered index tuple match the 'default row', there is no
need to store them. While trimming the entry, we must hold a page latch,
so that the table cannot be emptied and the 'default row' be deleted.
btr_cur_optimistic_update(), btr_cur_pessimistic_update(),
row_upd_clust_rec_by_insert(), row_ins_clust_index_entry_low():
Invoke dtuple_t::trim() if needed.
row_ins_clust_index_entry(): Restore dtuple_t::n_fields after calling
row_ins_clust_index_entry_low().
rec_get_converted_size(), rec_get_converted_size_comp(): Allow the number
of fields to be between n_core_fields and n_fields. Do not support
infimum,supremum. They are never supposed to be stored in dtuple_t,
because page creation nowadays uses a lower-level method for initializing
them.
rec_convert_dtuple_to_rec_comp(): Assign the status bits based on the
number of fields.
btr_cur_trim(): In an update, trim the index entry as needed. For the
'default row', handle rollback specially. For user records, omit
fields that match the 'default row'.
btr_cur_optimistic_delete_func(), btr_cur_pessimistic_delete():
Skip locking and adaptive hash index for the 'default row'.
row_log_table_apply_convert_mrec(): Replace 'default row' values if needed.
In the temporary file that is applied by row_log_table_apply(),
we must identify whether the records contain the extra header for
instantly added columns. For now, we will allocate an additional byte
for this for ROW_T_INSERT and ROW_T_UPDATE records when the source table
has been subject to instant ADD COLUMN. The ROW_T_DELETE records are
fine, as they will be converted and will only contain 'core' columns
(PRIMARY KEY and some system columns) that are converted from dtuple_t.
rec_get_converted_size_temp(), rec_init_offsets_temp(),
rec_convert_dtuple_to_temp(): Add the parameter 'status'.
REC_INFO_DEFAULT_ROW = REC_INFO_MIN_REC_FLAG | REC_STATUS_COLUMNS_ADDED:
An info_bits constant for distinguishing the 'default row' record.
rec_comp_status_t: An enum of the status bit values.
rec_leaf_format: An enum that replaces the bool parameter of
rec_init_offsets_comp_ordinary().
2017-10-06 07:00:05 +03:00
|
|
|
unsigned n_core_fields:10;/*!< number of fields in the index
|
|
|
|
(before the first time of instant add columns) */
|
|
|
|
/** number of bytes of null bits in ROW_FORMAT!=REDUNDANT node pointer
|
|
|
|
records; usually equal to UT_BITS_IN_BYTES(n_nullable), but
|
|
|
|
can be less in clustered indexes with instant ADD COLUMN */
|
|
|
|
unsigned n_core_null_bytes:8;
|
|
|
|
/** magic value signalling that n_core_null_bytes was not
|
|
|
|
initialized yet */
|
|
|
|
static const unsigned NO_CORE_NULL_BYTES = 0xff;
|
2017-10-13 21:59:15 +03:00
|
|
|
/** The clustered index ID of the hard-coded SYS_INDEXES table. */
|
|
|
|
static const unsigned DICT_INDEXES_ID = 3;
|
2014-02-26 19:11:54 +01:00
|
|
|
unsigned cached:1;/*!< TRUE if the index object is in the
|
|
|
|
dictionary cache */
|
|
|
|
unsigned to_be_dropped:1;
|
|
|
|
/*!< TRUE if the index is to be dropped;
|
2019-05-17 15:17:37 +03:00
|
|
|
protected by dict_sys.latch */
|
2014-02-26 19:11:54 +01:00
|
|
|
unsigned online_status:2;
|
|
|
|
/*!< enum online_index_status.
|
|
|
|
Transitions from ONLINE_INDEX_COMPLETE (to
|
|
|
|
ONLINE_INDEX_CREATION) are protected
|
2019-05-17 15:17:37 +03:00
|
|
|
by dict_sys.latch and
|
2019-05-17 14:32:53 +03:00
|
|
|
dict_sys.mutex. Other changes are
|
2014-02-26 19:11:54 +01:00
|
|
|
protected by index->lock. */
|
2016-08-12 11:17:45 +03:00
|
|
|
unsigned uncommitted:1;
|
|
|
|
/*!< a flag that is set for secondary indexes
|
|
|
|
that have not been committed to the
|
|
|
|
data dictionary yet */
|
|
|
|
|
|
|
|
#ifdef UNIV_DEBUG
|
2017-09-19 19:20:11 +03:00
|
|
|
/** whether this is a dummy index object */
|
|
|
|
bool is_dummy;
|
2019-04-01 16:11:41 +03:00
|
|
|
/** whether btr_cur_instant_init() is in progress */
|
|
|
|
bool in_instant_init;
|
2016-08-12 11:17:45 +03:00
|
|
|
uint32_t magic_n;/*!< magic number */
|
|
|
|
/** Value of dict_index_t::magic_n */
|
|
|
|
# define DICT_INDEX_MAGIC_N 76789786
|
|
|
|
#endif
|
2014-02-26 19:11:54 +01:00
|
|
|
dict_field_t* fields; /*!< array of field descriptions */
|
2016-08-12 11:17:45 +03:00
|
|
|
st_mysql_ftparser*
|
|
|
|
parser; /*!< fulltext parser plugin */
|
2016-09-06 09:43:16 +03:00
|
|
|
bool has_new_v_col;
|
|
|
|
/*!< whether it has a newly added virtual
|
|
|
|
column in ALTER */
|
2017-10-25 21:35:33 +03:00
|
|
|
bool index_fts_syncing;/*!< Whether the fts index is
|
2017-10-27 10:24:02 +03:00
|
|
|
still syncing in the background;
|
|
|
|
FIXME: remove this and use MDL */
|
2014-02-26 19:11:54 +01:00
|
|
|
UT_LIST_NODE_T(dict_index_t)
|
|
|
|
indexes;/*!< list of indexes of the table */
|
2017-02-23 23:05:12 +02:00
|
|
|
#ifdef BTR_CUR_ADAPT
|
2014-02-26 19:11:54 +01:00
|
|
|
btr_search_t* search_info;
|
|
|
|
/*!< info used in optimistic searches */
|
2017-02-23 23:05:12 +02:00
|
|
|
#endif /* BTR_CUR_ADAPT */
|
2014-02-26 19:11:54 +01:00
|
|
|
row_log_t* online_log;
|
|
|
|
/*!< the log of modifications
|
|
|
|
during online index creation;
|
|
|
|
valid when online_status is
|
|
|
|
ONLINE_INDEX_CREATION */
|
|
|
|
/*----------------------*/
|
|
|
|
/** Statistics for query optimization */
|
|
|
|
/* @{ */
|
|
|
|
ib_uint64_t* stat_n_diff_key_vals;
|
|
|
|
/*!< approximate number of different
|
|
|
|
key values for this index, for each
|
|
|
|
n-column prefix where 1 <= n <=
|
|
|
|
dict_get_n_unique(index) (the array is
|
|
|
|
indexed from 0 to n_uniq-1); we
|
|
|
|
periodically calculate new
|
|
|
|
estimates */
|
|
|
|
ib_uint64_t* stat_n_sample_sizes;
|
|
|
|
/*!< number of pages that were sampled
|
|
|
|
to calculate each of stat_n_diff_key_vals[],
|
|
|
|
e.g. stat_n_sample_sizes[3] pages were sampled
|
|
|
|
to get the number stat_n_diff_key_vals[3]. */
|
|
|
|
ib_uint64_t* stat_n_non_null_key_vals;
|
|
|
|
/* approximate number of non-null key values
|
|
|
|
for this index, for each column where
|
|
|
|
1 <= n <= dict_get_n_unique(index) (the array
|
|
|
|
is indexed from 0 to n_uniq-1); This
|
|
|
|
is used when innodb_stats_method is
|
|
|
|
"nulls_ignored". */
|
|
|
|
ulint stat_index_size;
|
|
|
|
/*!< approximate index size in
|
|
|
|
database pages */
|
|
|
|
ulint stat_n_leaf_pages;
|
|
|
|
/*!< approximate number of leaf pages in the
|
|
|
|
index tree */
|
2014-07-22 19:31:45 +03:00
|
|
|
bool stats_error_printed;
|
|
|
|
/*!< has persistent statistics error printed
|
|
|
|
for this index ? */
|
2014-02-26 19:11:54 +01:00
|
|
|
/* @} */
|
2014-08-06 15:28:58 +03:00
|
|
|
/** Statistics for defragmentation, these numbers are estimations and
|
|
|
|
could be very inaccurate at certain times, e.g. right after restart,
|
|
|
|
during defragmentation, etc. */
|
|
|
|
/* @{ */
|
|
|
|
ulint stat_defrag_modified_counter;
|
|
|
|
ulint stat_defrag_n_pages_freed;
|
|
|
|
/* number of pages freed by defragmentation. */
|
|
|
|
ulint stat_defrag_n_page_split;
|
|
|
|
/* number of page splits since last full index
|
|
|
|
defragmentation. */
|
|
|
|
ulint stat_defrag_data_size_sample[STAT_DEFRAG_DATA_SIZE_N_SAMPLE];
|
|
|
|
/* data size when compression failure happened
|
|
|
|
the most recent 10 times. */
|
|
|
|
ulint stat_defrag_sample_next_slot;
|
|
|
|
/* in which slot the next sample should be
|
|
|
|
saved. */
|
|
|
|
/* @} */
|
2016-08-12 11:17:45 +03:00
|
|
|
rtr_ssn_t rtr_ssn;/*!< Node sequence number for RTree */
|
|
|
|
rtr_info_track_t*
|
|
|
|
rtr_track;/*!< tracking all R-Tree search cursors */
|
2014-02-26 19:11:54 +01:00
|
|
|
trx_id_t trx_id; /*!< id of the transaction that created this
|
|
|
|
index, or 0 if the index existed
|
|
|
|
when InnoDB was started up */
|
|
|
|
zip_pad_info_t zip_pad;/*!< Information about state of
|
|
|
|
compression failures and successes */
|
2016-08-12 11:17:45 +03:00
|
|
|
rw_lock_t lock; /*!< read-write lock protecting the
|
|
|
|
upper levels of the index tree */
|
|
|
|
|
|
|
|
/** Determine if the index has been committed to the
|
|
|
|
data dictionary.
|
|
|
|
@return whether the index definition has been committed */
|
|
|
|
bool is_committed() const
|
|
|
|
{
|
|
|
|
ut_ad(!uncommitted || !(type & DICT_CLUSTERED));
|
|
|
|
return(UNIV_LIKELY(!uncommitted));
|
|
|
|
}
|
|
|
|
|
|
|
|
/** Flag an index committed or uncommitted.
|
|
|
|
@param[in] committed whether the index is committed */
|
|
|
|
void set_committed(bool committed)
|
|
|
|
{
|
|
|
|
ut_ad(!to_be_dropped);
|
|
|
|
ut_ad(committed || !(type & DICT_CLUSTERED));
|
|
|
|
uncommitted = !committed;
|
|
|
|
}
|
MDEV-12253: Buffer pool blocks are accessed after they have been freed
Problem was that bpage was referenced after it was already freed
from LRU. Fixed by adding a new variable encrypted that is
passed down to buf_page_check_corrupt() and used in
buf_page_get_gen() to stop processing page read.
This patch should also address following test failures and
bugs:
MDEV-12419: IMPORT should not look up tablespace in
PageConverter::validate(). This is now removed.
MDEV-10099: encryption.innodb_onlinealter_encryption fails
sporadically in buildbot
MDEV-11420: encryption.innodb_encryption-page-compression
failed in buildbot
MDEV-11222: encryption.encrypt_and_grep failed in buildbot on P8
Removed dict_table_t::is_encrypted and dict_table_t::ibd_file_missing
and replaced these with dict_table_t::file_unreadable. Table
ibd file is missing if fil_get_space(space_id) returns NULL
and encrypted if not. Removed dict_table_t::is_corrupted field.
Ported FilSpace class from 10.2 and using that on buf_page_check_corrupt(),
buf_page_decrypt_after_read(), buf_page_encrypt_before_write(),
buf_dblwr_process(), buf_read_page(), dict_stats_save_defrag_stats().
Added test cases when enrypted page could be read while doing
redo log crash recovery. Also added test case for row compressed
blobs.
btr_cur_open_at_index_side_func(),
btr_cur_open_at_rnd_pos_func(): Avoid referencing block that is
NULL.
buf_page_get_zip(): Issue error if page read fails.
buf_page_get_gen(): Use dberr_t for error detection and
do not reference bpage after we hare freed it.
buf_mark_space_corrupt(): remove bpage from LRU also when
it is encrypted.
buf_page_check_corrupt(): @return DB_SUCCESS if page has
been read and is not corrupted,
DB_PAGE_CORRUPTED if page based on checksum check is corrupted,
DB_DECRYPTION_FAILED if page post encryption checksum matches but
after decryption normal page checksum does not match. In read
case only DB_SUCCESS is possible.
buf_page_io_complete(): use dberr_t for error handling.
buf_flush_write_block_low(),
buf_read_ahead_random(),
buf_read_page_async(),
buf_read_ahead_linear(),
buf_read_ibuf_merge_pages(),
buf_read_recv_pages(),
fil_aio_wait():
Issue error if page read fails.
btr_pcur_move_to_next_page(): Do not reference page if it is
NULL.
Introduced dict_table_t::is_readable() and dict_index_t::is_readable()
that will return true if tablespace exists and pages read from
tablespace are not corrupted or page decryption failed.
Removed buf_page_t::key_version. After page decryption the
key version is not removed from page frame. For unencrypted
pages, old key_version is removed at buf_page_encrypt_before_write()
dict_stats_update_transient_for_index(),
dict_stats_update_transient()
Do not continue if table decryption failed or table
is corrupted.
dict0stats.cc: Introduced a dict_stats_report_error function
to avoid code duplication.
fil_parse_write_crypt_data():
Check that key read from redo log entry is found from
encryption plugin and if it is not, refuse to start.
PageConverter::validate(): Removed access to fil_space_t as
tablespace is not available during import.
Fixed error code on innodb.innodb test.
Merged test cased innodb-bad-key-change5 and innodb-bad-key-shutdown
to innodb-bad-key-change2. Removed innodb-bad-key-change5 test.
Decreased unnecessary complexity on some long lasting tests.
Removed fil_inc_pending_ops(), fil_decr_pending_ops(),
fil_get_first_space(), fil_get_next_space(),
fil_get_first_space_safe(), fil_get_next_space_safe()
functions.
fil_space_verify_crypt_checksum(): Fixed bug found using ASAN
where FIL_PAGE_END_LSN_OLD_CHECKSUM field was incorrectly
accessed from row compressed tables. Fixed out of page frame
bug for row compressed tables in
fil_space_verify_crypt_checksum() found using ASAN. Incorrect
function was called for compressed table.
Added new tests for discard, rename table and drop (we should allow them
even when page decryption fails). Alter table rename is not allowed.
Added test for restart with innodb-force-recovery=1 when page read on
redo-recovery cant be decrypted. Added test for corrupted table where
both page data and FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION is corrupted.
Adjusted the test case innodb_bug14147491 so that it does not anymore
expect crash. Instead table is just mostly not usable.
fil0fil.h: fil_space_acquire_low is not visible function
and fil_space_acquire and fil_space_acquire_silent are
inline functions. FilSpace class uses fil_space_acquire_low
directly.
recv_apply_hashed_log_recs() does not return anything.
2017-04-26 15:19:16 +03:00
|
|
|
|
2018-03-22 15:30:54 +02:00
|
|
|
/** Notify that the index pages are going to be modified.
|
|
|
|
@param[in,out] mtr mini-transaction */
|
|
|
|
inline void set_modified(mtr_t& mtr) const;
|
|
|
|
|
2017-05-05 10:25:29 +03:00
|
|
|
/** @return whether this index is readable
|
|
|
|
@retval true normally
|
|
|
|
@retval false if this is a single-table tablespace
|
|
|
|
and the .ibd file is missing, or a
|
|
|
|
page cannot be read or decrypted */
|
|
|
|
inline bool is_readable() const;
|
MDEV-11369 Instant ADD COLUMN for InnoDB
For InnoDB tables, adding, dropping and reordering columns has
required a rebuild of the table and all its indexes. Since MySQL 5.6
(and MariaDB 10.0) this has been supported online (LOCK=NONE), allowing
concurrent modification of the tables.
This work revises the InnoDB ROW_FORMAT=REDUNDANT, ROW_FORMAT=COMPACT
and ROW_FORMAT=DYNAMIC so that columns can be appended instantaneously,
with only minor changes performed to the table structure. The counter
innodb_instant_alter_column in INFORMATION_SCHEMA.GLOBAL_STATUS
is incremented whenever a table rebuild operation is converted into
an instant ADD COLUMN operation.
ROW_FORMAT=COMPRESSED tables will not support instant ADD COLUMN.
Some usability limitations will be addressed in subsequent work:
MDEV-13134 Introduce ALTER TABLE attributes ALGORITHM=NOCOPY
and ALGORITHM=INSTANT
MDEV-14016 Allow instant ADD COLUMN, ADD INDEX, LOCK=NONE
The format of the clustered index (PRIMARY KEY) is changed as follows:
(1) The FIL_PAGE_TYPE of the root page will be FIL_PAGE_TYPE_INSTANT,
and a new field PAGE_INSTANT will contain the original number of fields
in the clustered index ('core' fields).
If instant ADD COLUMN has not been used or the table becomes empty,
or the very first instant ADD COLUMN operation is rolled back,
the fields PAGE_INSTANT and FIL_PAGE_TYPE will be reset
to 0 and FIL_PAGE_INDEX.
(2) A special 'default row' record is inserted into the leftmost leaf,
between the page infimum and the first user record. This record is
distinguished by the REC_INFO_MIN_REC_FLAG, and it is otherwise in the
same format as records that contain values for the instantly added
columns. This 'default row' always has the same number of fields as
the clustered index according to the table definition. The values of
'core' fields are to be ignored. For other fields, the 'default row'
will contain the default values as they were during the ALTER TABLE
statement. (If the column default values are changed later, those
values will only be stored in the .frm file. The 'default row' will
contain the original evaluated values, which must be the same for
every row.) The 'default row' must be completely hidden from
higher-level access routines. Assertions have been added to ensure
that no 'default row' is ever present in the adaptive hash index
or in locked records. The 'default row' is never delete-marked.
(3) In clustered index leaf page records, the number of fields must
reside between the number of 'core' fields (dict_index_t::n_core_fields
introduced in this work) and dict_index_t::n_fields. If the number
of fields is less than dict_index_t::n_fields, the missing fields
are replaced with the column value of the 'default row'.
Note: The number of fields in the record may shrink if some of the
last instantly added columns are updated to the value that is
in the 'default row'. The function btr_cur_trim() implements this
'compression' on update and rollback; dtuple::trim() implements it
on insert.
(4) In ROW_FORMAT=COMPACT and ROW_FORMAT=DYNAMIC records, the new
status value REC_STATUS_COLUMNS_ADDED will indicate the presence of
a new record header that will encode n_fields-n_core_fields-1 in
1 or 2 bytes. (In ROW_FORMAT=REDUNDANT records, the record header
always explicitly encodes the number of fields.)
We introduce the undo log record type TRX_UNDO_INSERT_DEFAULT for
covering the insert of the 'default row' record when instant ADD COLUMN
is used for the first time. Subsequent instant ADD COLUMN can use
TRX_UNDO_UPD_EXIST_REC.
This is joint work with Vin Chen (陈福荣) from Tencent. The design
that was discussed in April 2017 would not have allowed import or
export of data files, because instead of the 'default row' it would
have introduced a data dictionary table. The test
rpl.rpl_alter_instant is exactly as contributed in pull request #408.
The test innodb.instant_alter is based on a contributed test.
The redo log record format changes for ROW_FORMAT=DYNAMIC and
ROW_FORMAT=COMPACT are as contributed. (With this change present,
crash recovery from MariaDB 10.3.1 will fail in spectacular ways!)
Also the semantics of higher-level redo log records that modify the
PAGE_INSTANT field is changed. The redo log format version identifier
was already changed to LOG_HEADER_FORMAT_CURRENT=103 in MariaDB 10.3.1.
Everything else has been rewritten by me. Thanks to Elena Stepanova,
the code has been tested extensively.
When rolling back an instant ADD COLUMN operation, we must empty the
PAGE_FREE list after deleting or shortening the 'default row' record,
by calling either btr_page_empty() or btr_page_reorganize(). We must
know the size of each entry in the PAGE_FREE list. If rollback left a
freed copy of the 'default row' in the PAGE_FREE list, we would be
unable to determine its size (if it is in ROW_FORMAT=COMPACT or
ROW_FORMAT=DYNAMIC) because it would contain more fields than the
rolled-back definition of the clustered index.
UNIV_SQL_DEFAULT: A new special constant that designates an instantly
added column that is not present in the clustered index record.
len_is_stored(): Check if a length is an actual length. There are
two magic length values: UNIV_SQL_DEFAULT, UNIV_SQL_NULL.
dict_col_t::def_val: The 'default row' value of the column. If the
column is not added instantly, def_val.len will be UNIV_SQL_DEFAULT.
dict_col_t: Add the accessors is_virtual(), is_nullable(), is_instant(),
instant_value().
dict_col_t::remove_instant(): Remove the 'instant ADD' status of
a column.
dict_col_t::name(const dict_table_t& table): Replaces
dict_table_get_col_name().
dict_index_t::n_core_fields: The original number of fields.
For secondary indexes and if instant ADD COLUMN has not been used,
this will be equal to dict_index_t::n_fields.
dict_index_t::n_core_null_bytes: Number of bytes needed to
represent the null flags; usually equal to UT_BITS_IN_BYTES(n_nullable).
dict_index_t::NO_CORE_NULL_BYTES: Magic value signalling that
n_core_null_bytes was not initialized yet from the clustered index
root page.
dict_index_t: Add the accessors is_instant(), is_clust(),
get_n_nullable(), instant_field_value().
dict_index_t::instant_add_field(): Adjust clustered index metadata
for instant ADD COLUMN.
dict_index_t::remove_instant(): Remove the 'instant ADD' status
of a clustered index when the table becomes empty, or the very first
instant ADD COLUMN operation is rolled back.
dict_table_t: Add the accessors is_instant(), is_temporary(),
supports_instant().
dict_table_t::instant_add_column(): Adjust metadata for
instant ADD COLUMN.
dict_table_t::rollback_instant(): Adjust metadata on the rollback
of instant ADD COLUMN.
prepare_inplace_alter_table_dict(): First create the ctx->new_table,
and only then decide if the table really needs to be rebuilt.
We must split the creation of table or index metadata from the
creation of the dictionary table records and the creation of
the data. In this way, we can transform a table-rebuilding operation
into an instant ADD COLUMN operation. Dictionary objects will only
be added to cache when table rebuilding or index creation is needed.
The ctx->instant_table will never be added to cache.
dict_table_t::add_to_cache(): Modified and renamed from
dict_table_add_to_cache(). Do not modify the table metadata.
Let the callers invoke dict_table_add_system_columns() and if needed,
set can_be_evicted.
dict_create_sys_tables_tuple(), dict_create_table_step(): Omit the
system columns (which will now exist in the dict_table_t object
already at this point).
dict_create_table_step(): Expect the callers to invoke
dict_table_add_system_columns().
pars_create_table(): Before creating the table creation execution
graph, invoke dict_table_add_system_columns().
row_create_table_for_mysql(): Expect all callers to invoke
dict_table_add_system_columns().
create_index_dict(): Replaces row_merge_create_index_graph().
innodb_update_n_cols(): Renamed from innobase_update_n_virtual().
Call my_error() if an error occurs.
btr_cur_instant_init(), btr_cur_instant_init_low(),
btr_cur_instant_root_init():
Load additional metadata from the clustered index and set
dict_index_t::n_core_null_bytes. This is invoked
when table metadata is first loaded into the data dictionary.
dict_boot(): Initialize n_core_null_bytes for the four hard-coded
dictionary tables.
dict_create_index_step(): Initialize n_core_null_bytes. This is
executed as part of CREATE TABLE.
dict_index_build_internal_clust(): Initialize n_core_null_bytes to
NO_CORE_NULL_BYTES if table->supports_instant().
row_create_index_for_mysql(): Initialize n_core_null_bytes for
CREATE TEMPORARY TABLE.
commit_cache_norebuild(): Call the code to rename or enlarge columns
in the cache only if instant ADD COLUMN is not being used.
(Instant ADD COLUMN would copy all column metadata from
instant_table to old_table, including the names and lengths.)
PAGE_INSTANT: A new 13-bit field for storing dict_index_t::n_core_fields.
This is repurposing the 16-bit field PAGE_DIRECTION, of which only the
least significant 3 bits were used. The original byte containing
PAGE_DIRECTION will be accessible via the new constant PAGE_DIRECTION_B.
page_get_instant(), page_set_instant(): Accessors for the PAGE_INSTANT.
page_ptr_get_direction(), page_get_direction(),
page_ptr_set_direction(): Accessors for PAGE_DIRECTION.
page_direction_reset(): Reset PAGE_DIRECTION, PAGE_N_DIRECTION.
page_direction_increment(): Increment PAGE_N_DIRECTION
and set PAGE_DIRECTION.
rec_get_offsets(): Use the 'leaf' parameter for non-debug purposes,
and assume that heap_no is always set.
Initialize all dict_index_t::n_fields for ROW_FORMAT=REDUNDANT records,
even if the record contains fewer fields.
rec_offs_make_valid(): Add the parameter 'leaf'.
rec_copy_prefix_to_dtuple(): Assert that the tuple is only built
on the core fields. Instant ADD COLUMN only applies to the
clustered index, and we should never build a search key that has
more than the PRIMARY KEY and possibly DB_TRX_ID,DB_ROLL_PTR.
All these columns are always present.
dict_index_build_data_tuple(): Remove assertions that would be
duplicated in rec_copy_prefix_to_dtuple().
rec_init_offsets(): Support ROW_FORMAT=REDUNDANT records whose
number of fields is between n_core_fields and n_fields.
cmp_rec_rec_with_match(): Implement the comparison between two
MIN_REC_FLAG records.
trx_t::in_rollback: Make the field available in non-debug builds.
trx_start_for_ddl_low(): Remove dangerous error-tolerance.
A dictionary transaction must be flagged as such before it has generated
any undo log records. This is because trx_undo_assign_undo() will mark
the transaction as a dictionary transaction in the undo log header
right before the very first undo log record is being written.
btr_index_rec_validate(): Account for instant ADD COLUMN
row_undo_ins_remove_clust_rec(): On the rollback of an insert into
SYS_COLUMNS, revert instant ADD COLUMN in the cache by removing the
last column from the table and the clustered index.
row_search_on_row_ref(), row_undo_mod_parse_undo_rec(), row_undo_mod(),
trx_undo_update_rec_get_update(): Handle the 'default row'
as a special case.
dtuple_t::trim(index): Omit a redundant suffix of an index tuple right
before insert or update. After instant ADD COLUMN, if the last fields
of a clustered index tuple match the 'default row', there is no
need to store them. While trimming the entry, we must hold a page latch,
so that the table cannot be emptied and the 'default row' be deleted.
btr_cur_optimistic_update(), btr_cur_pessimistic_update(),
row_upd_clust_rec_by_insert(), row_ins_clust_index_entry_low():
Invoke dtuple_t::trim() if needed.
row_ins_clust_index_entry(): Restore dtuple_t::n_fields after calling
row_ins_clust_index_entry_low().
rec_get_converted_size(), rec_get_converted_size_comp(): Allow the number
of fields to be between n_core_fields and n_fields. Do not support
infimum,supremum. They are never supposed to be stored in dtuple_t,
because page creation nowadays uses a lower-level method for initializing
them.
rec_convert_dtuple_to_rec_comp(): Assign the status bits based on the
number of fields.
btr_cur_trim(): In an update, trim the index entry as needed. For the
'default row', handle rollback specially. For user records, omit
fields that match the 'default row'.
btr_cur_optimistic_delete_func(), btr_cur_pessimistic_delete():
Skip locking and adaptive hash index for the 'default row'.
row_log_table_apply_convert_mrec(): Replace 'default row' values if needed.
In the temporary file that is applied by row_log_table_apply(),
we must identify whether the records contain the extra header for
instantly added columns. For now, we will allocate an additional byte
for this for ROW_T_INSERT and ROW_T_UPDATE records when the source table
has been subject to instant ADD COLUMN. The ROW_T_DELETE records are
fine, as they will be converted and will only contain 'core' columns
(PRIMARY KEY and some system columns) that are converted from dtuple_t.
rec_get_converted_size_temp(), rec_init_offsets_temp(),
rec_convert_dtuple_to_temp(): Add the parameter 'status'.
REC_INFO_DEFAULT_ROW = REC_INFO_MIN_REC_FLAG | REC_STATUS_COLUMNS_ADDED:
An info_bits constant for distinguishing the 'default row' record.
rec_comp_status_t: An enum of the status bit values.
rec_leaf_format: An enum that replaces the bool parameter of
rec_init_offsets_comp_ordinary().
2017-10-06 07:00:05 +03:00
|
|
|
|
MDEV-15662 Instant DROP COLUMN or changing the order of columns
Allow ADD COLUMN anywhere in a table, not only adding as the
last column.
Allow instant DROP COLUMN and instant changing the order of columns.
The added columns will always be added last in clustered index records.
In new records, instantly dropped columns will be stored as NULL or
empty when possible.
Information about dropped and reordered columns will be written in
a metadata BLOB (mblob), which is stored before the first 'user' field
in the hidden metadata record at the start of the clustered index.
The presence of mblob is indicated by setting the delete-mark flag in
the metadata record.
The metadata BLOB stores the number of clustered index fields,
followed by an array of column information for each field.
For dropped columns, we store the NOT NULL flag, the fixed length,
and for variable-length columns, whether the maximum length exceeded
255 bytes. For non-dropped columns, we store the column position.
Unlike with MDEV-11369, when a table becomes empty, it cannot
be converted back to the canonical format. The reason for this is
that other threads may hold cached objects such as
row_prebuilt_t::ins_node that could refer to dropped or reordered
index fields.
For instant DROP COLUMN and ROW_FORMAT=COMPACT or ROW_FORMAT=DYNAMIC,
we must store the n_core_null_bytes in the root page, so that the
chain of node pointer records can be followed in order to reach the
leftmost leaf page where the metadata record is located.
If the mblob is present, we will zero-initialize the strings
"infimum" and "supremum" in the root page, and use the last byte of
"supremum" for storing the number of null bytes (which are allocated
but useless on node pointer pages). This is necessary for
btr_cur_instant_init_metadata() to be able to navigate to the mblob.
If the PRIMARY KEY contains any variable-length column and some
nullable columns were instantly dropped, the dict_index_t::n_nullable
in the data dictionary could be smaller than it actually is in the
non-leaf pages. Because of this, the non-leaf pages could use more
bytes for the null flags than the data dictionary expects, and we
could be reading the lengths of the variable-length columns from the
wrong offset, and thus reading the child page number from wrong place.
This is the result of two design mistakes that involve unnecessary
storage of data: First, it is nonsense to store any data fields for
the leftmost node pointer records, because the comparisons would be
resolved by the MIN_REC_FLAG alone. Second, there cannot be any null
fields in the clustered index node pointer fields, but we nevertheless
reserve space for all the null flags.
Limitations (future work):
MDEV-17459 Allow instant ALTER TABLE even if FULLTEXT INDEX exists
MDEV-17468 Avoid table rebuild on operations on generated columns
MDEV-17494 Refuse ALGORITHM=INSTANT when the row size is too large
btr_page_reorganize_low(): Preserve any metadata in the root page.
Call lock_move_reorganize_page() only after restoring the "infimum"
and "supremum" records, to avoid a memcmp() assertion failure.
dict_col_t::DROPPED: Magic value for dict_col_t::ind.
dict_col_t::clear_instant(): Renamed from dict_col_t::remove_instant().
Do not assert that the column was instantly added, because we
sometimes call this unconditionally for all columns.
Convert an instantly added column to a "core column". The old name
remove_instant() could be mistaken to refer to "instant DROP COLUMN".
dict_col_t::is_added(): Rename from dict_col_t::is_instant().
dtype_t::metadata_blob_init(): Initialize the mblob data type.
dtuple_t::is_metadata(), dtuple_t::is_alter_metadata(),
upd_t::is_metadata(), upd_t::is_alter_metadata(): Check if info_bits
refer to a metadata record.
dict_table_t::instant: Metadata about dropped or reordered columns.
dict_table_t::prepare_instant(): Prepare
ha_innobase_inplace_ctx::instant_table for instant ALTER TABLE.
innobase_instant_try() will pass this to dict_table_t::instant_column().
On rollback, dict_table_t::rollback_instant() will be called.
dict_table_t::instant_column(): Renamed from instant_add_column().
Add the parameter col_map so that columns can be reordered.
Copy and adjust v_cols[] as well.
dict_table_t::find(): Find an old column based on a new column number.
dict_table_t::serialise_columns(), dict_table_t::deserialise_columns():
Convert the mblob.
dict_index_t::instant_metadata(): Create the metadata record
for instant ALTER TABLE. Invoke dict_table_t::serialise_columns().
dict_index_t::reconstruct_fields(): Invoked by
dict_table_t::deserialise_columns().
dict_index_t::clear_instant_alter(): Move the fields for the
dropped columns to the end, and sort the surviving index fields
in ascending order of column position.
ha_innobase::check_if_supported_inplace_alter(): Do not allow
adding a FTS_DOC_ID column if a hidden FTS_DOC_ID column exists
due to FULLTEXT INDEX. (This always required ALGORITHM=COPY.)
instant_alter_column_possible(): Add a parameter for InnoDB table,
to check for additional conditions, such as the maximum number of
index fields.
ha_innobase_inplace_ctx::first_alter_pos: The first column whose position
is affected by instant ADD, DROP, or changing the order of columns.
innobase_build_col_map(): Skip added virtual columns.
prepare_inplace_add_virtual(): Correctly compute num_to_add_vcol.
Remove some unnecessary code. Note that the call to
innodb_base_col_setup() should be executed later.
commit_try_norebuild(): If ctx->is_instant(), let the virtual
columns be added or dropped by innobase_instant_try().
innobase_instant_try(): Fill in a zero default value for the
hidden column FTS_DOC_ID (to reduce the work needed in MDEV-17459).
If any columns were dropped or reordered (or added not last),
delete any SYS_COLUMNS records for the following columns, and
insert SYS_COLUMNS records for all subsequent stored columns as well
as for all virtual columns. If any virtual column is dropped, rewrite
all virtual column metadata. Use a shortcut only for adding
virtual columns. This is because innobase_drop_virtual_try()
assumes that the dropped virtual columns still exist in ctx->old_table.
innodb_update_cols(): Renamed from innodb_update_n_cols().
innobase_add_one_virtual(), innobase_insert_sys_virtual(): Change
the return type to bool, and invoke my_error() when detecting an error.
innodb_insert_sys_columns(): Insert a record into SYS_COLUMNS.
Refactored from innobase_add_one_virtual() and innobase_instant_add_col().
innobase_instant_add_col(): Replace the parameter dfield with type.
innobase_instant_drop_cols(): Drop matching columns from SYS_COLUMNS
and all columns from SYS_VIRTUAL.
innobase_add_virtual_try(), innobase_drop_virtual_try(): Let
the caller invoke innodb_update_cols().
innobase_rename_column_try(): Skip dropped columns.
commit_cache_norebuild(): Update table->fts->doc_col.
dict_mem_table_col_rename_low(): Skip dropped columns.
trx_undo_rec_get_partial_row(): Skip dropped columns.
trx_undo_update_rec_get_update(): Handle the metadata BLOB correctly.
trx_undo_page_report_modify(): Avoid out-of-bounds access to record fields.
Log metadata records consistently.
Apparently, the first fields of a clustered index may be updated
in an update_undo vector when the index is ID_IND of SYS_FOREIGN,
as part of renaming the table during ALTER TABLE. Normally, updates of
the PRIMARY KEY should be logged as delete-mark and an insert.
row_undo_mod_parse_undo_rec(), row_purge_parse_undo_rec():
Use trx_undo_metadata.
row_undo_mod_clust_low(): On metadata rollback, roll back the root page too.
row_undo_mod_clust(): Relax an assertion. The delete-mark flag was
repurposed for ALTER TABLE metadata records.
row_rec_to_index_entry_impl(): Add the template parameter mblob
and the optional parameter info_bits for specifying the desired new
info bits. For the metadata tuple, allow conversion between the original
format (ADD COLUMN only) and the generic format (with hidden BLOB).
Add the optional parameter "pad" to determine whether the tuple should
be padded to the index fields (on ALTER TABLE it should), or whether
it should remain at its original size (on rollback).
row_build_index_entry_low(): Clean up the code, removing
redundant variables and conditions. For instantly dropped columns,
generate a dummy value that is NULL, the empty string, or a
fixed length of NUL bytes, depending on the type of the dropped column.
row_upd_clust_rec_by_insert_inherit_func(): On the update of PRIMARY KEY
of a record that contained a dropped column whose value was stored
externally, we will be inserting a dummy NULL or empty string value
to the field of the dropped column. The externally stored column would
eventually be dropped when purge removes the delete-marked record for
the old PRIMARY KEY value.
btr_index_rec_validate(): Recognize the metadata record.
btr_discard_only_page_on_level(): Preserve the generic instant
ALTER TABLE metadata.
btr_set_instant(): Replaces page_set_instant(). This sets a clustered
index root page to the appropriate format, or upgrades from
the MDEV-11369 instant ADD COLUMN to generic ALTER TABLE format.
btr_cur_instant_init_low(): Read and validate the metadata BLOB page
before reconstructing the dictionary information based on it.
btr_cur_instant_init_metadata(): Do not read any lengths from the
metadata record header before reading the BLOB. At this point, we
would not actually know how many nullable fields the metadata record
contains.
btr_cur_instant_root_init(): Initialize n_core_null_bytes in one
of two possible ways.
btr_cur_trim(): Handle the mblob record.
row_metadata_to_tuple(): Convert a metadata record to a data tuple,
based on the new info_bits of the metadata record.
btr_cur_pessimistic_update(): Invoke row_metadata_to_tuple() if needed.
Invoke dtuple_convert_big_rec() for metadata records if the record is
too large, or if the mblob is not yet marked as externally stored.
btr_cur_optimistic_delete_func(), btr_cur_pessimistic_delete():
When the last user record is deleted, do not delete the
generic instant ALTER TABLE metadata record. Only delete
MDEV-11369 instant ADD COLUMN metadata records.
btr_cur_optimistic_insert(): Avoid unnecessary computation of rec_size.
btr_pcur_store_position(): Allow a logically empty page to contain
a metadata record for generic ALTER TABLE.
REC_INFO_DEFAULT_ROW_ADD: Renamed from REC_INFO_DEFAULT_ROW.
This is for the old instant ADD COLUMN (MDEV-11369) only.
REC_INFO_DEFAULT_ROW_ALTER: The more generic metadata record,
with additional information for dropped or reordered columns.
rec_info_bits_valid(): Remove. The only case when this would fail
is when the record is the generic ALTER TABLE metadata record.
rec_is_alter_metadata(): Check if a record is the metadata record
for instant ALTER TABLE (other than ADD COLUMN). NOTE: This function
must not be invoked on node pointer records, because the delete-mark
flag in those records may be set (it is garbage), and then a debug
assertion could fail because index->is_instant() does not necessarily
hold.
rec_is_add_metadata(): Check if a record is MDEV-11369 ADD COLUMN metadata
record (not more generic instant ALTER TABLE).
rec_get_converted_size_comp_prefix_low(): Assume that the metadata
field will be stored externally. In dtuple_convert_big_rec() during
the rec_get_converted_size() call, it would not be there yet.
rec_get_converted_size_comp(): Replace status,fields,n_fields with tuple.
rec_init_offsets_comp_ordinary(), rec_get_converted_size_comp_prefix_low(),
rec_convert_dtuple_to_rec_comp(): Add template<bool mblob = false>.
With mblob=true, process a record with a metadata BLOB.
rec_copy_prefix_to_buf(): Assert that no fields beyond the key and
system columns are being copied. Exclude the metadata BLOB field.
rec_convert_dtuple_to_metadata_comp(): Convert an alter metadata tuple
into a record.
row_upd_index_replace_metadata(): Apply an update vector to an
alter_metadata tuple.
row_log_allocate(): Replace dict_index_t::is_instant()
with a more appropriate condition that ignores dict_table_t::instant.
Only a table on which the MDEV-11369 ADD COLUMN was performed
can "lose its instantness" when it becomes empty. After
instant DROP COLUMN or reordering columns, we cannot simply
convert the table to the canonical format, because the data
dictionary cache and all possibly existing references to it
from other client connection threads would have to be adjusted.
row_quiesce_write_index_fields(): Do not crash when the table contains
an instantly dropped column.
Thanks to Thirunarayanan Balathandayuthapani for discussing the design
and implementing an initial prototype of this.
Thanks to Matthias Leich for testing.
2018-10-19 16:49:54 +03:00
|
|
|
/** @return whether instant ALTER TABLE is in effect */
|
MDEV-11369 Instant ADD COLUMN for InnoDB
For InnoDB tables, adding, dropping and reordering columns has
required a rebuild of the table and all its indexes. Since MySQL 5.6
(and MariaDB 10.0) this has been supported online (LOCK=NONE), allowing
concurrent modification of the tables.
This work revises the InnoDB ROW_FORMAT=REDUNDANT, ROW_FORMAT=COMPACT
and ROW_FORMAT=DYNAMIC so that columns can be appended instantaneously,
with only minor changes performed to the table structure. The counter
innodb_instant_alter_column in INFORMATION_SCHEMA.GLOBAL_STATUS
is incremented whenever a table rebuild operation is converted into
an instant ADD COLUMN operation.
ROW_FORMAT=COMPRESSED tables will not support instant ADD COLUMN.
Some usability limitations will be addressed in subsequent work:
MDEV-13134 Introduce ALTER TABLE attributes ALGORITHM=NOCOPY
and ALGORITHM=INSTANT
MDEV-14016 Allow instant ADD COLUMN, ADD INDEX, LOCK=NONE
The format of the clustered index (PRIMARY KEY) is changed as follows:
(1) The FIL_PAGE_TYPE of the root page will be FIL_PAGE_TYPE_INSTANT,
and a new field PAGE_INSTANT will contain the original number of fields
in the clustered index ('core' fields).
If instant ADD COLUMN has not been used or the table becomes empty,
or the very first instant ADD COLUMN operation is rolled back,
the fields PAGE_INSTANT and FIL_PAGE_TYPE will be reset
to 0 and FIL_PAGE_INDEX.
(2) A special 'default row' record is inserted into the leftmost leaf,
between the page infimum and the first user record. This record is
distinguished by the REC_INFO_MIN_REC_FLAG, and it is otherwise in the
same format as records that contain values for the instantly added
columns. This 'default row' always has the same number of fields as
the clustered index according to the table definition. The values of
'core' fields are to be ignored. For other fields, the 'default row'
will contain the default values as they were during the ALTER TABLE
statement. (If the column default values are changed later, those
values will only be stored in the .frm file. The 'default row' will
contain the original evaluated values, which must be the same for
every row.) The 'default row' must be completely hidden from
higher-level access routines. Assertions have been added to ensure
that no 'default row' is ever present in the adaptive hash index
or in locked records. The 'default row' is never delete-marked.
(3) In clustered index leaf page records, the number of fields must
reside between the number of 'core' fields (dict_index_t::n_core_fields
introduced in this work) and dict_index_t::n_fields. If the number
of fields is less than dict_index_t::n_fields, the missing fields
are replaced with the column value of the 'default row'.
Note: The number of fields in the record may shrink if some of the
last instantly added columns are updated to the value that is
in the 'default row'. The function btr_cur_trim() implements this
'compression' on update and rollback; dtuple::trim() implements it
on insert.
(4) In ROW_FORMAT=COMPACT and ROW_FORMAT=DYNAMIC records, the new
status value REC_STATUS_COLUMNS_ADDED will indicate the presence of
a new record header that will encode n_fields-n_core_fields-1 in
1 or 2 bytes. (In ROW_FORMAT=REDUNDANT records, the record header
always explicitly encodes the number of fields.)
We introduce the undo log record type TRX_UNDO_INSERT_DEFAULT for
covering the insert of the 'default row' record when instant ADD COLUMN
is used for the first time. Subsequent instant ADD COLUMN can use
TRX_UNDO_UPD_EXIST_REC.
This is joint work with Vin Chen (陈福荣) from Tencent. The design
that was discussed in April 2017 would not have allowed import or
export of data files, because instead of the 'default row' it would
have introduced a data dictionary table. The test
rpl.rpl_alter_instant is exactly as contributed in pull request #408.
The test innodb.instant_alter is based on a contributed test.
The redo log record format changes for ROW_FORMAT=DYNAMIC and
ROW_FORMAT=COMPACT are as contributed. (With this change present,
crash recovery from MariaDB 10.3.1 will fail in spectacular ways!)
Also the semantics of higher-level redo log records that modify the
PAGE_INSTANT field is changed. The redo log format version identifier
was already changed to LOG_HEADER_FORMAT_CURRENT=103 in MariaDB 10.3.1.
Everything else has been rewritten by me. Thanks to Elena Stepanova,
the code has been tested extensively.
When rolling back an instant ADD COLUMN operation, we must empty the
PAGE_FREE list after deleting or shortening the 'default row' record,
by calling either btr_page_empty() or btr_page_reorganize(). We must
know the size of each entry in the PAGE_FREE list. If rollback left a
freed copy of the 'default row' in the PAGE_FREE list, we would be
unable to determine its size (if it is in ROW_FORMAT=COMPACT or
ROW_FORMAT=DYNAMIC) because it would contain more fields than the
rolled-back definition of the clustered index.
UNIV_SQL_DEFAULT: A new special constant that designates an instantly
added column that is not present in the clustered index record.
len_is_stored(): Check if a length is an actual length. There are
two magic length values: UNIV_SQL_DEFAULT, UNIV_SQL_NULL.
dict_col_t::def_val: The 'default row' value of the column. If the
column is not added instantly, def_val.len will be UNIV_SQL_DEFAULT.
dict_col_t: Add the accessors is_virtual(), is_nullable(), is_instant(),
instant_value().
dict_col_t::remove_instant(): Remove the 'instant ADD' status of
a column.
dict_col_t::name(const dict_table_t& table): Replaces
dict_table_get_col_name().
dict_index_t::n_core_fields: The original number of fields.
For secondary indexes and if instant ADD COLUMN has not been used,
this will be equal to dict_index_t::n_fields.
dict_index_t::n_core_null_bytes: Number of bytes needed to
represent the null flags; usually equal to UT_BITS_IN_BYTES(n_nullable).
dict_index_t::NO_CORE_NULL_BYTES: Magic value signalling that
n_core_null_bytes was not initialized yet from the clustered index
root page.
dict_index_t: Add the accessors is_instant(), is_clust(),
get_n_nullable(), instant_field_value().
dict_index_t::instant_add_field(): Adjust clustered index metadata
for instant ADD COLUMN.
dict_index_t::remove_instant(): Remove the 'instant ADD' status
of a clustered index when the table becomes empty, or the very first
instant ADD COLUMN operation is rolled back.
dict_table_t: Add the accessors is_instant(), is_temporary(),
supports_instant().
dict_table_t::instant_add_column(): Adjust metadata for
instant ADD COLUMN.
dict_table_t::rollback_instant(): Adjust metadata on the rollback
of instant ADD COLUMN.
prepare_inplace_alter_table_dict(): First create the ctx->new_table,
and only then decide if the table really needs to be rebuilt.
We must split the creation of table or index metadata from the
creation of the dictionary table records and the creation of
the data. In this way, we can transform a table-rebuilding operation
into an instant ADD COLUMN operation. Dictionary objects will only
be added to cache when table rebuilding or index creation is needed.
The ctx->instant_table will never be added to cache.
dict_table_t::add_to_cache(): Modified and renamed from
dict_table_add_to_cache(). Do not modify the table metadata.
Let the callers invoke dict_table_add_system_columns() and if needed,
set can_be_evicted.
dict_create_sys_tables_tuple(), dict_create_table_step(): Omit the
system columns (which will now exist in the dict_table_t object
already at this point).
dict_create_table_step(): Expect the callers to invoke
dict_table_add_system_columns().
pars_create_table(): Before creating the table creation execution
graph, invoke dict_table_add_system_columns().
row_create_table_for_mysql(): Expect all callers to invoke
dict_table_add_system_columns().
create_index_dict(): Replaces row_merge_create_index_graph().
innodb_update_n_cols(): Renamed from innobase_update_n_virtual().
Call my_error() if an error occurs.
btr_cur_instant_init(), btr_cur_instant_init_low(),
btr_cur_instant_root_init():
Load additional metadata from the clustered index and set
dict_index_t::n_core_null_bytes. This is invoked
when table metadata is first loaded into the data dictionary.
dict_boot(): Initialize n_core_null_bytes for the four hard-coded
dictionary tables.
dict_create_index_step(): Initialize n_core_null_bytes. This is
executed as part of CREATE TABLE.
dict_index_build_internal_clust(): Initialize n_core_null_bytes to
NO_CORE_NULL_BYTES if table->supports_instant().
row_create_index_for_mysql(): Initialize n_core_null_bytes for
CREATE TEMPORARY TABLE.
commit_cache_norebuild(): Call the code to rename or enlarge columns
in the cache only if instant ADD COLUMN is not being used.
(Instant ADD COLUMN would copy all column metadata from
instant_table to old_table, including the names and lengths.)
PAGE_INSTANT: A new 13-bit field for storing dict_index_t::n_core_fields.
This is repurposing the 16-bit field PAGE_DIRECTION, of which only the
least significant 3 bits were used. The original byte containing
PAGE_DIRECTION will be accessible via the new constant PAGE_DIRECTION_B.
page_get_instant(), page_set_instant(): Accessors for the PAGE_INSTANT.
page_ptr_get_direction(), page_get_direction(),
page_ptr_set_direction(): Accessors for PAGE_DIRECTION.
page_direction_reset(): Reset PAGE_DIRECTION, PAGE_N_DIRECTION.
page_direction_increment(): Increment PAGE_N_DIRECTION
and set PAGE_DIRECTION.
rec_get_offsets(): Use the 'leaf' parameter for non-debug purposes,
and assume that heap_no is always set.
Initialize all dict_index_t::n_fields for ROW_FORMAT=REDUNDANT records,
even if the record contains fewer fields.
rec_offs_make_valid(): Add the parameter 'leaf'.
rec_copy_prefix_to_dtuple(): Assert that the tuple is only built
on the core fields. Instant ADD COLUMN only applies to the
clustered index, and we should never build a search key that has
more than the PRIMARY KEY and possibly DB_TRX_ID,DB_ROLL_PTR.
All these columns are always present.
dict_index_build_data_tuple(): Remove assertions that would be
duplicated in rec_copy_prefix_to_dtuple().
rec_init_offsets(): Support ROW_FORMAT=REDUNDANT records whose
number of fields is between n_core_fields and n_fields.
cmp_rec_rec_with_match(): Implement the comparison between two
MIN_REC_FLAG records.
trx_t::in_rollback: Make the field available in non-debug builds.
trx_start_for_ddl_low(): Remove dangerous error-tolerance.
A dictionary transaction must be flagged as such before it has generated
any undo log records. This is because trx_undo_assign_undo() will mark
the transaction as a dictionary transaction in the undo log header
right before the very first undo log record is being written.
btr_index_rec_validate(): Account for instant ADD COLUMN
row_undo_ins_remove_clust_rec(): On the rollback of an insert into
SYS_COLUMNS, revert instant ADD COLUMN in the cache by removing the
last column from the table and the clustered index.
row_search_on_row_ref(), row_undo_mod_parse_undo_rec(), row_undo_mod(),
trx_undo_update_rec_get_update(): Handle the 'default row'
as a special case.
dtuple_t::trim(index): Omit a redundant suffix of an index tuple right
before insert or update. After instant ADD COLUMN, if the last fields
of a clustered index tuple match the 'default row', there is no
need to store them. While trimming the entry, we must hold a page latch,
so that the table cannot be emptied and the 'default row' be deleted.
btr_cur_optimistic_update(), btr_cur_pessimistic_update(),
row_upd_clust_rec_by_insert(), row_ins_clust_index_entry_low():
Invoke dtuple_t::trim() if needed.
row_ins_clust_index_entry(): Restore dtuple_t::n_fields after calling
row_ins_clust_index_entry_low().
rec_get_converted_size(), rec_get_converted_size_comp(): Allow the number
of fields to be between n_core_fields and n_fields. Do not support
infimum,supremum. They are never supposed to be stored in dtuple_t,
because page creation nowadays uses a lower-level method for initializing
them.
rec_convert_dtuple_to_rec_comp(): Assign the status bits based on the
number of fields.
btr_cur_trim(): In an update, trim the index entry as needed. For the
'default row', handle rollback specially. For user records, omit
fields that match the 'default row'.
btr_cur_optimistic_delete_func(), btr_cur_pessimistic_delete():
Skip locking and adaptive hash index for the 'default row'.
row_log_table_apply_convert_mrec(): Replace 'default row' values if needed.
In the temporary file that is applied by row_log_table_apply(),
we must identify whether the records contain the extra header for
instantly added columns. For now, we will allocate an additional byte
for this for ROW_T_INSERT and ROW_T_UPDATE records when the source table
has been subject to instant ADD COLUMN. The ROW_T_DELETE records are
fine, as they will be converted and will only contain 'core' columns
(PRIMARY KEY and some system columns) that are converted from dtuple_t.
rec_get_converted_size_temp(), rec_init_offsets_temp(),
rec_convert_dtuple_to_temp(): Add the parameter 'status'.
REC_INFO_DEFAULT_ROW = REC_INFO_MIN_REC_FLAG | REC_STATUS_COLUMNS_ADDED:
An info_bits constant for distinguishing the 'default row' record.
rec_comp_status_t: An enum of the status bit values.
rec_leaf_format: An enum that replaces the bool parameter of
rec_init_offsets_comp_ordinary().
2017-10-06 07:00:05 +03:00
|
|
|
inline bool is_instant() const;
|
|
|
|
|
MDEV-14407 Assertion failure during rollback
Rollback attempted to dereference DB_ROLL_PTR=0, which cannot possibly
be a valid undo log pointer. A safer canonical value would be
roll_ptr_t(1) << ROLL_PTR_INSERT_FLAG_POS
which is what was chosen in MDEV-12288, corresponding to reset_trx_id.
No deterministic test case for the bug was found. The simplest test
cases may be related to MDEV-11415, which suppresses undo logging for
ALGORITHM=COPY operations. In those operations, in the spirit of
MDEV-12288, we should actually have written reset_trx_id instead of
using the transaction identifier of the current transaction
(and a bogus value of DB_ROLL_PTR=0). However, thanks to MySQL Bug#28432
which I had fixed in MySQL 5.6.8 as part of WL#6255, access to the
rebuilt table by earlier-started transactions should actually have been
refused with ER_TABLE_DEF_CHANGED.
reset_trx_id: Move the definition to data0type.cc and the declaration
to data0type.h.
btr_cur_ins_lock_and_undo(): When undo logging is disabled, use the
safe value that corresponds to reset_trx_id.
btr_cur_optimistic_insert(): Validate the DB_TRX_ID,DB_ROLL_PTR before
inserting into a clustered index leaf page.
ins_node_t::sys_buf[]: Replaces row_id_buf and trx_id_buf and some
heap usage.
row_ins_alloc_sys_fields(): Init ins_node_t::sys_buf[] to reset_trx_id.
row_ins_buf(): Only if undo logging is enabled, copy trx->id
to node->sys_buf. Otherwise, rely on the initialization in
row_ins_alloc_sys_fields().
row_purge_reset_trx_id(): Invoke mlog_write_string() with reset_trx_id
directly. (No functional change.)
trx_undo_page_report_modify(): Assert that the DB_ROLL_PTR is not 0.
trx_undo_get_undo_rec_low(): Assert that the roll_ptr is valid before
trying to dereference it.
dict_index_t::is_primary(): Check if the index is the primary key.
PageConverter::adjust_cluster_record(): Fix
MDEV-15249 Crash in MVCC read after IMPORT TABLESPACE
by resetting the system fields to reset_trx_id instead of writing
the current transaction ID (which will be committed at the
end of the IMPORT TABLESPACE) and DB_ROLL_PTR=0.
This can partially be viewed as a follow-up fix of MDEV-12288,
because IMPORT should already then have written
DB_TRX_ID=0 and DB_ROLL_PTR=1<<55 to prevent unnecessary
DB_TRX_ID lookups in subsequent accesses to the table.
2018-02-07 16:44:46 +02:00
|
|
|
/** @return whether the index is the primary key index
|
|
|
|
(not the clustered index of the change buffer) */
|
|
|
|
bool is_primary() const
|
|
|
|
{
|
|
|
|
return DICT_CLUSTERED == (type & (DICT_CLUSTERED | DICT_IBUF));
|
|
|
|
}
|
MDEV-11369 Instant ADD COLUMN for InnoDB
For InnoDB tables, adding, dropping and reordering columns has
required a rebuild of the table and all its indexes. Since MySQL 5.6
(and MariaDB 10.0) this has been supported online (LOCK=NONE), allowing
concurrent modification of the tables.
This work revises the InnoDB ROW_FORMAT=REDUNDANT, ROW_FORMAT=COMPACT
and ROW_FORMAT=DYNAMIC so that columns can be appended instantaneously,
with only minor changes performed to the table structure. The counter
innodb_instant_alter_column in INFORMATION_SCHEMA.GLOBAL_STATUS
is incremented whenever a table rebuild operation is converted into
an instant ADD COLUMN operation.
ROW_FORMAT=COMPRESSED tables will not support instant ADD COLUMN.
Some usability limitations will be addressed in subsequent work:
MDEV-13134 Introduce ALTER TABLE attributes ALGORITHM=NOCOPY
and ALGORITHM=INSTANT
MDEV-14016 Allow instant ADD COLUMN, ADD INDEX, LOCK=NONE
The format of the clustered index (PRIMARY KEY) is changed as follows:
(1) The FIL_PAGE_TYPE of the root page will be FIL_PAGE_TYPE_INSTANT,
and a new field PAGE_INSTANT will contain the original number of fields
in the clustered index ('core' fields).
If instant ADD COLUMN has not been used or the table becomes empty,
or the very first instant ADD COLUMN operation is rolled back,
the fields PAGE_INSTANT and FIL_PAGE_TYPE will be reset
to 0 and FIL_PAGE_INDEX.
(2) A special 'default row' record is inserted into the leftmost leaf,
between the page infimum and the first user record. This record is
distinguished by the REC_INFO_MIN_REC_FLAG, and it is otherwise in the
same format as records that contain values for the instantly added
columns. This 'default row' always has the same number of fields as
the clustered index according to the table definition. The values of
'core' fields are to be ignored. For other fields, the 'default row'
will contain the default values as they were during the ALTER TABLE
statement. (If the column default values are changed later, those
values will only be stored in the .frm file. The 'default row' will
contain the original evaluated values, which must be the same for
every row.) The 'default row' must be completely hidden from
higher-level access routines. Assertions have been added to ensure
that no 'default row' is ever present in the adaptive hash index
or in locked records. The 'default row' is never delete-marked.
(3) In clustered index leaf page records, the number of fields must
reside between the number of 'core' fields (dict_index_t::n_core_fields
introduced in this work) and dict_index_t::n_fields. If the number
of fields is less than dict_index_t::n_fields, the missing fields
are replaced with the column value of the 'default row'.
Note: The number of fields in the record may shrink if some of the
last instantly added columns are updated to the value that is
in the 'default row'. The function btr_cur_trim() implements this
'compression' on update and rollback; dtuple::trim() implements it
on insert.
(4) In ROW_FORMAT=COMPACT and ROW_FORMAT=DYNAMIC records, the new
status value REC_STATUS_COLUMNS_ADDED will indicate the presence of
a new record header that will encode n_fields-n_core_fields-1 in
1 or 2 bytes. (In ROW_FORMAT=REDUNDANT records, the record header
always explicitly encodes the number of fields.)
We introduce the undo log record type TRX_UNDO_INSERT_DEFAULT for
covering the insert of the 'default row' record when instant ADD COLUMN
is used for the first time. Subsequent instant ADD COLUMN can use
TRX_UNDO_UPD_EXIST_REC.
This is joint work with Vin Chen (陈福荣) from Tencent. The design
that was discussed in April 2017 would not have allowed import or
export of data files, because instead of the 'default row' it would
have introduced a data dictionary table. The test
rpl.rpl_alter_instant is exactly as contributed in pull request #408.
The test innodb.instant_alter is based on a contributed test.
The redo log record format changes for ROW_FORMAT=DYNAMIC and
ROW_FORMAT=COMPACT are as contributed. (With this change present,
crash recovery from MariaDB 10.3.1 will fail in spectacular ways!)
Also the semantics of higher-level redo log records that modify the
PAGE_INSTANT field is changed. The redo log format version identifier
was already changed to LOG_HEADER_FORMAT_CURRENT=103 in MariaDB 10.3.1.
Everything else has been rewritten by me. Thanks to Elena Stepanova,
the code has been tested extensively.
When rolling back an instant ADD COLUMN operation, we must empty the
PAGE_FREE list after deleting or shortening the 'default row' record,
by calling either btr_page_empty() or btr_page_reorganize(). We must
know the size of each entry in the PAGE_FREE list. If rollback left a
freed copy of the 'default row' in the PAGE_FREE list, we would be
unable to determine its size (if it is in ROW_FORMAT=COMPACT or
ROW_FORMAT=DYNAMIC) because it would contain more fields than the
rolled-back definition of the clustered index.
UNIV_SQL_DEFAULT: A new special constant that designates an instantly
added column that is not present in the clustered index record.
len_is_stored(): Check if a length is an actual length. There are
two magic length values: UNIV_SQL_DEFAULT, UNIV_SQL_NULL.
dict_col_t::def_val: The 'default row' value of the column. If the
column is not added instantly, def_val.len will be UNIV_SQL_DEFAULT.
dict_col_t: Add the accessors is_virtual(), is_nullable(), is_instant(),
instant_value().
dict_col_t::remove_instant(): Remove the 'instant ADD' status of
a column.
dict_col_t::name(const dict_table_t& table): Replaces
dict_table_get_col_name().
dict_index_t::n_core_fields: The original number of fields.
For secondary indexes and if instant ADD COLUMN has not been used,
this will be equal to dict_index_t::n_fields.
dict_index_t::n_core_null_bytes: Number of bytes needed to
represent the null flags; usually equal to UT_BITS_IN_BYTES(n_nullable).
dict_index_t::NO_CORE_NULL_BYTES: Magic value signalling that
n_core_null_bytes was not initialized yet from the clustered index
root page.
dict_index_t: Add the accessors is_instant(), is_clust(),
get_n_nullable(), instant_field_value().
dict_index_t::instant_add_field(): Adjust clustered index metadata
for instant ADD COLUMN.
dict_index_t::remove_instant(): Remove the 'instant ADD' status
of a clustered index when the table becomes empty, or the very first
instant ADD COLUMN operation is rolled back.
dict_table_t: Add the accessors is_instant(), is_temporary(),
supports_instant().
dict_table_t::instant_add_column(): Adjust metadata for
instant ADD COLUMN.
dict_table_t::rollback_instant(): Adjust metadata on the rollback
of instant ADD COLUMN.
prepare_inplace_alter_table_dict(): First create the ctx->new_table,
and only then decide if the table really needs to be rebuilt.
We must split the creation of table or index metadata from the
creation of the dictionary table records and the creation of
the data. In this way, we can transform a table-rebuilding operation
into an instant ADD COLUMN operation. Dictionary objects will only
be added to cache when table rebuilding or index creation is needed.
The ctx->instant_table will never be added to cache.
dict_table_t::add_to_cache(): Modified and renamed from
dict_table_add_to_cache(). Do not modify the table metadata.
Let the callers invoke dict_table_add_system_columns() and if needed,
set can_be_evicted.
dict_create_sys_tables_tuple(), dict_create_table_step(): Omit the
system columns (which will now exist in the dict_table_t object
already at this point).
dict_create_table_step(): Expect the callers to invoke
dict_table_add_system_columns().
pars_create_table(): Before creating the table creation execution
graph, invoke dict_table_add_system_columns().
row_create_table_for_mysql(): Expect all callers to invoke
dict_table_add_system_columns().
create_index_dict(): Replaces row_merge_create_index_graph().
innodb_update_n_cols(): Renamed from innobase_update_n_virtual().
Call my_error() if an error occurs.
btr_cur_instant_init(), btr_cur_instant_init_low(),
btr_cur_instant_root_init():
Load additional metadata from the clustered index and set
dict_index_t::n_core_null_bytes. This is invoked
when table metadata is first loaded into the data dictionary.
dict_boot(): Initialize n_core_null_bytes for the four hard-coded
dictionary tables.
dict_create_index_step(): Initialize n_core_null_bytes. This is
executed as part of CREATE TABLE.
dict_index_build_internal_clust(): Initialize n_core_null_bytes to
NO_CORE_NULL_BYTES if table->supports_instant().
row_create_index_for_mysql(): Initialize n_core_null_bytes for
CREATE TEMPORARY TABLE.
commit_cache_norebuild(): Call the code to rename or enlarge columns
in the cache only if instant ADD COLUMN is not being used.
(Instant ADD COLUMN would copy all column metadata from
instant_table to old_table, including the names and lengths.)
PAGE_INSTANT: A new 13-bit field for storing dict_index_t::n_core_fields.
This is repurposing the 16-bit field PAGE_DIRECTION, of which only the
least significant 3 bits were used. The original byte containing
PAGE_DIRECTION will be accessible via the new constant PAGE_DIRECTION_B.
page_get_instant(), page_set_instant(): Accessors for the PAGE_INSTANT.
page_ptr_get_direction(), page_get_direction(),
page_ptr_set_direction(): Accessors for PAGE_DIRECTION.
page_direction_reset(): Reset PAGE_DIRECTION, PAGE_N_DIRECTION.
page_direction_increment(): Increment PAGE_N_DIRECTION
and set PAGE_DIRECTION.
rec_get_offsets(): Use the 'leaf' parameter for non-debug purposes,
and assume that heap_no is always set.
Initialize all dict_index_t::n_fields for ROW_FORMAT=REDUNDANT records,
even if the record contains fewer fields.
rec_offs_make_valid(): Add the parameter 'leaf'.
rec_copy_prefix_to_dtuple(): Assert that the tuple is only built
on the core fields. Instant ADD COLUMN only applies to the
clustered index, and we should never build a search key that has
more than the PRIMARY KEY and possibly DB_TRX_ID,DB_ROLL_PTR.
All these columns are always present.
dict_index_build_data_tuple(): Remove assertions that would be
duplicated in rec_copy_prefix_to_dtuple().
rec_init_offsets(): Support ROW_FORMAT=REDUNDANT records whose
number of fields is between n_core_fields and n_fields.
cmp_rec_rec_with_match(): Implement the comparison between two
MIN_REC_FLAG records.
trx_t::in_rollback: Make the field available in non-debug builds.
trx_start_for_ddl_low(): Remove dangerous error-tolerance.
A dictionary transaction must be flagged as such before it has generated
any undo log records. This is because trx_undo_assign_undo() will mark
the transaction as a dictionary transaction in the undo log header
right before the very first undo log record is being written.
btr_index_rec_validate(): Account for instant ADD COLUMN
row_undo_ins_remove_clust_rec(): On the rollback of an insert into
SYS_COLUMNS, revert instant ADD COLUMN in the cache by removing the
last column from the table and the clustered index.
row_search_on_row_ref(), row_undo_mod_parse_undo_rec(), row_undo_mod(),
trx_undo_update_rec_get_update(): Handle the 'default row'
as a special case.
dtuple_t::trim(index): Omit a redundant suffix of an index tuple right
before insert or update. After instant ADD COLUMN, if the last fields
of a clustered index tuple match the 'default row', there is no
need to store them. While trimming the entry, we must hold a page latch,
so that the table cannot be emptied and the 'default row' be deleted.
btr_cur_optimistic_update(), btr_cur_pessimistic_update(),
row_upd_clust_rec_by_insert(), row_ins_clust_index_entry_low():
Invoke dtuple_t::trim() if needed.
row_ins_clust_index_entry(): Restore dtuple_t::n_fields after calling
row_ins_clust_index_entry_low().
rec_get_converted_size(), rec_get_converted_size_comp(): Allow the number
of fields to be between n_core_fields and n_fields. Do not support
infimum,supremum. They are never supposed to be stored in dtuple_t,
because page creation nowadays uses a lower-level method for initializing
them.
rec_convert_dtuple_to_rec_comp(): Assign the status bits based on the
number of fields.
btr_cur_trim(): In an update, trim the index entry as needed. For the
'default row', handle rollback specially. For user records, omit
fields that match the 'default row'.
btr_cur_optimistic_delete_func(), btr_cur_pessimistic_delete():
Skip locking and adaptive hash index for the 'default row'.
row_log_table_apply_convert_mrec(): Replace 'default row' values if needed.
In the temporary file that is applied by row_log_table_apply(),
we must identify whether the records contain the extra header for
instantly added columns. For now, we will allocate an additional byte
for this for ROW_T_INSERT and ROW_T_UPDATE records when the source table
has been subject to instant ADD COLUMN. The ROW_T_DELETE records are
fine, as they will be converted and will only contain 'core' columns
(PRIMARY KEY and some system columns) that are converted from dtuple_t.
rec_get_converted_size_temp(), rec_init_offsets_temp(),
rec_convert_dtuple_to_temp(): Add the parameter 'status'.
REC_INFO_DEFAULT_ROW = REC_INFO_MIN_REC_FLAG | REC_STATUS_COLUMNS_ADDED:
An info_bits constant for distinguishing the 'default row' record.
rec_comp_status_t: An enum of the status bit values.
rec_leaf_format: An enum that replaces the bool parameter of
rec_init_offsets_comp_ordinary().
2017-10-06 07:00:05 +03:00
|
|
|
|
2018-11-16 20:10:33 +02:00
|
|
|
/** @return whether this is a generated clustered index */
|
|
|
|
bool is_gen_clust() const { return type == DICT_CLUSTERED; }
|
|
|
|
|
|
|
|
/** @return whether this is a clustered index */
|
|
|
|
bool is_clust() const { return type & DICT_CLUSTERED; }
|
|
|
|
|
|
|
|
/** @return whether this is a unique index */
|
|
|
|
bool is_unique() const { return type & DICT_UNIQUE; }
|
|
|
|
|
|
|
|
/** @return whether this is a spatial index */
|
|
|
|
bool is_spatial() const { return UNIV_UNLIKELY(type & DICT_SPATIAL); }
|
|
|
|
|
|
|
|
/** @return whether this is the change buffer */
|
|
|
|
bool is_ibuf() const { return UNIV_UNLIKELY(type & DICT_IBUF); }
|
|
|
|
|
2018-06-04 15:55:37 +03:00
|
|
|
/** @return whether the index includes virtual columns */
|
|
|
|
bool has_virtual() const { return type & DICT_VIRTUAL; }
|
|
|
|
|
2018-11-16 19:36:04 +02:00
|
|
|
/** @return the position of DB_TRX_ID */
|
|
|
|
unsigned db_trx_id() const {
|
|
|
|
DBUG_ASSERT(is_primary());
|
|
|
|
DBUG_ASSERT(n_uniq);
|
|
|
|
DBUG_ASSERT(n_uniq <= MAX_REF_PARTS);
|
|
|
|
return n_uniq;
|
|
|
|
}
|
|
|
|
/** @return the position of DB_ROLL_PTR */
|
|
|
|
unsigned db_roll_ptr() const { return db_trx_id() + 1; }
|
|
|
|
|
2018-10-10 11:53:23 +03:00
|
|
|
/** @return the offset of the metadata BLOB field,
|
|
|
|
or the first user field after the PRIMARY KEY,DB_TRX_ID,DB_ROLL_PTR */
|
2018-11-16 19:36:04 +02:00
|
|
|
unsigned first_user_field() const { return db_trx_id() + 2; }
|
2018-10-10 11:53:23 +03:00
|
|
|
|
2018-05-07 15:30:57 +03:00
|
|
|
/** @return whether the index is corrupted */
|
|
|
|
inline bool is_corrupted() const;
|
2018-05-07 15:50:38 +03:00
|
|
|
|
2018-06-04 15:55:37 +03:00
|
|
|
/** Detach the columns from the index that is to be freed. */
|
|
|
|
void detach_columns()
|
|
|
|
{
|
|
|
|
if (has_virtual()) {
|
|
|
|
for (unsigned i = 0; i < n_fields; i++) {
|
|
|
|
fields[i].col->detach(*this);
|
|
|
|
}
|
|
|
|
|
|
|
|
n_fields = 0;
|
|
|
|
}
|
|
|
|
}
|
2018-06-04 16:12:00 +03:00
|
|
|
|
MDEV-11369 Instant ADD COLUMN for InnoDB
For InnoDB tables, adding, dropping and reordering columns has
required a rebuild of the table and all its indexes. Since MySQL 5.6
(and MariaDB 10.0) this has been supported online (LOCK=NONE), allowing
concurrent modification of the tables.
This work revises the InnoDB ROW_FORMAT=REDUNDANT, ROW_FORMAT=COMPACT
and ROW_FORMAT=DYNAMIC so that columns can be appended instantaneously,
with only minor changes performed to the table structure. The counter
innodb_instant_alter_column in INFORMATION_SCHEMA.GLOBAL_STATUS
is incremented whenever a table rebuild operation is converted into
an instant ADD COLUMN operation.
ROW_FORMAT=COMPRESSED tables will not support instant ADD COLUMN.
Some usability limitations will be addressed in subsequent work:
MDEV-13134 Introduce ALTER TABLE attributes ALGORITHM=NOCOPY
and ALGORITHM=INSTANT
MDEV-14016 Allow instant ADD COLUMN, ADD INDEX, LOCK=NONE
The format of the clustered index (PRIMARY KEY) is changed as follows:
(1) The FIL_PAGE_TYPE of the root page will be FIL_PAGE_TYPE_INSTANT,
and a new field PAGE_INSTANT will contain the original number of fields
in the clustered index ('core' fields).
If instant ADD COLUMN has not been used or the table becomes empty,
or the very first instant ADD COLUMN operation is rolled back,
the fields PAGE_INSTANT and FIL_PAGE_TYPE will be reset
to 0 and FIL_PAGE_INDEX.
(2) A special 'default row' record is inserted into the leftmost leaf,
between the page infimum and the first user record. This record is
distinguished by the REC_INFO_MIN_REC_FLAG, and it is otherwise in the
same format as records that contain values for the instantly added
columns. This 'default row' always has the same number of fields as
the clustered index according to the table definition. The values of
'core' fields are to be ignored. For other fields, the 'default row'
will contain the default values as they were during the ALTER TABLE
statement. (If the column default values are changed later, those
values will only be stored in the .frm file. The 'default row' will
contain the original evaluated values, which must be the same for
every row.) The 'default row' must be completely hidden from
higher-level access routines. Assertions have been added to ensure
that no 'default row' is ever present in the adaptive hash index
or in locked records. The 'default row' is never delete-marked.
(3) In clustered index leaf page records, the number of fields must
reside between the number of 'core' fields (dict_index_t::n_core_fields
introduced in this work) and dict_index_t::n_fields. If the number
of fields is less than dict_index_t::n_fields, the missing fields
are replaced with the column value of the 'default row'.
Note: The number of fields in the record may shrink if some of the
last instantly added columns are updated to the value that is
in the 'default row'. The function btr_cur_trim() implements this
'compression' on update and rollback; dtuple::trim() implements it
on insert.
(4) In ROW_FORMAT=COMPACT and ROW_FORMAT=DYNAMIC records, the new
status value REC_STATUS_COLUMNS_ADDED will indicate the presence of
a new record header that will encode n_fields-n_core_fields-1 in
1 or 2 bytes. (In ROW_FORMAT=REDUNDANT records, the record header
always explicitly encodes the number of fields.)
We introduce the undo log record type TRX_UNDO_INSERT_DEFAULT for
covering the insert of the 'default row' record when instant ADD COLUMN
is used for the first time. Subsequent instant ADD COLUMN can use
TRX_UNDO_UPD_EXIST_REC.
This is joint work with Vin Chen (陈福荣) from Tencent. The design
that was discussed in April 2017 would not have allowed import or
export of data files, because instead of the 'default row' it would
have introduced a data dictionary table. The test
rpl.rpl_alter_instant is exactly as contributed in pull request #408.
The test innodb.instant_alter is based on a contributed test.
The redo log record format changes for ROW_FORMAT=DYNAMIC and
ROW_FORMAT=COMPACT are as contributed. (With this change present,
crash recovery from MariaDB 10.3.1 will fail in spectacular ways!)
Also the semantics of higher-level redo log records that modify the
PAGE_INSTANT field is changed. The redo log format version identifier
was already changed to LOG_HEADER_FORMAT_CURRENT=103 in MariaDB 10.3.1.
Everything else has been rewritten by me. Thanks to Elena Stepanova,
the code has been tested extensively.
When rolling back an instant ADD COLUMN operation, we must empty the
PAGE_FREE list after deleting or shortening the 'default row' record,
by calling either btr_page_empty() or btr_page_reorganize(). We must
know the size of each entry in the PAGE_FREE list. If rollback left a
freed copy of the 'default row' in the PAGE_FREE list, we would be
unable to determine its size (if it is in ROW_FORMAT=COMPACT or
ROW_FORMAT=DYNAMIC) because it would contain more fields than the
rolled-back definition of the clustered index.
UNIV_SQL_DEFAULT: A new special constant that designates an instantly
added column that is not present in the clustered index record.
len_is_stored(): Check if a length is an actual length. There are
two magic length values: UNIV_SQL_DEFAULT, UNIV_SQL_NULL.
dict_col_t::def_val: The 'default row' value of the column. If the
column is not added instantly, def_val.len will be UNIV_SQL_DEFAULT.
dict_col_t: Add the accessors is_virtual(), is_nullable(), is_instant(),
instant_value().
dict_col_t::remove_instant(): Remove the 'instant ADD' status of
a column.
dict_col_t::name(const dict_table_t& table): Replaces
dict_table_get_col_name().
dict_index_t::n_core_fields: The original number of fields.
For secondary indexes and if instant ADD COLUMN has not been used,
this will be equal to dict_index_t::n_fields.
dict_index_t::n_core_null_bytes: Number of bytes needed to
represent the null flags; usually equal to UT_BITS_IN_BYTES(n_nullable).
dict_index_t::NO_CORE_NULL_BYTES: Magic value signalling that
n_core_null_bytes was not initialized yet from the clustered index
root page.
dict_index_t: Add the accessors is_instant(), is_clust(),
get_n_nullable(), instant_field_value().
dict_index_t::instant_add_field(): Adjust clustered index metadata
for instant ADD COLUMN.
dict_index_t::remove_instant(): Remove the 'instant ADD' status
of a clustered index when the table becomes empty, or the very first
instant ADD COLUMN operation is rolled back.
dict_table_t: Add the accessors is_instant(), is_temporary(),
supports_instant().
dict_table_t::instant_add_column(): Adjust metadata for
instant ADD COLUMN.
dict_table_t::rollback_instant(): Adjust metadata on the rollback
of instant ADD COLUMN.
prepare_inplace_alter_table_dict(): First create the ctx->new_table,
and only then decide if the table really needs to be rebuilt.
We must split the creation of table or index metadata from the
creation of the dictionary table records and the creation of
the data. In this way, we can transform a table-rebuilding operation
into an instant ADD COLUMN operation. Dictionary objects will only
be added to cache when table rebuilding or index creation is needed.
The ctx->instant_table will never be added to cache.
dict_table_t::add_to_cache(): Modified and renamed from
dict_table_add_to_cache(). Do not modify the table metadata.
Let the callers invoke dict_table_add_system_columns() and if needed,
set can_be_evicted.
dict_create_sys_tables_tuple(), dict_create_table_step(): Omit the
system columns (which will now exist in the dict_table_t object
already at this point).
dict_create_table_step(): Expect the callers to invoke
dict_table_add_system_columns().
pars_create_table(): Before creating the table creation execution
graph, invoke dict_table_add_system_columns().
row_create_table_for_mysql(): Expect all callers to invoke
dict_table_add_system_columns().
create_index_dict(): Replaces row_merge_create_index_graph().
innodb_update_n_cols(): Renamed from innobase_update_n_virtual().
Call my_error() if an error occurs.
btr_cur_instant_init(), btr_cur_instant_init_low(),
btr_cur_instant_root_init():
Load additional metadata from the clustered index and set
dict_index_t::n_core_null_bytes. This is invoked
when table metadata is first loaded into the data dictionary.
dict_boot(): Initialize n_core_null_bytes for the four hard-coded
dictionary tables.
dict_create_index_step(): Initialize n_core_null_bytes. This is
executed as part of CREATE TABLE.
dict_index_build_internal_clust(): Initialize n_core_null_bytes to
NO_CORE_NULL_BYTES if table->supports_instant().
row_create_index_for_mysql(): Initialize n_core_null_bytes for
CREATE TEMPORARY TABLE.
commit_cache_norebuild(): Call the code to rename or enlarge columns
in the cache only if instant ADD COLUMN is not being used.
(Instant ADD COLUMN would copy all column metadata from
instant_table to old_table, including the names and lengths.)
PAGE_INSTANT: A new 13-bit field for storing dict_index_t::n_core_fields.
This is repurposing the 16-bit field PAGE_DIRECTION, of which only the
least significant 3 bits were used. The original byte containing
PAGE_DIRECTION will be accessible via the new constant PAGE_DIRECTION_B.
page_get_instant(), page_set_instant(): Accessors for the PAGE_INSTANT.
page_ptr_get_direction(), page_get_direction(),
page_ptr_set_direction(): Accessors for PAGE_DIRECTION.
page_direction_reset(): Reset PAGE_DIRECTION, PAGE_N_DIRECTION.
page_direction_increment(): Increment PAGE_N_DIRECTION
and set PAGE_DIRECTION.
rec_get_offsets(): Use the 'leaf' parameter for non-debug purposes,
and assume that heap_no is always set.
Initialize all dict_index_t::n_fields for ROW_FORMAT=REDUNDANT records,
even if the record contains fewer fields.
rec_offs_make_valid(): Add the parameter 'leaf'.
rec_copy_prefix_to_dtuple(): Assert that the tuple is only built
on the core fields. Instant ADD COLUMN only applies to the
clustered index, and we should never build a search key that has
more than the PRIMARY KEY and possibly DB_TRX_ID,DB_ROLL_PTR.
All these columns are always present.
dict_index_build_data_tuple(): Remove assertions that would be
duplicated in rec_copy_prefix_to_dtuple().
rec_init_offsets(): Support ROW_FORMAT=REDUNDANT records whose
number of fields is between n_core_fields and n_fields.
cmp_rec_rec_with_match(): Implement the comparison between two
MIN_REC_FLAG records.
trx_t::in_rollback: Make the field available in non-debug builds.
trx_start_for_ddl_low(): Remove dangerous error-tolerance.
A dictionary transaction must be flagged as such before it has generated
any undo log records. This is because trx_undo_assign_undo() will mark
the transaction as a dictionary transaction in the undo log header
right before the very first undo log record is being written.
btr_index_rec_validate(): Account for instant ADD COLUMN
row_undo_ins_remove_clust_rec(): On the rollback of an insert into
SYS_COLUMNS, revert instant ADD COLUMN in the cache by removing the
last column from the table and the clustered index.
row_search_on_row_ref(), row_undo_mod_parse_undo_rec(), row_undo_mod(),
trx_undo_update_rec_get_update(): Handle the 'default row'
as a special case.
dtuple_t::trim(index): Omit a redundant suffix of an index tuple right
before insert or update. After instant ADD COLUMN, if the last fields
of a clustered index tuple match the 'default row', there is no
need to store them. While trimming the entry, we must hold a page latch,
so that the table cannot be emptied and the 'default row' be deleted.
btr_cur_optimistic_update(), btr_cur_pessimistic_update(),
row_upd_clust_rec_by_insert(), row_ins_clust_index_entry_low():
Invoke dtuple_t::trim() if needed.
row_ins_clust_index_entry(): Restore dtuple_t::n_fields after calling
row_ins_clust_index_entry_low().
rec_get_converted_size(), rec_get_converted_size_comp(): Allow the number
of fields to be between n_core_fields and n_fields. Do not support
infimum,supremum. They are never supposed to be stored in dtuple_t,
because page creation nowadays uses a lower-level method for initializing
them.
rec_convert_dtuple_to_rec_comp(): Assign the status bits based on the
number of fields.
btr_cur_trim(): In an update, trim the index entry as needed. For the
'default row', handle rollback specially. For user records, omit
fields that match the 'default row'.
btr_cur_optimistic_delete_func(), btr_cur_pessimistic_delete():
Skip locking and adaptive hash index for the 'default row'.
row_log_table_apply_convert_mrec(): Replace 'default row' values if needed.
In the temporary file that is applied by row_log_table_apply(),
we must identify whether the records contain the extra header for
instantly added columns. For now, we will allocate an additional byte
for this for ROW_T_INSERT and ROW_T_UPDATE records when the source table
has been subject to instant ADD COLUMN. The ROW_T_DELETE records are
fine, as they will be converted and will only contain 'core' columns
(PRIMARY KEY and some system columns) that are converted from dtuple_t.
rec_get_converted_size_temp(), rec_init_offsets_temp(),
rec_convert_dtuple_to_temp(): Add the parameter 'status'.
REC_INFO_DEFAULT_ROW = REC_INFO_MIN_REC_FLAG | REC_STATUS_COLUMNS_ADDED:
An info_bits constant for distinguishing the 'default row' record.
rec_comp_status_t: An enum of the status bit values.
rec_leaf_format: An enum that replaces the bool parameter of
rec_init_offsets_comp_ordinary().
2017-10-06 07:00:05 +03:00
|
|
|
/** Determine how many fields of a given prefix can be set NULL.
|
|
|
|
@param[in] n_prefix number of fields in the prefix
|
|
|
|
@return number of fields 0..n_prefix-1 that can be set NULL */
|
|
|
|
unsigned get_n_nullable(ulint n_prefix) const
|
|
|
|
{
|
|
|
|
DBUG_ASSERT(n_prefix > 0);
|
|
|
|
DBUG_ASSERT(n_prefix <= n_fields);
|
|
|
|
unsigned n = n_nullable;
|
|
|
|
for (; n_prefix < n_fields; n_prefix++) {
|
|
|
|
const dict_col_t* col = fields[n_prefix].col;
|
|
|
|
DBUG_ASSERT(!col->is_virtual());
|
|
|
|
n -= col->is_nullable();
|
|
|
|
}
|
|
|
|
DBUG_ASSERT(n < n_def);
|
|
|
|
return n;
|
|
|
|
}
|
|
|
|
|
|
|
|
/** Get the default value of an instantly-added clustered index field.
|
|
|
|
@param[in] n instantly added field position
|
|
|
|
@param[out] len value length (in bytes), or UNIV_SQL_NULL
|
|
|
|
@return default value
|
|
|
|
@retval NULL if the default value is SQL NULL (len=UNIV_SQL_NULL) */
|
2018-02-06 12:55:58 +00:00
|
|
|
const byte* instant_field_value(ulint n, ulint* len) const
|
MDEV-11369 Instant ADD COLUMN for InnoDB
For InnoDB tables, adding, dropping and reordering columns has
required a rebuild of the table and all its indexes. Since MySQL 5.6
(and MariaDB 10.0) this has been supported online (LOCK=NONE), allowing
concurrent modification of the tables.
This work revises the InnoDB ROW_FORMAT=REDUNDANT, ROW_FORMAT=COMPACT
and ROW_FORMAT=DYNAMIC so that columns can be appended instantaneously,
with only minor changes performed to the table structure. The counter
innodb_instant_alter_column in INFORMATION_SCHEMA.GLOBAL_STATUS
is incremented whenever a table rebuild operation is converted into
an instant ADD COLUMN operation.
ROW_FORMAT=COMPRESSED tables will not support instant ADD COLUMN.
Some usability limitations will be addressed in subsequent work:
MDEV-13134 Introduce ALTER TABLE attributes ALGORITHM=NOCOPY
and ALGORITHM=INSTANT
MDEV-14016 Allow instant ADD COLUMN, ADD INDEX, LOCK=NONE
The format of the clustered index (PRIMARY KEY) is changed as follows:
(1) The FIL_PAGE_TYPE of the root page will be FIL_PAGE_TYPE_INSTANT,
and a new field PAGE_INSTANT will contain the original number of fields
in the clustered index ('core' fields).
If instant ADD COLUMN has not been used or the table becomes empty,
or the very first instant ADD COLUMN operation is rolled back,
the fields PAGE_INSTANT and FIL_PAGE_TYPE will be reset
to 0 and FIL_PAGE_INDEX.
(2) A special 'default row' record is inserted into the leftmost leaf,
between the page infimum and the first user record. This record is
distinguished by the REC_INFO_MIN_REC_FLAG, and it is otherwise in the
same format as records that contain values for the instantly added
columns. This 'default row' always has the same number of fields as
the clustered index according to the table definition. The values of
'core' fields are to be ignored. For other fields, the 'default row'
will contain the default values as they were during the ALTER TABLE
statement. (If the column default values are changed later, those
values will only be stored in the .frm file. The 'default row' will
contain the original evaluated values, which must be the same for
every row.) The 'default row' must be completely hidden from
higher-level access routines. Assertions have been added to ensure
that no 'default row' is ever present in the adaptive hash index
or in locked records. The 'default row' is never delete-marked.
(3) In clustered index leaf page records, the number of fields must
reside between the number of 'core' fields (dict_index_t::n_core_fields
introduced in this work) and dict_index_t::n_fields. If the number
of fields is less than dict_index_t::n_fields, the missing fields
are replaced with the column value of the 'default row'.
Note: The number of fields in the record may shrink if some of the
last instantly added columns are updated to the value that is
in the 'default row'. The function btr_cur_trim() implements this
'compression' on update and rollback; dtuple::trim() implements it
on insert.
(4) In ROW_FORMAT=COMPACT and ROW_FORMAT=DYNAMIC records, the new
status value REC_STATUS_COLUMNS_ADDED will indicate the presence of
a new record header that will encode n_fields-n_core_fields-1 in
1 or 2 bytes. (In ROW_FORMAT=REDUNDANT records, the record header
always explicitly encodes the number of fields.)
We introduce the undo log record type TRX_UNDO_INSERT_DEFAULT for
covering the insert of the 'default row' record when instant ADD COLUMN
is used for the first time. Subsequent instant ADD COLUMN can use
TRX_UNDO_UPD_EXIST_REC.
This is joint work with Vin Chen (陈福荣) from Tencent. The design
that was discussed in April 2017 would not have allowed import or
export of data files, because instead of the 'default row' it would
have introduced a data dictionary table. The test
rpl.rpl_alter_instant is exactly as contributed in pull request #408.
The test innodb.instant_alter is based on a contributed test.
The redo log record format changes for ROW_FORMAT=DYNAMIC and
ROW_FORMAT=COMPACT are as contributed. (With this change present,
crash recovery from MariaDB 10.3.1 will fail in spectacular ways!)
Also the semantics of higher-level redo log records that modify the
PAGE_INSTANT field is changed. The redo log format version identifier
was already changed to LOG_HEADER_FORMAT_CURRENT=103 in MariaDB 10.3.1.
Everything else has been rewritten by me. Thanks to Elena Stepanova,
the code has been tested extensively.
When rolling back an instant ADD COLUMN operation, we must empty the
PAGE_FREE list after deleting or shortening the 'default row' record,
by calling either btr_page_empty() or btr_page_reorganize(). We must
know the size of each entry in the PAGE_FREE list. If rollback left a
freed copy of the 'default row' in the PAGE_FREE list, we would be
unable to determine its size (if it is in ROW_FORMAT=COMPACT or
ROW_FORMAT=DYNAMIC) because it would contain more fields than the
rolled-back definition of the clustered index.
UNIV_SQL_DEFAULT: A new special constant that designates an instantly
added column that is not present in the clustered index record.
len_is_stored(): Check if a length is an actual length. There are
two magic length values: UNIV_SQL_DEFAULT, UNIV_SQL_NULL.
dict_col_t::def_val: The 'default row' value of the column. If the
column is not added instantly, def_val.len will be UNIV_SQL_DEFAULT.
dict_col_t: Add the accessors is_virtual(), is_nullable(), is_instant(),
instant_value().
dict_col_t::remove_instant(): Remove the 'instant ADD' status of
a column.
dict_col_t::name(const dict_table_t& table): Replaces
dict_table_get_col_name().
dict_index_t::n_core_fields: The original number of fields.
For secondary indexes and if instant ADD COLUMN has not been used,
this will be equal to dict_index_t::n_fields.
dict_index_t::n_core_null_bytes: Number of bytes needed to
represent the null flags; usually equal to UT_BITS_IN_BYTES(n_nullable).
dict_index_t::NO_CORE_NULL_BYTES: Magic value signalling that
n_core_null_bytes was not initialized yet from the clustered index
root page.
dict_index_t: Add the accessors is_instant(), is_clust(),
get_n_nullable(), instant_field_value().
dict_index_t::instant_add_field(): Adjust clustered index metadata
for instant ADD COLUMN.
dict_index_t::remove_instant(): Remove the 'instant ADD' status
of a clustered index when the table becomes empty, or the very first
instant ADD COLUMN operation is rolled back.
dict_table_t: Add the accessors is_instant(), is_temporary(),
supports_instant().
dict_table_t::instant_add_column(): Adjust metadata for
instant ADD COLUMN.
dict_table_t::rollback_instant(): Adjust metadata on the rollback
of instant ADD COLUMN.
prepare_inplace_alter_table_dict(): First create the ctx->new_table,
and only then decide if the table really needs to be rebuilt.
We must split the creation of table or index metadata from the
creation of the dictionary table records and the creation of
the data. In this way, we can transform a table-rebuilding operation
into an instant ADD COLUMN operation. Dictionary objects will only
be added to cache when table rebuilding or index creation is needed.
The ctx->instant_table will never be added to cache.
dict_table_t::add_to_cache(): Modified and renamed from
dict_table_add_to_cache(). Do not modify the table metadata.
Let the callers invoke dict_table_add_system_columns() and if needed,
set can_be_evicted.
dict_create_sys_tables_tuple(), dict_create_table_step(): Omit the
system columns (which will now exist in the dict_table_t object
already at this point).
dict_create_table_step(): Expect the callers to invoke
dict_table_add_system_columns().
pars_create_table(): Before creating the table creation execution
graph, invoke dict_table_add_system_columns().
row_create_table_for_mysql(): Expect all callers to invoke
dict_table_add_system_columns().
create_index_dict(): Replaces row_merge_create_index_graph().
innodb_update_n_cols(): Renamed from innobase_update_n_virtual().
Call my_error() if an error occurs.
btr_cur_instant_init(), btr_cur_instant_init_low(),
btr_cur_instant_root_init():
Load additional metadata from the clustered index and set
dict_index_t::n_core_null_bytes. This is invoked
when table metadata is first loaded into the data dictionary.
dict_boot(): Initialize n_core_null_bytes for the four hard-coded
dictionary tables.
dict_create_index_step(): Initialize n_core_null_bytes. This is
executed as part of CREATE TABLE.
dict_index_build_internal_clust(): Initialize n_core_null_bytes to
NO_CORE_NULL_BYTES if table->supports_instant().
row_create_index_for_mysql(): Initialize n_core_null_bytes for
CREATE TEMPORARY TABLE.
commit_cache_norebuild(): Call the code to rename or enlarge columns
in the cache only if instant ADD COLUMN is not being used.
(Instant ADD COLUMN would copy all column metadata from
instant_table to old_table, including the names and lengths.)
PAGE_INSTANT: A new 13-bit field for storing dict_index_t::n_core_fields.
This is repurposing the 16-bit field PAGE_DIRECTION, of which only the
least significant 3 bits were used. The original byte containing
PAGE_DIRECTION will be accessible via the new constant PAGE_DIRECTION_B.
page_get_instant(), page_set_instant(): Accessors for the PAGE_INSTANT.
page_ptr_get_direction(), page_get_direction(),
page_ptr_set_direction(): Accessors for PAGE_DIRECTION.
page_direction_reset(): Reset PAGE_DIRECTION, PAGE_N_DIRECTION.
page_direction_increment(): Increment PAGE_N_DIRECTION
and set PAGE_DIRECTION.
rec_get_offsets(): Use the 'leaf' parameter for non-debug purposes,
and assume that heap_no is always set.
Initialize all dict_index_t::n_fields for ROW_FORMAT=REDUNDANT records,
even if the record contains fewer fields.
rec_offs_make_valid(): Add the parameter 'leaf'.
rec_copy_prefix_to_dtuple(): Assert that the tuple is only built
on the core fields. Instant ADD COLUMN only applies to the
clustered index, and we should never build a search key that has
more than the PRIMARY KEY and possibly DB_TRX_ID,DB_ROLL_PTR.
All these columns are always present.
dict_index_build_data_tuple(): Remove assertions that would be
duplicated in rec_copy_prefix_to_dtuple().
rec_init_offsets(): Support ROW_FORMAT=REDUNDANT records whose
number of fields is between n_core_fields and n_fields.
cmp_rec_rec_with_match(): Implement the comparison between two
MIN_REC_FLAG records.
trx_t::in_rollback: Make the field available in non-debug builds.
trx_start_for_ddl_low(): Remove dangerous error-tolerance.
A dictionary transaction must be flagged as such before it has generated
any undo log records. This is because trx_undo_assign_undo() will mark
the transaction as a dictionary transaction in the undo log header
right before the very first undo log record is being written.
btr_index_rec_validate(): Account for instant ADD COLUMN
row_undo_ins_remove_clust_rec(): On the rollback of an insert into
SYS_COLUMNS, revert instant ADD COLUMN in the cache by removing the
last column from the table and the clustered index.
row_search_on_row_ref(), row_undo_mod_parse_undo_rec(), row_undo_mod(),
trx_undo_update_rec_get_update(): Handle the 'default row'
as a special case.
dtuple_t::trim(index): Omit a redundant suffix of an index tuple right
before insert or update. After instant ADD COLUMN, if the last fields
of a clustered index tuple match the 'default row', there is no
need to store them. While trimming the entry, we must hold a page latch,
so that the table cannot be emptied and the 'default row' be deleted.
btr_cur_optimistic_update(), btr_cur_pessimistic_update(),
row_upd_clust_rec_by_insert(), row_ins_clust_index_entry_low():
Invoke dtuple_t::trim() if needed.
row_ins_clust_index_entry(): Restore dtuple_t::n_fields after calling
row_ins_clust_index_entry_low().
rec_get_converted_size(), rec_get_converted_size_comp(): Allow the number
of fields to be between n_core_fields and n_fields. Do not support
infimum,supremum. They are never supposed to be stored in dtuple_t,
because page creation nowadays uses a lower-level method for initializing
them.
rec_convert_dtuple_to_rec_comp(): Assign the status bits based on the
number of fields.
btr_cur_trim(): In an update, trim the index entry as needed. For the
'default row', handle rollback specially. For user records, omit
fields that match the 'default row'.
btr_cur_optimistic_delete_func(), btr_cur_pessimistic_delete():
Skip locking and adaptive hash index for the 'default row'.
row_log_table_apply_convert_mrec(): Replace 'default row' values if needed.
In the temporary file that is applied by row_log_table_apply(),
we must identify whether the records contain the extra header for
instantly added columns. For now, we will allocate an additional byte
for this for ROW_T_INSERT and ROW_T_UPDATE records when the source table
has been subject to instant ADD COLUMN. The ROW_T_DELETE records are
fine, as they will be converted and will only contain 'core' columns
(PRIMARY KEY and some system columns) that are converted from dtuple_t.
rec_get_converted_size_temp(), rec_init_offsets_temp(),
rec_convert_dtuple_to_temp(): Add the parameter 'status'.
REC_INFO_DEFAULT_ROW = REC_INFO_MIN_REC_FLAG | REC_STATUS_COLUMNS_ADDED:
An info_bits constant for distinguishing the 'default row' record.
rec_comp_status_t: An enum of the status bit values.
rec_leaf_format: An enum that replaces the bool parameter of
rec_init_offsets_comp_ordinary().
2017-10-06 07:00:05 +03:00
|
|
|
{
|
2017-10-13 21:59:15 +03:00
|
|
|
DBUG_ASSERT(is_instant() || id == DICT_INDEXES_ID);
|
|
|
|
DBUG_ASSERT(n + (id == DICT_INDEXES_ID) >= n_core_fields);
|
MDEV-11369 Instant ADD COLUMN for InnoDB
For InnoDB tables, adding, dropping and reordering columns has
required a rebuild of the table and all its indexes. Since MySQL 5.6
(and MariaDB 10.0) this has been supported online (LOCK=NONE), allowing
concurrent modification of the tables.
This work revises the InnoDB ROW_FORMAT=REDUNDANT, ROW_FORMAT=COMPACT
and ROW_FORMAT=DYNAMIC so that columns can be appended instantaneously,
with only minor changes performed to the table structure. The counter
innodb_instant_alter_column in INFORMATION_SCHEMA.GLOBAL_STATUS
is incremented whenever a table rebuild operation is converted into
an instant ADD COLUMN operation.
ROW_FORMAT=COMPRESSED tables will not support instant ADD COLUMN.
Some usability limitations will be addressed in subsequent work:
MDEV-13134 Introduce ALTER TABLE attributes ALGORITHM=NOCOPY
and ALGORITHM=INSTANT
MDEV-14016 Allow instant ADD COLUMN, ADD INDEX, LOCK=NONE
The format of the clustered index (PRIMARY KEY) is changed as follows:
(1) The FIL_PAGE_TYPE of the root page will be FIL_PAGE_TYPE_INSTANT,
and a new field PAGE_INSTANT will contain the original number of fields
in the clustered index ('core' fields).
If instant ADD COLUMN has not been used or the table becomes empty,
or the very first instant ADD COLUMN operation is rolled back,
the fields PAGE_INSTANT and FIL_PAGE_TYPE will be reset
to 0 and FIL_PAGE_INDEX.
(2) A special 'default row' record is inserted into the leftmost leaf,
between the page infimum and the first user record. This record is
distinguished by the REC_INFO_MIN_REC_FLAG, and it is otherwise in the
same format as records that contain values for the instantly added
columns. This 'default row' always has the same number of fields as
the clustered index according to the table definition. The values of
'core' fields are to be ignored. For other fields, the 'default row'
will contain the default values as they were during the ALTER TABLE
statement. (If the column default values are changed later, those
values will only be stored in the .frm file. The 'default row' will
contain the original evaluated values, which must be the same for
every row.) The 'default row' must be completely hidden from
higher-level access routines. Assertions have been added to ensure
that no 'default row' is ever present in the adaptive hash index
or in locked records. The 'default row' is never delete-marked.
(3) In clustered index leaf page records, the number of fields must
reside between the number of 'core' fields (dict_index_t::n_core_fields
introduced in this work) and dict_index_t::n_fields. If the number
of fields is less than dict_index_t::n_fields, the missing fields
are replaced with the column value of the 'default row'.
Note: The number of fields in the record may shrink if some of the
last instantly added columns are updated to the value that is
in the 'default row'. The function btr_cur_trim() implements this
'compression' on update and rollback; dtuple::trim() implements it
on insert.
(4) In ROW_FORMAT=COMPACT and ROW_FORMAT=DYNAMIC records, the new
status value REC_STATUS_COLUMNS_ADDED will indicate the presence of
a new record header that will encode n_fields-n_core_fields-1 in
1 or 2 bytes. (In ROW_FORMAT=REDUNDANT records, the record header
always explicitly encodes the number of fields.)
We introduce the undo log record type TRX_UNDO_INSERT_DEFAULT for
covering the insert of the 'default row' record when instant ADD COLUMN
is used for the first time. Subsequent instant ADD COLUMN can use
TRX_UNDO_UPD_EXIST_REC.
This is joint work with Vin Chen (陈福荣) from Tencent. The design
that was discussed in April 2017 would not have allowed import or
export of data files, because instead of the 'default row' it would
have introduced a data dictionary table. The test
rpl.rpl_alter_instant is exactly as contributed in pull request #408.
The test innodb.instant_alter is based on a contributed test.
The redo log record format changes for ROW_FORMAT=DYNAMIC and
ROW_FORMAT=COMPACT are as contributed. (With this change present,
crash recovery from MariaDB 10.3.1 will fail in spectacular ways!)
Also the semantics of higher-level redo log records that modify the
PAGE_INSTANT field is changed. The redo log format version identifier
was already changed to LOG_HEADER_FORMAT_CURRENT=103 in MariaDB 10.3.1.
Everything else has been rewritten by me. Thanks to Elena Stepanova,
the code has been tested extensively.
When rolling back an instant ADD COLUMN operation, we must empty the
PAGE_FREE list after deleting or shortening the 'default row' record,
by calling either btr_page_empty() or btr_page_reorganize(). We must
know the size of each entry in the PAGE_FREE list. If rollback left a
freed copy of the 'default row' in the PAGE_FREE list, we would be
unable to determine its size (if it is in ROW_FORMAT=COMPACT or
ROW_FORMAT=DYNAMIC) because it would contain more fields than the
rolled-back definition of the clustered index.
UNIV_SQL_DEFAULT: A new special constant that designates an instantly
added column that is not present in the clustered index record.
len_is_stored(): Check if a length is an actual length. There are
two magic length values: UNIV_SQL_DEFAULT, UNIV_SQL_NULL.
dict_col_t::def_val: The 'default row' value of the column. If the
column is not added instantly, def_val.len will be UNIV_SQL_DEFAULT.
dict_col_t: Add the accessors is_virtual(), is_nullable(), is_instant(),
instant_value().
dict_col_t::remove_instant(): Remove the 'instant ADD' status of
a column.
dict_col_t::name(const dict_table_t& table): Replaces
dict_table_get_col_name().
dict_index_t::n_core_fields: The original number of fields.
For secondary indexes and if instant ADD COLUMN has not been used,
this will be equal to dict_index_t::n_fields.
dict_index_t::n_core_null_bytes: Number of bytes needed to
represent the null flags; usually equal to UT_BITS_IN_BYTES(n_nullable).
dict_index_t::NO_CORE_NULL_BYTES: Magic value signalling that
n_core_null_bytes was not initialized yet from the clustered index
root page.
dict_index_t: Add the accessors is_instant(), is_clust(),
get_n_nullable(), instant_field_value().
dict_index_t::instant_add_field(): Adjust clustered index metadata
for instant ADD COLUMN.
dict_index_t::remove_instant(): Remove the 'instant ADD' status
of a clustered index when the table becomes empty, or the very first
instant ADD COLUMN operation is rolled back.
dict_table_t: Add the accessors is_instant(), is_temporary(),
supports_instant().
dict_table_t::instant_add_column(): Adjust metadata for
instant ADD COLUMN.
dict_table_t::rollback_instant(): Adjust metadata on the rollback
of instant ADD COLUMN.
prepare_inplace_alter_table_dict(): First create the ctx->new_table,
and only then decide if the table really needs to be rebuilt.
We must split the creation of table or index metadata from the
creation of the dictionary table records and the creation of
the data. In this way, we can transform a table-rebuilding operation
into an instant ADD COLUMN operation. Dictionary objects will only
be added to cache when table rebuilding or index creation is needed.
The ctx->instant_table will never be added to cache.
dict_table_t::add_to_cache(): Modified and renamed from
dict_table_add_to_cache(). Do not modify the table metadata.
Let the callers invoke dict_table_add_system_columns() and if needed,
set can_be_evicted.
dict_create_sys_tables_tuple(), dict_create_table_step(): Omit the
system columns (which will now exist in the dict_table_t object
already at this point).
dict_create_table_step(): Expect the callers to invoke
dict_table_add_system_columns().
pars_create_table(): Before creating the table creation execution
graph, invoke dict_table_add_system_columns().
row_create_table_for_mysql(): Expect all callers to invoke
dict_table_add_system_columns().
create_index_dict(): Replaces row_merge_create_index_graph().
innodb_update_n_cols(): Renamed from innobase_update_n_virtual().
Call my_error() if an error occurs.
btr_cur_instant_init(), btr_cur_instant_init_low(),
btr_cur_instant_root_init():
Load additional metadata from the clustered index and set
dict_index_t::n_core_null_bytes. This is invoked
when table metadata is first loaded into the data dictionary.
dict_boot(): Initialize n_core_null_bytes for the four hard-coded
dictionary tables.
dict_create_index_step(): Initialize n_core_null_bytes. This is
executed as part of CREATE TABLE.
dict_index_build_internal_clust(): Initialize n_core_null_bytes to
NO_CORE_NULL_BYTES if table->supports_instant().
row_create_index_for_mysql(): Initialize n_core_null_bytes for
CREATE TEMPORARY TABLE.
commit_cache_norebuild(): Call the code to rename or enlarge columns
in the cache only if instant ADD COLUMN is not being used.
(Instant ADD COLUMN would copy all column metadata from
instant_table to old_table, including the names and lengths.)
PAGE_INSTANT: A new 13-bit field for storing dict_index_t::n_core_fields.
This is repurposing the 16-bit field PAGE_DIRECTION, of which only the
least significant 3 bits were used. The original byte containing
PAGE_DIRECTION will be accessible via the new constant PAGE_DIRECTION_B.
page_get_instant(), page_set_instant(): Accessors for the PAGE_INSTANT.
page_ptr_get_direction(), page_get_direction(),
page_ptr_set_direction(): Accessors for PAGE_DIRECTION.
page_direction_reset(): Reset PAGE_DIRECTION, PAGE_N_DIRECTION.
page_direction_increment(): Increment PAGE_N_DIRECTION
and set PAGE_DIRECTION.
rec_get_offsets(): Use the 'leaf' parameter for non-debug purposes,
and assume that heap_no is always set.
Initialize all dict_index_t::n_fields for ROW_FORMAT=REDUNDANT records,
even if the record contains fewer fields.
rec_offs_make_valid(): Add the parameter 'leaf'.
rec_copy_prefix_to_dtuple(): Assert that the tuple is only built
on the core fields. Instant ADD COLUMN only applies to the
clustered index, and we should never build a search key that has
more than the PRIMARY KEY and possibly DB_TRX_ID,DB_ROLL_PTR.
All these columns are always present.
dict_index_build_data_tuple(): Remove assertions that would be
duplicated in rec_copy_prefix_to_dtuple().
rec_init_offsets(): Support ROW_FORMAT=REDUNDANT records whose
number of fields is between n_core_fields and n_fields.
cmp_rec_rec_with_match(): Implement the comparison between two
MIN_REC_FLAG records.
trx_t::in_rollback: Make the field available in non-debug builds.
trx_start_for_ddl_low(): Remove dangerous error-tolerance.
A dictionary transaction must be flagged as such before it has generated
any undo log records. This is because trx_undo_assign_undo() will mark
the transaction as a dictionary transaction in the undo log header
right before the very first undo log record is being written.
btr_index_rec_validate(): Account for instant ADD COLUMN
row_undo_ins_remove_clust_rec(): On the rollback of an insert into
SYS_COLUMNS, revert instant ADD COLUMN in the cache by removing the
last column from the table and the clustered index.
row_search_on_row_ref(), row_undo_mod_parse_undo_rec(), row_undo_mod(),
trx_undo_update_rec_get_update(): Handle the 'default row'
as a special case.
dtuple_t::trim(index): Omit a redundant suffix of an index tuple right
before insert or update. After instant ADD COLUMN, if the last fields
of a clustered index tuple match the 'default row', there is no
need to store them. While trimming the entry, we must hold a page latch,
so that the table cannot be emptied and the 'default row' be deleted.
btr_cur_optimistic_update(), btr_cur_pessimistic_update(),
row_upd_clust_rec_by_insert(), row_ins_clust_index_entry_low():
Invoke dtuple_t::trim() if needed.
row_ins_clust_index_entry(): Restore dtuple_t::n_fields after calling
row_ins_clust_index_entry_low().
rec_get_converted_size(), rec_get_converted_size_comp(): Allow the number
of fields to be between n_core_fields and n_fields. Do not support
infimum,supremum. They are never supposed to be stored in dtuple_t,
because page creation nowadays uses a lower-level method for initializing
them.
rec_convert_dtuple_to_rec_comp(): Assign the status bits based on the
number of fields.
btr_cur_trim(): In an update, trim the index entry as needed. For the
'default row', handle rollback specially. For user records, omit
fields that match the 'default row'.
btr_cur_optimistic_delete_func(), btr_cur_pessimistic_delete():
Skip locking and adaptive hash index for the 'default row'.
row_log_table_apply_convert_mrec(): Replace 'default row' values if needed.
In the temporary file that is applied by row_log_table_apply(),
we must identify whether the records contain the extra header for
instantly added columns. For now, we will allocate an additional byte
for this for ROW_T_INSERT and ROW_T_UPDATE records when the source table
has been subject to instant ADD COLUMN. The ROW_T_DELETE records are
fine, as they will be converted and will only contain 'core' columns
(PRIMARY KEY and some system columns) that are converted from dtuple_t.
rec_get_converted_size_temp(), rec_init_offsets_temp(),
rec_convert_dtuple_to_temp(): Add the parameter 'status'.
REC_INFO_DEFAULT_ROW = REC_INFO_MIN_REC_FLAG | REC_STATUS_COLUMNS_ADDED:
An info_bits constant for distinguishing the 'default row' record.
rec_comp_status_t: An enum of the status bit values.
rec_leaf_format: An enum that replaces the bool parameter of
rec_init_offsets_comp_ordinary().
2017-10-06 07:00:05 +03:00
|
|
|
DBUG_ASSERT(n < n_fields);
|
|
|
|
return fields[n].col->instant_value(len);
|
|
|
|
}
|
|
|
|
|
MDEV-15662 Instant DROP COLUMN or changing the order of columns
Allow ADD COLUMN anywhere in a table, not only adding as the
last column.
Allow instant DROP COLUMN and instant changing the order of columns.
The added columns will always be added last in clustered index records.
In new records, instantly dropped columns will be stored as NULL or
empty when possible.
Information about dropped and reordered columns will be written in
a metadata BLOB (mblob), which is stored before the first 'user' field
in the hidden metadata record at the start of the clustered index.
The presence of mblob is indicated by setting the delete-mark flag in
the metadata record.
The metadata BLOB stores the number of clustered index fields,
followed by an array of column information for each field.
For dropped columns, we store the NOT NULL flag, the fixed length,
and for variable-length columns, whether the maximum length exceeded
255 bytes. For non-dropped columns, we store the column position.
Unlike with MDEV-11369, when a table becomes empty, it cannot
be converted back to the canonical format. The reason for this is
that other threads may hold cached objects such as
row_prebuilt_t::ins_node that could refer to dropped or reordered
index fields.
For instant DROP COLUMN and ROW_FORMAT=COMPACT or ROW_FORMAT=DYNAMIC,
we must store the n_core_null_bytes in the root page, so that the
chain of node pointer records can be followed in order to reach the
leftmost leaf page where the metadata record is located.
If the mblob is present, we will zero-initialize the strings
"infimum" and "supremum" in the root page, and use the last byte of
"supremum" for storing the number of null bytes (which are allocated
but useless on node pointer pages). This is necessary for
btr_cur_instant_init_metadata() to be able to navigate to the mblob.
If the PRIMARY KEY contains any variable-length column and some
nullable columns were instantly dropped, the dict_index_t::n_nullable
in the data dictionary could be smaller than it actually is in the
non-leaf pages. Because of this, the non-leaf pages could use more
bytes for the null flags than the data dictionary expects, and we
could be reading the lengths of the variable-length columns from the
wrong offset, and thus reading the child page number from wrong place.
This is the result of two design mistakes that involve unnecessary
storage of data: First, it is nonsense to store any data fields for
the leftmost node pointer records, because the comparisons would be
resolved by the MIN_REC_FLAG alone. Second, there cannot be any null
fields in the clustered index node pointer fields, but we nevertheless
reserve space for all the null flags.
Limitations (future work):
MDEV-17459 Allow instant ALTER TABLE even if FULLTEXT INDEX exists
MDEV-17468 Avoid table rebuild on operations on generated columns
MDEV-17494 Refuse ALGORITHM=INSTANT when the row size is too large
btr_page_reorganize_low(): Preserve any metadata in the root page.
Call lock_move_reorganize_page() only after restoring the "infimum"
and "supremum" records, to avoid a memcmp() assertion failure.
dict_col_t::DROPPED: Magic value for dict_col_t::ind.
dict_col_t::clear_instant(): Renamed from dict_col_t::remove_instant().
Do not assert that the column was instantly added, because we
sometimes call this unconditionally for all columns.
Convert an instantly added column to a "core column". The old name
remove_instant() could be mistaken to refer to "instant DROP COLUMN".
dict_col_t::is_added(): Rename from dict_col_t::is_instant().
dtype_t::metadata_blob_init(): Initialize the mblob data type.
dtuple_t::is_metadata(), dtuple_t::is_alter_metadata(),
upd_t::is_metadata(), upd_t::is_alter_metadata(): Check if info_bits
refer to a metadata record.
dict_table_t::instant: Metadata about dropped or reordered columns.
dict_table_t::prepare_instant(): Prepare
ha_innobase_inplace_ctx::instant_table for instant ALTER TABLE.
innobase_instant_try() will pass this to dict_table_t::instant_column().
On rollback, dict_table_t::rollback_instant() will be called.
dict_table_t::instant_column(): Renamed from instant_add_column().
Add the parameter col_map so that columns can be reordered.
Copy and adjust v_cols[] as well.
dict_table_t::find(): Find an old column based on a new column number.
dict_table_t::serialise_columns(), dict_table_t::deserialise_columns():
Convert the mblob.
dict_index_t::instant_metadata(): Create the metadata record
for instant ALTER TABLE. Invoke dict_table_t::serialise_columns().
dict_index_t::reconstruct_fields(): Invoked by
dict_table_t::deserialise_columns().
dict_index_t::clear_instant_alter(): Move the fields for the
dropped columns to the end, and sort the surviving index fields
in ascending order of column position.
ha_innobase::check_if_supported_inplace_alter(): Do not allow
adding a FTS_DOC_ID column if a hidden FTS_DOC_ID column exists
due to FULLTEXT INDEX. (This always required ALGORITHM=COPY.)
instant_alter_column_possible(): Add a parameter for InnoDB table,
to check for additional conditions, such as the maximum number of
index fields.
ha_innobase_inplace_ctx::first_alter_pos: The first column whose position
is affected by instant ADD, DROP, or changing the order of columns.
innobase_build_col_map(): Skip added virtual columns.
prepare_inplace_add_virtual(): Correctly compute num_to_add_vcol.
Remove some unnecessary code. Note that the call to
innodb_base_col_setup() should be executed later.
commit_try_norebuild(): If ctx->is_instant(), let the virtual
columns be added or dropped by innobase_instant_try().
innobase_instant_try(): Fill in a zero default value for the
hidden column FTS_DOC_ID (to reduce the work needed in MDEV-17459).
If any columns were dropped or reordered (or added not last),
delete any SYS_COLUMNS records for the following columns, and
insert SYS_COLUMNS records for all subsequent stored columns as well
as for all virtual columns. If any virtual column is dropped, rewrite
all virtual column metadata. Use a shortcut only for adding
virtual columns. This is because innobase_drop_virtual_try()
assumes that the dropped virtual columns still exist in ctx->old_table.
innodb_update_cols(): Renamed from innodb_update_n_cols().
innobase_add_one_virtual(), innobase_insert_sys_virtual(): Change
the return type to bool, and invoke my_error() when detecting an error.
innodb_insert_sys_columns(): Insert a record into SYS_COLUMNS.
Refactored from innobase_add_one_virtual() and innobase_instant_add_col().
innobase_instant_add_col(): Replace the parameter dfield with type.
innobase_instant_drop_cols(): Drop matching columns from SYS_COLUMNS
and all columns from SYS_VIRTUAL.
innobase_add_virtual_try(), innobase_drop_virtual_try(): Let
the caller invoke innodb_update_cols().
innobase_rename_column_try(): Skip dropped columns.
commit_cache_norebuild(): Update table->fts->doc_col.
dict_mem_table_col_rename_low(): Skip dropped columns.
trx_undo_rec_get_partial_row(): Skip dropped columns.
trx_undo_update_rec_get_update(): Handle the metadata BLOB correctly.
trx_undo_page_report_modify(): Avoid out-of-bounds access to record fields.
Log metadata records consistently.
Apparently, the first fields of a clustered index may be updated
in an update_undo vector when the index is ID_IND of SYS_FOREIGN,
as part of renaming the table during ALTER TABLE. Normally, updates of
the PRIMARY KEY should be logged as delete-mark and an insert.
row_undo_mod_parse_undo_rec(), row_purge_parse_undo_rec():
Use trx_undo_metadata.
row_undo_mod_clust_low(): On metadata rollback, roll back the root page too.
row_undo_mod_clust(): Relax an assertion. The delete-mark flag was
repurposed for ALTER TABLE metadata records.
row_rec_to_index_entry_impl(): Add the template parameter mblob
and the optional parameter info_bits for specifying the desired new
info bits. For the metadata tuple, allow conversion between the original
format (ADD COLUMN only) and the generic format (with hidden BLOB).
Add the optional parameter "pad" to determine whether the tuple should
be padded to the index fields (on ALTER TABLE it should), or whether
it should remain at its original size (on rollback).
row_build_index_entry_low(): Clean up the code, removing
redundant variables and conditions. For instantly dropped columns,
generate a dummy value that is NULL, the empty string, or a
fixed length of NUL bytes, depending on the type of the dropped column.
row_upd_clust_rec_by_insert_inherit_func(): On the update of PRIMARY KEY
of a record that contained a dropped column whose value was stored
externally, we will be inserting a dummy NULL or empty string value
to the field of the dropped column. The externally stored column would
eventually be dropped when purge removes the delete-marked record for
the old PRIMARY KEY value.
btr_index_rec_validate(): Recognize the metadata record.
btr_discard_only_page_on_level(): Preserve the generic instant
ALTER TABLE metadata.
btr_set_instant(): Replaces page_set_instant(). This sets a clustered
index root page to the appropriate format, or upgrades from
the MDEV-11369 instant ADD COLUMN to generic ALTER TABLE format.
btr_cur_instant_init_low(): Read and validate the metadata BLOB page
before reconstructing the dictionary information based on it.
btr_cur_instant_init_metadata(): Do not read any lengths from the
metadata record header before reading the BLOB. At this point, we
would not actually know how many nullable fields the metadata record
contains.
btr_cur_instant_root_init(): Initialize n_core_null_bytes in one
of two possible ways.
btr_cur_trim(): Handle the mblob record.
row_metadata_to_tuple(): Convert a metadata record to a data tuple,
based on the new info_bits of the metadata record.
btr_cur_pessimistic_update(): Invoke row_metadata_to_tuple() if needed.
Invoke dtuple_convert_big_rec() for metadata records if the record is
too large, or if the mblob is not yet marked as externally stored.
btr_cur_optimistic_delete_func(), btr_cur_pessimistic_delete():
When the last user record is deleted, do not delete the
generic instant ALTER TABLE metadata record. Only delete
MDEV-11369 instant ADD COLUMN metadata records.
btr_cur_optimistic_insert(): Avoid unnecessary computation of rec_size.
btr_pcur_store_position(): Allow a logically empty page to contain
a metadata record for generic ALTER TABLE.
REC_INFO_DEFAULT_ROW_ADD: Renamed from REC_INFO_DEFAULT_ROW.
This is for the old instant ADD COLUMN (MDEV-11369) only.
REC_INFO_DEFAULT_ROW_ALTER: The more generic metadata record,
with additional information for dropped or reordered columns.
rec_info_bits_valid(): Remove. The only case when this would fail
is when the record is the generic ALTER TABLE metadata record.
rec_is_alter_metadata(): Check if a record is the metadata record
for instant ALTER TABLE (other than ADD COLUMN). NOTE: This function
must not be invoked on node pointer records, because the delete-mark
flag in those records may be set (it is garbage), and then a debug
assertion could fail because index->is_instant() does not necessarily
hold.
rec_is_add_metadata(): Check if a record is MDEV-11369 ADD COLUMN metadata
record (not more generic instant ALTER TABLE).
rec_get_converted_size_comp_prefix_low(): Assume that the metadata
field will be stored externally. In dtuple_convert_big_rec() during
the rec_get_converted_size() call, it would not be there yet.
rec_get_converted_size_comp(): Replace status,fields,n_fields with tuple.
rec_init_offsets_comp_ordinary(), rec_get_converted_size_comp_prefix_low(),
rec_convert_dtuple_to_rec_comp(): Add template<bool mblob = false>.
With mblob=true, process a record with a metadata BLOB.
rec_copy_prefix_to_buf(): Assert that no fields beyond the key and
system columns are being copied. Exclude the metadata BLOB field.
rec_convert_dtuple_to_metadata_comp(): Convert an alter metadata tuple
into a record.
row_upd_index_replace_metadata(): Apply an update vector to an
alter_metadata tuple.
row_log_allocate(): Replace dict_index_t::is_instant()
with a more appropriate condition that ignores dict_table_t::instant.
Only a table on which the MDEV-11369 ADD COLUMN was performed
can "lose its instantness" when it becomes empty. After
instant DROP COLUMN or reordering columns, we cannot simply
convert the table to the canonical format, because the data
dictionary cache and all possibly existing references to it
from other client connection threads would have to be adjusted.
row_quiesce_write_index_fields(): Do not crash when the table contains
an instantly dropped column.
Thanks to Thirunarayanan Balathandayuthapani for discussing the design
and implementing an initial prototype of this.
Thanks to Matthias Leich for testing.
2018-10-19 16:49:54 +03:00
|
|
|
/** Adjust index metadata for instant ADD/DROP/reorder COLUMN.
|
|
|
|
@param[in] clustered index definition after instant ALTER TABLE */
|
|
|
|
inline void instant_add_field(const dict_index_t& instant);
|
|
|
|
/** Remove instant ADD COLUMN metadata. */
|
|
|
|
inline void clear_instant_add();
|
|
|
|
/** Remove instant ALTER TABLE metadata. */
|
|
|
|
inline void clear_instant_alter();
|
|
|
|
|
|
|
|
/** Construct the metadata record for instant ALTER TABLE.
|
|
|
|
@param[in] row dummy or default values for existing columns
|
|
|
|
@param[in,out] heap memory heap for allocations
|
|
|
|
@return metadata record */
|
|
|
|
inline dtuple_t*
|
|
|
|
instant_metadata(const dtuple_t& row, mem_heap_t* heap) const;
|
2017-12-18 19:03:51 +03:00
|
|
|
|
|
|
|
/** Check if record in clustered index is historical row.
|
|
|
|
@param[in] rec clustered row
|
|
|
|
@param[in] offsets offsets
|
|
|
|
@return true if row is historical */
|
|
|
|
bool
|
|
|
|
vers_history_row(const rec_t* rec, const ulint* offsets);
|
|
|
|
|
|
|
|
/** Check if record in secondary index is historical row.
|
|
|
|
@param[in] rec record in a secondary index
|
|
|
|
@param[out] history_row true if row is historical
|
|
|
|
@return true on error */
|
|
|
|
bool
|
|
|
|
vers_history_row(const rec_t* rec, bool &history_row);
|
MDEV-15662 Instant DROP COLUMN or changing the order of columns
Allow ADD COLUMN anywhere in a table, not only adding as the
last column.
Allow instant DROP COLUMN and instant changing the order of columns.
The added columns will always be added last in clustered index records.
In new records, instantly dropped columns will be stored as NULL or
empty when possible.
Information about dropped and reordered columns will be written in
a metadata BLOB (mblob), which is stored before the first 'user' field
in the hidden metadata record at the start of the clustered index.
The presence of mblob is indicated by setting the delete-mark flag in
the metadata record.
The metadata BLOB stores the number of clustered index fields,
followed by an array of column information for each field.
For dropped columns, we store the NOT NULL flag, the fixed length,
and for variable-length columns, whether the maximum length exceeded
255 bytes. For non-dropped columns, we store the column position.
Unlike with MDEV-11369, when a table becomes empty, it cannot
be converted back to the canonical format. The reason for this is
that other threads may hold cached objects such as
row_prebuilt_t::ins_node that could refer to dropped or reordered
index fields.
For instant DROP COLUMN and ROW_FORMAT=COMPACT or ROW_FORMAT=DYNAMIC,
we must store the n_core_null_bytes in the root page, so that the
chain of node pointer records can be followed in order to reach the
leftmost leaf page where the metadata record is located.
If the mblob is present, we will zero-initialize the strings
"infimum" and "supremum" in the root page, and use the last byte of
"supremum" for storing the number of null bytes (which are allocated
but useless on node pointer pages). This is necessary for
btr_cur_instant_init_metadata() to be able to navigate to the mblob.
If the PRIMARY KEY contains any variable-length column and some
nullable columns were instantly dropped, the dict_index_t::n_nullable
in the data dictionary could be smaller than it actually is in the
non-leaf pages. Because of this, the non-leaf pages could use more
bytes for the null flags than the data dictionary expects, and we
could be reading the lengths of the variable-length columns from the
wrong offset, and thus reading the child page number from wrong place.
This is the result of two design mistakes that involve unnecessary
storage of data: First, it is nonsense to store any data fields for
the leftmost node pointer records, because the comparisons would be
resolved by the MIN_REC_FLAG alone. Second, there cannot be any null
fields in the clustered index node pointer fields, but we nevertheless
reserve space for all the null flags.
Limitations (future work):
MDEV-17459 Allow instant ALTER TABLE even if FULLTEXT INDEX exists
MDEV-17468 Avoid table rebuild on operations on generated columns
MDEV-17494 Refuse ALGORITHM=INSTANT when the row size is too large
btr_page_reorganize_low(): Preserve any metadata in the root page.
Call lock_move_reorganize_page() only after restoring the "infimum"
and "supremum" records, to avoid a memcmp() assertion failure.
dict_col_t::DROPPED: Magic value for dict_col_t::ind.
dict_col_t::clear_instant(): Renamed from dict_col_t::remove_instant().
Do not assert that the column was instantly added, because we
sometimes call this unconditionally for all columns.
Convert an instantly added column to a "core column". The old name
remove_instant() could be mistaken to refer to "instant DROP COLUMN".
dict_col_t::is_added(): Rename from dict_col_t::is_instant().
dtype_t::metadata_blob_init(): Initialize the mblob data type.
dtuple_t::is_metadata(), dtuple_t::is_alter_metadata(),
upd_t::is_metadata(), upd_t::is_alter_metadata(): Check if info_bits
refer to a metadata record.
dict_table_t::instant: Metadata about dropped or reordered columns.
dict_table_t::prepare_instant(): Prepare
ha_innobase_inplace_ctx::instant_table for instant ALTER TABLE.
innobase_instant_try() will pass this to dict_table_t::instant_column().
On rollback, dict_table_t::rollback_instant() will be called.
dict_table_t::instant_column(): Renamed from instant_add_column().
Add the parameter col_map so that columns can be reordered.
Copy and adjust v_cols[] as well.
dict_table_t::find(): Find an old column based on a new column number.
dict_table_t::serialise_columns(), dict_table_t::deserialise_columns():
Convert the mblob.
dict_index_t::instant_metadata(): Create the metadata record
for instant ALTER TABLE. Invoke dict_table_t::serialise_columns().
dict_index_t::reconstruct_fields(): Invoked by
dict_table_t::deserialise_columns().
dict_index_t::clear_instant_alter(): Move the fields for the
dropped columns to the end, and sort the surviving index fields
in ascending order of column position.
ha_innobase::check_if_supported_inplace_alter(): Do not allow
adding a FTS_DOC_ID column if a hidden FTS_DOC_ID column exists
due to FULLTEXT INDEX. (This always required ALGORITHM=COPY.)
instant_alter_column_possible(): Add a parameter for InnoDB table,
to check for additional conditions, such as the maximum number of
index fields.
ha_innobase_inplace_ctx::first_alter_pos: The first column whose position
is affected by instant ADD, DROP, or changing the order of columns.
innobase_build_col_map(): Skip added virtual columns.
prepare_inplace_add_virtual(): Correctly compute num_to_add_vcol.
Remove some unnecessary code. Note that the call to
innodb_base_col_setup() should be executed later.
commit_try_norebuild(): If ctx->is_instant(), let the virtual
columns be added or dropped by innobase_instant_try().
innobase_instant_try(): Fill in a zero default value for the
hidden column FTS_DOC_ID (to reduce the work needed in MDEV-17459).
If any columns were dropped or reordered (or added not last),
delete any SYS_COLUMNS records for the following columns, and
insert SYS_COLUMNS records for all subsequent stored columns as well
as for all virtual columns. If any virtual column is dropped, rewrite
all virtual column metadata. Use a shortcut only for adding
virtual columns. This is because innobase_drop_virtual_try()
assumes that the dropped virtual columns still exist in ctx->old_table.
innodb_update_cols(): Renamed from innodb_update_n_cols().
innobase_add_one_virtual(), innobase_insert_sys_virtual(): Change
the return type to bool, and invoke my_error() when detecting an error.
innodb_insert_sys_columns(): Insert a record into SYS_COLUMNS.
Refactored from innobase_add_one_virtual() and innobase_instant_add_col().
innobase_instant_add_col(): Replace the parameter dfield with type.
innobase_instant_drop_cols(): Drop matching columns from SYS_COLUMNS
and all columns from SYS_VIRTUAL.
innobase_add_virtual_try(), innobase_drop_virtual_try(): Let
the caller invoke innodb_update_cols().
innobase_rename_column_try(): Skip dropped columns.
commit_cache_norebuild(): Update table->fts->doc_col.
dict_mem_table_col_rename_low(): Skip dropped columns.
trx_undo_rec_get_partial_row(): Skip dropped columns.
trx_undo_update_rec_get_update(): Handle the metadata BLOB correctly.
trx_undo_page_report_modify(): Avoid out-of-bounds access to record fields.
Log metadata records consistently.
Apparently, the first fields of a clustered index may be updated
in an update_undo vector when the index is ID_IND of SYS_FOREIGN,
as part of renaming the table during ALTER TABLE. Normally, updates of
the PRIMARY KEY should be logged as delete-mark and an insert.
row_undo_mod_parse_undo_rec(), row_purge_parse_undo_rec():
Use trx_undo_metadata.
row_undo_mod_clust_low(): On metadata rollback, roll back the root page too.
row_undo_mod_clust(): Relax an assertion. The delete-mark flag was
repurposed for ALTER TABLE metadata records.
row_rec_to_index_entry_impl(): Add the template parameter mblob
and the optional parameter info_bits for specifying the desired new
info bits. For the metadata tuple, allow conversion between the original
format (ADD COLUMN only) and the generic format (with hidden BLOB).
Add the optional parameter "pad" to determine whether the tuple should
be padded to the index fields (on ALTER TABLE it should), or whether
it should remain at its original size (on rollback).
row_build_index_entry_low(): Clean up the code, removing
redundant variables and conditions. For instantly dropped columns,
generate a dummy value that is NULL, the empty string, or a
fixed length of NUL bytes, depending on the type of the dropped column.
row_upd_clust_rec_by_insert_inherit_func(): On the update of PRIMARY KEY
of a record that contained a dropped column whose value was stored
externally, we will be inserting a dummy NULL or empty string value
to the field of the dropped column. The externally stored column would
eventually be dropped when purge removes the delete-marked record for
the old PRIMARY KEY value.
btr_index_rec_validate(): Recognize the metadata record.
btr_discard_only_page_on_level(): Preserve the generic instant
ALTER TABLE metadata.
btr_set_instant(): Replaces page_set_instant(). This sets a clustered
index root page to the appropriate format, or upgrades from
the MDEV-11369 instant ADD COLUMN to generic ALTER TABLE format.
btr_cur_instant_init_low(): Read and validate the metadata BLOB page
before reconstructing the dictionary information based on it.
btr_cur_instant_init_metadata(): Do not read any lengths from the
metadata record header before reading the BLOB. At this point, we
would not actually know how many nullable fields the metadata record
contains.
btr_cur_instant_root_init(): Initialize n_core_null_bytes in one
of two possible ways.
btr_cur_trim(): Handle the mblob record.
row_metadata_to_tuple(): Convert a metadata record to a data tuple,
based on the new info_bits of the metadata record.
btr_cur_pessimistic_update(): Invoke row_metadata_to_tuple() if needed.
Invoke dtuple_convert_big_rec() for metadata records if the record is
too large, or if the mblob is not yet marked as externally stored.
btr_cur_optimistic_delete_func(), btr_cur_pessimistic_delete():
When the last user record is deleted, do not delete the
generic instant ALTER TABLE metadata record. Only delete
MDEV-11369 instant ADD COLUMN metadata records.
btr_cur_optimistic_insert(): Avoid unnecessary computation of rec_size.
btr_pcur_store_position(): Allow a logically empty page to contain
a metadata record for generic ALTER TABLE.
REC_INFO_DEFAULT_ROW_ADD: Renamed from REC_INFO_DEFAULT_ROW.
This is for the old instant ADD COLUMN (MDEV-11369) only.
REC_INFO_DEFAULT_ROW_ALTER: The more generic metadata record,
with additional information for dropped or reordered columns.
rec_info_bits_valid(): Remove. The only case when this would fail
is when the record is the generic ALTER TABLE metadata record.
rec_is_alter_metadata(): Check if a record is the metadata record
for instant ALTER TABLE (other than ADD COLUMN). NOTE: This function
must not be invoked on node pointer records, because the delete-mark
flag in those records may be set (it is garbage), and then a debug
assertion could fail because index->is_instant() does not necessarily
hold.
rec_is_add_metadata(): Check if a record is MDEV-11369 ADD COLUMN metadata
record (not more generic instant ALTER TABLE).
rec_get_converted_size_comp_prefix_low(): Assume that the metadata
field will be stored externally. In dtuple_convert_big_rec() during
the rec_get_converted_size() call, it would not be there yet.
rec_get_converted_size_comp(): Replace status,fields,n_fields with tuple.
rec_init_offsets_comp_ordinary(), rec_get_converted_size_comp_prefix_low(),
rec_convert_dtuple_to_rec_comp(): Add template<bool mblob = false>.
With mblob=true, process a record with a metadata BLOB.
rec_copy_prefix_to_buf(): Assert that no fields beyond the key and
system columns are being copied. Exclude the metadata BLOB field.
rec_convert_dtuple_to_metadata_comp(): Convert an alter metadata tuple
into a record.
row_upd_index_replace_metadata(): Apply an update vector to an
alter_metadata tuple.
row_log_allocate(): Replace dict_index_t::is_instant()
with a more appropriate condition that ignores dict_table_t::instant.
Only a table on which the MDEV-11369 ADD COLUMN was performed
can "lose its instantness" when it becomes empty. After
instant DROP COLUMN or reordering columns, we cannot simply
convert the table to the canonical format, because the data
dictionary cache and all possibly existing references to it
from other client connection threads would have to be adjusted.
row_quiesce_write_index_fields(): Do not crash when the table contains
an instantly dropped column.
Thanks to Thirunarayanan Balathandayuthapani for discussing the design
and implementing an initial prototype of this.
Thanks to Matthias Leich for testing.
2018-10-19 16:49:54 +03:00
|
|
|
|
|
|
|
/** Reconstruct the clustered index fields. */
|
|
|
|
inline void reconstruct_fields();
|
2019-01-15 11:41:30 +02:00
|
|
|
|
|
|
|
/** Check if the index contains a column or a prefix of that column.
|
|
|
|
@param[in] n column number
|
|
|
|
@param[in] is_virtual whether it is a virtual col
|
|
|
|
@return whether the index contains the column or its prefix */
|
|
|
|
bool contains_col_or_prefix(ulint n, bool is_virtual) const
|
|
|
|
MY_ATTRIBUTE((warn_unused_result));
|
2014-02-26 19:11:54 +01:00
|
|
|
};
|
|
|
|
|
2018-06-04 15:55:37 +03:00
|
|
|
/** Detach a column from an index.
|
|
|
|
@param[in] index index to be detached from */
|
|
|
|
inline void dict_col_t::detach(const dict_index_t& index)
|
|
|
|
{
|
2019-05-27 19:45:44 +03:00
|
|
|
if (is_virtual()) {
|
|
|
|
reinterpret_cast<dict_v_col_t*>(this)->detach(index);
|
2018-06-04 15:55:37 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-02-26 19:11:54 +01:00
|
|
|
/** The status of online index creation */
|
|
|
|
enum online_index_status {
|
|
|
|
/** the index is complete and ready for access */
|
|
|
|
ONLINE_INDEX_COMPLETE = 0,
|
|
|
|
/** the index is being created, online
|
|
|
|
(allowing concurrent modifications) */
|
|
|
|
ONLINE_INDEX_CREATION,
|
|
|
|
/** secondary index creation was aborted and the index
|
|
|
|
should be dropped as soon as index->table->n_ref_count reaches 0,
|
|
|
|
or online table rebuild was aborted and the clustered index
|
|
|
|
of the original table should soon be restored to
|
|
|
|
ONLINE_INDEX_COMPLETE */
|
|
|
|
ONLINE_INDEX_ABORTED,
|
|
|
|
/** the online index creation was aborted, the index was
|
|
|
|
dropped from the data dictionary and the tablespace, and it
|
|
|
|
should be dropped from the data dictionary cache as soon as
|
|
|
|
index->table->n_ref_count reaches 0. */
|
|
|
|
ONLINE_INDEX_ABORTED_DROPPED
|
|
|
|
};
|
|
|
|
|
2016-09-06 09:43:16 +03:00
|
|
|
/** Set to store the virtual columns which are affected by Foreign
|
|
|
|
key constraint. */
|
|
|
|
typedef std::set<dict_v_col_t*, std::less<dict_v_col_t*>,
|
|
|
|
ut_allocator<dict_v_col_t*> > dict_vcol_set;
|
|
|
|
|
2014-02-26 19:11:54 +01:00
|
|
|
/** Data structure for a foreign key constraint; an example:
|
|
|
|
FOREIGN KEY (A, B) REFERENCES TABLE2 (C, D). Most fields will be
|
|
|
|
initialized to 0, NULL or FALSE in dict_mem_foreign_create(). */
|
|
|
|
struct dict_foreign_t{
|
|
|
|
mem_heap_t* heap; /*!< this object is allocated from
|
|
|
|
this memory heap */
|
|
|
|
char* id; /*!< id of the constraint as a
|
|
|
|
null-terminated string */
|
|
|
|
unsigned n_fields:10; /*!< number of indexes' first fields
|
|
|
|
for which the foreign key
|
|
|
|
constraint is defined: we allow the
|
|
|
|
indexes to contain more fields than
|
|
|
|
mentioned in the constraint, as long
|
|
|
|
as the first fields are as mentioned */
|
|
|
|
unsigned type:6; /*!< 0 or DICT_FOREIGN_ON_DELETE_CASCADE
|
|
|
|
or DICT_FOREIGN_ON_DELETE_SET_NULL */
|
|
|
|
char* foreign_table_name;/*!< foreign table name */
|
|
|
|
char* foreign_table_name_lookup;
|
|
|
|
/*!< foreign table name used for dict lookup */
|
|
|
|
dict_table_t* foreign_table; /*!< table where the foreign key is */
|
|
|
|
const char** foreign_col_names;/*!< names of the columns in the
|
|
|
|
foreign key */
|
|
|
|
char* referenced_table_name;/*!< referenced table name */
|
|
|
|
char* referenced_table_name_lookup;
|
|
|
|
/*!< referenced table name for dict lookup*/
|
|
|
|
dict_table_t* referenced_table;/*!< table where the referenced key
|
|
|
|
is */
|
|
|
|
const char** referenced_col_names;/*!< names of the referenced
|
|
|
|
columns in the referenced table */
|
|
|
|
dict_index_t* foreign_index; /*!< foreign index; we require that
|
|
|
|
both tables contain explicitly defined
|
|
|
|
indexes for the constraint: InnoDB
|
|
|
|
does not generate new indexes
|
|
|
|
implicitly */
|
|
|
|
dict_index_t* referenced_index;/*!< referenced index */
|
2016-09-06 09:43:16 +03:00
|
|
|
|
|
|
|
dict_vcol_set* v_cols; /*!< set of virtual columns affected
|
|
|
|
by foreign key constraint. */
|
2014-09-11 10:13:35 +02:00
|
|
|
};
|
|
|
|
|
2014-11-18 17:41:12 +01:00
|
|
|
std::ostream&
|
|
|
|
operator<< (std::ostream& out, const dict_foreign_t& foreign);
|
|
|
|
|
|
|
|
struct dict_foreign_print {
|
|
|
|
|
|
|
|
dict_foreign_print(std::ostream& out)
|
|
|
|
: m_out(out)
|
|
|
|
{}
|
|
|
|
|
|
|
|
void operator()(const dict_foreign_t* foreign) {
|
|
|
|
m_out << *foreign;
|
|
|
|
}
|
|
|
|
private:
|
|
|
|
std::ostream& m_out;
|
|
|
|
};
|
|
|
|
|
2014-09-11 10:13:35 +02:00
|
|
|
/** Compare two dict_foreign_t objects using their ids. Used in the ordering
|
|
|
|
of dict_table_t::foreign_set and dict_table_t::referenced_set. It returns
|
|
|
|
true if the first argument is considered to go before the second in the
|
|
|
|
strict weak ordering it defines, and false otherwise. */
|
|
|
|
struct dict_foreign_compare {
|
|
|
|
|
|
|
|
bool operator()(
|
|
|
|
const dict_foreign_t* lhs,
|
|
|
|
const dict_foreign_t* rhs) const
|
|
|
|
{
|
|
|
|
return(ut_strcmp(lhs->id, rhs->id) < 0);
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
/** A function object to find a foreign key with the given index as the
|
|
|
|
referenced index. Return the foreign key with matching criteria or NULL */
|
|
|
|
struct dict_foreign_with_index {
|
|
|
|
|
|
|
|
dict_foreign_with_index(const dict_index_t* index)
|
|
|
|
: m_index(index)
|
|
|
|
{}
|
|
|
|
|
|
|
|
bool operator()(const dict_foreign_t* foreign) const
|
|
|
|
{
|
|
|
|
return(foreign->referenced_index == m_index);
|
|
|
|
}
|
|
|
|
|
|
|
|
const dict_index_t* m_index;
|
|
|
|
};
|
|
|
|
|
2016-09-06 09:43:16 +03:00
|
|
|
#ifdef WITH_WSREP
|
|
|
|
/** A function object to find a foreign key with the given index as the
|
|
|
|
foreign index. Return the foreign key with matching criteria or NULL */
|
|
|
|
struct dict_foreign_with_foreign_index {
|
|
|
|
|
|
|
|
dict_foreign_with_foreign_index(const dict_index_t* index)
|
|
|
|
: m_index(index)
|
|
|
|
{}
|
|
|
|
|
|
|
|
bool operator()(const dict_foreign_t* foreign) const
|
|
|
|
{
|
|
|
|
return(foreign->foreign_index == m_index);
|
|
|
|
}
|
|
|
|
|
|
|
|
const dict_index_t* m_index;
|
|
|
|
};
|
|
|
|
#endif
|
|
|
|
|
2014-09-11 10:13:35 +02:00
|
|
|
/* A function object to check if the foreign constraint is between different
|
|
|
|
tables. Returns true if foreign key constraint is between different tables,
|
|
|
|
false otherwise. */
|
|
|
|
struct dict_foreign_different_tables {
|
|
|
|
|
|
|
|
bool operator()(const dict_foreign_t* foreign) const
|
|
|
|
{
|
|
|
|
return(foreign->foreign_table != foreign->referenced_table);
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
/** A function object to check if the foreign key constraint has the same
|
|
|
|
name as given. If the full name of the foreign key constraint doesn't match,
|
|
|
|
then, check if removing the database name from the foreign key constraint
|
|
|
|
matches. Return true if it matches, false otherwise. */
|
|
|
|
struct dict_foreign_matches_id {
|
|
|
|
|
|
|
|
dict_foreign_matches_id(const char* id)
|
|
|
|
: m_id(id)
|
|
|
|
{}
|
|
|
|
|
|
|
|
bool operator()(const dict_foreign_t* foreign) const
|
|
|
|
{
|
|
|
|
if (0 == innobase_strcasecmp(foreign->id, m_id)) {
|
|
|
|
return(true);
|
|
|
|
}
|
|
|
|
if (const char* pos = strchr(foreign->id, '/')) {
|
|
|
|
if (0 == innobase_strcasecmp(m_id, pos + 1)) {
|
|
|
|
return(true);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return(false);
|
|
|
|
}
|
|
|
|
|
|
|
|
const char* m_id;
|
|
|
|
};
|
|
|
|
|
2016-08-12 11:17:45 +03:00
|
|
|
typedef std::set<
|
|
|
|
dict_foreign_t*,
|
|
|
|
dict_foreign_compare,
|
|
|
|
ut_allocator<dict_foreign_t*> > dict_foreign_set;
|
2014-09-11 10:13:35 +02:00
|
|
|
|
2014-11-18 17:41:12 +01:00
|
|
|
std::ostream&
|
|
|
|
operator<< (std::ostream& out, const dict_foreign_set& fk_set);
|
|
|
|
|
|
|
|
/** Function object to check if a foreign key object is there
|
|
|
|
in the given foreign key set or not. It returns true if the
|
|
|
|
foreign key is not found, false otherwise */
|
|
|
|
struct dict_foreign_not_exists {
|
|
|
|
dict_foreign_not_exists(const dict_foreign_set& obj_)
|
|
|
|
: m_foreigns(obj_)
|
|
|
|
{}
|
|
|
|
|
|
|
|
/* Return true if the given foreign key is not found */
|
|
|
|
bool operator()(dict_foreign_t* const & foreign) const {
|
|
|
|
return(m_foreigns.find(foreign) == m_foreigns.end());
|
|
|
|
}
|
|
|
|
private:
|
|
|
|
const dict_foreign_set& m_foreigns;
|
|
|
|
};
|
|
|
|
|
|
|
|
/** Validate the search order in the foreign key set.
|
|
|
|
@param[in] fk_set the foreign key set to be validated
|
|
|
|
@return true if search order is fine in the set, false otherwise. */
|
|
|
|
bool
|
|
|
|
dict_foreign_set_validate(
|
|
|
|
const dict_foreign_set& fk_set);
|
|
|
|
|
|
|
|
/** Validate the search order in the foreign key sets of the table
|
|
|
|
(foreign_set and referenced_set).
|
|
|
|
@param[in] table table whose foreign key sets are to be validated
|
|
|
|
@return true if foreign key sets are fine, false otherwise. */
|
|
|
|
bool
|
|
|
|
dict_foreign_set_validate(
|
|
|
|
const dict_table_t& table);
|
|
|
|
|
2014-09-11 10:13:35 +02:00
|
|
|
/*********************************************************************//**
|
|
|
|
Frees a foreign key struct. */
|
|
|
|
inline
|
|
|
|
void
|
|
|
|
dict_foreign_free(
|
|
|
|
/*==============*/
|
|
|
|
dict_foreign_t* foreign) /*!< in, own: foreign key struct */
|
|
|
|
{
|
2016-09-06 09:43:16 +03:00
|
|
|
if (foreign->v_cols != NULL) {
|
|
|
|
UT_DELETE(foreign->v_cols);
|
|
|
|
}
|
|
|
|
|
2014-09-11 10:13:35 +02:00
|
|
|
mem_heap_free(foreign->heap);
|
|
|
|
}
|
|
|
|
|
|
|
|
/** The destructor will free all the foreign key constraints in the set
|
|
|
|
by calling dict_foreign_free() on each of the foreign key constraints.
|
|
|
|
This is used to free the allocated memory when a local set goes out
|
|
|
|
of scope. */
|
|
|
|
struct dict_foreign_set_free {
|
|
|
|
|
|
|
|
dict_foreign_set_free(const dict_foreign_set& foreign_set)
|
|
|
|
: m_foreign_set(foreign_set)
|
|
|
|
{}
|
|
|
|
|
|
|
|
~dict_foreign_set_free()
|
|
|
|
{
|
|
|
|
std::for_each(m_foreign_set.begin(),
|
|
|
|
m_foreign_set.end(),
|
|
|
|
dict_foreign_free);
|
|
|
|
}
|
|
|
|
|
|
|
|
const dict_foreign_set& m_foreign_set;
|
2014-02-26 19:11:54 +01:00
|
|
|
};
|
|
|
|
|
|
|
|
/** The flags for ON_UPDATE and ON_DELETE can be ORed; the default is that
|
|
|
|
a foreign key constraint is enforced, therefore RESTRICT just means no flag */
|
|
|
|
/* @{ */
|
2017-03-01 08:27:39 +02:00
|
|
|
#define DICT_FOREIGN_ON_DELETE_CASCADE 1U /*!< ON DELETE CASCADE */
|
|
|
|
#define DICT_FOREIGN_ON_DELETE_SET_NULL 2U /*!< ON UPDATE SET NULL */
|
|
|
|
#define DICT_FOREIGN_ON_UPDATE_CASCADE 4U /*!< ON DELETE CASCADE */
|
|
|
|
#define DICT_FOREIGN_ON_UPDATE_SET_NULL 8U /*!< ON UPDATE SET NULL */
|
|
|
|
#define DICT_FOREIGN_ON_DELETE_NO_ACTION 16U /*!< ON DELETE NO ACTION */
|
|
|
|
#define DICT_FOREIGN_ON_UPDATE_NO_ACTION 32U /*!< ON UPDATE NO ACTION */
|
2014-02-26 19:11:54 +01:00
|
|
|
/* @} */
|
|
|
|
|
2016-08-12 11:17:45 +03:00
|
|
|
/** Display an identifier.
|
|
|
|
@param[in,out] s output stream
|
|
|
|
@param[in] id_name SQL identifier (other than table name)
|
|
|
|
@return the output stream */
|
|
|
|
std::ostream&
|
|
|
|
operator<<(
|
|
|
|
std::ostream& s,
|
|
|
|
const id_name_t& id_name);
|
|
|
|
|
|
|
|
/** Display a table name.
|
|
|
|
@param[in,out] s output stream
|
|
|
|
@param[in] table_name table name
|
|
|
|
@return the output stream */
|
|
|
|
std::ostream&
|
|
|
|
operator<<(
|
|
|
|
std::ostream& s,
|
|
|
|
const table_name_t& table_name);
|
|
|
|
|
|
|
|
/** List of locks that different transactions have acquired on a table. This
|
|
|
|
list has a list node that is embedded in a nested union/structure. We have to
|
|
|
|
generate a specific template for it. */
|
|
|
|
|
|
|
|
typedef ut_list_base<lock_t, ut_list_node<lock_t> lock_table_t::*>
|
|
|
|
table_lock_list_t;
|
|
|
|
|
2016-09-06 09:43:16 +03:00
|
|
|
/** mysql template structure defined in row0mysql.cc */
|
|
|
|
struct mysql_row_templ_t;
|
|
|
|
|
|
|
|
/** Structure defines template related to virtual columns and
|
|
|
|
their base columns */
|
|
|
|
struct dict_vcol_templ_t {
|
|
|
|
/** number of regular columns */
|
|
|
|
ulint n_col;
|
|
|
|
|
|
|
|
/** number of virtual columns */
|
|
|
|
ulint n_v_col;
|
|
|
|
|
|
|
|
/** array of templates for virtual col and their base columns */
|
|
|
|
mysql_row_templ_t** vtempl;
|
|
|
|
|
|
|
|
/** table's database name */
|
|
|
|
std::string db_name;
|
|
|
|
|
|
|
|
/** table name */
|
|
|
|
std::string tb_name;
|
|
|
|
|
|
|
|
/** MySQL record length */
|
|
|
|
ulint rec_len;
|
|
|
|
|
|
|
|
/** default column value if any */
|
|
|
|
byte* default_rec;
|
2016-11-07 22:35:02 +01:00
|
|
|
|
|
|
|
/** cached MySQL TABLE object */
|
|
|
|
TABLE* mysql_table;
|
|
|
|
|
|
|
|
/** when mysql_table was cached */
|
|
|
|
uint64_t mysql_table_query_id;
|
|
|
|
|
2017-03-01 08:27:39 +02:00
|
|
|
dict_vcol_templ_t() : vtempl(0), mysql_table_query_id(~0ULL) {}
|
2016-09-06 09:43:16 +03:00
|
|
|
};
|
|
|
|
|
2018-11-27 22:25:11 +03:00
|
|
|
/** Metadata on clustered index fields starting from first_user_field() */
|
|
|
|
class field_map_element_t
|
|
|
|
{
|
|
|
|
/** Number of bits for representing a column number */
|
|
|
|
static constexpr uint16_t IND_BITS = 10;
|
|
|
|
|
|
|
|
/** Set if the column of the field has been instantly dropped */
|
|
|
|
static constexpr uint16_t DROPPED = 1U << (IND_BITS + 5);
|
|
|
|
|
|
|
|
/** Set if the column was dropped and originally declared NOT NULL */
|
|
|
|
static constexpr uint16_t NOT_NULL = 1U << (IND_BITS + 4);
|
|
|
|
|
|
|
|
/** Column index (if !(data & DROPPED)): table->cols[data & IND],
|
|
|
|
or field length (if (data & DROPPED)):
|
|
|
|
(data & IND) = 0 if variable-length with max_len < 256 bytes;
|
|
|
|
(data & IND) = 1 if variable-length with max_len > 255 bytes;
|
|
|
|
(data & IND) = 1 + L otherwise, with L=fixed length of the column */
|
|
|
|
static constexpr uint16_t IND = (1U << IND_BITS) - 1;
|
|
|
|
|
|
|
|
/** Field metadata */
|
|
|
|
uint16_t data;
|
|
|
|
|
|
|
|
void clear_not_null() { data &= ~NOT_NULL; }
|
|
|
|
public:
|
|
|
|
bool is_dropped() const { return data & DROPPED; }
|
|
|
|
void set_dropped() { data |= DROPPED; }
|
|
|
|
bool is_not_null() const { return data & NOT_NULL; }
|
|
|
|
void set_not_null() { ut_ad(is_dropped()); data |= NOT_NULL; }
|
|
|
|
uint16_t ind() const { return data & IND; }
|
|
|
|
void set_ind(uint16_t i)
|
|
|
|
{
|
|
|
|
DBUG_ASSERT(i <= IND);
|
|
|
|
DBUG_ASSERT(!ind());
|
|
|
|
data |= i;
|
|
|
|
}
|
|
|
|
field_map_element_t& operator= (uint16_t value)
|
|
|
|
{
|
|
|
|
data = value;
|
|
|
|
return *this;
|
|
|
|
}
|
|
|
|
operator uint16_t() { return data; }
|
|
|
|
};
|
|
|
|
|
|
|
|
static_assert(sizeof(field_map_element_t) == 2,
|
|
|
|
"Size mismatch for a persistent data item!");
|
|
|
|
|
MDEV-15662 Instant DROP COLUMN or changing the order of columns
Allow ADD COLUMN anywhere in a table, not only adding as the
last column.
Allow instant DROP COLUMN and instant changing the order of columns.
The added columns will always be added last in clustered index records.
In new records, instantly dropped columns will be stored as NULL or
empty when possible.
Information about dropped and reordered columns will be written in
a metadata BLOB (mblob), which is stored before the first 'user' field
in the hidden metadata record at the start of the clustered index.
The presence of mblob is indicated by setting the delete-mark flag in
the metadata record.
The metadata BLOB stores the number of clustered index fields,
followed by an array of column information for each field.
For dropped columns, we store the NOT NULL flag, the fixed length,
and for variable-length columns, whether the maximum length exceeded
255 bytes. For non-dropped columns, we store the column position.
Unlike with MDEV-11369, when a table becomes empty, it cannot
be converted back to the canonical format. The reason for this is
that other threads may hold cached objects such as
row_prebuilt_t::ins_node that could refer to dropped or reordered
index fields.
For instant DROP COLUMN and ROW_FORMAT=COMPACT or ROW_FORMAT=DYNAMIC,
we must store the n_core_null_bytes in the root page, so that the
chain of node pointer records can be followed in order to reach the
leftmost leaf page where the metadata record is located.
If the mblob is present, we will zero-initialize the strings
"infimum" and "supremum" in the root page, and use the last byte of
"supremum" for storing the number of null bytes (which are allocated
but useless on node pointer pages). This is necessary for
btr_cur_instant_init_metadata() to be able to navigate to the mblob.
If the PRIMARY KEY contains any variable-length column and some
nullable columns were instantly dropped, the dict_index_t::n_nullable
in the data dictionary could be smaller than it actually is in the
non-leaf pages. Because of this, the non-leaf pages could use more
bytes for the null flags than the data dictionary expects, and we
could be reading the lengths of the variable-length columns from the
wrong offset, and thus reading the child page number from wrong place.
This is the result of two design mistakes that involve unnecessary
storage of data: First, it is nonsense to store any data fields for
the leftmost node pointer records, because the comparisons would be
resolved by the MIN_REC_FLAG alone. Second, there cannot be any null
fields in the clustered index node pointer fields, but we nevertheless
reserve space for all the null flags.
Limitations (future work):
MDEV-17459 Allow instant ALTER TABLE even if FULLTEXT INDEX exists
MDEV-17468 Avoid table rebuild on operations on generated columns
MDEV-17494 Refuse ALGORITHM=INSTANT when the row size is too large
btr_page_reorganize_low(): Preserve any metadata in the root page.
Call lock_move_reorganize_page() only after restoring the "infimum"
and "supremum" records, to avoid a memcmp() assertion failure.
dict_col_t::DROPPED: Magic value for dict_col_t::ind.
dict_col_t::clear_instant(): Renamed from dict_col_t::remove_instant().
Do not assert that the column was instantly added, because we
sometimes call this unconditionally for all columns.
Convert an instantly added column to a "core column". The old name
remove_instant() could be mistaken to refer to "instant DROP COLUMN".
dict_col_t::is_added(): Rename from dict_col_t::is_instant().
dtype_t::metadata_blob_init(): Initialize the mblob data type.
dtuple_t::is_metadata(), dtuple_t::is_alter_metadata(),
upd_t::is_metadata(), upd_t::is_alter_metadata(): Check if info_bits
refer to a metadata record.
dict_table_t::instant: Metadata about dropped or reordered columns.
dict_table_t::prepare_instant(): Prepare
ha_innobase_inplace_ctx::instant_table for instant ALTER TABLE.
innobase_instant_try() will pass this to dict_table_t::instant_column().
On rollback, dict_table_t::rollback_instant() will be called.
dict_table_t::instant_column(): Renamed from instant_add_column().
Add the parameter col_map so that columns can be reordered.
Copy and adjust v_cols[] as well.
dict_table_t::find(): Find an old column based on a new column number.
dict_table_t::serialise_columns(), dict_table_t::deserialise_columns():
Convert the mblob.
dict_index_t::instant_metadata(): Create the metadata record
for instant ALTER TABLE. Invoke dict_table_t::serialise_columns().
dict_index_t::reconstruct_fields(): Invoked by
dict_table_t::deserialise_columns().
dict_index_t::clear_instant_alter(): Move the fields for the
dropped columns to the end, and sort the surviving index fields
in ascending order of column position.
ha_innobase::check_if_supported_inplace_alter(): Do not allow
adding a FTS_DOC_ID column if a hidden FTS_DOC_ID column exists
due to FULLTEXT INDEX. (This always required ALGORITHM=COPY.)
instant_alter_column_possible(): Add a parameter for InnoDB table,
to check for additional conditions, such as the maximum number of
index fields.
ha_innobase_inplace_ctx::first_alter_pos: The first column whose position
is affected by instant ADD, DROP, or changing the order of columns.
innobase_build_col_map(): Skip added virtual columns.
prepare_inplace_add_virtual(): Correctly compute num_to_add_vcol.
Remove some unnecessary code. Note that the call to
innodb_base_col_setup() should be executed later.
commit_try_norebuild(): If ctx->is_instant(), let the virtual
columns be added or dropped by innobase_instant_try().
innobase_instant_try(): Fill in a zero default value for the
hidden column FTS_DOC_ID (to reduce the work needed in MDEV-17459).
If any columns were dropped or reordered (or added not last),
delete any SYS_COLUMNS records for the following columns, and
insert SYS_COLUMNS records for all subsequent stored columns as well
as for all virtual columns. If any virtual column is dropped, rewrite
all virtual column metadata. Use a shortcut only for adding
virtual columns. This is because innobase_drop_virtual_try()
assumes that the dropped virtual columns still exist in ctx->old_table.
innodb_update_cols(): Renamed from innodb_update_n_cols().
innobase_add_one_virtual(), innobase_insert_sys_virtual(): Change
the return type to bool, and invoke my_error() when detecting an error.
innodb_insert_sys_columns(): Insert a record into SYS_COLUMNS.
Refactored from innobase_add_one_virtual() and innobase_instant_add_col().
innobase_instant_add_col(): Replace the parameter dfield with type.
innobase_instant_drop_cols(): Drop matching columns from SYS_COLUMNS
and all columns from SYS_VIRTUAL.
innobase_add_virtual_try(), innobase_drop_virtual_try(): Let
the caller invoke innodb_update_cols().
innobase_rename_column_try(): Skip dropped columns.
commit_cache_norebuild(): Update table->fts->doc_col.
dict_mem_table_col_rename_low(): Skip dropped columns.
trx_undo_rec_get_partial_row(): Skip dropped columns.
trx_undo_update_rec_get_update(): Handle the metadata BLOB correctly.
trx_undo_page_report_modify(): Avoid out-of-bounds access to record fields.
Log metadata records consistently.
Apparently, the first fields of a clustered index may be updated
in an update_undo vector when the index is ID_IND of SYS_FOREIGN,
as part of renaming the table during ALTER TABLE. Normally, updates of
the PRIMARY KEY should be logged as delete-mark and an insert.
row_undo_mod_parse_undo_rec(), row_purge_parse_undo_rec():
Use trx_undo_metadata.
row_undo_mod_clust_low(): On metadata rollback, roll back the root page too.
row_undo_mod_clust(): Relax an assertion. The delete-mark flag was
repurposed for ALTER TABLE metadata records.
row_rec_to_index_entry_impl(): Add the template parameter mblob
and the optional parameter info_bits for specifying the desired new
info bits. For the metadata tuple, allow conversion between the original
format (ADD COLUMN only) and the generic format (with hidden BLOB).
Add the optional parameter "pad" to determine whether the tuple should
be padded to the index fields (on ALTER TABLE it should), or whether
it should remain at its original size (on rollback).
row_build_index_entry_low(): Clean up the code, removing
redundant variables and conditions. For instantly dropped columns,
generate a dummy value that is NULL, the empty string, or a
fixed length of NUL bytes, depending on the type of the dropped column.
row_upd_clust_rec_by_insert_inherit_func(): On the update of PRIMARY KEY
of a record that contained a dropped column whose value was stored
externally, we will be inserting a dummy NULL or empty string value
to the field of the dropped column. The externally stored column would
eventually be dropped when purge removes the delete-marked record for
the old PRIMARY KEY value.
btr_index_rec_validate(): Recognize the metadata record.
btr_discard_only_page_on_level(): Preserve the generic instant
ALTER TABLE metadata.
btr_set_instant(): Replaces page_set_instant(). This sets a clustered
index root page to the appropriate format, or upgrades from
the MDEV-11369 instant ADD COLUMN to generic ALTER TABLE format.
btr_cur_instant_init_low(): Read and validate the metadata BLOB page
before reconstructing the dictionary information based on it.
btr_cur_instant_init_metadata(): Do not read any lengths from the
metadata record header before reading the BLOB. At this point, we
would not actually know how many nullable fields the metadata record
contains.
btr_cur_instant_root_init(): Initialize n_core_null_bytes in one
of two possible ways.
btr_cur_trim(): Handle the mblob record.
row_metadata_to_tuple(): Convert a metadata record to a data tuple,
based on the new info_bits of the metadata record.
btr_cur_pessimistic_update(): Invoke row_metadata_to_tuple() if needed.
Invoke dtuple_convert_big_rec() for metadata records if the record is
too large, or if the mblob is not yet marked as externally stored.
btr_cur_optimistic_delete_func(), btr_cur_pessimistic_delete():
When the last user record is deleted, do not delete the
generic instant ALTER TABLE metadata record. Only delete
MDEV-11369 instant ADD COLUMN metadata records.
btr_cur_optimistic_insert(): Avoid unnecessary computation of rec_size.
btr_pcur_store_position(): Allow a logically empty page to contain
a metadata record for generic ALTER TABLE.
REC_INFO_DEFAULT_ROW_ADD: Renamed from REC_INFO_DEFAULT_ROW.
This is for the old instant ADD COLUMN (MDEV-11369) only.
REC_INFO_DEFAULT_ROW_ALTER: The more generic metadata record,
with additional information for dropped or reordered columns.
rec_info_bits_valid(): Remove. The only case when this would fail
is when the record is the generic ALTER TABLE metadata record.
rec_is_alter_metadata(): Check if a record is the metadata record
for instant ALTER TABLE (other than ADD COLUMN). NOTE: This function
must not be invoked on node pointer records, because the delete-mark
flag in those records may be set (it is garbage), and then a debug
assertion could fail because index->is_instant() does not necessarily
hold.
rec_is_add_metadata(): Check if a record is MDEV-11369 ADD COLUMN metadata
record (not more generic instant ALTER TABLE).
rec_get_converted_size_comp_prefix_low(): Assume that the metadata
field will be stored externally. In dtuple_convert_big_rec() during
the rec_get_converted_size() call, it would not be there yet.
rec_get_converted_size_comp(): Replace status,fields,n_fields with tuple.
rec_init_offsets_comp_ordinary(), rec_get_converted_size_comp_prefix_low(),
rec_convert_dtuple_to_rec_comp(): Add template<bool mblob = false>.
With mblob=true, process a record with a metadata BLOB.
rec_copy_prefix_to_buf(): Assert that no fields beyond the key and
system columns are being copied. Exclude the metadata BLOB field.
rec_convert_dtuple_to_metadata_comp(): Convert an alter metadata tuple
into a record.
row_upd_index_replace_metadata(): Apply an update vector to an
alter_metadata tuple.
row_log_allocate(): Replace dict_index_t::is_instant()
with a more appropriate condition that ignores dict_table_t::instant.
Only a table on which the MDEV-11369 ADD COLUMN was performed
can "lose its instantness" when it becomes empty. After
instant DROP COLUMN or reordering columns, we cannot simply
convert the table to the canonical format, because the data
dictionary cache and all possibly existing references to it
from other client connection threads would have to be adjusted.
row_quiesce_write_index_fields(): Do not crash when the table contains
an instantly dropped column.
Thanks to Thirunarayanan Balathandayuthapani for discussing the design
and implementing an initial prototype of this.
Thanks to Matthias Leich for testing.
2018-10-19 16:49:54 +03:00
|
|
|
/** Instantly dropped or reordered columns */
|
|
|
|
struct dict_instant_t
|
|
|
|
{
|
|
|
|
/** Number of dropped columns */
|
|
|
|
unsigned n_dropped;
|
|
|
|
/** Dropped columns */
|
|
|
|
dict_col_t* dropped;
|
2018-11-27 22:25:11 +03:00
|
|
|
/** Map of clustered index non-PK fields[i - first_user_field()]
|
|
|
|
to table columns */
|
|
|
|
field_map_element_t* field_map;
|
MDEV-15662 Instant DROP COLUMN or changing the order of columns
Allow ADD COLUMN anywhere in a table, not only adding as the
last column.
Allow instant DROP COLUMN and instant changing the order of columns.
The added columns will always be added last in clustered index records.
In new records, instantly dropped columns will be stored as NULL or
empty when possible.
Information about dropped and reordered columns will be written in
a metadata BLOB (mblob), which is stored before the first 'user' field
in the hidden metadata record at the start of the clustered index.
The presence of mblob is indicated by setting the delete-mark flag in
the metadata record.
The metadata BLOB stores the number of clustered index fields,
followed by an array of column information for each field.
For dropped columns, we store the NOT NULL flag, the fixed length,
and for variable-length columns, whether the maximum length exceeded
255 bytes. For non-dropped columns, we store the column position.
Unlike with MDEV-11369, when a table becomes empty, it cannot
be converted back to the canonical format. The reason for this is
that other threads may hold cached objects such as
row_prebuilt_t::ins_node that could refer to dropped or reordered
index fields.
For instant DROP COLUMN and ROW_FORMAT=COMPACT or ROW_FORMAT=DYNAMIC,
we must store the n_core_null_bytes in the root page, so that the
chain of node pointer records can be followed in order to reach the
leftmost leaf page where the metadata record is located.
If the mblob is present, we will zero-initialize the strings
"infimum" and "supremum" in the root page, and use the last byte of
"supremum" for storing the number of null bytes (which are allocated
but useless on node pointer pages). This is necessary for
btr_cur_instant_init_metadata() to be able to navigate to the mblob.
If the PRIMARY KEY contains any variable-length column and some
nullable columns were instantly dropped, the dict_index_t::n_nullable
in the data dictionary could be smaller than it actually is in the
non-leaf pages. Because of this, the non-leaf pages could use more
bytes for the null flags than the data dictionary expects, and we
could be reading the lengths of the variable-length columns from the
wrong offset, and thus reading the child page number from wrong place.
This is the result of two design mistakes that involve unnecessary
storage of data: First, it is nonsense to store any data fields for
the leftmost node pointer records, because the comparisons would be
resolved by the MIN_REC_FLAG alone. Second, there cannot be any null
fields in the clustered index node pointer fields, but we nevertheless
reserve space for all the null flags.
Limitations (future work):
MDEV-17459 Allow instant ALTER TABLE even if FULLTEXT INDEX exists
MDEV-17468 Avoid table rebuild on operations on generated columns
MDEV-17494 Refuse ALGORITHM=INSTANT when the row size is too large
btr_page_reorganize_low(): Preserve any metadata in the root page.
Call lock_move_reorganize_page() only after restoring the "infimum"
and "supremum" records, to avoid a memcmp() assertion failure.
dict_col_t::DROPPED: Magic value for dict_col_t::ind.
dict_col_t::clear_instant(): Renamed from dict_col_t::remove_instant().
Do not assert that the column was instantly added, because we
sometimes call this unconditionally for all columns.
Convert an instantly added column to a "core column". The old name
remove_instant() could be mistaken to refer to "instant DROP COLUMN".
dict_col_t::is_added(): Rename from dict_col_t::is_instant().
dtype_t::metadata_blob_init(): Initialize the mblob data type.
dtuple_t::is_metadata(), dtuple_t::is_alter_metadata(),
upd_t::is_metadata(), upd_t::is_alter_metadata(): Check if info_bits
refer to a metadata record.
dict_table_t::instant: Metadata about dropped or reordered columns.
dict_table_t::prepare_instant(): Prepare
ha_innobase_inplace_ctx::instant_table for instant ALTER TABLE.
innobase_instant_try() will pass this to dict_table_t::instant_column().
On rollback, dict_table_t::rollback_instant() will be called.
dict_table_t::instant_column(): Renamed from instant_add_column().
Add the parameter col_map so that columns can be reordered.
Copy and adjust v_cols[] as well.
dict_table_t::find(): Find an old column based on a new column number.
dict_table_t::serialise_columns(), dict_table_t::deserialise_columns():
Convert the mblob.
dict_index_t::instant_metadata(): Create the metadata record
for instant ALTER TABLE. Invoke dict_table_t::serialise_columns().
dict_index_t::reconstruct_fields(): Invoked by
dict_table_t::deserialise_columns().
dict_index_t::clear_instant_alter(): Move the fields for the
dropped columns to the end, and sort the surviving index fields
in ascending order of column position.
ha_innobase::check_if_supported_inplace_alter(): Do not allow
adding a FTS_DOC_ID column if a hidden FTS_DOC_ID column exists
due to FULLTEXT INDEX. (This always required ALGORITHM=COPY.)
instant_alter_column_possible(): Add a parameter for InnoDB table,
to check for additional conditions, such as the maximum number of
index fields.
ha_innobase_inplace_ctx::first_alter_pos: The first column whose position
is affected by instant ADD, DROP, or changing the order of columns.
innobase_build_col_map(): Skip added virtual columns.
prepare_inplace_add_virtual(): Correctly compute num_to_add_vcol.
Remove some unnecessary code. Note that the call to
innodb_base_col_setup() should be executed later.
commit_try_norebuild(): If ctx->is_instant(), let the virtual
columns be added or dropped by innobase_instant_try().
innobase_instant_try(): Fill in a zero default value for the
hidden column FTS_DOC_ID (to reduce the work needed in MDEV-17459).
If any columns were dropped or reordered (or added not last),
delete any SYS_COLUMNS records for the following columns, and
insert SYS_COLUMNS records for all subsequent stored columns as well
as for all virtual columns. If any virtual column is dropped, rewrite
all virtual column metadata. Use a shortcut only for adding
virtual columns. This is because innobase_drop_virtual_try()
assumes that the dropped virtual columns still exist in ctx->old_table.
innodb_update_cols(): Renamed from innodb_update_n_cols().
innobase_add_one_virtual(), innobase_insert_sys_virtual(): Change
the return type to bool, and invoke my_error() when detecting an error.
innodb_insert_sys_columns(): Insert a record into SYS_COLUMNS.
Refactored from innobase_add_one_virtual() and innobase_instant_add_col().
innobase_instant_add_col(): Replace the parameter dfield with type.
innobase_instant_drop_cols(): Drop matching columns from SYS_COLUMNS
and all columns from SYS_VIRTUAL.
innobase_add_virtual_try(), innobase_drop_virtual_try(): Let
the caller invoke innodb_update_cols().
innobase_rename_column_try(): Skip dropped columns.
commit_cache_norebuild(): Update table->fts->doc_col.
dict_mem_table_col_rename_low(): Skip dropped columns.
trx_undo_rec_get_partial_row(): Skip dropped columns.
trx_undo_update_rec_get_update(): Handle the metadata BLOB correctly.
trx_undo_page_report_modify(): Avoid out-of-bounds access to record fields.
Log metadata records consistently.
Apparently, the first fields of a clustered index may be updated
in an update_undo vector when the index is ID_IND of SYS_FOREIGN,
as part of renaming the table during ALTER TABLE. Normally, updates of
the PRIMARY KEY should be logged as delete-mark and an insert.
row_undo_mod_parse_undo_rec(), row_purge_parse_undo_rec():
Use trx_undo_metadata.
row_undo_mod_clust_low(): On metadata rollback, roll back the root page too.
row_undo_mod_clust(): Relax an assertion. The delete-mark flag was
repurposed for ALTER TABLE metadata records.
row_rec_to_index_entry_impl(): Add the template parameter mblob
and the optional parameter info_bits for specifying the desired new
info bits. For the metadata tuple, allow conversion between the original
format (ADD COLUMN only) and the generic format (with hidden BLOB).
Add the optional parameter "pad" to determine whether the tuple should
be padded to the index fields (on ALTER TABLE it should), or whether
it should remain at its original size (on rollback).
row_build_index_entry_low(): Clean up the code, removing
redundant variables and conditions. For instantly dropped columns,
generate a dummy value that is NULL, the empty string, or a
fixed length of NUL bytes, depending on the type of the dropped column.
row_upd_clust_rec_by_insert_inherit_func(): On the update of PRIMARY KEY
of a record that contained a dropped column whose value was stored
externally, we will be inserting a dummy NULL or empty string value
to the field of the dropped column. The externally stored column would
eventually be dropped when purge removes the delete-marked record for
the old PRIMARY KEY value.
btr_index_rec_validate(): Recognize the metadata record.
btr_discard_only_page_on_level(): Preserve the generic instant
ALTER TABLE metadata.
btr_set_instant(): Replaces page_set_instant(). This sets a clustered
index root page to the appropriate format, or upgrades from
the MDEV-11369 instant ADD COLUMN to generic ALTER TABLE format.
btr_cur_instant_init_low(): Read and validate the metadata BLOB page
before reconstructing the dictionary information based on it.
btr_cur_instant_init_metadata(): Do not read any lengths from the
metadata record header before reading the BLOB. At this point, we
would not actually know how many nullable fields the metadata record
contains.
btr_cur_instant_root_init(): Initialize n_core_null_bytes in one
of two possible ways.
btr_cur_trim(): Handle the mblob record.
row_metadata_to_tuple(): Convert a metadata record to a data tuple,
based on the new info_bits of the metadata record.
btr_cur_pessimistic_update(): Invoke row_metadata_to_tuple() if needed.
Invoke dtuple_convert_big_rec() for metadata records if the record is
too large, or if the mblob is not yet marked as externally stored.
btr_cur_optimistic_delete_func(), btr_cur_pessimistic_delete():
When the last user record is deleted, do not delete the
generic instant ALTER TABLE metadata record. Only delete
MDEV-11369 instant ADD COLUMN metadata records.
btr_cur_optimistic_insert(): Avoid unnecessary computation of rec_size.
btr_pcur_store_position(): Allow a logically empty page to contain
a metadata record for generic ALTER TABLE.
REC_INFO_DEFAULT_ROW_ADD: Renamed from REC_INFO_DEFAULT_ROW.
This is for the old instant ADD COLUMN (MDEV-11369) only.
REC_INFO_DEFAULT_ROW_ALTER: The more generic metadata record,
with additional information for dropped or reordered columns.
rec_info_bits_valid(): Remove. The only case when this would fail
is when the record is the generic ALTER TABLE metadata record.
rec_is_alter_metadata(): Check if a record is the metadata record
for instant ALTER TABLE (other than ADD COLUMN). NOTE: This function
must not be invoked on node pointer records, because the delete-mark
flag in those records may be set (it is garbage), and then a debug
assertion could fail because index->is_instant() does not necessarily
hold.
rec_is_add_metadata(): Check if a record is MDEV-11369 ADD COLUMN metadata
record (not more generic instant ALTER TABLE).
rec_get_converted_size_comp_prefix_low(): Assume that the metadata
field will be stored externally. In dtuple_convert_big_rec() during
the rec_get_converted_size() call, it would not be there yet.
rec_get_converted_size_comp(): Replace status,fields,n_fields with tuple.
rec_init_offsets_comp_ordinary(), rec_get_converted_size_comp_prefix_low(),
rec_convert_dtuple_to_rec_comp(): Add template<bool mblob = false>.
With mblob=true, process a record with a metadata BLOB.
rec_copy_prefix_to_buf(): Assert that no fields beyond the key and
system columns are being copied. Exclude the metadata BLOB field.
rec_convert_dtuple_to_metadata_comp(): Convert an alter metadata tuple
into a record.
row_upd_index_replace_metadata(): Apply an update vector to an
alter_metadata tuple.
row_log_allocate(): Replace dict_index_t::is_instant()
with a more appropriate condition that ignores dict_table_t::instant.
Only a table on which the MDEV-11369 ADD COLUMN was performed
can "lose its instantness" when it becomes empty. After
instant DROP COLUMN or reordering columns, we cannot simply
convert the table to the canonical format, because the data
dictionary cache and all possibly existing references to it
from other client connection threads would have to be adjusted.
row_quiesce_write_index_fields(): Do not crash when the table contains
an instantly dropped column.
Thanks to Thirunarayanan Balathandayuthapani for discussing the design
and implementing an initial prototype of this.
Thanks to Matthias Leich for testing.
2018-10-19 16:49:54 +03:00
|
|
|
};
|
|
|
|
|
2016-04-21 10:52:52 +03:00
|
|
|
/** These are used when MySQL FRM and InnoDB data dictionary are
|
|
|
|
in inconsistent state. */
|
|
|
|
typedef enum {
|
|
|
|
DICT_FRM_CONSISTENT = 0, /*!< Consistent state */
|
|
|
|
DICT_FRM_NO_PK = 1, /*!< MySQL has no primary key
|
|
|
|
but InnoDB dictionary has
|
|
|
|
non-generated one. */
|
|
|
|
DICT_NO_PK_FRM_HAS = 2, /*!< MySQL has primary key but
|
|
|
|
InnoDB dictionary has not. */
|
|
|
|
DICT_FRM_INCONSISTENT_KEYS = 3 /*!< Key count mismatch */
|
|
|
|
} dict_frm_t;
|
|
|
|
|
2014-02-26 19:11:54 +01:00
|
|
|
/** Data structure for a database table. Most fields will be
|
|
|
|
initialized to 0, NULL or FALSE in dict_mem_table_create(). */
|
2016-08-12 11:17:45 +03:00
|
|
|
struct dict_table_t {
|
2014-09-11 10:13:35 +02:00
|
|
|
|
2016-08-12 11:17:45 +03:00
|
|
|
/** Get reference count.
|
|
|
|
@return current value of n_ref_count */
|
2018-10-13 21:48:59 +04:00
|
|
|
inline uint32_t get_ref_count() const { return n_ref_count; }
|
2016-08-12 11:17:45 +03:00
|
|
|
|
|
|
|
/** Acquire the table handle. */
|
|
|
|
inline void acquire();
|
2014-09-11 10:13:35 +02:00
|
|
|
|
2017-10-24 14:53:18 +02:00
|
|
|
/** Release the table handle.
|
|
|
|
@return whether the last handle was released */
|
|
|
|
inline bool release();
|
2016-08-12 11:17:45 +03:00
|
|
|
|
MDEV-10139 Support for InnoDB SEQUENCE objects
We introduce a NO_ROLLBACK flag for InnoDB tables. This flag only works
for tables that have a single index. Apart from undo logging, this flag
will also prevent locking and the assignment of DB_ROW_ID or DB_TRX_ID,
and imply READ UNCOMMITTED isolation. It is assumed that the SQL layer
is guaranteeing mutual exclusion.
After the initial insert of the single record during CREATE SEQUENCE,
InnoDB will be updating the single record in-place. This is crash-safe
thanks to the redo log. (That is, after a crash after CREATE SEQUENCE
was committed, the effect of sequence operations will be observable
fully or not at all.)
When it comes to the durability of the updates of SEQUENCE in
InnoDB, there is a clear analogy to MDEV-6076 Persistent AUTO_INCREMENT.
The updates would be made persistent by the InnoDB redo log flush
at transaction commit or rollback (or XA PREPARE), provided that
innodb_log_flush_at_trx_commit=1.
Similar to AUTO_INCREMENT, it is possible that the update of a SEQUENCE
in a middle of transaction becomes durable before the COMMIT/ROLLBACK of
the transaction, in case the InnoDB redo log is being flushed as a result
of the a commit or rollback of some other transaction, or as a result of
a redo log checkpoint that can be initiated at any time by operations that
are writing redo log.
dict_table_t::no_rollback(): Check if the table does not support rollback.
BTR_NO_ROLLBACK: Logging and locking flags for no_rollback() tables.
DICT_TF_BITS: Add the NO_ROLLBACK flag.
row_ins_step(): Assign 0 to DB_ROW_ID and DB_TRX_ID, and skip
any locking for no-rollback tables. There will be only a single row
in no-rollback tables (or there must be a proper PRIMARY KEY).
row_search_mvcc(): Execute the READ UNCOMMITTED code path for
no-rollback tables.
ha_innobase::external_lock(), ha_innobase::store_lock():
Block CREATE/DROP SEQUENCE in innodb_read_only mode.
This probably has no effect for CREATE SEQUENCE, because already
ha_innobase::create() should have been called (and refused)
before external_lock() or store_lock() is called.
ha_innobase::store_lock(): For CREATE SEQUENCE, do not acquire any
InnoDB locks, even though TL_WRITE is being requested. (This is just
a performance optimization.)
innobase_copy_frm_flags_from_create_info(), row_drop_table_for_mysql():
Disable persistent statistics for no_rollback tables.
2017-03-27 18:58:16 +03:00
|
|
|
/** @return whether the table supports transactions */
|
|
|
|
bool no_rollback() const
|
|
|
|
{
|
2018-04-28 15:49:09 +03:00
|
|
|
return !(~unsigned(flags) & DICT_TF_MASK_NO_ROLLBACK);
|
2017-05-26 19:32:28 +04:00
|
|
|
}
|
2017-10-03 11:37:38 +03:00
|
|
|
/** @return whether this is a temporary table */
|
|
|
|
bool is_temporary() const
|
|
|
|
{
|
|
|
|
return flags2 & DICT_TF2_TEMPORARY;
|
|
|
|
}
|
|
|
|
|
2018-11-12 17:01:56 +02:00
|
|
|
/** @return whether the table is not in ROW_FORMAT=REDUNDANT */
|
|
|
|
bool not_redundant() const { return flags & DICT_TF_COMPACT; }
|
|
|
|
|
2017-05-05 10:25:29 +03:00
|
|
|
/** @return whether this table is readable
|
|
|
|
@retval true normally
|
|
|
|
@retval false if this is a single-table tablespace
|
|
|
|
and the .ibd file is missing, or a
|
|
|
|
page cannot be read or decrypted */
|
|
|
|
bool is_readable() const
|
|
|
|
{
|
MDEV-12266: Change dict_table_t::space to fil_space_t*
InnoDB always keeps all tablespaces in the fil_system cache.
The fil_system.LRU is only for closing file handles; the
fil_space_t and fil_node_t for all data files will remain
in main memory. Between startup to shutdown, they can only be
created and removed by DDL statements. Therefore, we can
let dict_table_t::space point directly to the fil_space_t.
dict_table_t::space_id: A numeric tablespace ID for the corner cases
where we do not have a tablespace. The most prominent examples are
ALTER TABLE...DISCARD TABLESPACE or a missing or corrupted file.
There are a few functional differences; most notably:
(1) DROP TABLE will delete matching .ibd and .cfg files,
even if they were not attached to the data dictionary.
(2) Some error messages will report file names instead of numeric IDs.
There still are many functions that use numeric tablespace IDs instead
of fil_space_t*, and many functions could be converted to fil_space_t
member functions. Also, Tablespace and Datafile should be merged with
fil_space_t and fil_node_t. page_id_t and buf_page_get_gen() could use
fil_space_t& instead of a numeric ID, and after moving to a single
buffer pool (MDEV-15058), buf_pool_t::page_hash could be moved to
fil_space_t::page_hash.
FilSpace: Remove. Only few calls to fil_space_acquire() will remain,
and gradually they should be removed.
mtr_t::set_named_space_id(ulint): Renamed from set_named_space(),
to prevent accidental calls to this slower function. Very few
callers remain.
fseg_create(), fsp_reserve_free_extents(): Take fil_space_t*
as a parameter instead of a space_id.
fil_space_t::rename(): Wrapper for fil_rename_tablespace_check(),
fil_name_write_rename(), fil_rename_tablespace(). Mariabackup
passes the parameter log=false; InnoDB passes log=true.
dict_mem_table_create(): Take fil_space_t* instead of space_id
as parameter.
dict_process_sys_tables_rec_and_mtr_commit(): Replace the parameter
'status' with 'bool cached'.
dict_get_and_save_data_dir_path(): Avoid copying the fil_node_t::name.
fil_ibd_open(): Return the tablespace.
fil_space_t::set_imported(): Replaces fil_space_set_imported().
truncate_t: Change many member function parameters to fil_space_t*,
and remove page_size parameters.
row_truncate_prepare(): Merge to its only caller.
row_drop_table_from_cache(): Assert that the table is persistent.
dict_create_sys_indexes_tuple(): Write SYS_INDEXES.SPACE=FIL_NULL
if the tablespace has been discarded.
row_import_update_discarded_flag(): Remove a constant parameter.
2018-03-27 16:31:10 +03:00
|
|
|
ut_ad(file_unreadable || space);
|
2017-05-05 10:25:29 +03:00
|
|
|
return(UNIV_LIKELY(!file_unreadable));
|
MDEV-10139 Support for InnoDB SEQUENCE objects
We introduce a NO_ROLLBACK flag for InnoDB tables. This flag only works
for tables that have a single index. Apart from undo logging, this flag
will also prevent locking and the assignment of DB_ROW_ID or DB_TRX_ID,
and imply READ UNCOMMITTED isolation. It is assumed that the SQL layer
is guaranteeing mutual exclusion.
After the initial insert of the single record during CREATE SEQUENCE,
InnoDB will be updating the single record in-place. This is crash-safe
thanks to the redo log. (That is, after a crash after CREATE SEQUENCE
was committed, the effect of sequence operations will be observable
fully or not at all.)
When it comes to the durability of the updates of SEQUENCE in
InnoDB, there is a clear analogy to MDEV-6076 Persistent AUTO_INCREMENT.
The updates would be made persistent by the InnoDB redo log flush
at transaction commit or rollback (or XA PREPARE), provided that
innodb_log_flush_at_trx_commit=1.
Similar to AUTO_INCREMENT, it is possible that the update of a SEQUENCE
in a middle of transaction becomes durable before the COMMIT/ROLLBACK of
the transaction, in case the InnoDB redo log is being flushed as a result
of the a commit or rollback of some other transaction, or as a result of
a redo log checkpoint that can be initiated at any time by operations that
are writing redo log.
dict_table_t::no_rollback(): Check if the table does not support rollback.
BTR_NO_ROLLBACK: Logging and locking flags for no_rollback() tables.
DICT_TF_BITS: Add the NO_ROLLBACK flag.
row_ins_step(): Assign 0 to DB_ROW_ID and DB_TRX_ID, and skip
any locking for no-rollback tables. There will be only a single row
in no-rollback tables (or there must be a proper PRIMARY KEY).
row_search_mvcc(): Execute the READ UNCOMMITTED code path for
no-rollback tables.
ha_innobase::external_lock(), ha_innobase::store_lock():
Block CREATE/DROP SEQUENCE in innodb_read_only mode.
This probably has no effect for CREATE SEQUENCE, because already
ha_innobase::create() should have been called (and refused)
before external_lock() or store_lock() is called.
ha_innobase::store_lock(): For CREATE SEQUENCE, do not acquire any
InnoDB locks, even though TL_WRITE is being requested. (This is just
a performance optimization.)
innobase_copy_frm_flags_from_create_info(), row_drop_table_for_mysql():
Disable persistent statistics for no_rollback tables.
2017-03-27 18:58:16 +03:00
|
|
|
}
|
|
|
|
|
2019-04-03 15:56:28 +03:00
|
|
|
/** Check if a table name contains the string "/#sql"
|
|
|
|
which denotes temporary or intermediate tables in MariaDB. */
|
|
|
|
static bool is_temporary_name(const char* name)
|
|
|
|
{
|
|
|
|
return strstr(name, "/" TEMP_FILE_PREFIX) != NULL;
|
|
|
|
}
|
|
|
|
|
MDEV-15662 Instant DROP COLUMN or changing the order of columns
Allow ADD COLUMN anywhere in a table, not only adding as the
last column.
Allow instant DROP COLUMN and instant changing the order of columns.
The added columns will always be added last in clustered index records.
In new records, instantly dropped columns will be stored as NULL or
empty when possible.
Information about dropped and reordered columns will be written in
a metadata BLOB (mblob), which is stored before the first 'user' field
in the hidden metadata record at the start of the clustered index.
The presence of mblob is indicated by setting the delete-mark flag in
the metadata record.
The metadata BLOB stores the number of clustered index fields,
followed by an array of column information for each field.
For dropped columns, we store the NOT NULL flag, the fixed length,
and for variable-length columns, whether the maximum length exceeded
255 bytes. For non-dropped columns, we store the column position.
Unlike with MDEV-11369, when a table becomes empty, it cannot
be converted back to the canonical format. The reason for this is
that other threads may hold cached objects such as
row_prebuilt_t::ins_node that could refer to dropped or reordered
index fields.
For instant DROP COLUMN and ROW_FORMAT=COMPACT or ROW_FORMAT=DYNAMIC,
we must store the n_core_null_bytes in the root page, so that the
chain of node pointer records can be followed in order to reach the
leftmost leaf page where the metadata record is located.
If the mblob is present, we will zero-initialize the strings
"infimum" and "supremum" in the root page, and use the last byte of
"supremum" for storing the number of null bytes (which are allocated
but useless on node pointer pages). This is necessary for
btr_cur_instant_init_metadata() to be able to navigate to the mblob.
If the PRIMARY KEY contains any variable-length column and some
nullable columns were instantly dropped, the dict_index_t::n_nullable
in the data dictionary could be smaller than it actually is in the
non-leaf pages. Because of this, the non-leaf pages could use more
bytes for the null flags than the data dictionary expects, and we
could be reading the lengths of the variable-length columns from the
wrong offset, and thus reading the child page number from wrong place.
This is the result of two design mistakes that involve unnecessary
storage of data: First, it is nonsense to store any data fields for
the leftmost node pointer records, because the comparisons would be
resolved by the MIN_REC_FLAG alone. Second, there cannot be any null
fields in the clustered index node pointer fields, but we nevertheless
reserve space for all the null flags.
Limitations (future work):
MDEV-17459 Allow instant ALTER TABLE even if FULLTEXT INDEX exists
MDEV-17468 Avoid table rebuild on operations on generated columns
MDEV-17494 Refuse ALGORITHM=INSTANT when the row size is too large
btr_page_reorganize_low(): Preserve any metadata in the root page.
Call lock_move_reorganize_page() only after restoring the "infimum"
and "supremum" records, to avoid a memcmp() assertion failure.
dict_col_t::DROPPED: Magic value for dict_col_t::ind.
dict_col_t::clear_instant(): Renamed from dict_col_t::remove_instant().
Do not assert that the column was instantly added, because we
sometimes call this unconditionally for all columns.
Convert an instantly added column to a "core column". The old name
remove_instant() could be mistaken to refer to "instant DROP COLUMN".
dict_col_t::is_added(): Rename from dict_col_t::is_instant().
dtype_t::metadata_blob_init(): Initialize the mblob data type.
dtuple_t::is_metadata(), dtuple_t::is_alter_metadata(),
upd_t::is_metadata(), upd_t::is_alter_metadata(): Check if info_bits
refer to a metadata record.
dict_table_t::instant: Metadata about dropped or reordered columns.
dict_table_t::prepare_instant(): Prepare
ha_innobase_inplace_ctx::instant_table for instant ALTER TABLE.
innobase_instant_try() will pass this to dict_table_t::instant_column().
On rollback, dict_table_t::rollback_instant() will be called.
dict_table_t::instant_column(): Renamed from instant_add_column().
Add the parameter col_map so that columns can be reordered.
Copy and adjust v_cols[] as well.
dict_table_t::find(): Find an old column based on a new column number.
dict_table_t::serialise_columns(), dict_table_t::deserialise_columns():
Convert the mblob.
dict_index_t::instant_metadata(): Create the metadata record
for instant ALTER TABLE. Invoke dict_table_t::serialise_columns().
dict_index_t::reconstruct_fields(): Invoked by
dict_table_t::deserialise_columns().
dict_index_t::clear_instant_alter(): Move the fields for the
dropped columns to the end, and sort the surviving index fields
in ascending order of column position.
ha_innobase::check_if_supported_inplace_alter(): Do not allow
adding a FTS_DOC_ID column if a hidden FTS_DOC_ID column exists
due to FULLTEXT INDEX. (This always required ALGORITHM=COPY.)
instant_alter_column_possible(): Add a parameter for InnoDB table,
to check for additional conditions, such as the maximum number of
index fields.
ha_innobase_inplace_ctx::first_alter_pos: The first column whose position
is affected by instant ADD, DROP, or changing the order of columns.
innobase_build_col_map(): Skip added virtual columns.
prepare_inplace_add_virtual(): Correctly compute num_to_add_vcol.
Remove some unnecessary code. Note that the call to
innodb_base_col_setup() should be executed later.
commit_try_norebuild(): If ctx->is_instant(), let the virtual
columns be added or dropped by innobase_instant_try().
innobase_instant_try(): Fill in a zero default value for the
hidden column FTS_DOC_ID (to reduce the work needed in MDEV-17459).
If any columns were dropped or reordered (or added not last),
delete any SYS_COLUMNS records for the following columns, and
insert SYS_COLUMNS records for all subsequent stored columns as well
as for all virtual columns. If any virtual column is dropped, rewrite
all virtual column metadata. Use a shortcut only for adding
virtual columns. This is because innobase_drop_virtual_try()
assumes that the dropped virtual columns still exist in ctx->old_table.
innodb_update_cols(): Renamed from innodb_update_n_cols().
innobase_add_one_virtual(), innobase_insert_sys_virtual(): Change
the return type to bool, and invoke my_error() when detecting an error.
innodb_insert_sys_columns(): Insert a record into SYS_COLUMNS.
Refactored from innobase_add_one_virtual() and innobase_instant_add_col().
innobase_instant_add_col(): Replace the parameter dfield with type.
innobase_instant_drop_cols(): Drop matching columns from SYS_COLUMNS
and all columns from SYS_VIRTUAL.
innobase_add_virtual_try(), innobase_drop_virtual_try(): Let
the caller invoke innodb_update_cols().
innobase_rename_column_try(): Skip dropped columns.
commit_cache_norebuild(): Update table->fts->doc_col.
dict_mem_table_col_rename_low(): Skip dropped columns.
trx_undo_rec_get_partial_row(): Skip dropped columns.
trx_undo_update_rec_get_update(): Handle the metadata BLOB correctly.
trx_undo_page_report_modify(): Avoid out-of-bounds access to record fields.
Log metadata records consistently.
Apparently, the first fields of a clustered index may be updated
in an update_undo vector when the index is ID_IND of SYS_FOREIGN,
as part of renaming the table during ALTER TABLE. Normally, updates of
the PRIMARY KEY should be logged as delete-mark and an insert.
row_undo_mod_parse_undo_rec(), row_purge_parse_undo_rec():
Use trx_undo_metadata.
row_undo_mod_clust_low(): On metadata rollback, roll back the root page too.
row_undo_mod_clust(): Relax an assertion. The delete-mark flag was
repurposed for ALTER TABLE metadata records.
row_rec_to_index_entry_impl(): Add the template parameter mblob
and the optional parameter info_bits for specifying the desired new
info bits. For the metadata tuple, allow conversion between the original
format (ADD COLUMN only) and the generic format (with hidden BLOB).
Add the optional parameter "pad" to determine whether the tuple should
be padded to the index fields (on ALTER TABLE it should), or whether
it should remain at its original size (on rollback).
row_build_index_entry_low(): Clean up the code, removing
redundant variables and conditions. For instantly dropped columns,
generate a dummy value that is NULL, the empty string, or a
fixed length of NUL bytes, depending on the type of the dropped column.
row_upd_clust_rec_by_insert_inherit_func(): On the update of PRIMARY KEY
of a record that contained a dropped column whose value was stored
externally, we will be inserting a dummy NULL or empty string value
to the field of the dropped column. The externally stored column would
eventually be dropped when purge removes the delete-marked record for
the old PRIMARY KEY value.
btr_index_rec_validate(): Recognize the metadata record.
btr_discard_only_page_on_level(): Preserve the generic instant
ALTER TABLE metadata.
btr_set_instant(): Replaces page_set_instant(). This sets a clustered
index root page to the appropriate format, or upgrades from
the MDEV-11369 instant ADD COLUMN to generic ALTER TABLE format.
btr_cur_instant_init_low(): Read and validate the metadata BLOB page
before reconstructing the dictionary information based on it.
btr_cur_instant_init_metadata(): Do not read any lengths from the
metadata record header before reading the BLOB. At this point, we
would not actually know how many nullable fields the metadata record
contains.
btr_cur_instant_root_init(): Initialize n_core_null_bytes in one
of two possible ways.
btr_cur_trim(): Handle the mblob record.
row_metadata_to_tuple(): Convert a metadata record to a data tuple,
based on the new info_bits of the metadata record.
btr_cur_pessimistic_update(): Invoke row_metadata_to_tuple() if needed.
Invoke dtuple_convert_big_rec() for metadata records if the record is
too large, or if the mblob is not yet marked as externally stored.
btr_cur_optimistic_delete_func(), btr_cur_pessimistic_delete():
When the last user record is deleted, do not delete the
generic instant ALTER TABLE metadata record. Only delete
MDEV-11369 instant ADD COLUMN metadata records.
btr_cur_optimistic_insert(): Avoid unnecessary computation of rec_size.
btr_pcur_store_position(): Allow a logically empty page to contain
a metadata record for generic ALTER TABLE.
REC_INFO_DEFAULT_ROW_ADD: Renamed from REC_INFO_DEFAULT_ROW.
This is for the old instant ADD COLUMN (MDEV-11369) only.
REC_INFO_DEFAULT_ROW_ALTER: The more generic metadata record,
with additional information for dropped or reordered columns.
rec_info_bits_valid(): Remove. The only case when this would fail
is when the record is the generic ALTER TABLE metadata record.
rec_is_alter_metadata(): Check if a record is the metadata record
for instant ALTER TABLE (other than ADD COLUMN). NOTE: This function
must not be invoked on node pointer records, because the delete-mark
flag in those records may be set (it is garbage), and then a debug
assertion could fail because index->is_instant() does not necessarily
hold.
rec_is_add_metadata(): Check if a record is MDEV-11369 ADD COLUMN metadata
record (not more generic instant ALTER TABLE).
rec_get_converted_size_comp_prefix_low(): Assume that the metadata
field will be stored externally. In dtuple_convert_big_rec() during
the rec_get_converted_size() call, it would not be there yet.
rec_get_converted_size_comp(): Replace status,fields,n_fields with tuple.
rec_init_offsets_comp_ordinary(), rec_get_converted_size_comp_prefix_low(),
rec_convert_dtuple_to_rec_comp(): Add template<bool mblob = false>.
With mblob=true, process a record with a metadata BLOB.
rec_copy_prefix_to_buf(): Assert that no fields beyond the key and
system columns are being copied. Exclude the metadata BLOB field.
rec_convert_dtuple_to_metadata_comp(): Convert an alter metadata tuple
into a record.
row_upd_index_replace_metadata(): Apply an update vector to an
alter_metadata tuple.
row_log_allocate(): Replace dict_index_t::is_instant()
with a more appropriate condition that ignores dict_table_t::instant.
Only a table on which the MDEV-11369 ADD COLUMN was performed
can "lose its instantness" when it becomes empty. After
instant DROP COLUMN or reordering columns, we cannot simply
convert the table to the canonical format, because the data
dictionary cache and all possibly existing references to it
from other client connection threads would have to be adjusted.
row_quiesce_write_index_fields(): Do not crash when the table contains
an instantly dropped column.
Thanks to Thirunarayanan Balathandayuthapani for discussing the design
and implementing an initial prototype of this.
Thanks to Matthias Leich for testing.
2018-10-19 16:49:54 +03:00
|
|
|
/** @return whether instant ALTER TABLE is in effect */
|
MDEV-11369 Instant ADD COLUMN for InnoDB
For InnoDB tables, adding, dropping and reordering columns has
required a rebuild of the table and all its indexes. Since MySQL 5.6
(and MariaDB 10.0) this has been supported online (LOCK=NONE), allowing
concurrent modification of the tables.
This work revises the InnoDB ROW_FORMAT=REDUNDANT, ROW_FORMAT=COMPACT
and ROW_FORMAT=DYNAMIC so that columns can be appended instantaneously,
with only minor changes performed to the table structure. The counter
innodb_instant_alter_column in INFORMATION_SCHEMA.GLOBAL_STATUS
is incremented whenever a table rebuild operation is converted into
an instant ADD COLUMN operation.
ROW_FORMAT=COMPRESSED tables will not support instant ADD COLUMN.
Some usability limitations will be addressed in subsequent work:
MDEV-13134 Introduce ALTER TABLE attributes ALGORITHM=NOCOPY
and ALGORITHM=INSTANT
MDEV-14016 Allow instant ADD COLUMN, ADD INDEX, LOCK=NONE
The format of the clustered index (PRIMARY KEY) is changed as follows:
(1) The FIL_PAGE_TYPE of the root page will be FIL_PAGE_TYPE_INSTANT,
and a new field PAGE_INSTANT will contain the original number of fields
in the clustered index ('core' fields).
If instant ADD COLUMN has not been used or the table becomes empty,
or the very first instant ADD COLUMN operation is rolled back,
the fields PAGE_INSTANT and FIL_PAGE_TYPE will be reset
to 0 and FIL_PAGE_INDEX.
(2) A special 'default row' record is inserted into the leftmost leaf,
between the page infimum and the first user record. This record is
distinguished by the REC_INFO_MIN_REC_FLAG, and it is otherwise in the
same format as records that contain values for the instantly added
columns. This 'default row' always has the same number of fields as
the clustered index according to the table definition. The values of
'core' fields are to be ignored. For other fields, the 'default row'
will contain the default values as they were during the ALTER TABLE
statement. (If the column default values are changed later, those
values will only be stored in the .frm file. The 'default row' will
contain the original evaluated values, which must be the same for
every row.) The 'default row' must be completely hidden from
higher-level access routines. Assertions have been added to ensure
that no 'default row' is ever present in the adaptive hash index
or in locked records. The 'default row' is never delete-marked.
(3) In clustered index leaf page records, the number of fields must
reside between the number of 'core' fields (dict_index_t::n_core_fields
introduced in this work) and dict_index_t::n_fields. If the number
of fields is less than dict_index_t::n_fields, the missing fields
are replaced with the column value of the 'default row'.
Note: The number of fields in the record may shrink if some of the
last instantly added columns are updated to the value that is
in the 'default row'. The function btr_cur_trim() implements this
'compression' on update and rollback; dtuple::trim() implements it
on insert.
(4) In ROW_FORMAT=COMPACT and ROW_FORMAT=DYNAMIC records, the new
status value REC_STATUS_COLUMNS_ADDED will indicate the presence of
a new record header that will encode n_fields-n_core_fields-1 in
1 or 2 bytes. (In ROW_FORMAT=REDUNDANT records, the record header
always explicitly encodes the number of fields.)
We introduce the undo log record type TRX_UNDO_INSERT_DEFAULT for
covering the insert of the 'default row' record when instant ADD COLUMN
is used for the first time. Subsequent instant ADD COLUMN can use
TRX_UNDO_UPD_EXIST_REC.
This is joint work with Vin Chen (陈福荣) from Tencent. The design
that was discussed in April 2017 would not have allowed import or
export of data files, because instead of the 'default row' it would
have introduced a data dictionary table. The test
rpl.rpl_alter_instant is exactly as contributed in pull request #408.
The test innodb.instant_alter is based on a contributed test.
The redo log record format changes for ROW_FORMAT=DYNAMIC and
ROW_FORMAT=COMPACT are as contributed. (With this change present,
crash recovery from MariaDB 10.3.1 will fail in spectacular ways!)
Also the semantics of higher-level redo log records that modify the
PAGE_INSTANT field is changed. The redo log format version identifier
was already changed to LOG_HEADER_FORMAT_CURRENT=103 in MariaDB 10.3.1.
Everything else has been rewritten by me. Thanks to Elena Stepanova,
the code has been tested extensively.
When rolling back an instant ADD COLUMN operation, we must empty the
PAGE_FREE list after deleting or shortening the 'default row' record,
by calling either btr_page_empty() or btr_page_reorganize(). We must
know the size of each entry in the PAGE_FREE list. If rollback left a
freed copy of the 'default row' in the PAGE_FREE list, we would be
unable to determine its size (if it is in ROW_FORMAT=COMPACT or
ROW_FORMAT=DYNAMIC) because it would contain more fields than the
rolled-back definition of the clustered index.
UNIV_SQL_DEFAULT: A new special constant that designates an instantly
added column that is not present in the clustered index record.
len_is_stored(): Check if a length is an actual length. There are
two magic length values: UNIV_SQL_DEFAULT, UNIV_SQL_NULL.
dict_col_t::def_val: The 'default row' value of the column. If the
column is not added instantly, def_val.len will be UNIV_SQL_DEFAULT.
dict_col_t: Add the accessors is_virtual(), is_nullable(), is_instant(),
instant_value().
dict_col_t::remove_instant(): Remove the 'instant ADD' status of
a column.
dict_col_t::name(const dict_table_t& table): Replaces
dict_table_get_col_name().
dict_index_t::n_core_fields: The original number of fields.
For secondary indexes and if instant ADD COLUMN has not been used,
this will be equal to dict_index_t::n_fields.
dict_index_t::n_core_null_bytes: Number of bytes needed to
represent the null flags; usually equal to UT_BITS_IN_BYTES(n_nullable).
dict_index_t::NO_CORE_NULL_BYTES: Magic value signalling that
n_core_null_bytes was not initialized yet from the clustered index
root page.
dict_index_t: Add the accessors is_instant(), is_clust(),
get_n_nullable(), instant_field_value().
dict_index_t::instant_add_field(): Adjust clustered index metadata
for instant ADD COLUMN.
dict_index_t::remove_instant(): Remove the 'instant ADD' status
of a clustered index when the table becomes empty, or the very first
instant ADD COLUMN operation is rolled back.
dict_table_t: Add the accessors is_instant(), is_temporary(),
supports_instant().
dict_table_t::instant_add_column(): Adjust metadata for
instant ADD COLUMN.
dict_table_t::rollback_instant(): Adjust metadata on the rollback
of instant ADD COLUMN.
prepare_inplace_alter_table_dict(): First create the ctx->new_table,
and only then decide if the table really needs to be rebuilt.
We must split the creation of table or index metadata from the
creation of the dictionary table records and the creation of
the data. In this way, we can transform a table-rebuilding operation
into an instant ADD COLUMN operation. Dictionary objects will only
be added to cache when table rebuilding or index creation is needed.
The ctx->instant_table will never be added to cache.
dict_table_t::add_to_cache(): Modified and renamed from
dict_table_add_to_cache(). Do not modify the table metadata.
Let the callers invoke dict_table_add_system_columns() and if needed,
set can_be_evicted.
dict_create_sys_tables_tuple(), dict_create_table_step(): Omit the
system columns (which will now exist in the dict_table_t object
already at this point).
dict_create_table_step(): Expect the callers to invoke
dict_table_add_system_columns().
pars_create_table(): Before creating the table creation execution
graph, invoke dict_table_add_system_columns().
row_create_table_for_mysql(): Expect all callers to invoke
dict_table_add_system_columns().
create_index_dict(): Replaces row_merge_create_index_graph().
innodb_update_n_cols(): Renamed from innobase_update_n_virtual().
Call my_error() if an error occurs.
btr_cur_instant_init(), btr_cur_instant_init_low(),
btr_cur_instant_root_init():
Load additional metadata from the clustered index and set
dict_index_t::n_core_null_bytes. This is invoked
when table metadata is first loaded into the data dictionary.
dict_boot(): Initialize n_core_null_bytes for the four hard-coded
dictionary tables.
dict_create_index_step(): Initialize n_core_null_bytes. This is
executed as part of CREATE TABLE.
dict_index_build_internal_clust(): Initialize n_core_null_bytes to
NO_CORE_NULL_BYTES if table->supports_instant().
row_create_index_for_mysql(): Initialize n_core_null_bytes for
CREATE TEMPORARY TABLE.
commit_cache_norebuild(): Call the code to rename or enlarge columns
in the cache only if instant ADD COLUMN is not being used.
(Instant ADD COLUMN would copy all column metadata from
instant_table to old_table, including the names and lengths.)
PAGE_INSTANT: A new 13-bit field for storing dict_index_t::n_core_fields.
This is repurposing the 16-bit field PAGE_DIRECTION, of which only the
least significant 3 bits were used. The original byte containing
PAGE_DIRECTION will be accessible via the new constant PAGE_DIRECTION_B.
page_get_instant(), page_set_instant(): Accessors for the PAGE_INSTANT.
page_ptr_get_direction(), page_get_direction(),
page_ptr_set_direction(): Accessors for PAGE_DIRECTION.
page_direction_reset(): Reset PAGE_DIRECTION, PAGE_N_DIRECTION.
page_direction_increment(): Increment PAGE_N_DIRECTION
and set PAGE_DIRECTION.
rec_get_offsets(): Use the 'leaf' parameter for non-debug purposes,
and assume that heap_no is always set.
Initialize all dict_index_t::n_fields for ROW_FORMAT=REDUNDANT records,
even if the record contains fewer fields.
rec_offs_make_valid(): Add the parameter 'leaf'.
rec_copy_prefix_to_dtuple(): Assert that the tuple is only built
on the core fields. Instant ADD COLUMN only applies to the
clustered index, and we should never build a search key that has
more than the PRIMARY KEY and possibly DB_TRX_ID,DB_ROLL_PTR.
All these columns are always present.
dict_index_build_data_tuple(): Remove assertions that would be
duplicated in rec_copy_prefix_to_dtuple().
rec_init_offsets(): Support ROW_FORMAT=REDUNDANT records whose
number of fields is between n_core_fields and n_fields.
cmp_rec_rec_with_match(): Implement the comparison between two
MIN_REC_FLAG records.
trx_t::in_rollback: Make the field available in non-debug builds.
trx_start_for_ddl_low(): Remove dangerous error-tolerance.
A dictionary transaction must be flagged as such before it has generated
any undo log records. This is because trx_undo_assign_undo() will mark
the transaction as a dictionary transaction in the undo log header
right before the very first undo log record is being written.
btr_index_rec_validate(): Account for instant ADD COLUMN
row_undo_ins_remove_clust_rec(): On the rollback of an insert into
SYS_COLUMNS, revert instant ADD COLUMN in the cache by removing the
last column from the table and the clustered index.
row_search_on_row_ref(), row_undo_mod_parse_undo_rec(), row_undo_mod(),
trx_undo_update_rec_get_update(): Handle the 'default row'
as a special case.
dtuple_t::trim(index): Omit a redundant suffix of an index tuple right
before insert or update. After instant ADD COLUMN, if the last fields
of a clustered index tuple match the 'default row', there is no
need to store them. While trimming the entry, we must hold a page latch,
so that the table cannot be emptied and the 'default row' be deleted.
btr_cur_optimistic_update(), btr_cur_pessimistic_update(),
row_upd_clust_rec_by_insert(), row_ins_clust_index_entry_low():
Invoke dtuple_t::trim() if needed.
row_ins_clust_index_entry(): Restore dtuple_t::n_fields after calling
row_ins_clust_index_entry_low().
rec_get_converted_size(), rec_get_converted_size_comp(): Allow the number
of fields to be between n_core_fields and n_fields. Do not support
infimum,supremum. They are never supposed to be stored in dtuple_t,
because page creation nowadays uses a lower-level method for initializing
them.
rec_convert_dtuple_to_rec_comp(): Assign the status bits based on the
number of fields.
btr_cur_trim(): In an update, trim the index entry as needed. For the
'default row', handle rollback specially. For user records, omit
fields that match the 'default row'.
btr_cur_optimistic_delete_func(), btr_cur_pessimistic_delete():
Skip locking and adaptive hash index for the 'default row'.
row_log_table_apply_convert_mrec(): Replace 'default row' values if needed.
In the temporary file that is applied by row_log_table_apply(),
we must identify whether the records contain the extra header for
instantly added columns. For now, we will allocate an additional byte
for this for ROW_T_INSERT and ROW_T_UPDATE records when the source table
has been subject to instant ADD COLUMN. The ROW_T_DELETE records are
fine, as they will be converted and will only contain 'core' columns
(PRIMARY KEY and some system columns) that are converted from dtuple_t.
rec_get_converted_size_temp(), rec_init_offsets_temp(),
rec_convert_dtuple_to_temp(): Add the parameter 'status'.
REC_INFO_DEFAULT_ROW = REC_INFO_MIN_REC_FLAG | REC_STATUS_COLUMNS_ADDED:
An info_bits constant for distinguishing the 'default row' record.
rec_comp_status_t: An enum of the status bit values.
rec_leaf_format: An enum that replaces the bool parameter of
rec_init_offsets_comp_ordinary().
2017-10-06 07:00:05 +03:00
|
|
|
bool is_instant() const
|
|
|
|
{
|
|
|
|
return(UT_LIST_GET_FIRST(indexes)->is_instant());
|
|
|
|
}
|
|
|
|
|
MDEV-15662 Instant DROP COLUMN or changing the order of columns
Allow ADD COLUMN anywhere in a table, not only adding as the
last column.
Allow instant DROP COLUMN and instant changing the order of columns.
The added columns will always be added last in clustered index records.
In new records, instantly dropped columns will be stored as NULL or
empty when possible.
Information about dropped and reordered columns will be written in
a metadata BLOB (mblob), which is stored before the first 'user' field
in the hidden metadata record at the start of the clustered index.
The presence of mblob is indicated by setting the delete-mark flag in
the metadata record.
The metadata BLOB stores the number of clustered index fields,
followed by an array of column information for each field.
For dropped columns, we store the NOT NULL flag, the fixed length,
and for variable-length columns, whether the maximum length exceeded
255 bytes. For non-dropped columns, we store the column position.
Unlike with MDEV-11369, when a table becomes empty, it cannot
be converted back to the canonical format. The reason for this is
that other threads may hold cached objects such as
row_prebuilt_t::ins_node that could refer to dropped or reordered
index fields.
For instant DROP COLUMN and ROW_FORMAT=COMPACT or ROW_FORMAT=DYNAMIC,
we must store the n_core_null_bytes in the root page, so that the
chain of node pointer records can be followed in order to reach the
leftmost leaf page where the metadata record is located.
If the mblob is present, we will zero-initialize the strings
"infimum" and "supremum" in the root page, and use the last byte of
"supremum" for storing the number of null bytes (which are allocated
but useless on node pointer pages). This is necessary for
btr_cur_instant_init_metadata() to be able to navigate to the mblob.
If the PRIMARY KEY contains any variable-length column and some
nullable columns were instantly dropped, the dict_index_t::n_nullable
in the data dictionary could be smaller than it actually is in the
non-leaf pages. Because of this, the non-leaf pages could use more
bytes for the null flags than the data dictionary expects, and we
could be reading the lengths of the variable-length columns from the
wrong offset, and thus reading the child page number from wrong place.
This is the result of two design mistakes that involve unnecessary
storage of data: First, it is nonsense to store any data fields for
the leftmost node pointer records, because the comparisons would be
resolved by the MIN_REC_FLAG alone. Second, there cannot be any null
fields in the clustered index node pointer fields, but we nevertheless
reserve space for all the null flags.
Limitations (future work):
MDEV-17459 Allow instant ALTER TABLE even if FULLTEXT INDEX exists
MDEV-17468 Avoid table rebuild on operations on generated columns
MDEV-17494 Refuse ALGORITHM=INSTANT when the row size is too large
btr_page_reorganize_low(): Preserve any metadata in the root page.
Call lock_move_reorganize_page() only after restoring the "infimum"
and "supremum" records, to avoid a memcmp() assertion failure.
dict_col_t::DROPPED: Magic value for dict_col_t::ind.
dict_col_t::clear_instant(): Renamed from dict_col_t::remove_instant().
Do not assert that the column was instantly added, because we
sometimes call this unconditionally for all columns.
Convert an instantly added column to a "core column". The old name
remove_instant() could be mistaken to refer to "instant DROP COLUMN".
dict_col_t::is_added(): Rename from dict_col_t::is_instant().
dtype_t::metadata_blob_init(): Initialize the mblob data type.
dtuple_t::is_metadata(), dtuple_t::is_alter_metadata(),
upd_t::is_metadata(), upd_t::is_alter_metadata(): Check if info_bits
refer to a metadata record.
dict_table_t::instant: Metadata about dropped or reordered columns.
dict_table_t::prepare_instant(): Prepare
ha_innobase_inplace_ctx::instant_table for instant ALTER TABLE.
innobase_instant_try() will pass this to dict_table_t::instant_column().
On rollback, dict_table_t::rollback_instant() will be called.
dict_table_t::instant_column(): Renamed from instant_add_column().
Add the parameter col_map so that columns can be reordered.
Copy and adjust v_cols[] as well.
dict_table_t::find(): Find an old column based on a new column number.
dict_table_t::serialise_columns(), dict_table_t::deserialise_columns():
Convert the mblob.
dict_index_t::instant_metadata(): Create the metadata record
for instant ALTER TABLE. Invoke dict_table_t::serialise_columns().
dict_index_t::reconstruct_fields(): Invoked by
dict_table_t::deserialise_columns().
dict_index_t::clear_instant_alter(): Move the fields for the
dropped columns to the end, and sort the surviving index fields
in ascending order of column position.
ha_innobase::check_if_supported_inplace_alter(): Do not allow
adding a FTS_DOC_ID column if a hidden FTS_DOC_ID column exists
due to FULLTEXT INDEX. (This always required ALGORITHM=COPY.)
instant_alter_column_possible(): Add a parameter for InnoDB table,
to check for additional conditions, such as the maximum number of
index fields.
ha_innobase_inplace_ctx::first_alter_pos: The first column whose position
is affected by instant ADD, DROP, or changing the order of columns.
innobase_build_col_map(): Skip added virtual columns.
prepare_inplace_add_virtual(): Correctly compute num_to_add_vcol.
Remove some unnecessary code. Note that the call to
innodb_base_col_setup() should be executed later.
commit_try_norebuild(): If ctx->is_instant(), let the virtual
columns be added or dropped by innobase_instant_try().
innobase_instant_try(): Fill in a zero default value for the
hidden column FTS_DOC_ID (to reduce the work needed in MDEV-17459).
If any columns were dropped or reordered (or added not last),
delete any SYS_COLUMNS records for the following columns, and
insert SYS_COLUMNS records for all subsequent stored columns as well
as for all virtual columns. If any virtual column is dropped, rewrite
all virtual column metadata. Use a shortcut only for adding
virtual columns. This is because innobase_drop_virtual_try()
assumes that the dropped virtual columns still exist in ctx->old_table.
innodb_update_cols(): Renamed from innodb_update_n_cols().
innobase_add_one_virtual(), innobase_insert_sys_virtual(): Change
the return type to bool, and invoke my_error() when detecting an error.
innodb_insert_sys_columns(): Insert a record into SYS_COLUMNS.
Refactored from innobase_add_one_virtual() and innobase_instant_add_col().
innobase_instant_add_col(): Replace the parameter dfield with type.
innobase_instant_drop_cols(): Drop matching columns from SYS_COLUMNS
and all columns from SYS_VIRTUAL.
innobase_add_virtual_try(), innobase_drop_virtual_try(): Let
the caller invoke innodb_update_cols().
innobase_rename_column_try(): Skip dropped columns.
commit_cache_norebuild(): Update table->fts->doc_col.
dict_mem_table_col_rename_low(): Skip dropped columns.
trx_undo_rec_get_partial_row(): Skip dropped columns.
trx_undo_update_rec_get_update(): Handle the metadata BLOB correctly.
trx_undo_page_report_modify(): Avoid out-of-bounds access to record fields.
Log metadata records consistently.
Apparently, the first fields of a clustered index may be updated
in an update_undo vector when the index is ID_IND of SYS_FOREIGN,
as part of renaming the table during ALTER TABLE. Normally, updates of
the PRIMARY KEY should be logged as delete-mark and an insert.
row_undo_mod_parse_undo_rec(), row_purge_parse_undo_rec():
Use trx_undo_metadata.
row_undo_mod_clust_low(): On metadata rollback, roll back the root page too.
row_undo_mod_clust(): Relax an assertion. The delete-mark flag was
repurposed for ALTER TABLE metadata records.
row_rec_to_index_entry_impl(): Add the template parameter mblob
and the optional parameter info_bits for specifying the desired new
info bits. For the metadata tuple, allow conversion between the original
format (ADD COLUMN only) and the generic format (with hidden BLOB).
Add the optional parameter "pad" to determine whether the tuple should
be padded to the index fields (on ALTER TABLE it should), or whether
it should remain at its original size (on rollback).
row_build_index_entry_low(): Clean up the code, removing
redundant variables and conditions. For instantly dropped columns,
generate a dummy value that is NULL, the empty string, or a
fixed length of NUL bytes, depending on the type of the dropped column.
row_upd_clust_rec_by_insert_inherit_func(): On the update of PRIMARY KEY
of a record that contained a dropped column whose value was stored
externally, we will be inserting a dummy NULL or empty string value
to the field of the dropped column. The externally stored column would
eventually be dropped when purge removes the delete-marked record for
the old PRIMARY KEY value.
btr_index_rec_validate(): Recognize the metadata record.
btr_discard_only_page_on_level(): Preserve the generic instant
ALTER TABLE metadata.
btr_set_instant(): Replaces page_set_instant(). This sets a clustered
index root page to the appropriate format, or upgrades from
the MDEV-11369 instant ADD COLUMN to generic ALTER TABLE format.
btr_cur_instant_init_low(): Read and validate the metadata BLOB page
before reconstructing the dictionary information based on it.
btr_cur_instant_init_metadata(): Do not read any lengths from the
metadata record header before reading the BLOB. At this point, we
would not actually know how many nullable fields the metadata record
contains.
btr_cur_instant_root_init(): Initialize n_core_null_bytes in one
of two possible ways.
btr_cur_trim(): Handle the mblob record.
row_metadata_to_tuple(): Convert a metadata record to a data tuple,
based on the new info_bits of the metadata record.
btr_cur_pessimistic_update(): Invoke row_metadata_to_tuple() if needed.
Invoke dtuple_convert_big_rec() for metadata records if the record is
too large, or if the mblob is not yet marked as externally stored.
btr_cur_optimistic_delete_func(), btr_cur_pessimistic_delete():
When the last user record is deleted, do not delete the
generic instant ALTER TABLE metadata record. Only delete
MDEV-11369 instant ADD COLUMN metadata records.
btr_cur_optimistic_insert(): Avoid unnecessary computation of rec_size.
btr_pcur_store_position(): Allow a logically empty page to contain
a metadata record for generic ALTER TABLE.
REC_INFO_DEFAULT_ROW_ADD: Renamed from REC_INFO_DEFAULT_ROW.
This is for the old instant ADD COLUMN (MDEV-11369) only.
REC_INFO_DEFAULT_ROW_ALTER: The more generic metadata record,
with additional information for dropped or reordered columns.
rec_info_bits_valid(): Remove. The only case when this would fail
is when the record is the generic ALTER TABLE metadata record.
rec_is_alter_metadata(): Check if a record is the metadata record
for instant ALTER TABLE (other than ADD COLUMN). NOTE: This function
must not be invoked on node pointer records, because the delete-mark
flag in those records may be set (it is garbage), and then a debug
assertion could fail because index->is_instant() does not necessarily
hold.
rec_is_add_metadata(): Check if a record is MDEV-11369 ADD COLUMN metadata
record (not more generic instant ALTER TABLE).
rec_get_converted_size_comp_prefix_low(): Assume that the metadata
field will be stored externally. In dtuple_convert_big_rec() during
the rec_get_converted_size() call, it would not be there yet.
rec_get_converted_size_comp(): Replace status,fields,n_fields with tuple.
rec_init_offsets_comp_ordinary(), rec_get_converted_size_comp_prefix_low(),
rec_convert_dtuple_to_rec_comp(): Add template<bool mblob = false>.
With mblob=true, process a record with a metadata BLOB.
rec_copy_prefix_to_buf(): Assert that no fields beyond the key and
system columns are being copied. Exclude the metadata BLOB field.
rec_convert_dtuple_to_metadata_comp(): Convert an alter metadata tuple
into a record.
row_upd_index_replace_metadata(): Apply an update vector to an
alter_metadata tuple.
row_log_allocate(): Replace dict_index_t::is_instant()
with a more appropriate condition that ignores dict_table_t::instant.
Only a table on which the MDEV-11369 ADD COLUMN was performed
can "lose its instantness" when it becomes empty. After
instant DROP COLUMN or reordering columns, we cannot simply
convert the table to the canonical format, because the data
dictionary cache and all possibly existing references to it
from other client connection threads would have to be adjusted.
row_quiesce_write_index_fields(): Do not crash when the table contains
an instantly dropped column.
Thanks to Thirunarayanan Balathandayuthapani for discussing the design
and implementing an initial prototype of this.
Thanks to Matthias Leich for testing.
2018-10-19 16:49:54 +03:00
|
|
|
/** @return whether the table supports instant ALTER TABLE */
|
MDEV-11369 Instant ADD COLUMN for InnoDB
For InnoDB tables, adding, dropping and reordering columns has
required a rebuild of the table and all its indexes. Since MySQL 5.6
(and MariaDB 10.0) this has been supported online (LOCK=NONE), allowing
concurrent modification of the tables.
This work revises the InnoDB ROW_FORMAT=REDUNDANT, ROW_FORMAT=COMPACT
and ROW_FORMAT=DYNAMIC so that columns can be appended instantaneously,
with only minor changes performed to the table structure. The counter
innodb_instant_alter_column in INFORMATION_SCHEMA.GLOBAL_STATUS
is incremented whenever a table rebuild operation is converted into
an instant ADD COLUMN operation.
ROW_FORMAT=COMPRESSED tables will not support instant ADD COLUMN.
Some usability limitations will be addressed in subsequent work:
MDEV-13134 Introduce ALTER TABLE attributes ALGORITHM=NOCOPY
and ALGORITHM=INSTANT
MDEV-14016 Allow instant ADD COLUMN, ADD INDEX, LOCK=NONE
The format of the clustered index (PRIMARY KEY) is changed as follows:
(1) The FIL_PAGE_TYPE of the root page will be FIL_PAGE_TYPE_INSTANT,
and a new field PAGE_INSTANT will contain the original number of fields
in the clustered index ('core' fields).
If instant ADD COLUMN has not been used or the table becomes empty,
or the very first instant ADD COLUMN operation is rolled back,
the fields PAGE_INSTANT and FIL_PAGE_TYPE will be reset
to 0 and FIL_PAGE_INDEX.
(2) A special 'default row' record is inserted into the leftmost leaf,
between the page infimum and the first user record. This record is
distinguished by the REC_INFO_MIN_REC_FLAG, and it is otherwise in the
same format as records that contain values for the instantly added
columns. This 'default row' always has the same number of fields as
the clustered index according to the table definition. The values of
'core' fields are to be ignored. For other fields, the 'default row'
will contain the default values as they were during the ALTER TABLE
statement. (If the column default values are changed later, those
values will only be stored in the .frm file. The 'default row' will
contain the original evaluated values, which must be the same for
every row.) The 'default row' must be completely hidden from
higher-level access routines. Assertions have been added to ensure
that no 'default row' is ever present in the adaptive hash index
or in locked records. The 'default row' is never delete-marked.
(3) In clustered index leaf page records, the number of fields must
reside between the number of 'core' fields (dict_index_t::n_core_fields
introduced in this work) and dict_index_t::n_fields. If the number
of fields is less than dict_index_t::n_fields, the missing fields
are replaced with the column value of the 'default row'.
Note: The number of fields in the record may shrink if some of the
last instantly added columns are updated to the value that is
in the 'default row'. The function btr_cur_trim() implements this
'compression' on update and rollback; dtuple::trim() implements it
on insert.
(4) In ROW_FORMAT=COMPACT and ROW_FORMAT=DYNAMIC records, the new
status value REC_STATUS_COLUMNS_ADDED will indicate the presence of
a new record header that will encode n_fields-n_core_fields-1 in
1 or 2 bytes. (In ROW_FORMAT=REDUNDANT records, the record header
always explicitly encodes the number of fields.)
We introduce the undo log record type TRX_UNDO_INSERT_DEFAULT for
covering the insert of the 'default row' record when instant ADD COLUMN
is used for the first time. Subsequent instant ADD COLUMN can use
TRX_UNDO_UPD_EXIST_REC.
This is joint work with Vin Chen (陈福荣) from Tencent. The design
that was discussed in April 2017 would not have allowed import or
export of data files, because instead of the 'default row' it would
have introduced a data dictionary table. The test
rpl.rpl_alter_instant is exactly as contributed in pull request #408.
The test innodb.instant_alter is based on a contributed test.
The redo log record format changes for ROW_FORMAT=DYNAMIC and
ROW_FORMAT=COMPACT are as contributed. (With this change present,
crash recovery from MariaDB 10.3.1 will fail in spectacular ways!)
Also the semantics of higher-level redo log records that modify the
PAGE_INSTANT field is changed. The redo log format version identifier
was already changed to LOG_HEADER_FORMAT_CURRENT=103 in MariaDB 10.3.1.
Everything else has been rewritten by me. Thanks to Elena Stepanova,
the code has been tested extensively.
When rolling back an instant ADD COLUMN operation, we must empty the
PAGE_FREE list after deleting or shortening the 'default row' record,
by calling either btr_page_empty() or btr_page_reorganize(). We must
know the size of each entry in the PAGE_FREE list. If rollback left a
freed copy of the 'default row' in the PAGE_FREE list, we would be
unable to determine its size (if it is in ROW_FORMAT=COMPACT or
ROW_FORMAT=DYNAMIC) because it would contain more fields than the
rolled-back definition of the clustered index.
UNIV_SQL_DEFAULT: A new special constant that designates an instantly
added column that is not present in the clustered index record.
len_is_stored(): Check if a length is an actual length. There are
two magic length values: UNIV_SQL_DEFAULT, UNIV_SQL_NULL.
dict_col_t::def_val: The 'default row' value of the column. If the
column is not added instantly, def_val.len will be UNIV_SQL_DEFAULT.
dict_col_t: Add the accessors is_virtual(), is_nullable(), is_instant(),
instant_value().
dict_col_t::remove_instant(): Remove the 'instant ADD' status of
a column.
dict_col_t::name(const dict_table_t& table): Replaces
dict_table_get_col_name().
dict_index_t::n_core_fields: The original number of fields.
For secondary indexes and if instant ADD COLUMN has not been used,
this will be equal to dict_index_t::n_fields.
dict_index_t::n_core_null_bytes: Number of bytes needed to
represent the null flags; usually equal to UT_BITS_IN_BYTES(n_nullable).
dict_index_t::NO_CORE_NULL_BYTES: Magic value signalling that
n_core_null_bytes was not initialized yet from the clustered index
root page.
dict_index_t: Add the accessors is_instant(), is_clust(),
get_n_nullable(), instant_field_value().
dict_index_t::instant_add_field(): Adjust clustered index metadata
for instant ADD COLUMN.
dict_index_t::remove_instant(): Remove the 'instant ADD' status
of a clustered index when the table becomes empty, or the very first
instant ADD COLUMN operation is rolled back.
dict_table_t: Add the accessors is_instant(), is_temporary(),
supports_instant().
dict_table_t::instant_add_column(): Adjust metadata for
instant ADD COLUMN.
dict_table_t::rollback_instant(): Adjust metadata on the rollback
of instant ADD COLUMN.
prepare_inplace_alter_table_dict(): First create the ctx->new_table,
and only then decide if the table really needs to be rebuilt.
We must split the creation of table or index metadata from the
creation of the dictionary table records and the creation of
the data. In this way, we can transform a table-rebuilding operation
into an instant ADD COLUMN operation. Dictionary objects will only
be added to cache when table rebuilding or index creation is needed.
The ctx->instant_table will never be added to cache.
dict_table_t::add_to_cache(): Modified and renamed from
dict_table_add_to_cache(). Do not modify the table metadata.
Let the callers invoke dict_table_add_system_columns() and if needed,
set can_be_evicted.
dict_create_sys_tables_tuple(), dict_create_table_step(): Omit the
system columns (which will now exist in the dict_table_t object
already at this point).
dict_create_table_step(): Expect the callers to invoke
dict_table_add_system_columns().
pars_create_table(): Before creating the table creation execution
graph, invoke dict_table_add_system_columns().
row_create_table_for_mysql(): Expect all callers to invoke
dict_table_add_system_columns().
create_index_dict(): Replaces row_merge_create_index_graph().
innodb_update_n_cols(): Renamed from innobase_update_n_virtual().
Call my_error() if an error occurs.
btr_cur_instant_init(), btr_cur_instant_init_low(),
btr_cur_instant_root_init():
Load additional metadata from the clustered index and set
dict_index_t::n_core_null_bytes. This is invoked
when table metadata is first loaded into the data dictionary.
dict_boot(): Initialize n_core_null_bytes for the four hard-coded
dictionary tables.
dict_create_index_step(): Initialize n_core_null_bytes. This is
executed as part of CREATE TABLE.
dict_index_build_internal_clust(): Initialize n_core_null_bytes to
NO_CORE_NULL_BYTES if table->supports_instant().
row_create_index_for_mysql(): Initialize n_core_null_bytes for
CREATE TEMPORARY TABLE.
commit_cache_norebuild(): Call the code to rename or enlarge columns
in the cache only if instant ADD COLUMN is not being used.
(Instant ADD COLUMN would copy all column metadata from
instant_table to old_table, including the names and lengths.)
PAGE_INSTANT: A new 13-bit field for storing dict_index_t::n_core_fields.
This is repurposing the 16-bit field PAGE_DIRECTION, of which only the
least significant 3 bits were used. The original byte containing
PAGE_DIRECTION will be accessible via the new constant PAGE_DIRECTION_B.
page_get_instant(), page_set_instant(): Accessors for the PAGE_INSTANT.
page_ptr_get_direction(), page_get_direction(),
page_ptr_set_direction(): Accessors for PAGE_DIRECTION.
page_direction_reset(): Reset PAGE_DIRECTION, PAGE_N_DIRECTION.
page_direction_increment(): Increment PAGE_N_DIRECTION
and set PAGE_DIRECTION.
rec_get_offsets(): Use the 'leaf' parameter for non-debug purposes,
and assume that heap_no is always set.
Initialize all dict_index_t::n_fields for ROW_FORMAT=REDUNDANT records,
even if the record contains fewer fields.
rec_offs_make_valid(): Add the parameter 'leaf'.
rec_copy_prefix_to_dtuple(): Assert that the tuple is only built
on the core fields. Instant ADD COLUMN only applies to the
clustered index, and we should never build a search key that has
more than the PRIMARY KEY and possibly DB_TRX_ID,DB_ROLL_PTR.
All these columns are always present.
dict_index_build_data_tuple(): Remove assertions that would be
duplicated in rec_copy_prefix_to_dtuple().
rec_init_offsets(): Support ROW_FORMAT=REDUNDANT records whose
number of fields is between n_core_fields and n_fields.
cmp_rec_rec_with_match(): Implement the comparison between two
MIN_REC_FLAG records.
trx_t::in_rollback: Make the field available in non-debug builds.
trx_start_for_ddl_low(): Remove dangerous error-tolerance.
A dictionary transaction must be flagged as such before it has generated
any undo log records. This is because trx_undo_assign_undo() will mark
the transaction as a dictionary transaction in the undo log header
right before the very first undo log record is being written.
btr_index_rec_validate(): Account for instant ADD COLUMN
row_undo_ins_remove_clust_rec(): On the rollback of an insert into
SYS_COLUMNS, revert instant ADD COLUMN in the cache by removing the
last column from the table and the clustered index.
row_search_on_row_ref(), row_undo_mod_parse_undo_rec(), row_undo_mod(),
trx_undo_update_rec_get_update(): Handle the 'default row'
as a special case.
dtuple_t::trim(index): Omit a redundant suffix of an index tuple right
before insert or update. After instant ADD COLUMN, if the last fields
of a clustered index tuple match the 'default row', there is no
need to store them. While trimming the entry, we must hold a page latch,
so that the table cannot be emptied and the 'default row' be deleted.
btr_cur_optimistic_update(), btr_cur_pessimistic_update(),
row_upd_clust_rec_by_insert(), row_ins_clust_index_entry_low():
Invoke dtuple_t::trim() if needed.
row_ins_clust_index_entry(): Restore dtuple_t::n_fields after calling
row_ins_clust_index_entry_low().
rec_get_converted_size(), rec_get_converted_size_comp(): Allow the number
of fields to be between n_core_fields and n_fields. Do not support
infimum,supremum. They are never supposed to be stored in dtuple_t,
because page creation nowadays uses a lower-level method for initializing
them.
rec_convert_dtuple_to_rec_comp(): Assign the status bits based on the
number of fields.
btr_cur_trim(): In an update, trim the index entry as needed. For the
'default row', handle rollback specially. For user records, omit
fields that match the 'default row'.
btr_cur_optimistic_delete_func(), btr_cur_pessimistic_delete():
Skip locking and adaptive hash index for the 'default row'.
row_log_table_apply_convert_mrec(): Replace 'default row' values if needed.
In the temporary file that is applied by row_log_table_apply(),
we must identify whether the records contain the extra header for
instantly added columns. For now, we will allocate an additional byte
for this for ROW_T_INSERT and ROW_T_UPDATE records when the source table
has been subject to instant ADD COLUMN. The ROW_T_DELETE records are
fine, as they will be converted and will only contain 'core' columns
(PRIMARY KEY and some system columns) that are converted from dtuple_t.
rec_get_converted_size_temp(), rec_init_offsets_temp(),
rec_convert_dtuple_to_temp(): Add the parameter 'status'.
REC_INFO_DEFAULT_ROW = REC_INFO_MIN_REC_FLAG | REC_STATUS_COLUMNS_ADDED:
An info_bits constant for distinguishing the 'default row' record.
rec_comp_status_t: An enum of the status bit values.
rec_leaf_format: An enum that replaces the bool parameter of
rec_init_offsets_comp_ordinary().
2017-10-06 07:00:05 +03:00
|
|
|
bool supports_instant() const
|
|
|
|
{
|
|
|
|
return(!(flags & DICT_TF_MASK_ZIP_SSIZE));
|
|
|
|
}
|
|
|
|
|
MDEV-15662 Instant DROP COLUMN or changing the order of columns
Allow ADD COLUMN anywhere in a table, not only adding as the
last column.
Allow instant DROP COLUMN and instant changing the order of columns.
The added columns will always be added last in clustered index records.
In new records, instantly dropped columns will be stored as NULL or
empty when possible.
Information about dropped and reordered columns will be written in
a metadata BLOB (mblob), which is stored before the first 'user' field
in the hidden metadata record at the start of the clustered index.
The presence of mblob is indicated by setting the delete-mark flag in
the metadata record.
The metadata BLOB stores the number of clustered index fields,
followed by an array of column information for each field.
For dropped columns, we store the NOT NULL flag, the fixed length,
and for variable-length columns, whether the maximum length exceeded
255 bytes. For non-dropped columns, we store the column position.
Unlike with MDEV-11369, when a table becomes empty, it cannot
be converted back to the canonical format. The reason for this is
that other threads may hold cached objects such as
row_prebuilt_t::ins_node that could refer to dropped or reordered
index fields.
For instant DROP COLUMN and ROW_FORMAT=COMPACT or ROW_FORMAT=DYNAMIC,
we must store the n_core_null_bytes in the root page, so that the
chain of node pointer records can be followed in order to reach the
leftmost leaf page where the metadata record is located.
If the mblob is present, we will zero-initialize the strings
"infimum" and "supremum" in the root page, and use the last byte of
"supremum" for storing the number of null bytes (which are allocated
but useless on node pointer pages). This is necessary for
btr_cur_instant_init_metadata() to be able to navigate to the mblob.
If the PRIMARY KEY contains any variable-length column and some
nullable columns were instantly dropped, the dict_index_t::n_nullable
in the data dictionary could be smaller than it actually is in the
non-leaf pages. Because of this, the non-leaf pages could use more
bytes for the null flags than the data dictionary expects, and we
could be reading the lengths of the variable-length columns from the
wrong offset, and thus reading the child page number from wrong place.
This is the result of two design mistakes that involve unnecessary
storage of data: First, it is nonsense to store any data fields for
the leftmost node pointer records, because the comparisons would be
resolved by the MIN_REC_FLAG alone. Second, there cannot be any null
fields in the clustered index node pointer fields, but we nevertheless
reserve space for all the null flags.
Limitations (future work):
MDEV-17459 Allow instant ALTER TABLE even if FULLTEXT INDEX exists
MDEV-17468 Avoid table rebuild on operations on generated columns
MDEV-17494 Refuse ALGORITHM=INSTANT when the row size is too large
btr_page_reorganize_low(): Preserve any metadata in the root page.
Call lock_move_reorganize_page() only after restoring the "infimum"
and "supremum" records, to avoid a memcmp() assertion failure.
dict_col_t::DROPPED: Magic value for dict_col_t::ind.
dict_col_t::clear_instant(): Renamed from dict_col_t::remove_instant().
Do not assert that the column was instantly added, because we
sometimes call this unconditionally for all columns.
Convert an instantly added column to a "core column". The old name
remove_instant() could be mistaken to refer to "instant DROP COLUMN".
dict_col_t::is_added(): Rename from dict_col_t::is_instant().
dtype_t::metadata_blob_init(): Initialize the mblob data type.
dtuple_t::is_metadata(), dtuple_t::is_alter_metadata(),
upd_t::is_metadata(), upd_t::is_alter_metadata(): Check if info_bits
refer to a metadata record.
dict_table_t::instant: Metadata about dropped or reordered columns.
dict_table_t::prepare_instant(): Prepare
ha_innobase_inplace_ctx::instant_table for instant ALTER TABLE.
innobase_instant_try() will pass this to dict_table_t::instant_column().
On rollback, dict_table_t::rollback_instant() will be called.
dict_table_t::instant_column(): Renamed from instant_add_column().
Add the parameter col_map so that columns can be reordered.
Copy and adjust v_cols[] as well.
dict_table_t::find(): Find an old column based on a new column number.
dict_table_t::serialise_columns(), dict_table_t::deserialise_columns():
Convert the mblob.
dict_index_t::instant_metadata(): Create the metadata record
for instant ALTER TABLE. Invoke dict_table_t::serialise_columns().
dict_index_t::reconstruct_fields(): Invoked by
dict_table_t::deserialise_columns().
dict_index_t::clear_instant_alter(): Move the fields for the
dropped columns to the end, and sort the surviving index fields
in ascending order of column position.
ha_innobase::check_if_supported_inplace_alter(): Do not allow
adding a FTS_DOC_ID column if a hidden FTS_DOC_ID column exists
due to FULLTEXT INDEX. (This always required ALGORITHM=COPY.)
instant_alter_column_possible(): Add a parameter for InnoDB table,
to check for additional conditions, such as the maximum number of
index fields.
ha_innobase_inplace_ctx::first_alter_pos: The first column whose position
is affected by instant ADD, DROP, or changing the order of columns.
innobase_build_col_map(): Skip added virtual columns.
prepare_inplace_add_virtual(): Correctly compute num_to_add_vcol.
Remove some unnecessary code. Note that the call to
innodb_base_col_setup() should be executed later.
commit_try_norebuild(): If ctx->is_instant(), let the virtual
columns be added or dropped by innobase_instant_try().
innobase_instant_try(): Fill in a zero default value for the
hidden column FTS_DOC_ID (to reduce the work needed in MDEV-17459).
If any columns were dropped or reordered (or added not last),
delete any SYS_COLUMNS records for the following columns, and
insert SYS_COLUMNS records for all subsequent stored columns as well
as for all virtual columns. If any virtual column is dropped, rewrite
all virtual column metadata. Use a shortcut only for adding
virtual columns. This is because innobase_drop_virtual_try()
assumes that the dropped virtual columns still exist in ctx->old_table.
innodb_update_cols(): Renamed from innodb_update_n_cols().
innobase_add_one_virtual(), innobase_insert_sys_virtual(): Change
the return type to bool, and invoke my_error() when detecting an error.
innodb_insert_sys_columns(): Insert a record into SYS_COLUMNS.
Refactored from innobase_add_one_virtual() and innobase_instant_add_col().
innobase_instant_add_col(): Replace the parameter dfield with type.
innobase_instant_drop_cols(): Drop matching columns from SYS_COLUMNS
and all columns from SYS_VIRTUAL.
innobase_add_virtual_try(), innobase_drop_virtual_try(): Let
the caller invoke innodb_update_cols().
innobase_rename_column_try(): Skip dropped columns.
commit_cache_norebuild(): Update table->fts->doc_col.
dict_mem_table_col_rename_low(): Skip dropped columns.
trx_undo_rec_get_partial_row(): Skip dropped columns.
trx_undo_update_rec_get_update(): Handle the metadata BLOB correctly.
trx_undo_page_report_modify(): Avoid out-of-bounds access to record fields.
Log metadata records consistently.
Apparently, the first fields of a clustered index may be updated
in an update_undo vector when the index is ID_IND of SYS_FOREIGN,
as part of renaming the table during ALTER TABLE. Normally, updates of
the PRIMARY KEY should be logged as delete-mark and an insert.
row_undo_mod_parse_undo_rec(), row_purge_parse_undo_rec():
Use trx_undo_metadata.
row_undo_mod_clust_low(): On metadata rollback, roll back the root page too.
row_undo_mod_clust(): Relax an assertion. The delete-mark flag was
repurposed for ALTER TABLE metadata records.
row_rec_to_index_entry_impl(): Add the template parameter mblob
and the optional parameter info_bits for specifying the desired new
info bits. For the metadata tuple, allow conversion between the original
format (ADD COLUMN only) and the generic format (with hidden BLOB).
Add the optional parameter "pad" to determine whether the tuple should
be padded to the index fields (on ALTER TABLE it should), or whether
it should remain at its original size (on rollback).
row_build_index_entry_low(): Clean up the code, removing
redundant variables and conditions. For instantly dropped columns,
generate a dummy value that is NULL, the empty string, or a
fixed length of NUL bytes, depending on the type of the dropped column.
row_upd_clust_rec_by_insert_inherit_func(): On the update of PRIMARY KEY
of a record that contained a dropped column whose value was stored
externally, we will be inserting a dummy NULL or empty string value
to the field of the dropped column. The externally stored column would
eventually be dropped when purge removes the delete-marked record for
the old PRIMARY KEY value.
btr_index_rec_validate(): Recognize the metadata record.
btr_discard_only_page_on_level(): Preserve the generic instant
ALTER TABLE metadata.
btr_set_instant(): Replaces page_set_instant(). This sets a clustered
index root page to the appropriate format, or upgrades from
the MDEV-11369 instant ADD COLUMN to generic ALTER TABLE format.
btr_cur_instant_init_low(): Read and validate the metadata BLOB page
before reconstructing the dictionary information based on it.
btr_cur_instant_init_metadata(): Do not read any lengths from the
metadata record header before reading the BLOB. At this point, we
would not actually know how many nullable fields the metadata record
contains.
btr_cur_instant_root_init(): Initialize n_core_null_bytes in one
of two possible ways.
btr_cur_trim(): Handle the mblob record.
row_metadata_to_tuple(): Convert a metadata record to a data tuple,
based on the new info_bits of the metadata record.
btr_cur_pessimistic_update(): Invoke row_metadata_to_tuple() if needed.
Invoke dtuple_convert_big_rec() for metadata records if the record is
too large, or if the mblob is not yet marked as externally stored.
btr_cur_optimistic_delete_func(), btr_cur_pessimistic_delete():
When the last user record is deleted, do not delete the
generic instant ALTER TABLE metadata record. Only delete
MDEV-11369 instant ADD COLUMN metadata records.
btr_cur_optimistic_insert(): Avoid unnecessary computation of rec_size.
btr_pcur_store_position(): Allow a logically empty page to contain
a metadata record for generic ALTER TABLE.
REC_INFO_DEFAULT_ROW_ADD: Renamed from REC_INFO_DEFAULT_ROW.
This is for the old instant ADD COLUMN (MDEV-11369) only.
REC_INFO_DEFAULT_ROW_ALTER: The more generic metadata record,
with additional information for dropped or reordered columns.
rec_info_bits_valid(): Remove. The only case when this would fail
is when the record is the generic ALTER TABLE metadata record.
rec_is_alter_metadata(): Check if a record is the metadata record
for instant ALTER TABLE (other than ADD COLUMN). NOTE: This function
must not be invoked on node pointer records, because the delete-mark
flag in those records may be set (it is garbage), and then a debug
assertion could fail because index->is_instant() does not necessarily
hold.
rec_is_add_metadata(): Check if a record is MDEV-11369 ADD COLUMN metadata
record (not more generic instant ALTER TABLE).
rec_get_converted_size_comp_prefix_low(): Assume that the metadata
field will be stored externally. In dtuple_convert_big_rec() during
the rec_get_converted_size() call, it would not be there yet.
rec_get_converted_size_comp(): Replace status,fields,n_fields with tuple.
rec_init_offsets_comp_ordinary(), rec_get_converted_size_comp_prefix_low(),
rec_convert_dtuple_to_rec_comp(): Add template<bool mblob = false>.
With mblob=true, process a record with a metadata BLOB.
rec_copy_prefix_to_buf(): Assert that no fields beyond the key and
system columns are being copied. Exclude the metadata BLOB field.
rec_convert_dtuple_to_metadata_comp(): Convert an alter metadata tuple
into a record.
row_upd_index_replace_metadata(): Apply an update vector to an
alter_metadata tuple.
row_log_allocate(): Replace dict_index_t::is_instant()
with a more appropriate condition that ignores dict_table_t::instant.
Only a table on which the MDEV-11369 ADD COLUMN was performed
can "lose its instantness" when it becomes empty. After
instant DROP COLUMN or reordering columns, we cannot simply
convert the table to the canonical format, because the data
dictionary cache and all possibly existing references to it
from other client connection threads would have to be adjusted.
row_quiesce_write_index_fields(): Do not crash when the table contains
an instantly dropped column.
Thanks to Thirunarayanan Balathandayuthapani for discussing the design
and implementing an initial prototype of this.
Thanks to Matthias Leich for testing.
2018-10-19 16:49:54 +03:00
|
|
|
/** @return the number of instantly dropped columns */
|
|
|
|
unsigned n_dropped() const { return instant ? instant->n_dropped : 0; }
|
|
|
|
|
|
|
|
/** Look up an old column.
|
|
|
|
@param[in] cols the old columns of the table
|
|
|
|
@param[in] col_map map from old table columns to altered ones
|
|
|
|
@param[in] n_cols number of old columns
|
|
|
|
@param[in] i the number of the new column
|
|
|
|
@return old column
|
|
|
|
@retval NULL if column i was added to the table */
|
|
|
|
static const dict_col_t* find(const dict_col_t* cols,
|
|
|
|
const ulint* col_map, ulint n_cols,
|
|
|
|
ulint i)
|
|
|
|
{
|
|
|
|
for (ulint o = n_cols; o--; ) {
|
|
|
|
if (col_map[o] == i) {
|
|
|
|
return &cols[o];
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return NULL;
|
|
|
|
}
|
MDEV-11369 Instant ADD COLUMN for InnoDB
For InnoDB tables, adding, dropping and reordering columns has
required a rebuild of the table and all its indexes. Since MySQL 5.6
(and MariaDB 10.0) this has been supported online (LOCK=NONE), allowing
concurrent modification of the tables.
This work revises the InnoDB ROW_FORMAT=REDUNDANT, ROW_FORMAT=COMPACT
and ROW_FORMAT=DYNAMIC so that columns can be appended instantaneously,
with only minor changes performed to the table structure. The counter
innodb_instant_alter_column in INFORMATION_SCHEMA.GLOBAL_STATUS
is incremented whenever a table rebuild operation is converted into
an instant ADD COLUMN operation.
ROW_FORMAT=COMPRESSED tables will not support instant ADD COLUMN.
Some usability limitations will be addressed in subsequent work:
MDEV-13134 Introduce ALTER TABLE attributes ALGORITHM=NOCOPY
and ALGORITHM=INSTANT
MDEV-14016 Allow instant ADD COLUMN, ADD INDEX, LOCK=NONE
The format of the clustered index (PRIMARY KEY) is changed as follows:
(1) The FIL_PAGE_TYPE of the root page will be FIL_PAGE_TYPE_INSTANT,
and a new field PAGE_INSTANT will contain the original number of fields
in the clustered index ('core' fields).
If instant ADD COLUMN has not been used or the table becomes empty,
or the very first instant ADD COLUMN operation is rolled back,
the fields PAGE_INSTANT and FIL_PAGE_TYPE will be reset
to 0 and FIL_PAGE_INDEX.
(2) A special 'default row' record is inserted into the leftmost leaf,
between the page infimum and the first user record. This record is
distinguished by the REC_INFO_MIN_REC_FLAG, and it is otherwise in the
same format as records that contain values for the instantly added
columns. This 'default row' always has the same number of fields as
the clustered index according to the table definition. The values of
'core' fields are to be ignored. For other fields, the 'default row'
will contain the default values as they were during the ALTER TABLE
statement. (If the column default values are changed later, those
values will only be stored in the .frm file. The 'default row' will
contain the original evaluated values, which must be the same for
every row.) The 'default row' must be completely hidden from
higher-level access routines. Assertions have been added to ensure
that no 'default row' is ever present in the adaptive hash index
or in locked records. The 'default row' is never delete-marked.
(3) In clustered index leaf page records, the number of fields must
reside between the number of 'core' fields (dict_index_t::n_core_fields
introduced in this work) and dict_index_t::n_fields. If the number
of fields is less than dict_index_t::n_fields, the missing fields
are replaced with the column value of the 'default row'.
Note: The number of fields in the record may shrink if some of the
last instantly added columns are updated to the value that is
in the 'default row'. The function btr_cur_trim() implements this
'compression' on update and rollback; dtuple::trim() implements it
on insert.
(4) In ROW_FORMAT=COMPACT and ROW_FORMAT=DYNAMIC records, the new
status value REC_STATUS_COLUMNS_ADDED will indicate the presence of
a new record header that will encode n_fields-n_core_fields-1 in
1 or 2 bytes. (In ROW_FORMAT=REDUNDANT records, the record header
always explicitly encodes the number of fields.)
We introduce the undo log record type TRX_UNDO_INSERT_DEFAULT for
covering the insert of the 'default row' record when instant ADD COLUMN
is used for the first time. Subsequent instant ADD COLUMN can use
TRX_UNDO_UPD_EXIST_REC.
This is joint work with Vin Chen (陈福荣) from Tencent. The design
that was discussed in April 2017 would not have allowed import or
export of data files, because instead of the 'default row' it would
have introduced a data dictionary table. The test
rpl.rpl_alter_instant is exactly as contributed in pull request #408.
The test innodb.instant_alter is based on a contributed test.
The redo log record format changes for ROW_FORMAT=DYNAMIC and
ROW_FORMAT=COMPACT are as contributed. (With this change present,
crash recovery from MariaDB 10.3.1 will fail in spectacular ways!)
Also the semantics of higher-level redo log records that modify the
PAGE_INSTANT field is changed. The redo log format version identifier
was already changed to LOG_HEADER_FORMAT_CURRENT=103 in MariaDB 10.3.1.
Everything else has been rewritten by me. Thanks to Elena Stepanova,
the code has been tested extensively.
When rolling back an instant ADD COLUMN operation, we must empty the
PAGE_FREE list after deleting or shortening the 'default row' record,
by calling either btr_page_empty() or btr_page_reorganize(). We must
know the size of each entry in the PAGE_FREE list. If rollback left a
freed copy of the 'default row' in the PAGE_FREE list, we would be
unable to determine its size (if it is in ROW_FORMAT=COMPACT or
ROW_FORMAT=DYNAMIC) because it would contain more fields than the
rolled-back definition of the clustered index.
UNIV_SQL_DEFAULT: A new special constant that designates an instantly
added column that is not present in the clustered index record.
len_is_stored(): Check if a length is an actual length. There are
two magic length values: UNIV_SQL_DEFAULT, UNIV_SQL_NULL.
dict_col_t::def_val: The 'default row' value of the column. If the
column is not added instantly, def_val.len will be UNIV_SQL_DEFAULT.
dict_col_t: Add the accessors is_virtual(), is_nullable(), is_instant(),
instant_value().
dict_col_t::remove_instant(): Remove the 'instant ADD' status of
a column.
dict_col_t::name(const dict_table_t& table): Replaces
dict_table_get_col_name().
dict_index_t::n_core_fields: The original number of fields.
For secondary indexes and if instant ADD COLUMN has not been used,
this will be equal to dict_index_t::n_fields.
dict_index_t::n_core_null_bytes: Number of bytes needed to
represent the null flags; usually equal to UT_BITS_IN_BYTES(n_nullable).
dict_index_t::NO_CORE_NULL_BYTES: Magic value signalling that
n_core_null_bytes was not initialized yet from the clustered index
root page.
dict_index_t: Add the accessors is_instant(), is_clust(),
get_n_nullable(), instant_field_value().
dict_index_t::instant_add_field(): Adjust clustered index metadata
for instant ADD COLUMN.
dict_index_t::remove_instant(): Remove the 'instant ADD' status
of a clustered index when the table becomes empty, or the very first
instant ADD COLUMN operation is rolled back.
dict_table_t: Add the accessors is_instant(), is_temporary(),
supports_instant().
dict_table_t::instant_add_column(): Adjust metadata for
instant ADD COLUMN.
dict_table_t::rollback_instant(): Adjust metadata on the rollback
of instant ADD COLUMN.
prepare_inplace_alter_table_dict(): First create the ctx->new_table,
and only then decide if the table really needs to be rebuilt.
We must split the creation of table or index metadata from the
creation of the dictionary table records and the creation of
the data. In this way, we can transform a table-rebuilding operation
into an instant ADD COLUMN operation. Dictionary objects will only
be added to cache when table rebuilding or index creation is needed.
The ctx->instant_table will never be added to cache.
dict_table_t::add_to_cache(): Modified and renamed from
dict_table_add_to_cache(). Do not modify the table metadata.
Let the callers invoke dict_table_add_system_columns() and if needed,
set can_be_evicted.
dict_create_sys_tables_tuple(), dict_create_table_step(): Omit the
system columns (which will now exist in the dict_table_t object
already at this point).
dict_create_table_step(): Expect the callers to invoke
dict_table_add_system_columns().
pars_create_table(): Before creating the table creation execution
graph, invoke dict_table_add_system_columns().
row_create_table_for_mysql(): Expect all callers to invoke
dict_table_add_system_columns().
create_index_dict(): Replaces row_merge_create_index_graph().
innodb_update_n_cols(): Renamed from innobase_update_n_virtual().
Call my_error() if an error occurs.
btr_cur_instant_init(), btr_cur_instant_init_low(),
btr_cur_instant_root_init():
Load additional metadata from the clustered index and set
dict_index_t::n_core_null_bytes. This is invoked
when table metadata is first loaded into the data dictionary.
dict_boot(): Initialize n_core_null_bytes for the four hard-coded
dictionary tables.
dict_create_index_step(): Initialize n_core_null_bytes. This is
executed as part of CREATE TABLE.
dict_index_build_internal_clust(): Initialize n_core_null_bytes to
NO_CORE_NULL_BYTES if table->supports_instant().
row_create_index_for_mysql(): Initialize n_core_null_bytes for
CREATE TEMPORARY TABLE.
commit_cache_norebuild(): Call the code to rename or enlarge columns
in the cache only if instant ADD COLUMN is not being used.
(Instant ADD COLUMN would copy all column metadata from
instant_table to old_table, including the names and lengths.)
PAGE_INSTANT: A new 13-bit field for storing dict_index_t::n_core_fields.
This is repurposing the 16-bit field PAGE_DIRECTION, of which only the
least significant 3 bits were used. The original byte containing
PAGE_DIRECTION will be accessible via the new constant PAGE_DIRECTION_B.
page_get_instant(), page_set_instant(): Accessors for the PAGE_INSTANT.
page_ptr_get_direction(), page_get_direction(),
page_ptr_set_direction(): Accessors for PAGE_DIRECTION.
page_direction_reset(): Reset PAGE_DIRECTION, PAGE_N_DIRECTION.
page_direction_increment(): Increment PAGE_N_DIRECTION
and set PAGE_DIRECTION.
rec_get_offsets(): Use the 'leaf' parameter for non-debug purposes,
and assume that heap_no is always set.
Initialize all dict_index_t::n_fields for ROW_FORMAT=REDUNDANT records,
even if the record contains fewer fields.
rec_offs_make_valid(): Add the parameter 'leaf'.
rec_copy_prefix_to_dtuple(): Assert that the tuple is only built
on the core fields. Instant ADD COLUMN only applies to the
clustered index, and we should never build a search key that has
more than the PRIMARY KEY and possibly DB_TRX_ID,DB_ROLL_PTR.
All these columns are always present.
dict_index_build_data_tuple(): Remove assertions that would be
duplicated in rec_copy_prefix_to_dtuple().
rec_init_offsets(): Support ROW_FORMAT=REDUNDANT records whose
number of fields is between n_core_fields and n_fields.
cmp_rec_rec_with_match(): Implement the comparison between two
MIN_REC_FLAG records.
trx_t::in_rollback: Make the field available in non-debug builds.
trx_start_for_ddl_low(): Remove dangerous error-tolerance.
A dictionary transaction must be flagged as such before it has generated
any undo log records. This is because trx_undo_assign_undo() will mark
the transaction as a dictionary transaction in the undo log header
right before the very first undo log record is being written.
btr_index_rec_validate(): Account for instant ADD COLUMN
row_undo_ins_remove_clust_rec(): On the rollback of an insert into
SYS_COLUMNS, revert instant ADD COLUMN in the cache by removing the
last column from the table and the clustered index.
row_search_on_row_ref(), row_undo_mod_parse_undo_rec(), row_undo_mod(),
trx_undo_update_rec_get_update(): Handle the 'default row'
as a special case.
dtuple_t::trim(index): Omit a redundant suffix of an index tuple right
before insert or update. After instant ADD COLUMN, if the last fields
of a clustered index tuple match the 'default row', there is no
need to store them. While trimming the entry, we must hold a page latch,
so that the table cannot be emptied and the 'default row' be deleted.
btr_cur_optimistic_update(), btr_cur_pessimistic_update(),
row_upd_clust_rec_by_insert(), row_ins_clust_index_entry_low():
Invoke dtuple_t::trim() if needed.
row_ins_clust_index_entry(): Restore dtuple_t::n_fields after calling
row_ins_clust_index_entry_low().
rec_get_converted_size(), rec_get_converted_size_comp(): Allow the number
of fields to be between n_core_fields and n_fields. Do not support
infimum,supremum. They are never supposed to be stored in dtuple_t,
because page creation nowadays uses a lower-level method for initializing
them.
rec_convert_dtuple_to_rec_comp(): Assign the status bits based on the
number of fields.
btr_cur_trim(): In an update, trim the index entry as needed. For the
'default row', handle rollback specially. For user records, omit
fields that match the 'default row'.
btr_cur_optimistic_delete_func(), btr_cur_pessimistic_delete():
Skip locking and adaptive hash index for the 'default row'.
row_log_table_apply_convert_mrec(): Replace 'default row' values if needed.
In the temporary file that is applied by row_log_table_apply(),
we must identify whether the records contain the extra header for
instantly added columns. For now, we will allocate an additional byte
for this for ROW_T_INSERT and ROW_T_UPDATE records when the source table
has been subject to instant ADD COLUMN. The ROW_T_DELETE records are
fine, as they will be converted and will only contain 'core' columns
(PRIMARY KEY and some system columns) that are converted from dtuple_t.
rec_get_converted_size_temp(), rec_init_offsets_temp(),
rec_convert_dtuple_to_temp(): Add the parameter 'status'.
REC_INFO_DEFAULT_ROW = REC_INFO_MIN_REC_FLAG | REC_STATUS_COLUMNS_ADDED:
An info_bits constant for distinguishing the 'default row' record.
rec_comp_status_t: An enum of the status bit values.
rec_leaf_format: An enum that replaces the bool parameter of
rec_init_offsets_comp_ordinary().
2017-10-06 07:00:05 +03:00
|
|
|
|
MDEV-15662 Instant DROP COLUMN or changing the order of columns
Allow ADD COLUMN anywhere in a table, not only adding as the
last column.
Allow instant DROP COLUMN and instant changing the order of columns.
The added columns will always be added last in clustered index records.
In new records, instantly dropped columns will be stored as NULL or
empty when possible.
Information about dropped and reordered columns will be written in
a metadata BLOB (mblob), which is stored before the first 'user' field
in the hidden metadata record at the start of the clustered index.
The presence of mblob is indicated by setting the delete-mark flag in
the metadata record.
The metadata BLOB stores the number of clustered index fields,
followed by an array of column information for each field.
For dropped columns, we store the NOT NULL flag, the fixed length,
and for variable-length columns, whether the maximum length exceeded
255 bytes. For non-dropped columns, we store the column position.
Unlike with MDEV-11369, when a table becomes empty, it cannot
be converted back to the canonical format. The reason for this is
that other threads may hold cached objects such as
row_prebuilt_t::ins_node that could refer to dropped or reordered
index fields.
For instant DROP COLUMN and ROW_FORMAT=COMPACT or ROW_FORMAT=DYNAMIC,
we must store the n_core_null_bytes in the root page, so that the
chain of node pointer records can be followed in order to reach the
leftmost leaf page where the metadata record is located.
If the mblob is present, we will zero-initialize the strings
"infimum" and "supremum" in the root page, and use the last byte of
"supremum" for storing the number of null bytes (which are allocated
but useless on node pointer pages). This is necessary for
btr_cur_instant_init_metadata() to be able to navigate to the mblob.
If the PRIMARY KEY contains any variable-length column and some
nullable columns were instantly dropped, the dict_index_t::n_nullable
in the data dictionary could be smaller than it actually is in the
non-leaf pages. Because of this, the non-leaf pages could use more
bytes for the null flags than the data dictionary expects, and we
could be reading the lengths of the variable-length columns from the
wrong offset, and thus reading the child page number from wrong place.
This is the result of two design mistakes that involve unnecessary
storage of data: First, it is nonsense to store any data fields for
the leftmost node pointer records, because the comparisons would be
resolved by the MIN_REC_FLAG alone. Second, there cannot be any null
fields in the clustered index node pointer fields, but we nevertheless
reserve space for all the null flags.
Limitations (future work):
MDEV-17459 Allow instant ALTER TABLE even if FULLTEXT INDEX exists
MDEV-17468 Avoid table rebuild on operations on generated columns
MDEV-17494 Refuse ALGORITHM=INSTANT when the row size is too large
btr_page_reorganize_low(): Preserve any metadata in the root page.
Call lock_move_reorganize_page() only after restoring the "infimum"
and "supremum" records, to avoid a memcmp() assertion failure.
dict_col_t::DROPPED: Magic value for dict_col_t::ind.
dict_col_t::clear_instant(): Renamed from dict_col_t::remove_instant().
Do not assert that the column was instantly added, because we
sometimes call this unconditionally for all columns.
Convert an instantly added column to a "core column". The old name
remove_instant() could be mistaken to refer to "instant DROP COLUMN".
dict_col_t::is_added(): Rename from dict_col_t::is_instant().
dtype_t::metadata_blob_init(): Initialize the mblob data type.
dtuple_t::is_metadata(), dtuple_t::is_alter_metadata(),
upd_t::is_metadata(), upd_t::is_alter_metadata(): Check if info_bits
refer to a metadata record.
dict_table_t::instant: Metadata about dropped or reordered columns.
dict_table_t::prepare_instant(): Prepare
ha_innobase_inplace_ctx::instant_table for instant ALTER TABLE.
innobase_instant_try() will pass this to dict_table_t::instant_column().
On rollback, dict_table_t::rollback_instant() will be called.
dict_table_t::instant_column(): Renamed from instant_add_column().
Add the parameter col_map so that columns can be reordered.
Copy and adjust v_cols[] as well.
dict_table_t::find(): Find an old column based on a new column number.
dict_table_t::serialise_columns(), dict_table_t::deserialise_columns():
Convert the mblob.
dict_index_t::instant_metadata(): Create the metadata record
for instant ALTER TABLE. Invoke dict_table_t::serialise_columns().
dict_index_t::reconstruct_fields(): Invoked by
dict_table_t::deserialise_columns().
dict_index_t::clear_instant_alter(): Move the fields for the
dropped columns to the end, and sort the surviving index fields
in ascending order of column position.
ha_innobase::check_if_supported_inplace_alter(): Do not allow
adding a FTS_DOC_ID column if a hidden FTS_DOC_ID column exists
due to FULLTEXT INDEX. (This always required ALGORITHM=COPY.)
instant_alter_column_possible(): Add a parameter for InnoDB table,
to check for additional conditions, such as the maximum number of
index fields.
ha_innobase_inplace_ctx::first_alter_pos: The first column whose position
is affected by instant ADD, DROP, or changing the order of columns.
innobase_build_col_map(): Skip added virtual columns.
prepare_inplace_add_virtual(): Correctly compute num_to_add_vcol.
Remove some unnecessary code. Note that the call to
innodb_base_col_setup() should be executed later.
commit_try_norebuild(): If ctx->is_instant(), let the virtual
columns be added or dropped by innobase_instant_try().
innobase_instant_try(): Fill in a zero default value for the
hidden column FTS_DOC_ID (to reduce the work needed in MDEV-17459).
If any columns were dropped or reordered (or added not last),
delete any SYS_COLUMNS records for the following columns, and
insert SYS_COLUMNS records for all subsequent stored columns as well
as for all virtual columns. If any virtual column is dropped, rewrite
all virtual column metadata. Use a shortcut only for adding
virtual columns. This is because innobase_drop_virtual_try()
assumes that the dropped virtual columns still exist in ctx->old_table.
innodb_update_cols(): Renamed from innodb_update_n_cols().
innobase_add_one_virtual(), innobase_insert_sys_virtual(): Change
the return type to bool, and invoke my_error() when detecting an error.
innodb_insert_sys_columns(): Insert a record into SYS_COLUMNS.
Refactored from innobase_add_one_virtual() and innobase_instant_add_col().
innobase_instant_add_col(): Replace the parameter dfield with type.
innobase_instant_drop_cols(): Drop matching columns from SYS_COLUMNS
and all columns from SYS_VIRTUAL.
innobase_add_virtual_try(), innobase_drop_virtual_try(): Let
the caller invoke innodb_update_cols().
innobase_rename_column_try(): Skip dropped columns.
commit_cache_norebuild(): Update table->fts->doc_col.
dict_mem_table_col_rename_low(): Skip dropped columns.
trx_undo_rec_get_partial_row(): Skip dropped columns.
trx_undo_update_rec_get_update(): Handle the metadata BLOB correctly.
trx_undo_page_report_modify(): Avoid out-of-bounds access to record fields.
Log metadata records consistently.
Apparently, the first fields of a clustered index may be updated
in an update_undo vector when the index is ID_IND of SYS_FOREIGN,
as part of renaming the table during ALTER TABLE. Normally, updates of
the PRIMARY KEY should be logged as delete-mark and an insert.
row_undo_mod_parse_undo_rec(), row_purge_parse_undo_rec():
Use trx_undo_metadata.
row_undo_mod_clust_low(): On metadata rollback, roll back the root page too.
row_undo_mod_clust(): Relax an assertion. The delete-mark flag was
repurposed for ALTER TABLE metadata records.
row_rec_to_index_entry_impl(): Add the template parameter mblob
and the optional parameter info_bits for specifying the desired new
info bits. For the metadata tuple, allow conversion between the original
format (ADD COLUMN only) and the generic format (with hidden BLOB).
Add the optional parameter "pad" to determine whether the tuple should
be padded to the index fields (on ALTER TABLE it should), or whether
it should remain at its original size (on rollback).
row_build_index_entry_low(): Clean up the code, removing
redundant variables and conditions. For instantly dropped columns,
generate a dummy value that is NULL, the empty string, or a
fixed length of NUL bytes, depending on the type of the dropped column.
row_upd_clust_rec_by_insert_inherit_func(): On the update of PRIMARY KEY
of a record that contained a dropped column whose value was stored
externally, we will be inserting a dummy NULL or empty string value
to the field of the dropped column. The externally stored column would
eventually be dropped when purge removes the delete-marked record for
the old PRIMARY KEY value.
btr_index_rec_validate(): Recognize the metadata record.
btr_discard_only_page_on_level(): Preserve the generic instant
ALTER TABLE metadata.
btr_set_instant(): Replaces page_set_instant(). This sets a clustered
index root page to the appropriate format, or upgrades from
the MDEV-11369 instant ADD COLUMN to generic ALTER TABLE format.
btr_cur_instant_init_low(): Read and validate the metadata BLOB page
before reconstructing the dictionary information based on it.
btr_cur_instant_init_metadata(): Do not read any lengths from the
metadata record header before reading the BLOB. At this point, we
would not actually know how many nullable fields the metadata record
contains.
btr_cur_instant_root_init(): Initialize n_core_null_bytes in one
of two possible ways.
btr_cur_trim(): Handle the mblob record.
row_metadata_to_tuple(): Convert a metadata record to a data tuple,
based on the new info_bits of the metadata record.
btr_cur_pessimistic_update(): Invoke row_metadata_to_tuple() if needed.
Invoke dtuple_convert_big_rec() for metadata records if the record is
too large, or if the mblob is not yet marked as externally stored.
btr_cur_optimistic_delete_func(), btr_cur_pessimistic_delete():
When the last user record is deleted, do not delete the
generic instant ALTER TABLE metadata record. Only delete
MDEV-11369 instant ADD COLUMN metadata records.
btr_cur_optimistic_insert(): Avoid unnecessary computation of rec_size.
btr_pcur_store_position(): Allow a logically empty page to contain
a metadata record for generic ALTER TABLE.
REC_INFO_DEFAULT_ROW_ADD: Renamed from REC_INFO_DEFAULT_ROW.
This is for the old instant ADD COLUMN (MDEV-11369) only.
REC_INFO_DEFAULT_ROW_ALTER: The more generic metadata record,
with additional information for dropped or reordered columns.
rec_info_bits_valid(): Remove. The only case when this would fail
is when the record is the generic ALTER TABLE metadata record.
rec_is_alter_metadata(): Check if a record is the metadata record
for instant ALTER TABLE (other than ADD COLUMN). NOTE: This function
must not be invoked on node pointer records, because the delete-mark
flag in those records may be set (it is garbage), and then a debug
assertion could fail because index->is_instant() does not necessarily
hold.
rec_is_add_metadata(): Check if a record is MDEV-11369 ADD COLUMN metadata
record (not more generic instant ALTER TABLE).
rec_get_converted_size_comp_prefix_low(): Assume that the metadata
field will be stored externally. In dtuple_convert_big_rec() during
the rec_get_converted_size() call, it would not be there yet.
rec_get_converted_size_comp(): Replace status,fields,n_fields with tuple.
rec_init_offsets_comp_ordinary(), rec_get_converted_size_comp_prefix_low(),
rec_convert_dtuple_to_rec_comp(): Add template<bool mblob = false>.
With mblob=true, process a record with a metadata BLOB.
rec_copy_prefix_to_buf(): Assert that no fields beyond the key and
system columns are being copied. Exclude the metadata BLOB field.
rec_convert_dtuple_to_metadata_comp(): Convert an alter metadata tuple
into a record.
row_upd_index_replace_metadata(): Apply an update vector to an
alter_metadata tuple.
row_log_allocate(): Replace dict_index_t::is_instant()
with a more appropriate condition that ignores dict_table_t::instant.
Only a table on which the MDEV-11369 ADD COLUMN was performed
can "lose its instantness" when it becomes empty. After
instant DROP COLUMN or reordering columns, we cannot simply
convert the table to the canonical format, because the data
dictionary cache and all possibly existing references to it
from other client connection threads would have to be adjusted.
row_quiesce_write_index_fields(): Do not crash when the table contains
an instantly dropped column.
Thanks to Thirunarayanan Balathandayuthapani for discussing the design
and implementing an initial prototype of this.
Thanks to Matthias Leich for testing.
2018-10-19 16:49:54 +03:00
|
|
|
/** Serialise metadata of dropped or reordered columns.
|
|
|
|
@param[in,out] heap memory heap for allocation
|
|
|
|
@param[out] field data field with the metadata */
|
2018-11-09 18:00:07 +02:00
|
|
|
inline void serialise_columns(mem_heap_t* heap, dfield_t* field) const;
|
MDEV-15662 Instant DROP COLUMN or changing the order of columns
Allow ADD COLUMN anywhere in a table, not only adding as the
last column.
Allow instant DROP COLUMN and instant changing the order of columns.
The added columns will always be added last in clustered index records.
In new records, instantly dropped columns will be stored as NULL or
empty when possible.
Information about dropped and reordered columns will be written in
a metadata BLOB (mblob), which is stored before the first 'user' field
in the hidden metadata record at the start of the clustered index.
The presence of mblob is indicated by setting the delete-mark flag in
the metadata record.
The metadata BLOB stores the number of clustered index fields,
followed by an array of column information for each field.
For dropped columns, we store the NOT NULL flag, the fixed length,
and for variable-length columns, whether the maximum length exceeded
255 bytes. For non-dropped columns, we store the column position.
Unlike with MDEV-11369, when a table becomes empty, it cannot
be converted back to the canonical format. The reason for this is
that other threads may hold cached objects such as
row_prebuilt_t::ins_node that could refer to dropped or reordered
index fields.
For instant DROP COLUMN and ROW_FORMAT=COMPACT or ROW_FORMAT=DYNAMIC,
we must store the n_core_null_bytes in the root page, so that the
chain of node pointer records can be followed in order to reach the
leftmost leaf page where the metadata record is located.
If the mblob is present, we will zero-initialize the strings
"infimum" and "supremum" in the root page, and use the last byte of
"supremum" for storing the number of null bytes (which are allocated
but useless on node pointer pages). This is necessary for
btr_cur_instant_init_metadata() to be able to navigate to the mblob.
If the PRIMARY KEY contains any variable-length column and some
nullable columns were instantly dropped, the dict_index_t::n_nullable
in the data dictionary could be smaller than it actually is in the
non-leaf pages. Because of this, the non-leaf pages could use more
bytes for the null flags than the data dictionary expects, and we
could be reading the lengths of the variable-length columns from the
wrong offset, and thus reading the child page number from wrong place.
This is the result of two design mistakes that involve unnecessary
storage of data: First, it is nonsense to store any data fields for
the leftmost node pointer records, because the comparisons would be
resolved by the MIN_REC_FLAG alone. Second, there cannot be any null
fields in the clustered index node pointer fields, but we nevertheless
reserve space for all the null flags.
Limitations (future work):
MDEV-17459 Allow instant ALTER TABLE even if FULLTEXT INDEX exists
MDEV-17468 Avoid table rebuild on operations on generated columns
MDEV-17494 Refuse ALGORITHM=INSTANT when the row size is too large
btr_page_reorganize_low(): Preserve any metadata in the root page.
Call lock_move_reorganize_page() only after restoring the "infimum"
and "supremum" records, to avoid a memcmp() assertion failure.
dict_col_t::DROPPED: Magic value for dict_col_t::ind.
dict_col_t::clear_instant(): Renamed from dict_col_t::remove_instant().
Do not assert that the column was instantly added, because we
sometimes call this unconditionally for all columns.
Convert an instantly added column to a "core column". The old name
remove_instant() could be mistaken to refer to "instant DROP COLUMN".
dict_col_t::is_added(): Rename from dict_col_t::is_instant().
dtype_t::metadata_blob_init(): Initialize the mblob data type.
dtuple_t::is_metadata(), dtuple_t::is_alter_metadata(),
upd_t::is_metadata(), upd_t::is_alter_metadata(): Check if info_bits
refer to a metadata record.
dict_table_t::instant: Metadata about dropped or reordered columns.
dict_table_t::prepare_instant(): Prepare
ha_innobase_inplace_ctx::instant_table for instant ALTER TABLE.
innobase_instant_try() will pass this to dict_table_t::instant_column().
On rollback, dict_table_t::rollback_instant() will be called.
dict_table_t::instant_column(): Renamed from instant_add_column().
Add the parameter col_map so that columns can be reordered.
Copy and adjust v_cols[] as well.
dict_table_t::find(): Find an old column based on a new column number.
dict_table_t::serialise_columns(), dict_table_t::deserialise_columns():
Convert the mblob.
dict_index_t::instant_metadata(): Create the metadata record
for instant ALTER TABLE. Invoke dict_table_t::serialise_columns().
dict_index_t::reconstruct_fields(): Invoked by
dict_table_t::deserialise_columns().
dict_index_t::clear_instant_alter(): Move the fields for the
dropped columns to the end, and sort the surviving index fields
in ascending order of column position.
ha_innobase::check_if_supported_inplace_alter(): Do not allow
adding a FTS_DOC_ID column if a hidden FTS_DOC_ID column exists
due to FULLTEXT INDEX. (This always required ALGORITHM=COPY.)
instant_alter_column_possible(): Add a parameter for InnoDB table,
to check for additional conditions, such as the maximum number of
index fields.
ha_innobase_inplace_ctx::first_alter_pos: The first column whose position
is affected by instant ADD, DROP, or changing the order of columns.
innobase_build_col_map(): Skip added virtual columns.
prepare_inplace_add_virtual(): Correctly compute num_to_add_vcol.
Remove some unnecessary code. Note that the call to
innodb_base_col_setup() should be executed later.
commit_try_norebuild(): If ctx->is_instant(), let the virtual
columns be added or dropped by innobase_instant_try().
innobase_instant_try(): Fill in a zero default value for the
hidden column FTS_DOC_ID (to reduce the work needed in MDEV-17459).
If any columns were dropped or reordered (or added not last),
delete any SYS_COLUMNS records for the following columns, and
insert SYS_COLUMNS records for all subsequent stored columns as well
as for all virtual columns. If any virtual column is dropped, rewrite
all virtual column metadata. Use a shortcut only for adding
virtual columns. This is because innobase_drop_virtual_try()
assumes that the dropped virtual columns still exist in ctx->old_table.
innodb_update_cols(): Renamed from innodb_update_n_cols().
innobase_add_one_virtual(), innobase_insert_sys_virtual(): Change
the return type to bool, and invoke my_error() when detecting an error.
innodb_insert_sys_columns(): Insert a record into SYS_COLUMNS.
Refactored from innobase_add_one_virtual() and innobase_instant_add_col().
innobase_instant_add_col(): Replace the parameter dfield with type.
innobase_instant_drop_cols(): Drop matching columns from SYS_COLUMNS
and all columns from SYS_VIRTUAL.
innobase_add_virtual_try(), innobase_drop_virtual_try(): Let
the caller invoke innodb_update_cols().
innobase_rename_column_try(): Skip dropped columns.
commit_cache_norebuild(): Update table->fts->doc_col.
dict_mem_table_col_rename_low(): Skip dropped columns.
trx_undo_rec_get_partial_row(): Skip dropped columns.
trx_undo_update_rec_get_update(): Handle the metadata BLOB correctly.
trx_undo_page_report_modify(): Avoid out-of-bounds access to record fields.
Log metadata records consistently.
Apparently, the first fields of a clustered index may be updated
in an update_undo vector when the index is ID_IND of SYS_FOREIGN,
as part of renaming the table during ALTER TABLE. Normally, updates of
the PRIMARY KEY should be logged as delete-mark and an insert.
row_undo_mod_parse_undo_rec(), row_purge_parse_undo_rec():
Use trx_undo_metadata.
row_undo_mod_clust_low(): On metadata rollback, roll back the root page too.
row_undo_mod_clust(): Relax an assertion. The delete-mark flag was
repurposed for ALTER TABLE metadata records.
row_rec_to_index_entry_impl(): Add the template parameter mblob
and the optional parameter info_bits for specifying the desired new
info bits. For the metadata tuple, allow conversion between the original
format (ADD COLUMN only) and the generic format (with hidden BLOB).
Add the optional parameter "pad" to determine whether the tuple should
be padded to the index fields (on ALTER TABLE it should), or whether
it should remain at its original size (on rollback).
row_build_index_entry_low(): Clean up the code, removing
redundant variables and conditions. For instantly dropped columns,
generate a dummy value that is NULL, the empty string, or a
fixed length of NUL bytes, depending on the type of the dropped column.
row_upd_clust_rec_by_insert_inherit_func(): On the update of PRIMARY KEY
of a record that contained a dropped column whose value was stored
externally, we will be inserting a dummy NULL or empty string value
to the field of the dropped column. The externally stored column would
eventually be dropped when purge removes the delete-marked record for
the old PRIMARY KEY value.
btr_index_rec_validate(): Recognize the metadata record.
btr_discard_only_page_on_level(): Preserve the generic instant
ALTER TABLE metadata.
btr_set_instant(): Replaces page_set_instant(). This sets a clustered
index root page to the appropriate format, or upgrades from
the MDEV-11369 instant ADD COLUMN to generic ALTER TABLE format.
btr_cur_instant_init_low(): Read and validate the metadata BLOB page
before reconstructing the dictionary information based on it.
btr_cur_instant_init_metadata(): Do not read any lengths from the
metadata record header before reading the BLOB. At this point, we
would not actually know how many nullable fields the metadata record
contains.
btr_cur_instant_root_init(): Initialize n_core_null_bytes in one
of two possible ways.
btr_cur_trim(): Handle the mblob record.
row_metadata_to_tuple(): Convert a metadata record to a data tuple,
based on the new info_bits of the metadata record.
btr_cur_pessimistic_update(): Invoke row_metadata_to_tuple() if needed.
Invoke dtuple_convert_big_rec() for metadata records if the record is
too large, or if the mblob is not yet marked as externally stored.
btr_cur_optimistic_delete_func(), btr_cur_pessimistic_delete():
When the last user record is deleted, do not delete the
generic instant ALTER TABLE metadata record. Only delete
MDEV-11369 instant ADD COLUMN metadata records.
btr_cur_optimistic_insert(): Avoid unnecessary computation of rec_size.
btr_pcur_store_position(): Allow a logically empty page to contain
a metadata record for generic ALTER TABLE.
REC_INFO_DEFAULT_ROW_ADD: Renamed from REC_INFO_DEFAULT_ROW.
This is for the old instant ADD COLUMN (MDEV-11369) only.
REC_INFO_DEFAULT_ROW_ALTER: The more generic metadata record,
with additional information for dropped or reordered columns.
rec_info_bits_valid(): Remove. The only case when this would fail
is when the record is the generic ALTER TABLE metadata record.
rec_is_alter_metadata(): Check if a record is the metadata record
for instant ALTER TABLE (other than ADD COLUMN). NOTE: This function
must not be invoked on node pointer records, because the delete-mark
flag in those records may be set (it is garbage), and then a debug
assertion could fail because index->is_instant() does not necessarily
hold.
rec_is_add_metadata(): Check if a record is MDEV-11369 ADD COLUMN metadata
record (not more generic instant ALTER TABLE).
rec_get_converted_size_comp_prefix_low(): Assume that the metadata
field will be stored externally. In dtuple_convert_big_rec() during
the rec_get_converted_size() call, it would not be there yet.
rec_get_converted_size_comp(): Replace status,fields,n_fields with tuple.
rec_init_offsets_comp_ordinary(), rec_get_converted_size_comp_prefix_low(),
rec_convert_dtuple_to_rec_comp(): Add template<bool mblob = false>.
With mblob=true, process a record with a metadata BLOB.
rec_copy_prefix_to_buf(): Assert that no fields beyond the key and
system columns are being copied. Exclude the metadata BLOB field.
rec_convert_dtuple_to_metadata_comp(): Convert an alter metadata tuple
into a record.
row_upd_index_replace_metadata(): Apply an update vector to an
alter_metadata tuple.
row_log_allocate(): Replace dict_index_t::is_instant()
with a more appropriate condition that ignores dict_table_t::instant.
Only a table on which the MDEV-11369 ADD COLUMN was performed
can "lose its instantness" when it becomes empty. After
instant DROP COLUMN or reordering columns, we cannot simply
convert the table to the canonical format, because the data
dictionary cache and all possibly existing references to it
from other client connection threads would have to be adjusted.
row_quiesce_write_index_fields(): Do not crash when the table contains
an instantly dropped column.
Thanks to Thirunarayanan Balathandayuthapani for discussing the design
and implementing an initial prototype of this.
Thanks to Matthias Leich for testing.
2018-10-19 16:49:54 +03:00
|
|
|
|
|
|
|
/** Reconstruct dropped or reordered columns.
|
|
|
|
@param[in] metadata data from serialise_columns()
|
|
|
|
@param[in] len length of the metadata, in bytes
|
|
|
|
@return whether parsing the metadata failed */
|
|
|
|
bool deserialise_columns(const byte* metadata, ulint len);
|
|
|
|
|
|
|
|
/** Set is_instant() before instant_column().
|
|
|
|
@param[in] old previous table definition
|
|
|
|
@param[in] col_map map from old.cols[]
|
|
|
|
and old.v_cols[] to this
|
|
|
|
@param[out] first_alter_pos 0, or
|
|
|
|
1 + first changed column position */
|
|
|
|
inline void prepare_instant(const dict_table_t& old,
|
|
|
|
const ulint* col_map,
|
|
|
|
unsigned& first_alter_pos);
|
|
|
|
|
|
|
|
/** Adjust table metadata for instant ADD/DROP/reorder COLUMN.
|
|
|
|
@param[in] table table on which prepare_instant() was invoked
|
2018-12-27 15:32:26 +02:00
|
|
|
@param[in] col_map mapping from cols[] and v_cols[] to table
|
|
|
|
@return whether the metadata record must be updated */
|
|
|
|
inline bool instant_column(const dict_table_t& table,
|
MDEV-15662 Instant DROP COLUMN or changing the order of columns
Allow ADD COLUMN anywhere in a table, not only adding as the
last column.
Allow instant DROP COLUMN and instant changing the order of columns.
The added columns will always be added last in clustered index records.
In new records, instantly dropped columns will be stored as NULL or
empty when possible.
Information about dropped and reordered columns will be written in
a metadata BLOB (mblob), which is stored before the first 'user' field
in the hidden metadata record at the start of the clustered index.
The presence of mblob is indicated by setting the delete-mark flag in
the metadata record.
The metadata BLOB stores the number of clustered index fields,
followed by an array of column information for each field.
For dropped columns, we store the NOT NULL flag, the fixed length,
and for variable-length columns, whether the maximum length exceeded
255 bytes. For non-dropped columns, we store the column position.
Unlike with MDEV-11369, when a table becomes empty, it cannot
be converted back to the canonical format. The reason for this is
that other threads may hold cached objects such as
row_prebuilt_t::ins_node that could refer to dropped or reordered
index fields.
For instant DROP COLUMN and ROW_FORMAT=COMPACT or ROW_FORMAT=DYNAMIC,
we must store the n_core_null_bytes in the root page, so that the
chain of node pointer records can be followed in order to reach the
leftmost leaf page where the metadata record is located.
If the mblob is present, we will zero-initialize the strings
"infimum" and "supremum" in the root page, and use the last byte of
"supremum" for storing the number of null bytes (which are allocated
but useless on node pointer pages). This is necessary for
btr_cur_instant_init_metadata() to be able to navigate to the mblob.
If the PRIMARY KEY contains any variable-length column and some
nullable columns were instantly dropped, the dict_index_t::n_nullable
in the data dictionary could be smaller than it actually is in the
non-leaf pages. Because of this, the non-leaf pages could use more
bytes for the null flags than the data dictionary expects, and we
could be reading the lengths of the variable-length columns from the
wrong offset, and thus reading the child page number from wrong place.
This is the result of two design mistakes that involve unnecessary
storage of data: First, it is nonsense to store any data fields for
the leftmost node pointer records, because the comparisons would be
resolved by the MIN_REC_FLAG alone. Second, there cannot be any null
fields in the clustered index node pointer fields, but we nevertheless
reserve space for all the null flags.
Limitations (future work):
MDEV-17459 Allow instant ALTER TABLE even if FULLTEXT INDEX exists
MDEV-17468 Avoid table rebuild on operations on generated columns
MDEV-17494 Refuse ALGORITHM=INSTANT when the row size is too large
btr_page_reorganize_low(): Preserve any metadata in the root page.
Call lock_move_reorganize_page() only after restoring the "infimum"
and "supremum" records, to avoid a memcmp() assertion failure.
dict_col_t::DROPPED: Magic value for dict_col_t::ind.
dict_col_t::clear_instant(): Renamed from dict_col_t::remove_instant().
Do not assert that the column was instantly added, because we
sometimes call this unconditionally for all columns.
Convert an instantly added column to a "core column". The old name
remove_instant() could be mistaken to refer to "instant DROP COLUMN".
dict_col_t::is_added(): Rename from dict_col_t::is_instant().
dtype_t::metadata_blob_init(): Initialize the mblob data type.
dtuple_t::is_metadata(), dtuple_t::is_alter_metadata(),
upd_t::is_metadata(), upd_t::is_alter_metadata(): Check if info_bits
refer to a metadata record.
dict_table_t::instant: Metadata about dropped or reordered columns.
dict_table_t::prepare_instant(): Prepare
ha_innobase_inplace_ctx::instant_table for instant ALTER TABLE.
innobase_instant_try() will pass this to dict_table_t::instant_column().
On rollback, dict_table_t::rollback_instant() will be called.
dict_table_t::instant_column(): Renamed from instant_add_column().
Add the parameter col_map so that columns can be reordered.
Copy and adjust v_cols[] as well.
dict_table_t::find(): Find an old column based on a new column number.
dict_table_t::serialise_columns(), dict_table_t::deserialise_columns():
Convert the mblob.
dict_index_t::instant_metadata(): Create the metadata record
for instant ALTER TABLE. Invoke dict_table_t::serialise_columns().
dict_index_t::reconstruct_fields(): Invoked by
dict_table_t::deserialise_columns().
dict_index_t::clear_instant_alter(): Move the fields for the
dropped columns to the end, and sort the surviving index fields
in ascending order of column position.
ha_innobase::check_if_supported_inplace_alter(): Do not allow
adding a FTS_DOC_ID column if a hidden FTS_DOC_ID column exists
due to FULLTEXT INDEX. (This always required ALGORITHM=COPY.)
instant_alter_column_possible(): Add a parameter for InnoDB table,
to check for additional conditions, such as the maximum number of
index fields.
ha_innobase_inplace_ctx::first_alter_pos: The first column whose position
is affected by instant ADD, DROP, or changing the order of columns.
innobase_build_col_map(): Skip added virtual columns.
prepare_inplace_add_virtual(): Correctly compute num_to_add_vcol.
Remove some unnecessary code. Note that the call to
innodb_base_col_setup() should be executed later.
commit_try_norebuild(): If ctx->is_instant(), let the virtual
columns be added or dropped by innobase_instant_try().
innobase_instant_try(): Fill in a zero default value for the
hidden column FTS_DOC_ID (to reduce the work needed in MDEV-17459).
If any columns were dropped or reordered (or added not last),
delete any SYS_COLUMNS records for the following columns, and
insert SYS_COLUMNS records for all subsequent stored columns as well
as for all virtual columns. If any virtual column is dropped, rewrite
all virtual column metadata. Use a shortcut only for adding
virtual columns. This is because innobase_drop_virtual_try()
assumes that the dropped virtual columns still exist in ctx->old_table.
innodb_update_cols(): Renamed from innodb_update_n_cols().
innobase_add_one_virtual(), innobase_insert_sys_virtual(): Change
the return type to bool, and invoke my_error() when detecting an error.
innodb_insert_sys_columns(): Insert a record into SYS_COLUMNS.
Refactored from innobase_add_one_virtual() and innobase_instant_add_col().
innobase_instant_add_col(): Replace the parameter dfield with type.
innobase_instant_drop_cols(): Drop matching columns from SYS_COLUMNS
and all columns from SYS_VIRTUAL.
innobase_add_virtual_try(), innobase_drop_virtual_try(): Let
the caller invoke innodb_update_cols().
innobase_rename_column_try(): Skip dropped columns.
commit_cache_norebuild(): Update table->fts->doc_col.
dict_mem_table_col_rename_low(): Skip dropped columns.
trx_undo_rec_get_partial_row(): Skip dropped columns.
trx_undo_update_rec_get_update(): Handle the metadata BLOB correctly.
trx_undo_page_report_modify(): Avoid out-of-bounds access to record fields.
Log metadata records consistently.
Apparently, the first fields of a clustered index may be updated
in an update_undo vector when the index is ID_IND of SYS_FOREIGN,
as part of renaming the table during ALTER TABLE. Normally, updates of
the PRIMARY KEY should be logged as delete-mark and an insert.
row_undo_mod_parse_undo_rec(), row_purge_parse_undo_rec():
Use trx_undo_metadata.
row_undo_mod_clust_low(): On metadata rollback, roll back the root page too.
row_undo_mod_clust(): Relax an assertion. The delete-mark flag was
repurposed for ALTER TABLE metadata records.
row_rec_to_index_entry_impl(): Add the template parameter mblob
and the optional parameter info_bits for specifying the desired new
info bits. For the metadata tuple, allow conversion between the original
format (ADD COLUMN only) and the generic format (with hidden BLOB).
Add the optional parameter "pad" to determine whether the tuple should
be padded to the index fields (on ALTER TABLE it should), or whether
it should remain at its original size (on rollback).
row_build_index_entry_low(): Clean up the code, removing
redundant variables and conditions. For instantly dropped columns,
generate a dummy value that is NULL, the empty string, or a
fixed length of NUL bytes, depending on the type of the dropped column.
row_upd_clust_rec_by_insert_inherit_func(): On the update of PRIMARY KEY
of a record that contained a dropped column whose value was stored
externally, we will be inserting a dummy NULL or empty string value
to the field of the dropped column. The externally stored column would
eventually be dropped when purge removes the delete-marked record for
the old PRIMARY KEY value.
btr_index_rec_validate(): Recognize the metadata record.
btr_discard_only_page_on_level(): Preserve the generic instant
ALTER TABLE metadata.
btr_set_instant(): Replaces page_set_instant(). This sets a clustered
index root page to the appropriate format, or upgrades from
the MDEV-11369 instant ADD COLUMN to generic ALTER TABLE format.
btr_cur_instant_init_low(): Read and validate the metadata BLOB page
before reconstructing the dictionary information based on it.
btr_cur_instant_init_metadata(): Do not read any lengths from the
metadata record header before reading the BLOB. At this point, we
would not actually know how many nullable fields the metadata record
contains.
btr_cur_instant_root_init(): Initialize n_core_null_bytes in one
of two possible ways.
btr_cur_trim(): Handle the mblob record.
row_metadata_to_tuple(): Convert a metadata record to a data tuple,
based on the new info_bits of the metadata record.
btr_cur_pessimistic_update(): Invoke row_metadata_to_tuple() if needed.
Invoke dtuple_convert_big_rec() for metadata records if the record is
too large, or if the mblob is not yet marked as externally stored.
btr_cur_optimistic_delete_func(), btr_cur_pessimistic_delete():
When the last user record is deleted, do not delete the
generic instant ALTER TABLE metadata record. Only delete
MDEV-11369 instant ADD COLUMN metadata records.
btr_cur_optimistic_insert(): Avoid unnecessary computation of rec_size.
btr_pcur_store_position(): Allow a logically empty page to contain
a metadata record for generic ALTER TABLE.
REC_INFO_DEFAULT_ROW_ADD: Renamed from REC_INFO_DEFAULT_ROW.
This is for the old instant ADD COLUMN (MDEV-11369) only.
REC_INFO_DEFAULT_ROW_ALTER: The more generic metadata record,
with additional information for dropped or reordered columns.
rec_info_bits_valid(): Remove. The only case when this would fail
is when the record is the generic ALTER TABLE metadata record.
rec_is_alter_metadata(): Check if a record is the metadata record
for instant ALTER TABLE (other than ADD COLUMN). NOTE: This function
must not be invoked on node pointer records, because the delete-mark
flag in those records may be set (it is garbage), and then a debug
assertion could fail because index->is_instant() does not necessarily
hold.
rec_is_add_metadata(): Check if a record is MDEV-11369 ADD COLUMN metadata
record (not more generic instant ALTER TABLE).
rec_get_converted_size_comp_prefix_low(): Assume that the metadata
field will be stored externally. In dtuple_convert_big_rec() during
the rec_get_converted_size() call, it would not be there yet.
rec_get_converted_size_comp(): Replace status,fields,n_fields with tuple.
rec_init_offsets_comp_ordinary(), rec_get_converted_size_comp_prefix_low(),
rec_convert_dtuple_to_rec_comp(): Add template<bool mblob = false>.
With mblob=true, process a record with a metadata BLOB.
rec_copy_prefix_to_buf(): Assert that no fields beyond the key and
system columns are being copied. Exclude the metadata BLOB field.
rec_convert_dtuple_to_metadata_comp(): Convert an alter metadata tuple
into a record.
row_upd_index_replace_metadata(): Apply an update vector to an
alter_metadata tuple.
row_log_allocate(): Replace dict_index_t::is_instant()
with a more appropriate condition that ignores dict_table_t::instant.
Only a table on which the MDEV-11369 ADD COLUMN was performed
can "lose its instantness" when it becomes empty. After
instant DROP COLUMN or reordering columns, we cannot simply
convert the table to the canonical format, because the data
dictionary cache and all possibly existing references to it
from other client connection threads would have to be adjusted.
row_quiesce_write_index_fields(): Do not crash when the table contains
an instantly dropped column.
Thanks to Thirunarayanan Balathandayuthapani for discussing the design
and implementing an initial prototype of this.
Thanks to Matthias Leich for testing.
2018-10-19 16:49:54 +03:00
|
|
|
const ulint* col_map);
|
|
|
|
|
|
|
|
/** Roll back instant_column().
|
|
|
|
@param[in] old_n_cols original n_cols
|
|
|
|
@param[in] old_cols original cols
|
|
|
|
@param[in] old_col_names original col_names
|
|
|
|
@param[in] old_instant original instant structure
|
|
|
|
@param[in] old_fields original fields
|
|
|
|
@param[in] old_n_fields original number of fields
|
2018-12-28 15:23:42 +02:00
|
|
|
@param[in] old_n_core_fields original number of core fields
|
MDEV-15662 Instant DROP COLUMN or changing the order of columns
Allow ADD COLUMN anywhere in a table, not only adding as the
last column.
Allow instant DROP COLUMN and instant changing the order of columns.
The added columns will always be added last in clustered index records.
In new records, instantly dropped columns will be stored as NULL or
empty when possible.
Information about dropped and reordered columns will be written in
a metadata BLOB (mblob), which is stored before the first 'user' field
in the hidden metadata record at the start of the clustered index.
The presence of mblob is indicated by setting the delete-mark flag in
the metadata record.
The metadata BLOB stores the number of clustered index fields,
followed by an array of column information for each field.
For dropped columns, we store the NOT NULL flag, the fixed length,
and for variable-length columns, whether the maximum length exceeded
255 bytes. For non-dropped columns, we store the column position.
Unlike with MDEV-11369, when a table becomes empty, it cannot
be converted back to the canonical format. The reason for this is
that other threads may hold cached objects such as
row_prebuilt_t::ins_node that could refer to dropped or reordered
index fields.
For instant DROP COLUMN and ROW_FORMAT=COMPACT or ROW_FORMAT=DYNAMIC,
we must store the n_core_null_bytes in the root page, so that the
chain of node pointer records can be followed in order to reach the
leftmost leaf page where the metadata record is located.
If the mblob is present, we will zero-initialize the strings
"infimum" and "supremum" in the root page, and use the last byte of
"supremum" for storing the number of null bytes (which are allocated
but useless on node pointer pages). This is necessary for
btr_cur_instant_init_metadata() to be able to navigate to the mblob.
If the PRIMARY KEY contains any variable-length column and some
nullable columns were instantly dropped, the dict_index_t::n_nullable
in the data dictionary could be smaller than it actually is in the
non-leaf pages. Because of this, the non-leaf pages could use more
bytes for the null flags than the data dictionary expects, and we
could be reading the lengths of the variable-length columns from the
wrong offset, and thus reading the child page number from wrong place.
This is the result of two design mistakes that involve unnecessary
storage of data: First, it is nonsense to store any data fields for
the leftmost node pointer records, because the comparisons would be
resolved by the MIN_REC_FLAG alone. Second, there cannot be any null
fields in the clustered index node pointer fields, but we nevertheless
reserve space for all the null flags.
Limitations (future work):
MDEV-17459 Allow instant ALTER TABLE even if FULLTEXT INDEX exists
MDEV-17468 Avoid table rebuild on operations on generated columns
MDEV-17494 Refuse ALGORITHM=INSTANT when the row size is too large
btr_page_reorganize_low(): Preserve any metadata in the root page.
Call lock_move_reorganize_page() only after restoring the "infimum"
and "supremum" records, to avoid a memcmp() assertion failure.
dict_col_t::DROPPED: Magic value for dict_col_t::ind.
dict_col_t::clear_instant(): Renamed from dict_col_t::remove_instant().
Do not assert that the column was instantly added, because we
sometimes call this unconditionally for all columns.
Convert an instantly added column to a "core column". The old name
remove_instant() could be mistaken to refer to "instant DROP COLUMN".
dict_col_t::is_added(): Rename from dict_col_t::is_instant().
dtype_t::metadata_blob_init(): Initialize the mblob data type.
dtuple_t::is_metadata(), dtuple_t::is_alter_metadata(),
upd_t::is_metadata(), upd_t::is_alter_metadata(): Check if info_bits
refer to a metadata record.
dict_table_t::instant: Metadata about dropped or reordered columns.
dict_table_t::prepare_instant(): Prepare
ha_innobase_inplace_ctx::instant_table for instant ALTER TABLE.
innobase_instant_try() will pass this to dict_table_t::instant_column().
On rollback, dict_table_t::rollback_instant() will be called.
dict_table_t::instant_column(): Renamed from instant_add_column().
Add the parameter col_map so that columns can be reordered.
Copy and adjust v_cols[] as well.
dict_table_t::find(): Find an old column based on a new column number.
dict_table_t::serialise_columns(), dict_table_t::deserialise_columns():
Convert the mblob.
dict_index_t::instant_metadata(): Create the metadata record
for instant ALTER TABLE. Invoke dict_table_t::serialise_columns().
dict_index_t::reconstruct_fields(): Invoked by
dict_table_t::deserialise_columns().
dict_index_t::clear_instant_alter(): Move the fields for the
dropped columns to the end, and sort the surviving index fields
in ascending order of column position.
ha_innobase::check_if_supported_inplace_alter(): Do not allow
adding a FTS_DOC_ID column if a hidden FTS_DOC_ID column exists
due to FULLTEXT INDEX. (This always required ALGORITHM=COPY.)
instant_alter_column_possible(): Add a parameter for InnoDB table,
to check for additional conditions, such as the maximum number of
index fields.
ha_innobase_inplace_ctx::first_alter_pos: The first column whose position
is affected by instant ADD, DROP, or changing the order of columns.
innobase_build_col_map(): Skip added virtual columns.
prepare_inplace_add_virtual(): Correctly compute num_to_add_vcol.
Remove some unnecessary code. Note that the call to
innodb_base_col_setup() should be executed later.
commit_try_norebuild(): If ctx->is_instant(), let the virtual
columns be added or dropped by innobase_instant_try().
innobase_instant_try(): Fill in a zero default value for the
hidden column FTS_DOC_ID (to reduce the work needed in MDEV-17459).
If any columns were dropped or reordered (or added not last),
delete any SYS_COLUMNS records for the following columns, and
insert SYS_COLUMNS records for all subsequent stored columns as well
as for all virtual columns. If any virtual column is dropped, rewrite
all virtual column metadata. Use a shortcut only for adding
virtual columns. This is because innobase_drop_virtual_try()
assumes that the dropped virtual columns still exist in ctx->old_table.
innodb_update_cols(): Renamed from innodb_update_n_cols().
innobase_add_one_virtual(), innobase_insert_sys_virtual(): Change
the return type to bool, and invoke my_error() when detecting an error.
innodb_insert_sys_columns(): Insert a record into SYS_COLUMNS.
Refactored from innobase_add_one_virtual() and innobase_instant_add_col().
innobase_instant_add_col(): Replace the parameter dfield with type.
innobase_instant_drop_cols(): Drop matching columns from SYS_COLUMNS
and all columns from SYS_VIRTUAL.
innobase_add_virtual_try(), innobase_drop_virtual_try(): Let
the caller invoke innodb_update_cols().
innobase_rename_column_try(): Skip dropped columns.
commit_cache_norebuild(): Update table->fts->doc_col.
dict_mem_table_col_rename_low(): Skip dropped columns.
trx_undo_rec_get_partial_row(): Skip dropped columns.
trx_undo_update_rec_get_update(): Handle the metadata BLOB correctly.
trx_undo_page_report_modify(): Avoid out-of-bounds access to record fields.
Log metadata records consistently.
Apparently, the first fields of a clustered index may be updated
in an update_undo vector when the index is ID_IND of SYS_FOREIGN,
as part of renaming the table during ALTER TABLE. Normally, updates of
the PRIMARY KEY should be logged as delete-mark and an insert.
row_undo_mod_parse_undo_rec(), row_purge_parse_undo_rec():
Use trx_undo_metadata.
row_undo_mod_clust_low(): On metadata rollback, roll back the root page too.
row_undo_mod_clust(): Relax an assertion. The delete-mark flag was
repurposed for ALTER TABLE metadata records.
row_rec_to_index_entry_impl(): Add the template parameter mblob
and the optional parameter info_bits for specifying the desired new
info bits. For the metadata tuple, allow conversion between the original
format (ADD COLUMN only) and the generic format (with hidden BLOB).
Add the optional parameter "pad" to determine whether the tuple should
be padded to the index fields (on ALTER TABLE it should), or whether
it should remain at its original size (on rollback).
row_build_index_entry_low(): Clean up the code, removing
redundant variables and conditions. For instantly dropped columns,
generate a dummy value that is NULL, the empty string, or a
fixed length of NUL bytes, depending on the type of the dropped column.
row_upd_clust_rec_by_insert_inherit_func(): On the update of PRIMARY KEY
of a record that contained a dropped column whose value was stored
externally, we will be inserting a dummy NULL or empty string value
to the field of the dropped column. The externally stored column would
eventually be dropped when purge removes the delete-marked record for
the old PRIMARY KEY value.
btr_index_rec_validate(): Recognize the metadata record.
btr_discard_only_page_on_level(): Preserve the generic instant
ALTER TABLE metadata.
btr_set_instant(): Replaces page_set_instant(). This sets a clustered
index root page to the appropriate format, or upgrades from
the MDEV-11369 instant ADD COLUMN to generic ALTER TABLE format.
btr_cur_instant_init_low(): Read and validate the metadata BLOB page
before reconstructing the dictionary information based on it.
btr_cur_instant_init_metadata(): Do not read any lengths from the
metadata record header before reading the BLOB. At this point, we
would not actually know how many nullable fields the metadata record
contains.
btr_cur_instant_root_init(): Initialize n_core_null_bytes in one
of two possible ways.
btr_cur_trim(): Handle the mblob record.
row_metadata_to_tuple(): Convert a metadata record to a data tuple,
based on the new info_bits of the metadata record.
btr_cur_pessimistic_update(): Invoke row_metadata_to_tuple() if needed.
Invoke dtuple_convert_big_rec() for metadata records if the record is
too large, or if the mblob is not yet marked as externally stored.
btr_cur_optimistic_delete_func(), btr_cur_pessimistic_delete():
When the last user record is deleted, do not delete the
generic instant ALTER TABLE metadata record. Only delete
MDEV-11369 instant ADD COLUMN metadata records.
btr_cur_optimistic_insert(): Avoid unnecessary computation of rec_size.
btr_pcur_store_position(): Allow a logically empty page to contain
a metadata record for generic ALTER TABLE.
REC_INFO_DEFAULT_ROW_ADD: Renamed from REC_INFO_DEFAULT_ROW.
This is for the old instant ADD COLUMN (MDEV-11369) only.
REC_INFO_DEFAULT_ROW_ALTER: The more generic metadata record,
with additional information for dropped or reordered columns.
rec_info_bits_valid(): Remove. The only case when this would fail
is when the record is the generic ALTER TABLE metadata record.
rec_is_alter_metadata(): Check if a record is the metadata record
for instant ALTER TABLE (other than ADD COLUMN). NOTE: This function
must not be invoked on node pointer records, because the delete-mark
flag in those records may be set (it is garbage), and then a debug
assertion could fail because index->is_instant() does not necessarily
hold.
rec_is_add_metadata(): Check if a record is MDEV-11369 ADD COLUMN metadata
record (not more generic instant ALTER TABLE).
rec_get_converted_size_comp_prefix_low(): Assume that the metadata
field will be stored externally. In dtuple_convert_big_rec() during
the rec_get_converted_size() call, it would not be there yet.
rec_get_converted_size_comp(): Replace status,fields,n_fields with tuple.
rec_init_offsets_comp_ordinary(), rec_get_converted_size_comp_prefix_low(),
rec_convert_dtuple_to_rec_comp(): Add template<bool mblob = false>.
With mblob=true, process a record with a metadata BLOB.
rec_copy_prefix_to_buf(): Assert that no fields beyond the key and
system columns are being copied. Exclude the metadata BLOB field.
rec_convert_dtuple_to_metadata_comp(): Convert an alter metadata tuple
into a record.
row_upd_index_replace_metadata(): Apply an update vector to an
alter_metadata tuple.
row_log_allocate(): Replace dict_index_t::is_instant()
with a more appropriate condition that ignores dict_table_t::instant.
Only a table on which the MDEV-11369 ADD COLUMN was performed
can "lose its instantness" when it becomes empty. After
instant DROP COLUMN or reordering columns, we cannot simply
convert the table to the canonical format, because the data
dictionary cache and all possibly existing references to it
from other client connection threads would have to be adjusted.
row_quiesce_write_index_fields(): Do not crash when the table contains
an instantly dropped column.
Thanks to Thirunarayanan Balathandayuthapani for discussing the design
and implementing an initial prototype of this.
Thanks to Matthias Leich for testing.
2018-10-19 16:49:54 +03:00
|
|
|
@param[in] old_n_v_cols original n_v_cols
|
|
|
|
@param[in] old_v_cols original v_cols
|
|
|
|
@param[in] old_v_col_names original v_col_names
|
|
|
|
@param[in] col_map column map */
|
|
|
|
inline void rollback_instant(
|
MDEV-11369 Instant ADD COLUMN for InnoDB
For InnoDB tables, adding, dropping and reordering columns has
required a rebuild of the table and all its indexes. Since MySQL 5.6
(and MariaDB 10.0) this has been supported online (LOCK=NONE), allowing
concurrent modification of the tables.
This work revises the InnoDB ROW_FORMAT=REDUNDANT, ROW_FORMAT=COMPACT
and ROW_FORMAT=DYNAMIC so that columns can be appended instantaneously,
with only minor changes performed to the table structure. The counter
innodb_instant_alter_column in INFORMATION_SCHEMA.GLOBAL_STATUS
is incremented whenever a table rebuild operation is converted into
an instant ADD COLUMN operation.
ROW_FORMAT=COMPRESSED tables will not support instant ADD COLUMN.
Some usability limitations will be addressed in subsequent work:
MDEV-13134 Introduce ALTER TABLE attributes ALGORITHM=NOCOPY
and ALGORITHM=INSTANT
MDEV-14016 Allow instant ADD COLUMN, ADD INDEX, LOCK=NONE
The format of the clustered index (PRIMARY KEY) is changed as follows:
(1) The FIL_PAGE_TYPE of the root page will be FIL_PAGE_TYPE_INSTANT,
and a new field PAGE_INSTANT will contain the original number of fields
in the clustered index ('core' fields).
If instant ADD COLUMN has not been used or the table becomes empty,
or the very first instant ADD COLUMN operation is rolled back,
the fields PAGE_INSTANT and FIL_PAGE_TYPE will be reset
to 0 and FIL_PAGE_INDEX.
(2) A special 'default row' record is inserted into the leftmost leaf,
between the page infimum and the first user record. This record is
distinguished by the REC_INFO_MIN_REC_FLAG, and it is otherwise in the
same format as records that contain values for the instantly added
columns. This 'default row' always has the same number of fields as
the clustered index according to the table definition. The values of
'core' fields are to be ignored. For other fields, the 'default row'
will contain the default values as they were during the ALTER TABLE
statement. (If the column default values are changed later, those
values will only be stored in the .frm file. The 'default row' will
contain the original evaluated values, which must be the same for
every row.) The 'default row' must be completely hidden from
higher-level access routines. Assertions have been added to ensure
that no 'default row' is ever present in the adaptive hash index
or in locked records. The 'default row' is never delete-marked.
(3) In clustered index leaf page records, the number of fields must
reside between the number of 'core' fields (dict_index_t::n_core_fields
introduced in this work) and dict_index_t::n_fields. If the number
of fields is less than dict_index_t::n_fields, the missing fields
are replaced with the column value of the 'default row'.
Note: The number of fields in the record may shrink if some of the
last instantly added columns are updated to the value that is
in the 'default row'. The function btr_cur_trim() implements this
'compression' on update and rollback; dtuple::trim() implements it
on insert.
(4) In ROW_FORMAT=COMPACT and ROW_FORMAT=DYNAMIC records, the new
status value REC_STATUS_COLUMNS_ADDED will indicate the presence of
a new record header that will encode n_fields-n_core_fields-1 in
1 or 2 bytes. (In ROW_FORMAT=REDUNDANT records, the record header
always explicitly encodes the number of fields.)
We introduce the undo log record type TRX_UNDO_INSERT_DEFAULT for
covering the insert of the 'default row' record when instant ADD COLUMN
is used for the first time. Subsequent instant ADD COLUMN can use
TRX_UNDO_UPD_EXIST_REC.
This is joint work with Vin Chen (陈福荣) from Tencent. The design
that was discussed in April 2017 would not have allowed import or
export of data files, because instead of the 'default row' it would
have introduced a data dictionary table. The test
rpl.rpl_alter_instant is exactly as contributed in pull request #408.
The test innodb.instant_alter is based on a contributed test.
The redo log record format changes for ROW_FORMAT=DYNAMIC and
ROW_FORMAT=COMPACT are as contributed. (With this change present,
crash recovery from MariaDB 10.3.1 will fail in spectacular ways!)
Also the semantics of higher-level redo log records that modify the
PAGE_INSTANT field is changed. The redo log format version identifier
was already changed to LOG_HEADER_FORMAT_CURRENT=103 in MariaDB 10.3.1.
Everything else has been rewritten by me. Thanks to Elena Stepanova,
the code has been tested extensively.
When rolling back an instant ADD COLUMN operation, we must empty the
PAGE_FREE list after deleting or shortening the 'default row' record,
by calling either btr_page_empty() or btr_page_reorganize(). We must
know the size of each entry in the PAGE_FREE list. If rollback left a
freed copy of the 'default row' in the PAGE_FREE list, we would be
unable to determine its size (if it is in ROW_FORMAT=COMPACT or
ROW_FORMAT=DYNAMIC) because it would contain more fields than the
rolled-back definition of the clustered index.
UNIV_SQL_DEFAULT: A new special constant that designates an instantly
added column that is not present in the clustered index record.
len_is_stored(): Check if a length is an actual length. There are
two magic length values: UNIV_SQL_DEFAULT, UNIV_SQL_NULL.
dict_col_t::def_val: The 'default row' value of the column. If the
column is not added instantly, def_val.len will be UNIV_SQL_DEFAULT.
dict_col_t: Add the accessors is_virtual(), is_nullable(), is_instant(),
instant_value().
dict_col_t::remove_instant(): Remove the 'instant ADD' status of
a column.
dict_col_t::name(const dict_table_t& table): Replaces
dict_table_get_col_name().
dict_index_t::n_core_fields: The original number of fields.
For secondary indexes and if instant ADD COLUMN has not been used,
this will be equal to dict_index_t::n_fields.
dict_index_t::n_core_null_bytes: Number of bytes needed to
represent the null flags; usually equal to UT_BITS_IN_BYTES(n_nullable).
dict_index_t::NO_CORE_NULL_BYTES: Magic value signalling that
n_core_null_bytes was not initialized yet from the clustered index
root page.
dict_index_t: Add the accessors is_instant(), is_clust(),
get_n_nullable(), instant_field_value().
dict_index_t::instant_add_field(): Adjust clustered index metadata
for instant ADD COLUMN.
dict_index_t::remove_instant(): Remove the 'instant ADD' status
of a clustered index when the table becomes empty, or the very first
instant ADD COLUMN operation is rolled back.
dict_table_t: Add the accessors is_instant(), is_temporary(),
supports_instant().
dict_table_t::instant_add_column(): Adjust metadata for
instant ADD COLUMN.
dict_table_t::rollback_instant(): Adjust metadata on the rollback
of instant ADD COLUMN.
prepare_inplace_alter_table_dict(): First create the ctx->new_table,
and only then decide if the table really needs to be rebuilt.
We must split the creation of table or index metadata from the
creation of the dictionary table records and the creation of
the data. In this way, we can transform a table-rebuilding operation
into an instant ADD COLUMN operation. Dictionary objects will only
be added to cache when table rebuilding or index creation is needed.
The ctx->instant_table will never be added to cache.
dict_table_t::add_to_cache(): Modified and renamed from
dict_table_add_to_cache(). Do not modify the table metadata.
Let the callers invoke dict_table_add_system_columns() and if needed,
set can_be_evicted.
dict_create_sys_tables_tuple(), dict_create_table_step(): Omit the
system columns (which will now exist in the dict_table_t object
already at this point).
dict_create_table_step(): Expect the callers to invoke
dict_table_add_system_columns().
pars_create_table(): Before creating the table creation execution
graph, invoke dict_table_add_system_columns().
row_create_table_for_mysql(): Expect all callers to invoke
dict_table_add_system_columns().
create_index_dict(): Replaces row_merge_create_index_graph().
innodb_update_n_cols(): Renamed from innobase_update_n_virtual().
Call my_error() if an error occurs.
btr_cur_instant_init(), btr_cur_instant_init_low(),
btr_cur_instant_root_init():
Load additional metadata from the clustered index and set
dict_index_t::n_core_null_bytes. This is invoked
when table metadata is first loaded into the data dictionary.
dict_boot(): Initialize n_core_null_bytes for the four hard-coded
dictionary tables.
dict_create_index_step(): Initialize n_core_null_bytes. This is
executed as part of CREATE TABLE.
dict_index_build_internal_clust(): Initialize n_core_null_bytes to
NO_CORE_NULL_BYTES if table->supports_instant().
row_create_index_for_mysql(): Initialize n_core_null_bytes for
CREATE TEMPORARY TABLE.
commit_cache_norebuild(): Call the code to rename or enlarge columns
in the cache only if instant ADD COLUMN is not being used.
(Instant ADD COLUMN would copy all column metadata from
instant_table to old_table, including the names and lengths.)
PAGE_INSTANT: A new 13-bit field for storing dict_index_t::n_core_fields.
This is repurposing the 16-bit field PAGE_DIRECTION, of which only the
least significant 3 bits were used. The original byte containing
PAGE_DIRECTION will be accessible via the new constant PAGE_DIRECTION_B.
page_get_instant(), page_set_instant(): Accessors for the PAGE_INSTANT.
page_ptr_get_direction(), page_get_direction(),
page_ptr_set_direction(): Accessors for PAGE_DIRECTION.
page_direction_reset(): Reset PAGE_DIRECTION, PAGE_N_DIRECTION.
page_direction_increment(): Increment PAGE_N_DIRECTION
and set PAGE_DIRECTION.
rec_get_offsets(): Use the 'leaf' parameter for non-debug purposes,
and assume that heap_no is always set.
Initialize all dict_index_t::n_fields for ROW_FORMAT=REDUNDANT records,
even if the record contains fewer fields.
rec_offs_make_valid(): Add the parameter 'leaf'.
rec_copy_prefix_to_dtuple(): Assert that the tuple is only built
on the core fields. Instant ADD COLUMN only applies to the
clustered index, and we should never build a search key that has
more than the PRIMARY KEY and possibly DB_TRX_ID,DB_ROLL_PTR.
All these columns are always present.
dict_index_build_data_tuple(): Remove assertions that would be
duplicated in rec_copy_prefix_to_dtuple().
rec_init_offsets(): Support ROW_FORMAT=REDUNDANT records whose
number of fields is between n_core_fields and n_fields.
cmp_rec_rec_with_match(): Implement the comparison between two
MIN_REC_FLAG records.
trx_t::in_rollback: Make the field available in non-debug builds.
trx_start_for_ddl_low(): Remove dangerous error-tolerance.
A dictionary transaction must be flagged as such before it has generated
any undo log records. This is because trx_undo_assign_undo() will mark
the transaction as a dictionary transaction in the undo log header
right before the very first undo log record is being written.
btr_index_rec_validate(): Account for instant ADD COLUMN
row_undo_ins_remove_clust_rec(): On the rollback of an insert into
SYS_COLUMNS, revert instant ADD COLUMN in the cache by removing the
last column from the table and the clustered index.
row_search_on_row_ref(), row_undo_mod_parse_undo_rec(), row_undo_mod(),
trx_undo_update_rec_get_update(): Handle the 'default row'
as a special case.
dtuple_t::trim(index): Omit a redundant suffix of an index tuple right
before insert or update. After instant ADD COLUMN, if the last fields
of a clustered index tuple match the 'default row', there is no
need to store them. While trimming the entry, we must hold a page latch,
so that the table cannot be emptied and the 'default row' be deleted.
btr_cur_optimistic_update(), btr_cur_pessimistic_update(),
row_upd_clust_rec_by_insert(), row_ins_clust_index_entry_low():
Invoke dtuple_t::trim() if needed.
row_ins_clust_index_entry(): Restore dtuple_t::n_fields after calling
row_ins_clust_index_entry_low().
rec_get_converted_size(), rec_get_converted_size_comp(): Allow the number
of fields to be between n_core_fields and n_fields. Do not support
infimum,supremum. They are never supposed to be stored in dtuple_t,
because page creation nowadays uses a lower-level method for initializing
them.
rec_convert_dtuple_to_rec_comp(): Assign the status bits based on the
number of fields.
btr_cur_trim(): In an update, trim the index entry as needed. For the
'default row', handle rollback specially. For user records, omit
fields that match the 'default row'.
btr_cur_optimistic_delete_func(), btr_cur_pessimistic_delete():
Skip locking and adaptive hash index for the 'default row'.
row_log_table_apply_convert_mrec(): Replace 'default row' values if needed.
In the temporary file that is applied by row_log_table_apply(),
we must identify whether the records contain the extra header for
instantly added columns. For now, we will allocate an additional byte
for this for ROW_T_INSERT and ROW_T_UPDATE records when the source table
has been subject to instant ADD COLUMN. The ROW_T_DELETE records are
fine, as they will be converted and will only contain 'core' columns
(PRIMARY KEY and some system columns) that are converted from dtuple_t.
rec_get_converted_size_temp(), rec_init_offsets_temp(),
rec_convert_dtuple_to_temp(): Add the parameter 'status'.
REC_INFO_DEFAULT_ROW = REC_INFO_MIN_REC_FLAG | REC_STATUS_COLUMNS_ADDED:
An info_bits constant for distinguishing the 'default row' record.
rec_comp_status_t: An enum of the status bit values.
rec_leaf_format: An enum that replaces the bool parameter of
rec_init_offsets_comp_ordinary().
2017-10-06 07:00:05 +03:00
|
|
|
unsigned old_n_cols,
|
|
|
|
dict_col_t* old_cols,
|
MDEV-15662 Instant DROP COLUMN or changing the order of columns
Allow ADD COLUMN anywhere in a table, not only adding as the
last column.
Allow instant DROP COLUMN and instant changing the order of columns.
The added columns will always be added last in clustered index records.
In new records, instantly dropped columns will be stored as NULL or
empty when possible.
Information about dropped and reordered columns will be written in
a metadata BLOB (mblob), which is stored before the first 'user' field
in the hidden metadata record at the start of the clustered index.
The presence of mblob is indicated by setting the delete-mark flag in
the metadata record.
The metadata BLOB stores the number of clustered index fields,
followed by an array of column information for each field.
For dropped columns, we store the NOT NULL flag, the fixed length,
and for variable-length columns, whether the maximum length exceeded
255 bytes. For non-dropped columns, we store the column position.
Unlike with MDEV-11369, when a table becomes empty, it cannot
be converted back to the canonical format. The reason for this is
that other threads may hold cached objects such as
row_prebuilt_t::ins_node that could refer to dropped or reordered
index fields.
For instant DROP COLUMN and ROW_FORMAT=COMPACT or ROW_FORMAT=DYNAMIC,
we must store the n_core_null_bytes in the root page, so that the
chain of node pointer records can be followed in order to reach the
leftmost leaf page where the metadata record is located.
If the mblob is present, we will zero-initialize the strings
"infimum" and "supremum" in the root page, and use the last byte of
"supremum" for storing the number of null bytes (which are allocated
but useless on node pointer pages). This is necessary for
btr_cur_instant_init_metadata() to be able to navigate to the mblob.
If the PRIMARY KEY contains any variable-length column and some
nullable columns were instantly dropped, the dict_index_t::n_nullable
in the data dictionary could be smaller than it actually is in the
non-leaf pages. Because of this, the non-leaf pages could use more
bytes for the null flags than the data dictionary expects, and we
could be reading the lengths of the variable-length columns from the
wrong offset, and thus reading the child page number from wrong place.
This is the result of two design mistakes that involve unnecessary
storage of data: First, it is nonsense to store any data fields for
the leftmost node pointer records, because the comparisons would be
resolved by the MIN_REC_FLAG alone. Second, there cannot be any null
fields in the clustered index node pointer fields, but we nevertheless
reserve space for all the null flags.
Limitations (future work):
MDEV-17459 Allow instant ALTER TABLE even if FULLTEXT INDEX exists
MDEV-17468 Avoid table rebuild on operations on generated columns
MDEV-17494 Refuse ALGORITHM=INSTANT when the row size is too large
btr_page_reorganize_low(): Preserve any metadata in the root page.
Call lock_move_reorganize_page() only after restoring the "infimum"
and "supremum" records, to avoid a memcmp() assertion failure.
dict_col_t::DROPPED: Magic value for dict_col_t::ind.
dict_col_t::clear_instant(): Renamed from dict_col_t::remove_instant().
Do not assert that the column was instantly added, because we
sometimes call this unconditionally for all columns.
Convert an instantly added column to a "core column". The old name
remove_instant() could be mistaken to refer to "instant DROP COLUMN".
dict_col_t::is_added(): Rename from dict_col_t::is_instant().
dtype_t::metadata_blob_init(): Initialize the mblob data type.
dtuple_t::is_metadata(), dtuple_t::is_alter_metadata(),
upd_t::is_metadata(), upd_t::is_alter_metadata(): Check if info_bits
refer to a metadata record.
dict_table_t::instant: Metadata about dropped or reordered columns.
dict_table_t::prepare_instant(): Prepare
ha_innobase_inplace_ctx::instant_table for instant ALTER TABLE.
innobase_instant_try() will pass this to dict_table_t::instant_column().
On rollback, dict_table_t::rollback_instant() will be called.
dict_table_t::instant_column(): Renamed from instant_add_column().
Add the parameter col_map so that columns can be reordered.
Copy and adjust v_cols[] as well.
dict_table_t::find(): Find an old column based on a new column number.
dict_table_t::serialise_columns(), dict_table_t::deserialise_columns():
Convert the mblob.
dict_index_t::instant_metadata(): Create the metadata record
for instant ALTER TABLE. Invoke dict_table_t::serialise_columns().
dict_index_t::reconstruct_fields(): Invoked by
dict_table_t::deserialise_columns().
dict_index_t::clear_instant_alter(): Move the fields for the
dropped columns to the end, and sort the surviving index fields
in ascending order of column position.
ha_innobase::check_if_supported_inplace_alter(): Do not allow
adding a FTS_DOC_ID column if a hidden FTS_DOC_ID column exists
due to FULLTEXT INDEX. (This always required ALGORITHM=COPY.)
instant_alter_column_possible(): Add a parameter for InnoDB table,
to check for additional conditions, such as the maximum number of
index fields.
ha_innobase_inplace_ctx::first_alter_pos: The first column whose position
is affected by instant ADD, DROP, or changing the order of columns.
innobase_build_col_map(): Skip added virtual columns.
prepare_inplace_add_virtual(): Correctly compute num_to_add_vcol.
Remove some unnecessary code. Note that the call to
innodb_base_col_setup() should be executed later.
commit_try_norebuild(): If ctx->is_instant(), let the virtual
columns be added or dropped by innobase_instant_try().
innobase_instant_try(): Fill in a zero default value for the
hidden column FTS_DOC_ID (to reduce the work needed in MDEV-17459).
If any columns were dropped or reordered (or added not last),
delete any SYS_COLUMNS records for the following columns, and
insert SYS_COLUMNS records for all subsequent stored columns as well
as for all virtual columns. If any virtual column is dropped, rewrite
all virtual column metadata. Use a shortcut only for adding
virtual columns. This is because innobase_drop_virtual_try()
assumes that the dropped virtual columns still exist in ctx->old_table.
innodb_update_cols(): Renamed from innodb_update_n_cols().
innobase_add_one_virtual(), innobase_insert_sys_virtual(): Change
the return type to bool, and invoke my_error() when detecting an error.
innodb_insert_sys_columns(): Insert a record into SYS_COLUMNS.
Refactored from innobase_add_one_virtual() and innobase_instant_add_col().
innobase_instant_add_col(): Replace the parameter dfield with type.
innobase_instant_drop_cols(): Drop matching columns from SYS_COLUMNS
and all columns from SYS_VIRTUAL.
innobase_add_virtual_try(), innobase_drop_virtual_try(): Let
the caller invoke innodb_update_cols().
innobase_rename_column_try(): Skip dropped columns.
commit_cache_norebuild(): Update table->fts->doc_col.
dict_mem_table_col_rename_low(): Skip dropped columns.
trx_undo_rec_get_partial_row(): Skip dropped columns.
trx_undo_update_rec_get_update(): Handle the metadata BLOB correctly.
trx_undo_page_report_modify(): Avoid out-of-bounds access to record fields.
Log metadata records consistently.
Apparently, the first fields of a clustered index may be updated
in an update_undo vector when the index is ID_IND of SYS_FOREIGN,
as part of renaming the table during ALTER TABLE. Normally, updates of
the PRIMARY KEY should be logged as delete-mark and an insert.
row_undo_mod_parse_undo_rec(), row_purge_parse_undo_rec():
Use trx_undo_metadata.
row_undo_mod_clust_low(): On metadata rollback, roll back the root page too.
row_undo_mod_clust(): Relax an assertion. The delete-mark flag was
repurposed for ALTER TABLE metadata records.
row_rec_to_index_entry_impl(): Add the template parameter mblob
and the optional parameter info_bits for specifying the desired new
info bits. For the metadata tuple, allow conversion between the original
format (ADD COLUMN only) and the generic format (with hidden BLOB).
Add the optional parameter "pad" to determine whether the tuple should
be padded to the index fields (on ALTER TABLE it should), or whether
it should remain at its original size (on rollback).
row_build_index_entry_low(): Clean up the code, removing
redundant variables and conditions. For instantly dropped columns,
generate a dummy value that is NULL, the empty string, or a
fixed length of NUL bytes, depending on the type of the dropped column.
row_upd_clust_rec_by_insert_inherit_func(): On the update of PRIMARY KEY
of a record that contained a dropped column whose value was stored
externally, we will be inserting a dummy NULL or empty string value
to the field of the dropped column. The externally stored column would
eventually be dropped when purge removes the delete-marked record for
the old PRIMARY KEY value.
btr_index_rec_validate(): Recognize the metadata record.
btr_discard_only_page_on_level(): Preserve the generic instant
ALTER TABLE metadata.
btr_set_instant(): Replaces page_set_instant(). This sets a clustered
index root page to the appropriate format, or upgrades from
the MDEV-11369 instant ADD COLUMN to generic ALTER TABLE format.
btr_cur_instant_init_low(): Read and validate the metadata BLOB page
before reconstructing the dictionary information based on it.
btr_cur_instant_init_metadata(): Do not read any lengths from the
metadata record header before reading the BLOB. At this point, we
would not actually know how many nullable fields the metadata record
contains.
btr_cur_instant_root_init(): Initialize n_core_null_bytes in one
of two possible ways.
btr_cur_trim(): Handle the mblob record.
row_metadata_to_tuple(): Convert a metadata record to a data tuple,
based on the new info_bits of the metadata record.
btr_cur_pessimistic_update(): Invoke row_metadata_to_tuple() if needed.
Invoke dtuple_convert_big_rec() for metadata records if the record is
too large, or if the mblob is not yet marked as externally stored.
btr_cur_optimistic_delete_func(), btr_cur_pessimistic_delete():
When the last user record is deleted, do not delete the
generic instant ALTER TABLE metadata record. Only delete
MDEV-11369 instant ADD COLUMN metadata records.
btr_cur_optimistic_insert(): Avoid unnecessary computation of rec_size.
btr_pcur_store_position(): Allow a logically empty page to contain
a metadata record for generic ALTER TABLE.
REC_INFO_DEFAULT_ROW_ADD: Renamed from REC_INFO_DEFAULT_ROW.
This is for the old instant ADD COLUMN (MDEV-11369) only.
REC_INFO_DEFAULT_ROW_ALTER: The more generic metadata record,
with additional information for dropped or reordered columns.
rec_info_bits_valid(): Remove. The only case when this would fail
is when the record is the generic ALTER TABLE metadata record.
rec_is_alter_metadata(): Check if a record is the metadata record
for instant ALTER TABLE (other than ADD COLUMN). NOTE: This function
must not be invoked on node pointer records, because the delete-mark
flag in those records may be set (it is garbage), and then a debug
assertion could fail because index->is_instant() does not necessarily
hold.
rec_is_add_metadata(): Check if a record is MDEV-11369 ADD COLUMN metadata
record (not more generic instant ALTER TABLE).
rec_get_converted_size_comp_prefix_low(): Assume that the metadata
field will be stored externally. In dtuple_convert_big_rec() during
the rec_get_converted_size() call, it would not be there yet.
rec_get_converted_size_comp(): Replace status,fields,n_fields with tuple.
rec_init_offsets_comp_ordinary(), rec_get_converted_size_comp_prefix_low(),
rec_convert_dtuple_to_rec_comp(): Add template<bool mblob = false>.
With mblob=true, process a record with a metadata BLOB.
rec_copy_prefix_to_buf(): Assert that no fields beyond the key and
system columns are being copied. Exclude the metadata BLOB field.
rec_convert_dtuple_to_metadata_comp(): Convert an alter metadata tuple
into a record.
row_upd_index_replace_metadata(): Apply an update vector to an
alter_metadata tuple.
row_log_allocate(): Replace dict_index_t::is_instant()
with a more appropriate condition that ignores dict_table_t::instant.
Only a table on which the MDEV-11369 ADD COLUMN was performed
can "lose its instantness" when it becomes empty. After
instant DROP COLUMN or reordering columns, we cannot simply
convert the table to the canonical format, because the data
dictionary cache and all possibly existing references to it
from other client connection threads would have to be adjusted.
row_quiesce_write_index_fields(): Do not crash when the table contains
an instantly dropped column.
Thanks to Thirunarayanan Balathandayuthapani for discussing the design
and implementing an initial prototype of this.
Thanks to Matthias Leich for testing.
2018-10-19 16:49:54 +03:00
|
|
|
const char* old_col_names,
|
|
|
|
dict_instant_t* old_instant,
|
|
|
|
dict_field_t* old_fields,
|
|
|
|
unsigned old_n_fields,
|
2018-12-28 15:23:42 +02:00
|
|
|
unsigned old_n_core_fields,
|
MDEV-15662 Instant DROP COLUMN or changing the order of columns
Allow ADD COLUMN anywhere in a table, not only adding as the
last column.
Allow instant DROP COLUMN and instant changing the order of columns.
The added columns will always be added last in clustered index records.
In new records, instantly dropped columns will be stored as NULL or
empty when possible.
Information about dropped and reordered columns will be written in
a metadata BLOB (mblob), which is stored before the first 'user' field
in the hidden metadata record at the start of the clustered index.
The presence of mblob is indicated by setting the delete-mark flag in
the metadata record.
The metadata BLOB stores the number of clustered index fields,
followed by an array of column information for each field.
For dropped columns, we store the NOT NULL flag, the fixed length,
and for variable-length columns, whether the maximum length exceeded
255 bytes. For non-dropped columns, we store the column position.
Unlike with MDEV-11369, when a table becomes empty, it cannot
be converted back to the canonical format. The reason for this is
that other threads may hold cached objects such as
row_prebuilt_t::ins_node that could refer to dropped or reordered
index fields.
For instant DROP COLUMN and ROW_FORMAT=COMPACT or ROW_FORMAT=DYNAMIC,
we must store the n_core_null_bytes in the root page, so that the
chain of node pointer records can be followed in order to reach the
leftmost leaf page where the metadata record is located.
If the mblob is present, we will zero-initialize the strings
"infimum" and "supremum" in the root page, and use the last byte of
"supremum" for storing the number of null bytes (which are allocated
but useless on node pointer pages). This is necessary for
btr_cur_instant_init_metadata() to be able to navigate to the mblob.
If the PRIMARY KEY contains any variable-length column and some
nullable columns were instantly dropped, the dict_index_t::n_nullable
in the data dictionary could be smaller than it actually is in the
non-leaf pages. Because of this, the non-leaf pages could use more
bytes for the null flags than the data dictionary expects, and we
could be reading the lengths of the variable-length columns from the
wrong offset, and thus reading the child page number from wrong place.
This is the result of two design mistakes that involve unnecessary
storage of data: First, it is nonsense to store any data fields for
the leftmost node pointer records, because the comparisons would be
resolved by the MIN_REC_FLAG alone. Second, there cannot be any null
fields in the clustered index node pointer fields, but we nevertheless
reserve space for all the null flags.
Limitations (future work):
MDEV-17459 Allow instant ALTER TABLE even if FULLTEXT INDEX exists
MDEV-17468 Avoid table rebuild on operations on generated columns
MDEV-17494 Refuse ALGORITHM=INSTANT when the row size is too large
btr_page_reorganize_low(): Preserve any metadata in the root page.
Call lock_move_reorganize_page() only after restoring the "infimum"
and "supremum" records, to avoid a memcmp() assertion failure.
dict_col_t::DROPPED: Magic value for dict_col_t::ind.
dict_col_t::clear_instant(): Renamed from dict_col_t::remove_instant().
Do not assert that the column was instantly added, because we
sometimes call this unconditionally for all columns.
Convert an instantly added column to a "core column". The old name
remove_instant() could be mistaken to refer to "instant DROP COLUMN".
dict_col_t::is_added(): Rename from dict_col_t::is_instant().
dtype_t::metadata_blob_init(): Initialize the mblob data type.
dtuple_t::is_metadata(), dtuple_t::is_alter_metadata(),
upd_t::is_metadata(), upd_t::is_alter_metadata(): Check if info_bits
refer to a metadata record.
dict_table_t::instant: Metadata about dropped or reordered columns.
dict_table_t::prepare_instant(): Prepare
ha_innobase_inplace_ctx::instant_table for instant ALTER TABLE.
innobase_instant_try() will pass this to dict_table_t::instant_column().
On rollback, dict_table_t::rollback_instant() will be called.
dict_table_t::instant_column(): Renamed from instant_add_column().
Add the parameter col_map so that columns can be reordered.
Copy and adjust v_cols[] as well.
dict_table_t::find(): Find an old column based on a new column number.
dict_table_t::serialise_columns(), dict_table_t::deserialise_columns():
Convert the mblob.
dict_index_t::instant_metadata(): Create the metadata record
for instant ALTER TABLE. Invoke dict_table_t::serialise_columns().
dict_index_t::reconstruct_fields(): Invoked by
dict_table_t::deserialise_columns().
dict_index_t::clear_instant_alter(): Move the fields for the
dropped columns to the end, and sort the surviving index fields
in ascending order of column position.
ha_innobase::check_if_supported_inplace_alter(): Do not allow
adding a FTS_DOC_ID column if a hidden FTS_DOC_ID column exists
due to FULLTEXT INDEX. (This always required ALGORITHM=COPY.)
instant_alter_column_possible(): Add a parameter for InnoDB table,
to check for additional conditions, such as the maximum number of
index fields.
ha_innobase_inplace_ctx::first_alter_pos: The first column whose position
is affected by instant ADD, DROP, or changing the order of columns.
innobase_build_col_map(): Skip added virtual columns.
prepare_inplace_add_virtual(): Correctly compute num_to_add_vcol.
Remove some unnecessary code. Note that the call to
innodb_base_col_setup() should be executed later.
commit_try_norebuild(): If ctx->is_instant(), let the virtual
columns be added or dropped by innobase_instant_try().
innobase_instant_try(): Fill in a zero default value for the
hidden column FTS_DOC_ID (to reduce the work needed in MDEV-17459).
If any columns were dropped or reordered (or added not last),
delete any SYS_COLUMNS records for the following columns, and
insert SYS_COLUMNS records for all subsequent stored columns as well
as for all virtual columns. If any virtual column is dropped, rewrite
all virtual column metadata. Use a shortcut only for adding
virtual columns. This is because innobase_drop_virtual_try()
assumes that the dropped virtual columns still exist in ctx->old_table.
innodb_update_cols(): Renamed from innodb_update_n_cols().
innobase_add_one_virtual(), innobase_insert_sys_virtual(): Change
the return type to bool, and invoke my_error() when detecting an error.
innodb_insert_sys_columns(): Insert a record into SYS_COLUMNS.
Refactored from innobase_add_one_virtual() and innobase_instant_add_col().
innobase_instant_add_col(): Replace the parameter dfield with type.
innobase_instant_drop_cols(): Drop matching columns from SYS_COLUMNS
and all columns from SYS_VIRTUAL.
innobase_add_virtual_try(), innobase_drop_virtual_try(): Let
the caller invoke innodb_update_cols().
innobase_rename_column_try(): Skip dropped columns.
commit_cache_norebuild(): Update table->fts->doc_col.
dict_mem_table_col_rename_low(): Skip dropped columns.
trx_undo_rec_get_partial_row(): Skip dropped columns.
trx_undo_update_rec_get_update(): Handle the metadata BLOB correctly.
trx_undo_page_report_modify(): Avoid out-of-bounds access to record fields.
Log metadata records consistently.
Apparently, the first fields of a clustered index may be updated
in an update_undo vector when the index is ID_IND of SYS_FOREIGN,
as part of renaming the table during ALTER TABLE. Normally, updates of
the PRIMARY KEY should be logged as delete-mark and an insert.
row_undo_mod_parse_undo_rec(), row_purge_parse_undo_rec():
Use trx_undo_metadata.
row_undo_mod_clust_low(): On metadata rollback, roll back the root page too.
row_undo_mod_clust(): Relax an assertion. The delete-mark flag was
repurposed for ALTER TABLE metadata records.
row_rec_to_index_entry_impl(): Add the template parameter mblob
and the optional parameter info_bits for specifying the desired new
info bits. For the metadata tuple, allow conversion between the original
format (ADD COLUMN only) and the generic format (with hidden BLOB).
Add the optional parameter "pad" to determine whether the tuple should
be padded to the index fields (on ALTER TABLE it should), or whether
it should remain at its original size (on rollback).
row_build_index_entry_low(): Clean up the code, removing
redundant variables and conditions. For instantly dropped columns,
generate a dummy value that is NULL, the empty string, or a
fixed length of NUL bytes, depending on the type of the dropped column.
row_upd_clust_rec_by_insert_inherit_func(): On the update of PRIMARY KEY
of a record that contained a dropped column whose value was stored
externally, we will be inserting a dummy NULL or empty string value
to the field of the dropped column. The externally stored column would
eventually be dropped when purge removes the delete-marked record for
the old PRIMARY KEY value.
btr_index_rec_validate(): Recognize the metadata record.
btr_discard_only_page_on_level(): Preserve the generic instant
ALTER TABLE metadata.
btr_set_instant(): Replaces page_set_instant(). This sets a clustered
index root page to the appropriate format, or upgrades from
the MDEV-11369 instant ADD COLUMN to generic ALTER TABLE format.
btr_cur_instant_init_low(): Read and validate the metadata BLOB page
before reconstructing the dictionary information based on it.
btr_cur_instant_init_metadata(): Do not read any lengths from the
metadata record header before reading the BLOB. At this point, we
would not actually know how many nullable fields the metadata record
contains.
btr_cur_instant_root_init(): Initialize n_core_null_bytes in one
of two possible ways.
btr_cur_trim(): Handle the mblob record.
row_metadata_to_tuple(): Convert a metadata record to a data tuple,
based on the new info_bits of the metadata record.
btr_cur_pessimistic_update(): Invoke row_metadata_to_tuple() if needed.
Invoke dtuple_convert_big_rec() for metadata records if the record is
too large, or if the mblob is not yet marked as externally stored.
btr_cur_optimistic_delete_func(), btr_cur_pessimistic_delete():
When the last user record is deleted, do not delete the
generic instant ALTER TABLE metadata record. Only delete
MDEV-11369 instant ADD COLUMN metadata records.
btr_cur_optimistic_insert(): Avoid unnecessary computation of rec_size.
btr_pcur_store_position(): Allow a logically empty page to contain
a metadata record for generic ALTER TABLE.
REC_INFO_DEFAULT_ROW_ADD: Renamed from REC_INFO_DEFAULT_ROW.
This is for the old instant ADD COLUMN (MDEV-11369) only.
REC_INFO_DEFAULT_ROW_ALTER: The more generic metadata record,
with additional information for dropped or reordered columns.
rec_info_bits_valid(): Remove. The only case when this would fail
is when the record is the generic ALTER TABLE metadata record.
rec_is_alter_metadata(): Check if a record is the metadata record
for instant ALTER TABLE (other than ADD COLUMN). NOTE: This function
must not be invoked on node pointer records, because the delete-mark
flag in those records may be set (it is garbage), and then a debug
assertion could fail because index->is_instant() does not necessarily
hold.
rec_is_add_metadata(): Check if a record is MDEV-11369 ADD COLUMN metadata
record (not more generic instant ALTER TABLE).
rec_get_converted_size_comp_prefix_low(): Assume that the metadata
field will be stored externally. In dtuple_convert_big_rec() during
the rec_get_converted_size() call, it would not be there yet.
rec_get_converted_size_comp(): Replace status,fields,n_fields with tuple.
rec_init_offsets_comp_ordinary(), rec_get_converted_size_comp_prefix_low(),
rec_convert_dtuple_to_rec_comp(): Add template<bool mblob = false>.
With mblob=true, process a record with a metadata BLOB.
rec_copy_prefix_to_buf(): Assert that no fields beyond the key and
system columns are being copied. Exclude the metadata BLOB field.
rec_convert_dtuple_to_metadata_comp(): Convert an alter metadata tuple
into a record.
row_upd_index_replace_metadata(): Apply an update vector to an
alter_metadata tuple.
row_log_allocate(): Replace dict_index_t::is_instant()
with a more appropriate condition that ignores dict_table_t::instant.
Only a table on which the MDEV-11369 ADD COLUMN was performed
can "lose its instantness" when it becomes empty. After
instant DROP COLUMN or reordering columns, we cannot simply
convert the table to the canonical format, because the data
dictionary cache and all possibly existing references to it
from other client connection threads would have to be adjusted.
row_quiesce_write_index_fields(): Do not crash when the table contains
an instantly dropped column.
Thanks to Thirunarayanan Balathandayuthapani for discussing the design
and implementing an initial prototype of this.
Thanks to Matthias Leich for testing.
2018-10-19 16:49:54 +03:00
|
|
|
unsigned old_n_v_cols,
|
|
|
|
dict_v_col_t* old_v_cols,
|
|
|
|
const char* old_v_col_names,
|
|
|
|
const ulint* col_map);
|
MDEV-11369 Instant ADD COLUMN for InnoDB
For InnoDB tables, adding, dropping and reordering columns has
required a rebuild of the table and all its indexes. Since MySQL 5.6
(and MariaDB 10.0) this has been supported online (LOCK=NONE), allowing
concurrent modification of the tables.
This work revises the InnoDB ROW_FORMAT=REDUNDANT, ROW_FORMAT=COMPACT
and ROW_FORMAT=DYNAMIC so that columns can be appended instantaneously,
with only minor changes performed to the table structure. The counter
innodb_instant_alter_column in INFORMATION_SCHEMA.GLOBAL_STATUS
is incremented whenever a table rebuild operation is converted into
an instant ADD COLUMN operation.
ROW_FORMAT=COMPRESSED tables will not support instant ADD COLUMN.
Some usability limitations will be addressed in subsequent work:
MDEV-13134 Introduce ALTER TABLE attributes ALGORITHM=NOCOPY
and ALGORITHM=INSTANT
MDEV-14016 Allow instant ADD COLUMN, ADD INDEX, LOCK=NONE
The format of the clustered index (PRIMARY KEY) is changed as follows:
(1) The FIL_PAGE_TYPE of the root page will be FIL_PAGE_TYPE_INSTANT,
and a new field PAGE_INSTANT will contain the original number of fields
in the clustered index ('core' fields).
If instant ADD COLUMN has not been used or the table becomes empty,
or the very first instant ADD COLUMN operation is rolled back,
the fields PAGE_INSTANT and FIL_PAGE_TYPE will be reset
to 0 and FIL_PAGE_INDEX.
(2) A special 'default row' record is inserted into the leftmost leaf,
between the page infimum and the first user record. This record is
distinguished by the REC_INFO_MIN_REC_FLAG, and it is otherwise in the
same format as records that contain values for the instantly added
columns. This 'default row' always has the same number of fields as
the clustered index according to the table definition. The values of
'core' fields are to be ignored. For other fields, the 'default row'
will contain the default values as they were during the ALTER TABLE
statement. (If the column default values are changed later, those
values will only be stored in the .frm file. The 'default row' will
contain the original evaluated values, which must be the same for
every row.) The 'default row' must be completely hidden from
higher-level access routines. Assertions have been added to ensure
that no 'default row' is ever present in the adaptive hash index
or in locked records. The 'default row' is never delete-marked.
(3) In clustered index leaf page records, the number of fields must
reside between the number of 'core' fields (dict_index_t::n_core_fields
introduced in this work) and dict_index_t::n_fields. If the number
of fields is less than dict_index_t::n_fields, the missing fields
are replaced with the column value of the 'default row'.
Note: The number of fields in the record may shrink if some of the
last instantly added columns are updated to the value that is
in the 'default row'. The function btr_cur_trim() implements this
'compression' on update and rollback; dtuple::trim() implements it
on insert.
(4) In ROW_FORMAT=COMPACT and ROW_FORMAT=DYNAMIC records, the new
status value REC_STATUS_COLUMNS_ADDED will indicate the presence of
a new record header that will encode n_fields-n_core_fields-1 in
1 or 2 bytes. (In ROW_FORMAT=REDUNDANT records, the record header
always explicitly encodes the number of fields.)
We introduce the undo log record type TRX_UNDO_INSERT_DEFAULT for
covering the insert of the 'default row' record when instant ADD COLUMN
is used for the first time. Subsequent instant ADD COLUMN can use
TRX_UNDO_UPD_EXIST_REC.
This is joint work with Vin Chen (陈福荣) from Tencent. The design
that was discussed in April 2017 would not have allowed import or
export of data files, because instead of the 'default row' it would
have introduced a data dictionary table. The test
rpl.rpl_alter_instant is exactly as contributed in pull request #408.
The test innodb.instant_alter is based on a contributed test.
The redo log record format changes for ROW_FORMAT=DYNAMIC and
ROW_FORMAT=COMPACT are as contributed. (With this change present,
crash recovery from MariaDB 10.3.1 will fail in spectacular ways!)
Also the semantics of higher-level redo log records that modify the
PAGE_INSTANT field is changed. The redo log format version identifier
was already changed to LOG_HEADER_FORMAT_CURRENT=103 in MariaDB 10.3.1.
Everything else has been rewritten by me. Thanks to Elena Stepanova,
the code has been tested extensively.
When rolling back an instant ADD COLUMN operation, we must empty the
PAGE_FREE list after deleting or shortening the 'default row' record,
by calling either btr_page_empty() or btr_page_reorganize(). We must
know the size of each entry in the PAGE_FREE list. If rollback left a
freed copy of the 'default row' in the PAGE_FREE list, we would be
unable to determine its size (if it is in ROW_FORMAT=COMPACT or
ROW_FORMAT=DYNAMIC) because it would contain more fields than the
rolled-back definition of the clustered index.
UNIV_SQL_DEFAULT: A new special constant that designates an instantly
added column that is not present in the clustered index record.
len_is_stored(): Check if a length is an actual length. There are
two magic length values: UNIV_SQL_DEFAULT, UNIV_SQL_NULL.
dict_col_t::def_val: The 'default row' value of the column. If the
column is not added instantly, def_val.len will be UNIV_SQL_DEFAULT.
dict_col_t: Add the accessors is_virtual(), is_nullable(), is_instant(),
instant_value().
dict_col_t::remove_instant(): Remove the 'instant ADD' status of
a column.
dict_col_t::name(const dict_table_t& table): Replaces
dict_table_get_col_name().
dict_index_t::n_core_fields: The original number of fields.
For secondary indexes and if instant ADD COLUMN has not been used,
this will be equal to dict_index_t::n_fields.
dict_index_t::n_core_null_bytes: Number of bytes needed to
represent the null flags; usually equal to UT_BITS_IN_BYTES(n_nullable).
dict_index_t::NO_CORE_NULL_BYTES: Magic value signalling that
n_core_null_bytes was not initialized yet from the clustered index
root page.
dict_index_t: Add the accessors is_instant(), is_clust(),
get_n_nullable(), instant_field_value().
dict_index_t::instant_add_field(): Adjust clustered index metadata
for instant ADD COLUMN.
dict_index_t::remove_instant(): Remove the 'instant ADD' status
of a clustered index when the table becomes empty, or the very first
instant ADD COLUMN operation is rolled back.
dict_table_t: Add the accessors is_instant(), is_temporary(),
supports_instant().
dict_table_t::instant_add_column(): Adjust metadata for
instant ADD COLUMN.
dict_table_t::rollback_instant(): Adjust metadata on the rollback
of instant ADD COLUMN.
prepare_inplace_alter_table_dict(): First create the ctx->new_table,
and only then decide if the table really needs to be rebuilt.
We must split the creation of table or index metadata from the
creation of the dictionary table records and the creation of
the data. In this way, we can transform a table-rebuilding operation
into an instant ADD COLUMN operation. Dictionary objects will only
be added to cache when table rebuilding or index creation is needed.
The ctx->instant_table will never be added to cache.
dict_table_t::add_to_cache(): Modified and renamed from
dict_table_add_to_cache(). Do not modify the table metadata.
Let the callers invoke dict_table_add_system_columns() and if needed,
set can_be_evicted.
dict_create_sys_tables_tuple(), dict_create_table_step(): Omit the
system columns (which will now exist in the dict_table_t object
already at this point).
dict_create_table_step(): Expect the callers to invoke
dict_table_add_system_columns().
pars_create_table(): Before creating the table creation execution
graph, invoke dict_table_add_system_columns().
row_create_table_for_mysql(): Expect all callers to invoke
dict_table_add_system_columns().
create_index_dict(): Replaces row_merge_create_index_graph().
innodb_update_n_cols(): Renamed from innobase_update_n_virtual().
Call my_error() if an error occurs.
btr_cur_instant_init(), btr_cur_instant_init_low(),
btr_cur_instant_root_init():
Load additional metadata from the clustered index and set
dict_index_t::n_core_null_bytes. This is invoked
when table metadata is first loaded into the data dictionary.
dict_boot(): Initialize n_core_null_bytes for the four hard-coded
dictionary tables.
dict_create_index_step(): Initialize n_core_null_bytes. This is
executed as part of CREATE TABLE.
dict_index_build_internal_clust(): Initialize n_core_null_bytes to
NO_CORE_NULL_BYTES if table->supports_instant().
row_create_index_for_mysql(): Initialize n_core_null_bytes for
CREATE TEMPORARY TABLE.
commit_cache_norebuild(): Call the code to rename or enlarge columns
in the cache only if instant ADD COLUMN is not being used.
(Instant ADD COLUMN would copy all column metadata from
instant_table to old_table, including the names and lengths.)
PAGE_INSTANT: A new 13-bit field for storing dict_index_t::n_core_fields.
This is repurposing the 16-bit field PAGE_DIRECTION, of which only the
least significant 3 bits were used. The original byte containing
PAGE_DIRECTION will be accessible via the new constant PAGE_DIRECTION_B.
page_get_instant(), page_set_instant(): Accessors for the PAGE_INSTANT.
page_ptr_get_direction(), page_get_direction(),
page_ptr_set_direction(): Accessors for PAGE_DIRECTION.
page_direction_reset(): Reset PAGE_DIRECTION, PAGE_N_DIRECTION.
page_direction_increment(): Increment PAGE_N_DIRECTION
and set PAGE_DIRECTION.
rec_get_offsets(): Use the 'leaf' parameter for non-debug purposes,
and assume that heap_no is always set.
Initialize all dict_index_t::n_fields for ROW_FORMAT=REDUNDANT records,
even if the record contains fewer fields.
rec_offs_make_valid(): Add the parameter 'leaf'.
rec_copy_prefix_to_dtuple(): Assert that the tuple is only built
on the core fields. Instant ADD COLUMN only applies to the
clustered index, and we should never build a search key that has
more than the PRIMARY KEY and possibly DB_TRX_ID,DB_ROLL_PTR.
All these columns are always present.
dict_index_build_data_tuple(): Remove assertions that would be
duplicated in rec_copy_prefix_to_dtuple().
rec_init_offsets(): Support ROW_FORMAT=REDUNDANT records whose
number of fields is between n_core_fields and n_fields.
cmp_rec_rec_with_match(): Implement the comparison between two
MIN_REC_FLAG records.
trx_t::in_rollback: Make the field available in non-debug builds.
trx_start_for_ddl_low(): Remove dangerous error-tolerance.
A dictionary transaction must be flagged as such before it has generated
any undo log records. This is because trx_undo_assign_undo() will mark
the transaction as a dictionary transaction in the undo log header
right before the very first undo log record is being written.
btr_index_rec_validate(): Account for instant ADD COLUMN
row_undo_ins_remove_clust_rec(): On the rollback of an insert into
SYS_COLUMNS, revert instant ADD COLUMN in the cache by removing the
last column from the table and the clustered index.
row_search_on_row_ref(), row_undo_mod_parse_undo_rec(), row_undo_mod(),
trx_undo_update_rec_get_update(): Handle the 'default row'
as a special case.
dtuple_t::trim(index): Omit a redundant suffix of an index tuple right
before insert or update. After instant ADD COLUMN, if the last fields
of a clustered index tuple match the 'default row', there is no
need to store them. While trimming the entry, we must hold a page latch,
so that the table cannot be emptied and the 'default row' be deleted.
btr_cur_optimistic_update(), btr_cur_pessimistic_update(),
row_upd_clust_rec_by_insert(), row_ins_clust_index_entry_low():
Invoke dtuple_t::trim() if needed.
row_ins_clust_index_entry(): Restore dtuple_t::n_fields after calling
row_ins_clust_index_entry_low().
rec_get_converted_size(), rec_get_converted_size_comp(): Allow the number
of fields to be between n_core_fields and n_fields. Do not support
infimum,supremum. They are never supposed to be stored in dtuple_t,
because page creation nowadays uses a lower-level method for initializing
them.
rec_convert_dtuple_to_rec_comp(): Assign the status bits based on the
number of fields.
btr_cur_trim(): In an update, trim the index entry as needed. For the
'default row', handle rollback specially. For user records, omit
fields that match the 'default row'.
btr_cur_optimistic_delete_func(), btr_cur_pessimistic_delete():
Skip locking and adaptive hash index for the 'default row'.
row_log_table_apply_convert_mrec(): Replace 'default row' values if needed.
In the temporary file that is applied by row_log_table_apply(),
we must identify whether the records contain the extra header for
instantly added columns. For now, we will allocate an additional byte
for this for ROW_T_INSERT and ROW_T_UPDATE records when the source table
has been subject to instant ADD COLUMN. The ROW_T_DELETE records are
fine, as they will be converted and will only contain 'core' columns
(PRIMARY KEY and some system columns) that are converted from dtuple_t.
rec_get_converted_size_temp(), rec_init_offsets_temp(),
rec_convert_dtuple_to_temp(): Add the parameter 'status'.
REC_INFO_DEFAULT_ROW = REC_INFO_MIN_REC_FLAG | REC_STATUS_COLUMNS_ADDED:
An info_bits constant for distinguishing the 'default row' record.
rec_comp_status_t: An enum of the status bit values.
rec_leaf_format: An enum that replaces the bool parameter of
rec_init_offsets_comp_ordinary().
2017-10-06 07:00:05 +03:00
|
|
|
|
|
|
|
/** Add the table definition to the data dictionary cache */
|
|
|
|
void add_to_cache();
|
|
|
|
|
2017-11-10 17:54:46 +03:00
|
|
|
bool versioned() const { return vers_start || vers_end; }
|
2018-01-26 23:23:11 +02:00
|
|
|
bool versioned_by_id() const
|
|
|
|
{
|
|
|
|
return vers_start && cols[vers_start].mtype == DATA_INT;
|
|
|
|
}
|
2017-11-01 20:17:48 +03:00
|
|
|
|
2018-01-25 09:13:21 +02:00
|
|
|
void inc_fk_checks()
|
|
|
|
{
|
|
|
|
#ifdef UNIV_DEBUG
|
2018-10-25 18:59:08 +04:00
|
|
|
int32_t fk_checks=
|
2018-01-25 09:13:21 +02:00
|
|
|
#endif
|
2018-10-25 18:59:08 +04:00
|
|
|
n_foreign_key_checks_running++;
|
2018-01-25 09:13:21 +02:00
|
|
|
ut_ad(fk_checks >= 0);
|
|
|
|
}
|
|
|
|
void dec_fk_checks()
|
|
|
|
{
|
|
|
|
#ifdef UNIV_DEBUG
|
2018-10-25 18:59:08 +04:00
|
|
|
int32_t fk_checks=
|
2018-01-25 09:13:21 +02:00
|
|
|
#endif
|
2018-10-25 18:59:08 +04:00
|
|
|
n_foreign_key_checks_running--;
|
2018-01-25 09:13:21 +02:00
|
|
|
ut_ad(fk_checks > 0);
|
|
|
|
}
|
|
|
|
|
2018-11-13 16:42:49 +02:00
|
|
|
private:
|
2018-11-27 22:25:11 +03:00
|
|
|
/** Initialize instant->field_map.
|
2018-11-13 16:42:49 +02:00
|
|
|
@tparam replace_dropped whether to point clustered index fields
|
|
|
|
to instant->dropped[]
|
|
|
|
@param[in] table table definition to copy from */
|
|
|
|
template<bool replace_dropped = false>
|
|
|
|
inline void init_instant(const dict_table_t& table);
|
|
|
|
public:
|
2016-08-12 11:17:45 +03:00
|
|
|
/** Id of the table. */
|
|
|
|
table_id_t id;
|
2019-01-17 11:24:38 +02:00
|
|
|
/** Hash chain node. */
|
|
|
|
hash_node_t id_hash;
|
|
|
|
/** Table name. */
|
|
|
|
table_name_t name;
|
|
|
|
/** Hash chain node. */
|
|
|
|
hash_node_t name_hash;
|
2016-08-12 11:17:45 +03:00
|
|
|
|
2019-01-22 06:51:03 +02:00
|
|
|
/** Memory heap */
|
2016-08-12 11:17:45 +03:00
|
|
|
mem_heap_t* heap;
|
|
|
|
|
|
|
|
/** NULL or the directory path specified by DATA DIRECTORY. */
|
|
|
|
char* data_dir_path;
|
|
|
|
|
MDEV-12266: Change dict_table_t::space to fil_space_t*
InnoDB always keeps all tablespaces in the fil_system cache.
The fil_system.LRU is only for closing file handles; the
fil_space_t and fil_node_t for all data files will remain
in main memory. Between startup to shutdown, they can only be
created and removed by DDL statements. Therefore, we can
let dict_table_t::space point directly to the fil_space_t.
dict_table_t::space_id: A numeric tablespace ID for the corner cases
where we do not have a tablespace. The most prominent examples are
ALTER TABLE...DISCARD TABLESPACE or a missing or corrupted file.
There are a few functional differences; most notably:
(1) DROP TABLE will delete matching .ibd and .cfg files,
even if they were not attached to the data dictionary.
(2) Some error messages will report file names instead of numeric IDs.
There still are many functions that use numeric tablespace IDs instead
of fil_space_t*, and many functions could be converted to fil_space_t
member functions. Also, Tablespace and Datafile should be merged with
fil_space_t and fil_node_t. page_id_t and buf_page_get_gen() could use
fil_space_t& instead of a numeric ID, and after moving to a single
buffer pool (MDEV-15058), buf_pool_t::page_hash could be moved to
fil_space_t::page_hash.
FilSpace: Remove. Only few calls to fil_space_acquire() will remain,
and gradually they should be removed.
mtr_t::set_named_space_id(ulint): Renamed from set_named_space(),
to prevent accidental calls to this slower function. Very few
callers remain.
fseg_create(), fsp_reserve_free_extents(): Take fil_space_t*
as a parameter instead of a space_id.
fil_space_t::rename(): Wrapper for fil_rename_tablespace_check(),
fil_name_write_rename(), fil_rename_tablespace(). Mariabackup
passes the parameter log=false; InnoDB passes log=true.
dict_mem_table_create(): Take fil_space_t* instead of space_id
as parameter.
dict_process_sys_tables_rec_and_mtr_commit(): Replace the parameter
'status' with 'bool cached'.
dict_get_and_save_data_dir_path(): Avoid copying the fil_node_t::name.
fil_ibd_open(): Return the tablespace.
fil_space_t::set_imported(): Replaces fil_space_set_imported().
truncate_t: Change many member function parameters to fil_space_t*,
and remove page_size parameters.
row_truncate_prepare(): Merge to its only caller.
row_drop_table_from_cache(): Assert that the table is persistent.
dict_create_sys_indexes_tuple(): Write SYS_INDEXES.SPACE=FIL_NULL
if the tablespace has been discarded.
row_import_update_discarded_flag(): Remove a constant parameter.
2018-03-27 16:31:10 +03:00
|
|
|
/** The tablespace of the table */
|
|
|
|
fil_space_t* space;
|
|
|
|
/** Tablespace ID */
|
|
|
|
ulint space_id;
|
2016-08-12 11:17:45 +03:00
|
|
|
|
|
|
|
/** Stores information about:
|
|
|
|
1 row format (redundant or compact),
|
|
|
|
2 compressed page size (zip shift size),
|
|
|
|
3 whether using atomic blobs,
|
|
|
|
4 whether the table has been created with the option DATA DIRECTORY.
|
|
|
|
Use DICT_TF_GET_COMPACT(), DICT_TF_GET_ZIP_SSIZE(),
|
|
|
|
DICT_TF_HAS_ATOMIC_BLOBS() and DICT_TF_HAS_DATA_DIR() to parse this
|
|
|
|
flag. */
|
|
|
|
unsigned flags:DICT_TF_BITS;
|
|
|
|
|
|
|
|
/** Stores information about:
|
|
|
|
1 whether the table has been created using CREATE TEMPORARY TABLE,
|
|
|
|
2 whether the table has an internally defined DOC ID column,
|
|
|
|
3 whether the table has a FTS index,
|
|
|
|
4 whether DOC ID column need to be added to the FTS index,
|
|
|
|
5 whether the table is being created its own tablespace,
|
|
|
|
6 whether the table has been DISCARDed,
|
|
|
|
7 whether the aux FTS tables names are in hex.
|
|
|
|
Use DICT_TF2_FLAG_IS_SET() to parse this flag. */
|
|
|
|
unsigned flags2:DICT_TF2_BITS;
|
|
|
|
|
MDEV-11415 Remove excessive undo logging during ALTER TABLE…ALGORITHM=COPY
If a crash occurs during ALTER TABLE…ALGORITHM=COPY, InnoDB would spend
a lot of time rolling back writes to the intermediate copy of the table.
To reduce the amount of busy work done, a work-around was introduced in
commit fd069e2bb36a3c1c1f26d65dd298b07e6d83ac8b in MySQL 4.1.8 and 5.0.2,
to commit the transaction after every 10,000 inserted rows.
A proper fix would have been to disable the undo logging altogether and
to simply drop the intermediate copy of the table on subsequent server
startup. This is what happens in MariaDB 10.3 with MDEV-14717,MDEV-14585.
In MariaDB 10.2, the intermediate copy of the table would be left behind
with a name starting with the string #sql.
This is a backport of a bug fix from MySQL 8.0.0 to MariaDB,
contributed by jixianliang <271365745@qq.com>.
Unlike recent MySQL, MariaDB supports ALTER IGNORE. For that operation
InnoDB must for now keep the undo logging enabled, so that the latest
row can be rolled back in case of an error.
In Galera cluster, the LOAD DATA statement will retain the existing
behaviour and commit the transaction after every 10,000 rows if
the parameter wsrep_load_data_splitting=ON is set. The logic to do
so (the wsrep_load_data_split() function and the call
handler::extra(HA_EXTRA_FAKE_START_STMT)) are joint work
by Ji Xianliang and Marko Mäkelä.
The original fix:
Author: Thirunarayanan Balathandayuthapani <thirunarayanan.balathandayuth@oracle.com>
Date: Wed Dec 2 16:09:15 2015 +0530
Bug#17479594 AVOID INTERMEDIATE COMMIT WHILE DOING ALTER TABLE ALGORITHM=COPY
Problem:
During ALTER TABLE, we commit and restart the transaction for every
10,000 rows, so that the rollback after recovery would not take so long.
Fix:
Suppress the undo logging during copy alter operation. If fts_index is
present then insert directly into fts auxiliary table rather
than doing at commit time.
ha_innobase::num_write_row: Remove the variable.
ha_innobase::write_row(): Remove the hack for committing every 10000 rows.
row_lock_table_for_mysql(): Remove the extra 2 parameters.
lock_get_src_table(), lock_is_table_exclusive(): Remove.
Reviewed-by: Marko Mäkelä <marko.makela@oracle.com>
Reviewed-by: Shaohua Wang <shaohua.wang@oracle.com>
Reviewed-by: Jon Olav Hauglid <jon.hauglid@oracle.com>
2018-01-26 21:45:25 +02:00
|
|
|
/** TRUE if the table is an intermediate table during copy alter
|
|
|
|
operation or a partition/subpartition which is required for copying
|
|
|
|
data and skip the undo log for insertion of row in the table.
|
|
|
|
This variable will be set and unset during extra(), or during the
|
|
|
|
process of altering partitions */
|
|
|
|
unsigned skip_alter_undo:1;
|
|
|
|
|
2017-05-05 10:25:29 +03:00
|
|
|
/*!< whether this is in a single-table tablespace and the .ibd
|
|
|
|
file is missing or page decryption failed and page is corrupted */
|
|
|
|
unsigned file_unreadable:1;
|
2016-08-12 11:17:45 +03:00
|
|
|
|
|
|
|
/** TRUE if the table object has been added to the dictionary cache. */
|
|
|
|
unsigned cached:1;
|
|
|
|
|
|
|
|
/** TRUE if the table is to be dropped, but not yet actually dropped
|
|
|
|
(could in the background drop list). It is turned on at the beginning
|
|
|
|
of row_drop_table_for_mysql() and turned off just before we start to
|
2019-05-17 15:17:37 +03:00
|
|
|
update system tables for the drop. It is protected by dict_sys.latch. */
|
2016-08-12 11:17:45 +03:00
|
|
|
unsigned to_be_dropped:1;
|
|
|
|
|
|
|
|
/** Number of non-virtual columns defined so far. */
|
|
|
|
unsigned n_def:10;
|
|
|
|
|
|
|
|
/** Number of non-virtual columns. */
|
|
|
|
unsigned n_cols:10;
|
|
|
|
|
|
|
|
/** Number of total columns (inlcude virtual and non-virtual) */
|
|
|
|
unsigned n_t_cols:10;
|
|
|
|
|
|
|
|
/** Number of total columns defined so far. */
|
|
|
|
unsigned n_t_def:10;
|
|
|
|
|
|
|
|
/** Number of virtual columns defined so far. */
|
|
|
|
unsigned n_v_def:10;
|
|
|
|
|
|
|
|
/** Number of virtual columns. */
|
|
|
|
unsigned n_v_cols:10;
|
|
|
|
|
MDEV-6076 Persistent AUTO_INCREMENT for InnoDB
This should be functionally equivalent to WL#6204 in MySQL 8.0.0, with
the notable difference that the file format changes are limited to
repurposing a previously unused data field in B-tree pages.
For persistent InnoDB tables, write the last used AUTO_INCREMENT
value to the root page of the clustered index, in the previously
unused (0) PAGE_MAX_TRX_ID field, now aliased as PAGE_ROOT_AUTO_INC.
Unlike some other previously unused InnoDB data fields, this one was
actually always zero-initialized, at least since MySQL 3.23.49.
The writes to PAGE_ROOT_AUTO_INC are protected by SX or X latch on the
root page. The SX latch will allow concurrent read access to the root
page. (The field PAGE_ROOT_AUTO_INC will only be read on the
first-time call to ha_innobase::open() from the SQL layer. The
PAGE_ROOT_AUTO_INC can only be updated when executing SQL, so
read/write races are not possible.)
During INSERT, the PAGE_ROOT_AUTO_INC is updated by the low-level
function btr_cur_search_to_nth_level(), adding no extra page
access. [Adaptive hash index lookup will be disabled during INSERT.]
If some rare UPDATE modifies an AUTO_INCREMENT column, the
PAGE_ROOT_AUTO_INC will be adjusted in a separate mini-transaction in
ha_innobase::update_row().
When a page is reorganized, we have to preserve the PAGE_ROOT_AUTO_INC
field.
During ALTER TABLE, the initial AUTO_INCREMENT value will be copied
from the table. ALGORITHM=COPY and online log apply in LOCK=NONE will
update PAGE_ROOT_AUTO_INC in real time.
innodb_col_no(): Determine the dict_table_t::cols[] element index
corresponding to a Field of a non-virtual column.
(The MySQL 5.7 implementation of virtual columns breaks the 1:1
relationship between Field::field_index and dict_table_t::cols[].
Virtual columns are omitted from dict_table_t::cols[]. Therefore,
we must translate the field_index of AUTO_INCREMENT columns into
an index of dict_table_t::cols[].)
Upgrade from old data files:
By default, the AUTO_INCREMENT sequence in old data files would appear
to be reset, because PAGE_MAX_TRX_ID or PAGE_ROOT_AUTO_INC would contain
the value 0 in each clustered index page. In new data files,
PAGE_ROOT_AUTO_INC can only be 0 if the table is empty or does not contain
any AUTO_INCREMENT column.
For backward compatibility, we use the old method of
SELECT MAX(auto_increment_column) for initializing the sequence.
btr_read_autoinc(): Read the AUTO_INCREMENT sequence from a new-format
data file.
btr_read_autoinc_with_fallback(): A variant of btr_read_autoinc()
that will resort to reading MAX(auto_increment_column) for data files
that did not use AUTO_INCREMENT yet. It was manually tested that during
the execution of innodb.autoinc_persist the compatibility logic is
not activated (for new files, PAGE_ROOT_AUTO_INC is never 0 in nonempty
clustered index root pages).
initialize_auto_increment(): Replaces
ha_innobase::innobase_initialize_autoinc(). This initializes
the AUTO_INCREMENT metadata. Only called from ha_innobase::open().
ha_innobase::info_low(): Do not try to lazily initialize
dict_table_t::autoinc. It must already have been initialized by
ha_innobase::open() or ha_innobase::create().
Note: The adjustments to class ha_innopart were not tested, because
the source code (native InnoDB partitioning) is not being compiled.
2016-12-14 19:56:39 +02:00
|
|
|
/** 1 + the position of autoinc counter field in clustered
|
|
|
|
index, or 0 if there is no persistent AUTO_INCREMENT column in
|
|
|
|
the table. */
|
|
|
|
unsigned persistent_autoinc:10;
|
|
|
|
|
2016-08-12 11:17:45 +03:00
|
|
|
/** TRUE if it's not an InnoDB system table or a table that has no FK
|
|
|
|
relationships. */
|
|
|
|
unsigned can_be_evicted:1;
|
|
|
|
|
|
|
|
/** TRUE if table is corrupted. */
|
|
|
|
unsigned corrupted:1;
|
|
|
|
|
|
|
|
/** TRUE if some indexes should be dropped after ONLINE_INDEX_ABORTED
|
|
|
|
or ONLINE_INDEX_ABORTED_DROPPED. */
|
|
|
|
unsigned drop_aborted:1;
|
|
|
|
|
|
|
|
/** Array of column descriptions. */
|
|
|
|
dict_col_t* cols;
|
|
|
|
|
|
|
|
/** Array of virtual column descriptions. */
|
|
|
|
dict_v_col_t* v_cols;
|
|
|
|
|
2016-09-06 09:43:16 +03:00
|
|
|
/** List of stored column descriptions. It is used only for foreign key
|
|
|
|
check during create table and copy alter operations.
|
|
|
|
During copy alter, s_cols list is filled during create table operation
|
|
|
|
and need to preserve till rename table operation. That is the
|
|
|
|
reason s_cols is a part of dict_table_t */
|
|
|
|
dict_s_col_list* s_cols;
|
|
|
|
|
MDEV-15662 Instant DROP COLUMN or changing the order of columns
Allow ADD COLUMN anywhere in a table, not only adding as the
last column.
Allow instant DROP COLUMN and instant changing the order of columns.
The added columns will always be added last in clustered index records.
In new records, instantly dropped columns will be stored as NULL or
empty when possible.
Information about dropped and reordered columns will be written in
a metadata BLOB (mblob), which is stored before the first 'user' field
in the hidden metadata record at the start of the clustered index.
The presence of mblob is indicated by setting the delete-mark flag in
the metadata record.
The metadata BLOB stores the number of clustered index fields,
followed by an array of column information for each field.
For dropped columns, we store the NOT NULL flag, the fixed length,
and for variable-length columns, whether the maximum length exceeded
255 bytes. For non-dropped columns, we store the column position.
Unlike with MDEV-11369, when a table becomes empty, it cannot
be converted back to the canonical format. The reason for this is
that other threads may hold cached objects such as
row_prebuilt_t::ins_node that could refer to dropped or reordered
index fields.
For instant DROP COLUMN and ROW_FORMAT=COMPACT or ROW_FORMAT=DYNAMIC,
we must store the n_core_null_bytes in the root page, so that the
chain of node pointer records can be followed in order to reach the
leftmost leaf page where the metadata record is located.
If the mblob is present, we will zero-initialize the strings
"infimum" and "supremum" in the root page, and use the last byte of
"supremum" for storing the number of null bytes (which are allocated
but useless on node pointer pages). This is necessary for
btr_cur_instant_init_metadata() to be able to navigate to the mblob.
If the PRIMARY KEY contains any variable-length column and some
nullable columns were instantly dropped, the dict_index_t::n_nullable
in the data dictionary could be smaller than it actually is in the
non-leaf pages. Because of this, the non-leaf pages could use more
bytes for the null flags than the data dictionary expects, and we
could be reading the lengths of the variable-length columns from the
wrong offset, and thus reading the child page number from wrong place.
This is the result of two design mistakes that involve unnecessary
storage of data: First, it is nonsense to store any data fields for
the leftmost node pointer records, because the comparisons would be
resolved by the MIN_REC_FLAG alone. Second, there cannot be any null
fields in the clustered index node pointer fields, but we nevertheless
reserve space for all the null flags.
Limitations (future work):
MDEV-17459 Allow instant ALTER TABLE even if FULLTEXT INDEX exists
MDEV-17468 Avoid table rebuild on operations on generated columns
MDEV-17494 Refuse ALGORITHM=INSTANT when the row size is too large
btr_page_reorganize_low(): Preserve any metadata in the root page.
Call lock_move_reorganize_page() only after restoring the "infimum"
and "supremum" records, to avoid a memcmp() assertion failure.
dict_col_t::DROPPED: Magic value for dict_col_t::ind.
dict_col_t::clear_instant(): Renamed from dict_col_t::remove_instant().
Do not assert that the column was instantly added, because we
sometimes call this unconditionally for all columns.
Convert an instantly added column to a "core column". The old name
remove_instant() could be mistaken to refer to "instant DROP COLUMN".
dict_col_t::is_added(): Rename from dict_col_t::is_instant().
dtype_t::metadata_blob_init(): Initialize the mblob data type.
dtuple_t::is_metadata(), dtuple_t::is_alter_metadata(),
upd_t::is_metadata(), upd_t::is_alter_metadata(): Check if info_bits
refer to a metadata record.
dict_table_t::instant: Metadata about dropped or reordered columns.
dict_table_t::prepare_instant(): Prepare
ha_innobase_inplace_ctx::instant_table for instant ALTER TABLE.
innobase_instant_try() will pass this to dict_table_t::instant_column().
On rollback, dict_table_t::rollback_instant() will be called.
dict_table_t::instant_column(): Renamed from instant_add_column().
Add the parameter col_map so that columns can be reordered.
Copy and adjust v_cols[] as well.
dict_table_t::find(): Find an old column based on a new column number.
dict_table_t::serialise_columns(), dict_table_t::deserialise_columns():
Convert the mblob.
dict_index_t::instant_metadata(): Create the metadata record
for instant ALTER TABLE. Invoke dict_table_t::serialise_columns().
dict_index_t::reconstruct_fields(): Invoked by
dict_table_t::deserialise_columns().
dict_index_t::clear_instant_alter(): Move the fields for the
dropped columns to the end, and sort the surviving index fields
in ascending order of column position.
ha_innobase::check_if_supported_inplace_alter(): Do not allow
adding a FTS_DOC_ID column if a hidden FTS_DOC_ID column exists
due to FULLTEXT INDEX. (This always required ALGORITHM=COPY.)
instant_alter_column_possible(): Add a parameter for InnoDB table,
to check for additional conditions, such as the maximum number of
index fields.
ha_innobase_inplace_ctx::first_alter_pos: The first column whose position
is affected by instant ADD, DROP, or changing the order of columns.
innobase_build_col_map(): Skip added virtual columns.
prepare_inplace_add_virtual(): Correctly compute num_to_add_vcol.
Remove some unnecessary code. Note that the call to
innodb_base_col_setup() should be executed later.
commit_try_norebuild(): If ctx->is_instant(), let the virtual
columns be added or dropped by innobase_instant_try().
innobase_instant_try(): Fill in a zero default value for the
hidden column FTS_DOC_ID (to reduce the work needed in MDEV-17459).
If any columns were dropped or reordered (or added not last),
delete any SYS_COLUMNS records for the following columns, and
insert SYS_COLUMNS records for all subsequent stored columns as well
as for all virtual columns. If any virtual column is dropped, rewrite
all virtual column metadata. Use a shortcut only for adding
virtual columns. This is because innobase_drop_virtual_try()
assumes that the dropped virtual columns still exist in ctx->old_table.
innodb_update_cols(): Renamed from innodb_update_n_cols().
innobase_add_one_virtual(), innobase_insert_sys_virtual(): Change
the return type to bool, and invoke my_error() when detecting an error.
innodb_insert_sys_columns(): Insert a record into SYS_COLUMNS.
Refactored from innobase_add_one_virtual() and innobase_instant_add_col().
innobase_instant_add_col(): Replace the parameter dfield with type.
innobase_instant_drop_cols(): Drop matching columns from SYS_COLUMNS
and all columns from SYS_VIRTUAL.
innobase_add_virtual_try(), innobase_drop_virtual_try(): Let
the caller invoke innodb_update_cols().
innobase_rename_column_try(): Skip dropped columns.
commit_cache_norebuild(): Update table->fts->doc_col.
dict_mem_table_col_rename_low(): Skip dropped columns.
trx_undo_rec_get_partial_row(): Skip dropped columns.
trx_undo_update_rec_get_update(): Handle the metadata BLOB correctly.
trx_undo_page_report_modify(): Avoid out-of-bounds access to record fields.
Log metadata records consistently.
Apparently, the first fields of a clustered index may be updated
in an update_undo vector when the index is ID_IND of SYS_FOREIGN,
as part of renaming the table during ALTER TABLE. Normally, updates of
the PRIMARY KEY should be logged as delete-mark and an insert.
row_undo_mod_parse_undo_rec(), row_purge_parse_undo_rec():
Use trx_undo_metadata.
row_undo_mod_clust_low(): On metadata rollback, roll back the root page too.
row_undo_mod_clust(): Relax an assertion. The delete-mark flag was
repurposed for ALTER TABLE metadata records.
row_rec_to_index_entry_impl(): Add the template parameter mblob
and the optional parameter info_bits for specifying the desired new
info bits. For the metadata tuple, allow conversion between the original
format (ADD COLUMN only) and the generic format (with hidden BLOB).
Add the optional parameter "pad" to determine whether the tuple should
be padded to the index fields (on ALTER TABLE it should), or whether
it should remain at its original size (on rollback).
row_build_index_entry_low(): Clean up the code, removing
redundant variables and conditions. For instantly dropped columns,
generate a dummy value that is NULL, the empty string, or a
fixed length of NUL bytes, depending on the type of the dropped column.
row_upd_clust_rec_by_insert_inherit_func(): On the update of PRIMARY KEY
of a record that contained a dropped column whose value was stored
externally, we will be inserting a dummy NULL or empty string value
to the field of the dropped column. The externally stored column would
eventually be dropped when purge removes the delete-marked record for
the old PRIMARY KEY value.
btr_index_rec_validate(): Recognize the metadata record.
btr_discard_only_page_on_level(): Preserve the generic instant
ALTER TABLE metadata.
btr_set_instant(): Replaces page_set_instant(). This sets a clustered
index root page to the appropriate format, or upgrades from
the MDEV-11369 instant ADD COLUMN to generic ALTER TABLE format.
btr_cur_instant_init_low(): Read and validate the metadata BLOB page
before reconstructing the dictionary information based on it.
btr_cur_instant_init_metadata(): Do not read any lengths from the
metadata record header before reading the BLOB. At this point, we
would not actually know how many nullable fields the metadata record
contains.
btr_cur_instant_root_init(): Initialize n_core_null_bytes in one
of two possible ways.
btr_cur_trim(): Handle the mblob record.
row_metadata_to_tuple(): Convert a metadata record to a data tuple,
based on the new info_bits of the metadata record.
btr_cur_pessimistic_update(): Invoke row_metadata_to_tuple() if needed.
Invoke dtuple_convert_big_rec() for metadata records if the record is
too large, or if the mblob is not yet marked as externally stored.
btr_cur_optimistic_delete_func(), btr_cur_pessimistic_delete():
When the last user record is deleted, do not delete the
generic instant ALTER TABLE metadata record. Only delete
MDEV-11369 instant ADD COLUMN metadata records.
btr_cur_optimistic_insert(): Avoid unnecessary computation of rec_size.
btr_pcur_store_position(): Allow a logically empty page to contain
a metadata record for generic ALTER TABLE.
REC_INFO_DEFAULT_ROW_ADD: Renamed from REC_INFO_DEFAULT_ROW.
This is for the old instant ADD COLUMN (MDEV-11369) only.
REC_INFO_DEFAULT_ROW_ALTER: The more generic metadata record,
with additional information for dropped or reordered columns.
rec_info_bits_valid(): Remove. The only case when this would fail
is when the record is the generic ALTER TABLE metadata record.
rec_is_alter_metadata(): Check if a record is the metadata record
for instant ALTER TABLE (other than ADD COLUMN). NOTE: This function
must not be invoked on node pointer records, because the delete-mark
flag in those records may be set (it is garbage), and then a debug
assertion could fail because index->is_instant() does not necessarily
hold.
rec_is_add_metadata(): Check if a record is MDEV-11369 ADD COLUMN metadata
record (not more generic instant ALTER TABLE).
rec_get_converted_size_comp_prefix_low(): Assume that the metadata
field will be stored externally. In dtuple_convert_big_rec() during
the rec_get_converted_size() call, it would not be there yet.
rec_get_converted_size_comp(): Replace status,fields,n_fields with tuple.
rec_init_offsets_comp_ordinary(), rec_get_converted_size_comp_prefix_low(),
rec_convert_dtuple_to_rec_comp(): Add template<bool mblob = false>.
With mblob=true, process a record with a metadata BLOB.
rec_copy_prefix_to_buf(): Assert that no fields beyond the key and
system columns are being copied. Exclude the metadata BLOB field.
rec_convert_dtuple_to_metadata_comp(): Convert an alter metadata tuple
into a record.
row_upd_index_replace_metadata(): Apply an update vector to an
alter_metadata tuple.
row_log_allocate(): Replace dict_index_t::is_instant()
with a more appropriate condition that ignores dict_table_t::instant.
Only a table on which the MDEV-11369 ADD COLUMN was performed
can "lose its instantness" when it becomes empty. After
instant DROP COLUMN or reordering columns, we cannot simply
convert the table to the canonical format, because the data
dictionary cache and all possibly existing references to it
from other client connection threads would have to be adjusted.
row_quiesce_write_index_fields(): Do not crash when the table contains
an instantly dropped column.
Thanks to Thirunarayanan Balathandayuthapani for discussing the design
and implementing an initial prototype of this.
Thanks to Matthias Leich for testing.
2018-10-19 16:49:54 +03:00
|
|
|
/** Instantly dropped or reordered columns, or NULL if none */
|
|
|
|
dict_instant_t* instant;
|
|
|
|
|
2016-08-12 11:17:45 +03:00
|
|
|
/** Column names packed in a character string
|
|
|
|
"name1\0name2\0...nameN\0". Until the string contains n_cols, it will
|
|
|
|
be allocated from a temporary heap. The final string will be allocated
|
|
|
|
from table->heap. */
|
|
|
|
const char* col_names;
|
|
|
|
|
|
|
|
/** Virtual column names */
|
|
|
|
const char* v_col_names;
|
2017-11-01 22:13:05 +03:00
|
|
|
unsigned vers_start:10;
|
2016-09-29 11:12:46 +00:00
|
|
|
/*!< System Versioning: row start col index */
|
2017-11-01 22:13:05 +03:00
|
|
|
unsigned vers_end:10;
|
2016-09-29 11:12:46 +00:00
|
|
|
/*!< System Versioning: row end col index */
|
2014-10-26 07:22:51 +02:00
|
|
|
bool is_system_db;
|
|
|
|
/*!< True if the table belongs to a system
|
|
|
|
database (mysql, information_schema or
|
|
|
|
performance_schema) */
|
2016-04-21 10:52:52 +03:00
|
|
|
dict_frm_t dict_frm_mismatch;
|
|
|
|
/*!< !DICT_FRM_CONSISTENT==0 if data
|
|
|
|
dictionary information and
|
|
|
|
MySQL FRM information mismatch. */
|
2016-08-12 11:17:45 +03:00
|
|
|
/** The FTS_DOC_ID_INDEX, or NULL if no fulltext indexes exist */
|
|
|
|
dict_index_t* fts_doc_id_index;
|
|
|
|
|
|
|
|
/** List of indexes of the table. */
|
|
|
|
UT_LIST_BASE_NODE_T(dict_index_t) indexes;
|
|
|
|
|
|
|
|
/** List of foreign key constraints in the table. These refer to
|
|
|
|
columns in other tables. */
|
|
|
|
UT_LIST_BASE_NODE_T(dict_foreign_t) foreign_list;
|
|
|
|
|
|
|
|
/** List of foreign key constraints which refer to this table. */
|
|
|
|
UT_LIST_BASE_NODE_T(dict_foreign_t) referenced_list;
|
|
|
|
|
|
|
|
/** Node of the LRU list of tables. */
|
|
|
|
UT_LIST_NODE_T(dict_table_t) table_LRU;
|
|
|
|
|
|
|
|
/** Maximum recursive level we support when loading tables chained
|
|
|
|
together with FK constraints. If exceeds this level, we will stop
|
|
|
|
loading child table into memory along with its parent table. */
|
|
|
|
unsigned fk_max_recusive_level:8;
|
|
|
|
|
|
|
|
/** Count of how many foreign key check operations are currently being
|
|
|
|
performed on the table. We cannot drop the table while there are
|
|
|
|
foreign key checks running on it. */
|
2018-10-25 18:59:08 +04:00
|
|
|
Atomic_counter<int32_t> n_foreign_key_checks_running;
|
2016-08-12 11:17:45 +03:00
|
|
|
|
|
|
|
/** Transactions whose view low limit is greater than this number are
|
|
|
|
not allowed to store to the MySQL query cache or retrieve from it.
|
|
|
|
When a trx with undo logs commits, it sets this to the value of the
|
2018-06-18 14:26:37 +05:30
|
|
|
transaction id. */
|
|
|
|
trx_id_t query_cache_inv_trx_id;
|
2016-08-12 11:17:45 +03:00
|
|
|
|
|
|
|
/** Transaction id that last touched the table definition. Either when
|
|
|
|
loading the definition or CREATE TABLE, or ALTER TABLE (prepare,
|
|
|
|
commit, and rollback phases). */
|
|
|
|
trx_id_t def_trx_id;
|
|
|
|
|
|
|
|
/*!< set of foreign key constraints in the table; these refer to
|
|
|
|
columns in other tables */
|
|
|
|
dict_foreign_set foreign_set;
|
|
|
|
|
|
|
|
/*!< set of foreign key constraints which refer to this table */
|
|
|
|
dict_foreign_set referenced_set;
|
|
|
|
|
|
|
|
/** Statistics for query optimization. @{ */
|
|
|
|
|
|
|
|
/** Creation state of 'stats_latch'. */
|
|
|
|
volatile os_once::state_t stats_latch_created;
|
|
|
|
|
|
|
|
/** This latch protects:
|
|
|
|
dict_table_t::stat_initialized,
|
|
|
|
dict_table_t::stat_n_rows (*),
|
|
|
|
dict_table_t::stat_clustered_index_size,
|
|
|
|
dict_table_t::stat_sum_of_other_index_sizes,
|
|
|
|
dict_table_t::stat_modified_counter (*),
|
|
|
|
dict_table_t::indexes*::stat_n_diff_key_vals[],
|
|
|
|
dict_table_t::indexes*::stat_index_size,
|
|
|
|
dict_table_t::indexes*::stat_n_leaf_pages.
|
|
|
|
(*) Those are not always protected for
|
|
|
|
performance reasons. */
|
|
|
|
rw_lock_t* stats_latch;
|
|
|
|
|
|
|
|
/** TRUE if statistics have been calculated the first time after
|
|
|
|
database startup or table creation. */
|
|
|
|
unsigned stat_initialized:1;
|
|
|
|
|
|
|
|
/** Timestamp of last recalc of the stats. */
|
|
|
|
ib_time_t stats_last_recalc;
|
|
|
|
|
|
|
|
/** The two bits below are set in the 'stat_persistent' member. They
|
|
|
|
have the following meaning:
|
|
|
|
1. _ON=0, _OFF=0, no explicit persistent stats setting for this table,
|
|
|
|
the value of the global srv_stats_persistent is used to determine
|
|
|
|
whether the table has persistent stats enabled or not
|
|
|
|
2. _ON=0, _OFF=1, persistent stats are explicitly disabled for this
|
|
|
|
table, regardless of the value of the global srv_stats_persistent
|
|
|
|
3. _ON=1, _OFF=0, persistent stats are explicitly enabled for this
|
|
|
|
table, regardless of the value of the global srv_stats_persistent
|
|
|
|
4. _ON=1, _OFF=1, not allowed, we assert if this ever happens. */
|
|
|
|
#define DICT_STATS_PERSISTENT_ON (1 << 1)
|
|
|
|
#define DICT_STATS_PERSISTENT_OFF (1 << 2)
|
|
|
|
|
|
|
|
/** Indicates whether the table uses persistent stats or not. See
|
|
|
|
DICT_STATS_PERSISTENT_ON and DICT_STATS_PERSISTENT_OFF. */
|
|
|
|
ib_uint32_t stat_persistent;
|
|
|
|
|
|
|
|
/** The two bits below are set in the 'stats_auto_recalc' member. They
|
|
|
|
have the following meaning:
|
|
|
|
1. _ON=0, _OFF=0, no explicit auto recalc setting for this table, the
|
|
|
|
value of the global srv_stats_persistent_auto_recalc is used to
|
|
|
|
determine whether the table has auto recalc enabled or not
|
|
|
|
2. _ON=0, _OFF=1, auto recalc is explicitly disabled for this table,
|
|
|
|
regardless of the value of the global srv_stats_persistent_auto_recalc
|
|
|
|
3. _ON=1, _OFF=0, auto recalc is explicitly enabled for this table,
|
|
|
|
regardless of the value of the global srv_stats_persistent_auto_recalc
|
|
|
|
4. _ON=1, _OFF=1, not allowed, we assert if this ever happens. */
|
|
|
|
#define DICT_STATS_AUTO_RECALC_ON (1 << 1)
|
|
|
|
#define DICT_STATS_AUTO_RECALC_OFF (1 << 2)
|
|
|
|
|
|
|
|
/** Indicates whether the table uses automatic recalc for persistent
|
|
|
|
stats or not. See DICT_STATS_AUTO_RECALC_ON and
|
|
|
|
DICT_STATS_AUTO_RECALC_OFF. */
|
|
|
|
ib_uint32_t stats_auto_recalc;
|
|
|
|
|
|
|
|
/** The number of pages to sample for this table during persistent
|
|
|
|
stats estimation. If this is 0, then the value of the global
|
|
|
|
srv_stats_persistent_sample_pages will be used instead. */
|
|
|
|
ulint stats_sample_pages;
|
|
|
|
|
|
|
|
/** Approximate number of rows in the table. We periodically calculate
|
|
|
|
new estimates. */
|
|
|
|
ib_uint64_t stat_n_rows;
|
|
|
|
|
|
|
|
/** Approximate clustered index size in database pages. */
|
|
|
|
ulint stat_clustered_index_size;
|
|
|
|
|
|
|
|
/** Approximate size of other indexes in database pages. */
|
|
|
|
ulint stat_sum_of_other_index_sizes;
|
|
|
|
|
|
|
|
/** How many rows are modified since last stats recalc. When a row is
|
|
|
|
inserted, updated, or deleted, we add 1 to this number; we calculate
|
|
|
|
new estimates for the table and the indexes if the table has changed
|
2017-12-04 13:43:02 +02:00
|
|
|
too much, see dict_stats_update_if_needed(). The counter is reset
|
2016-08-12 11:17:45 +03:00
|
|
|
to zero at statistics calculation. This counter is not protected by
|
|
|
|
any latch, because this is only used for heuristics. */
|
|
|
|
ib_uint64_t stat_modified_counter;
|
|
|
|
|
|
|
|
/** Background stats thread is not working on this table. */
|
|
|
|
#define BG_STAT_NONE 0
|
|
|
|
|
|
|
|
/** Set in 'stats_bg_flag' when the background stats code is working
|
|
|
|
on this table. The DROP TABLE code waits for this to be cleared before
|
|
|
|
proceeding. */
|
|
|
|
#define BG_STAT_IN_PROGRESS (1 << 0)
|
|
|
|
|
|
|
|
/** Set in 'stats_bg_flag' when DROP TABLE starts waiting on
|
|
|
|
BG_STAT_IN_PROGRESS to be cleared. The background stats thread will
|
|
|
|
detect this and will eventually quit sooner. */
|
|
|
|
#define BG_STAT_SHOULD_QUIT (1 << 1)
|
|
|
|
|
|
|
|
/** The state of the background stats thread wrt this table.
|
|
|
|
See BG_STAT_NONE, BG_STAT_IN_PROGRESS and BG_STAT_SHOULD_QUIT.
|
2019-05-17 14:32:53 +03:00
|
|
|
Writes are covered by dict_sys.mutex. Dirty reads are possible. */
|
2016-09-06 09:43:16 +03:00
|
|
|
|
|
|
|
#define BG_SCRUB_IN_PROGRESS ((byte)(1 << 2))
|
2014-12-22 16:53:17 +02:00
|
|
|
/*!< BG_SCRUB_IN_PROGRESS is set in
|
|
|
|
stats_bg_flag when the background
|
|
|
|
scrub code is working on this table. The DROP
|
|
|
|
TABLE code waits for this to be cleared
|
|
|
|
before proceeding. */
|
|
|
|
|
2016-09-06 09:43:16 +03:00
|
|
|
#define BG_STAT_SHOULD_QUIT (1 << 1)
|
|
|
|
|
|
|
|
#define BG_IN_PROGRESS (BG_STAT_IN_PROGRESS | BG_SCRUB_IN_PROGRESS)
|
2014-12-22 16:53:17 +02:00
|
|
|
|
2016-09-06 09:43:16 +03:00
|
|
|
|
|
|
|
/** The state of the background stats thread wrt this table.
|
|
|
|
See BG_STAT_NONE, BG_STAT_IN_PROGRESS and BG_STAT_SHOULD_QUIT.
|
2019-05-17 14:32:53 +03:00
|
|
|
Writes are covered by dict_sys.mutex. Dirty reads are possible. */
|
2016-08-12 11:17:45 +03:00
|
|
|
byte stats_bg_flag;
|
|
|
|
|
2014-07-22 19:31:45 +03:00
|
|
|
bool stats_error_printed;
|
|
|
|
/*!< Has persistent stats error beein
|
|
|
|
already printed for this table ? */
|
2016-08-12 11:17:45 +03:00
|
|
|
/* @} */
|
|
|
|
|
|
|
|
/** AUTOINC related members. @{ */
|
|
|
|
|
|
|
|
/* The actual collection of tables locked during AUTOINC read/write is
|
|
|
|
kept in trx_t. In order to quickly determine whether a transaction has
|
|
|
|
locked the AUTOINC lock we keep a pointer to the transaction here in
|
|
|
|
the 'autoinc_trx' member. This is to avoid acquiring the
|
|
|
|
lock_sys_t::mutex and scanning the vector in trx_t.
|
|
|
|
When an AUTOINC lock has to wait, the corresponding lock instance is
|
|
|
|
created on the trx lock heap rather than use the pre-allocated instance
|
|
|
|
in autoinc_lock below. */
|
|
|
|
|
|
|
|
/** A buffer for an AUTOINC lock for this table. We allocate the
|
|
|
|
memory here so that individual transactions can get it and release it
|
|
|
|
without a need to allocate space from the lock heap of the trx:
|
|
|
|
otherwise the lock heap would grow rapidly if we do a large insert
|
|
|
|
from a select. */
|
|
|
|
lock_t* autoinc_lock;
|
2015-05-04 22:13:46 +02:00
|
|
|
|
|
|
|
/** Creation state of autoinc_mutex member */
|
2016-08-12 11:17:45 +03:00
|
|
|
volatile os_once::state_t autoinc_mutex_created;
|
2014-02-26 19:11:54 +01:00
|
|
|
|
2016-08-12 11:17:45 +03:00
|
|
|
/** Mutex protecting the autoincrement counter. */
|
|
|
|
ib_mutex_t* autoinc_mutex;
|
2014-02-26 19:11:54 +01:00
|
|
|
|
2016-08-12 11:17:45 +03:00
|
|
|
/** Autoinc counter value to give to the next inserted row. */
|
|
|
|
ib_uint64_t autoinc;
|
|
|
|
|
|
|
|
/** This counter is used to track the number of granted and pending
|
|
|
|
autoinc locks on this table. This value is set after acquiring the
|
|
|
|
lock_sys_t::mutex but we peek the contents to determine whether other
|
|
|
|
transactions have acquired the AUTOINC lock or not. Of course only one
|
|
|
|
transaction can be granted the lock but there can be multiple
|
|
|
|
waiters. */
|
|
|
|
ulong n_waiting_or_granted_auto_inc_locks;
|
|
|
|
|
|
|
|
/** The transaction that currently holds the the AUTOINC lock on this
|
2018-02-22 20:46:42 +04:00
|
|
|
table. Protected by lock_sys.mutex. */
|
2016-08-12 11:17:45 +03:00
|
|
|
const trx_t* autoinc_trx;
|
|
|
|
|
|
|
|
/* @} */
|
|
|
|
|
|
|
|
/** FTS specific state variables. */
|
|
|
|
fts_t* fts;
|
|
|
|
|
|
|
|
/** Quiescing states, protected by the dict_index_t::lock. ie. we can
|
|
|
|
only change the state if we acquire all the latches (dict_index_t::lock)
|
|
|
|
in X mode of this table's indexes. */
|
|
|
|
ib_quiesce_t quiesce;
|
|
|
|
|
|
|
|
/** Count of the number of record locks on this table. We use this to
|
|
|
|
determine whether we can evict the table from the dictionary cache.
|
2018-02-22 20:46:42 +04:00
|
|
|
It is protected by lock_sys.mutex. */
|
2016-08-12 11:17:45 +03:00
|
|
|
ulint n_rec_locks;
|
|
|
|
|
|
|
|
private:
|
|
|
|
/** Count of how many handles are opened to this table. Dropping of the
|
|
|
|
table is NOT allowed until this count gets to zero. MySQL does NOT
|
|
|
|
itself check the number of open handles at DROP. */
|
2018-10-13 21:48:59 +04:00
|
|
|
Atomic_counter<uint32_t> n_ref_count;
|
2016-08-12 11:17:45 +03:00
|
|
|
|
|
|
|
public:
|
2018-02-22 20:46:42 +04:00
|
|
|
/** List of locks on the table. Protected by lock_sys.mutex. */
|
2016-08-12 11:17:45 +03:00
|
|
|
table_lock_list_t locks;
|
|
|
|
|
|
|
|
/** Timestamp of the last modification of this table. */
|
|
|
|
time_t update_time;
|
|
|
|
|
2014-02-26 19:11:54 +01:00
|
|
|
#ifdef UNIV_DEBUG
|
2016-08-12 11:17:45 +03:00
|
|
|
/** Value of 'magic_n'. */
|
|
|
|
#define DICT_TABLE_MAGIC_N 76333786
|
|
|
|
|
|
|
|
/** Magic number. */
|
|
|
|
ulint magic_n;
|
2014-02-26 19:11:54 +01:00
|
|
|
#endif /* UNIV_DEBUG */
|
2016-08-12 11:17:45 +03:00
|
|
|
/** mysql_row_templ_t for base columns used for compute the virtual
|
|
|
|
columns */
|
2016-09-06 09:43:16 +03:00
|
|
|
dict_vcol_templ_t* vc_templ;
|
2014-02-26 19:11:54 +01:00
|
|
|
};
|
|
|
|
|
2018-03-22 15:30:54 +02:00
|
|
|
inline void dict_index_t::set_modified(mtr_t& mtr) const
|
|
|
|
{
|
2018-03-22 19:40:38 +02:00
|
|
|
mtr.set_named_space(table->space);
|
2018-03-22 15:30:54 +02:00
|
|
|
}
|
|
|
|
|
2019-04-03 15:56:28 +03:00
|
|
|
inline bool table_name_t::is_temporary() const
|
|
|
|
{
|
|
|
|
return dict_table_t::is_temporary_name(m_name);
|
|
|
|
}
|
|
|
|
|
2018-05-12 10:11:38 +03:00
|
|
|
inline bool dict_index_t::is_readable() const { return table->is_readable(); }
|
MDEV-12253: Buffer pool blocks are accessed after they have been freed
Problem was that bpage was referenced after it was already freed
from LRU. Fixed by adding a new variable encrypted that is
passed down to buf_page_check_corrupt() and used in
buf_page_get_gen() to stop processing page read.
This patch should also address following test failures and
bugs:
MDEV-12419: IMPORT should not look up tablespace in
PageConverter::validate(). This is now removed.
MDEV-10099: encryption.innodb_onlinealter_encryption fails
sporadically in buildbot
MDEV-11420: encryption.innodb_encryption-page-compression
failed in buildbot
MDEV-11222: encryption.encrypt_and_grep failed in buildbot on P8
Removed dict_table_t::is_encrypted and dict_table_t::ibd_file_missing
and replaced these with dict_table_t::file_unreadable. Table
ibd file is missing if fil_get_space(space_id) returns NULL
and encrypted if not. Removed dict_table_t::is_corrupted field.
Ported FilSpace class from 10.2 and using that on buf_page_check_corrupt(),
buf_page_decrypt_after_read(), buf_page_encrypt_before_write(),
buf_dblwr_process(), buf_read_page(), dict_stats_save_defrag_stats().
Added test cases when enrypted page could be read while doing
redo log crash recovery. Also added test case for row compressed
blobs.
btr_cur_open_at_index_side_func(),
btr_cur_open_at_rnd_pos_func(): Avoid referencing block that is
NULL.
buf_page_get_zip(): Issue error if page read fails.
buf_page_get_gen(): Use dberr_t for error detection and
do not reference bpage after we hare freed it.
buf_mark_space_corrupt(): remove bpage from LRU also when
it is encrypted.
buf_page_check_corrupt(): @return DB_SUCCESS if page has
been read and is not corrupted,
DB_PAGE_CORRUPTED if page based on checksum check is corrupted,
DB_DECRYPTION_FAILED if page post encryption checksum matches but
after decryption normal page checksum does not match. In read
case only DB_SUCCESS is possible.
buf_page_io_complete(): use dberr_t for error handling.
buf_flush_write_block_low(),
buf_read_ahead_random(),
buf_read_page_async(),
buf_read_ahead_linear(),
buf_read_ibuf_merge_pages(),
buf_read_recv_pages(),
fil_aio_wait():
Issue error if page read fails.
btr_pcur_move_to_next_page(): Do not reference page if it is
NULL.
Introduced dict_table_t::is_readable() and dict_index_t::is_readable()
that will return true if tablespace exists and pages read from
tablespace are not corrupted or page decryption failed.
Removed buf_page_t::key_version. After page decryption the
key version is not removed from page frame. For unencrypted
pages, old key_version is removed at buf_page_encrypt_before_write()
dict_stats_update_transient_for_index(),
dict_stats_update_transient()
Do not continue if table decryption failed or table
is corrupted.
dict0stats.cc: Introduced a dict_stats_report_error function
to avoid code duplication.
fil_parse_write_crypt_data():
Check that key read from redo log entry is found from
encryption plugin and if it is not, refuse to start.
PageConverter::validate(): Removed access to fil_space_t as
tablespace is not available during import.
Fixed error code on innodb.innodb test.
Merged test cased innodb-bad-key-change5 and innodb-bad-key-shutdown
to innodb-bad-key-change2. Removed innodb-bad-key-change5 test.
Decreased unnecessary complexity on some long lasting tests.
Removed fil_inc_pending_ops(), fil_decr_pending_ops(),
fil_get_first_space(), fil_get_next_space(),
fil_get_first_space_safe(), fil_get_next_space_safe()
functions.
fil_space_verify_crypt_checksum(): Fixed bug found using ASAN
where FIL_PAGE_END_LSN_OLD_CHECKSUM field was incorrectly
accessed from row compressed tables. Fixed out of page frame
bug for row compressed tables in
fil_space_verify_crypt_checksum() found using ASAN. Incorrect
function was called for compressed table.
Added new tests for discard, rename table and drop (we should allow them
even when page decryption fails). Alter table rename is not allowed.
Added test for restart with innodb-force-recovery=1 when page read on
redo-recovery cant be decrypted. Added test for corrupted table where
both page data and FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION is corrupted.
Adjusted the test case innodb_bug14147491 so that it does not anymore
expect crash. Instead table is just mostly not usable.
fil0fil.h: fil_space_acquire_low is not visible function
and fil_space_acquire and fil_space_acquire_silent are
inline functions. FilSpace class uses fil_space_acquire_low
directly.
recv_apply_hashed_log_recs() does not return anything.
2017-04-26 15:19:16 +03:00
|
|
|
|
MDEV-11369 Instant ADD COLUMN for InnoDB
For InnoDB tables, adding, dropping and reordering columns has
required a rebuild of the table and all its indexes. Since MySQL 5.6
(and MariaDB 10.0) this has been supported online (LOCK=NONE), allowing
concurrent modification of the tables.
This work revises the InnoDB ROW_FORMAT=REDUNDANT, ROW_FORMAT=COMPACT
and ROW_FORMAT=DYNAMIC so that columns can be appended instantaneously,
with only minor changes performed to the table structure. The counter
innodb_instant_alter_column in INFORMATION_SCHEMA.GLOBAL_STATUS
is incremented whenever a table rebuild operation is converted into
an instant ADD COLUMN operation.
ROW_FORMAT=COMPRESSED tables will not support instant ADD COLUMN.
Some usability limitations will be addressed in subsequent work:
MDEV-13134 Introduce ALTER TABLE attributes ALGORITHM=NOCOPY
and ALGORITHM=INSTANT
MDEV-14016 Allow instant ADD COLUMN, ADD INDEX, LOCK=NONE
The format of the clustered index (PRIMARY KEY) is changed as follows:
(1) The FIL_PAGE_TYPE of the root page will be FIL_PAGE_TYPE_INSTANT,
and a new field PAGE_INSTANT will contain the original number of fields
in the clustered index ('core' fields).
If instant ADD COLUMN has not been used or the table becomes empty,
or the very first instant ADD COLUMN operation is rolled back,
the fields PAGE_INSTANT and FIL_PAGE_TYPE will be reset
to 0 and FIL_PAGE_INDEX.
(2) A special 'default row' record is inserted into the leftmost leaf,
between the page infimum and the first user record. This record is
distinguished by the REC_INFO_MIN_REC_FLAG, and it is otherwise in the
same format as records that contain values for the instantly added
columns. This 'default row' always has the same number of fields as
the clustered index according to the table definition. The values of
'core' fields are to be ignored. For other fields, the 'default row'
will contain the default values as they were during the ALTER TABLE
statement. (If the column default values are changed later, those
values will only be stored in the .frm file. The 'default row' will
contain the original evaluated values, which must be the same for
every row.) The 'default row' must be completely hidden from
higher-level access routines. Assertions have been added to ensure
that no 'default row' is ever present in the adaptive hash index
or in locked records. The 'default row' is never delete-marked.
(3) In clustered index leaf page records, the number of fields must
reside between the number of 'core' fields (dict_index_t::n_core_fields
introduced in this work) and dict_index_t::n_fields. If the number
of fields is less than dict_index_t::n_fields, the missing fields
are replaced with the column value of the 'default row'.
Note: The number of fields in the record may shrink if some of the
last instantly added columns are updated to the value that is
in the 'default row'. The function btr_cur_trim() implements this
'compression' on update and rollback; dtuple::trim() implements it
on insert.
(4) In ROW_FORMAT=COMPACT and ROW_FORMAT=DYNAMIC records, the new
status value REC_STATUS_COLUMNS_ADDED will indicate the presence of
a new record header that will encode n_fields-n_core_fields-1 in
1 or 2 bytes. (In ROW_FORMAT=REDUNDANT records, the record header
always explicitly encodes the number of fields.)
We introduce the undo log record type TRX_UNDO_INSERT_DEFAULT for
covering the insert of the 'default row' record when instant ADD COLUMN
is used for the first time. Subsequent instant ADD COLUMN can use
TRX_UNDO_UPD_EXIST_REC.
This is joint work with Vin Chen (陈福荣) from Tencent. The design
that was discussed in April 2017 would not have allowed import or
export of data files, because instead of the 'default row' it would
have introduced a data dictionary table. The test
rpl.rpl_alter_instant is exactly as contributed in pull request #408.
The test innodb.instant_alter is based on a contributed test.
The redo log record format changes for ROW_FORMAT=DYNAMIC and
ROW_FORMAT=COMPACT are as contributed. (With this change present,
crash recovery from MariaDB 10.3.1 will fail in spectacular ways!)
Also the semantics of higher-level redo log records that modify the
PAGE_INSTANT field is changed. The redo log format version identifier
was already changed to LOG_HEADER_FORMAT_CURRENT=103 in MariaDB 10.3.1.
Everything else has been rewritten by me. Thanks to Elena Stepanova,
the code has been tested extensively.
When rolling back an instant ADD COLUMN operation, we must empty the
PAGE_FREE list after deleting or shortening the 'default row' record,
by calling either btr_page_empty() or btr_page_reorganize(). We must
know the size of each entry in the PAGE_FREE list. If rollback left a
freed copy of the 'default row' in the PAGE_FREE list, we would be
unable to determine its size (if it is in ROW_FORMAT=COMPACT or
ROW_FORMAT=DYNAMIC) because it would contain more fields than the
rolled-back definition of the clustered index.
UNIV_SQL_DEFAULT: A new special constant that designates an instantly
added column that is not present in the clustered index record.
len_is_stored(): Check if a length is an actual length. There are
two magic length values: UNIV_SQL_DEFAULT, UNIV_SQL_NULL.
dict_col_t::def_val: The 'default row' value of the column. If the
column is not added instantly, def_val.len will be UNIV_SQL_DEFAULT.
dict_col_t: Add the accessors is_virtual(), is_nullable(), is_instant(),
instant_value().
dict_col_t::remove_instant(): Remove the 'instant ADD' status of
a column.
dict_col_t::name(const dict_table_t& table): Replaces
dict_table_get_col_name().
dict_index_t::n_core_fields: The original number of fields.
For secondary indexes and if instant ADD COLUMN has not been used,
this will be equal to dict_index_t::n_fields.
dict_index_t::n_core_null_bytes: Number of bytes needed to
represent the null flags; usually equal to UT_BITS_IN_BYTES(n_nullable).
dict_index_t::NO_CORE_NULL_BYTES: Magic value signalling that
n_core_null_bytes was not initialized yet from the clustered index
root page.
dict_index_t: Add the accessors is_instant(), is_clust(),
get_n_nullable(), instant_field_value().
dict_index_t::instant_add_field(): Adjust clustered index metadata
for instant ADD COLUMN.
dict_index_t::remove_instant(): Remove the 'instant ADD' status
of a clustered index when the table becomes empty, or the very first
instant ADD COLUMN operation is rolled back.
dict_table_t: Add the accessors is_instant(), is_temporary(),
supports_instant().
dict_table_t::instant_add_column(): Adjust metadata for
instant ADD COLUMN.
dict_table_t::rollback_instant(): Adjust metadata on the rollback
of instant ADD COLUMN.
prepare_inplace_alter_table_dict(): First create the ctx->new_table,
and only then decide if the table really needs to be rebuilt.
We must split the creation of table or index metadata from the
creation of the dictionary table records and the creation of
the data. In this way, we can transform a table-rebuilding operation
into an instant ADD COLUMN operation. Dictionary objects will only
be added to cache when table rebuilding or index creation is needed.
The ctx->instant_table will never be added to cache.
dict_table_t::add_to_cache(): Modified and renamed from
dict_table_add_to_cache(). Do not modify the table metadata.
Let the callers invoke dict_table_add_system_columns() and if needed,
set can_be_evicted.
dict_create_sys_tables_tuple(), dict_create_table_step(): Omit the
system columns (which will now exist in the dict_table_t object
already at this point).
dict_create_table_step(): Expect the callers to invoke
dict_table_add_system_columns().
pars_create_table(): Before creating the table creation execution
graph, invoke dict_table_add_system_columns().
row_create_table_for_mysql(): Expect all callers to invoke
dict_table_add_system_columns().
create_index_dict(): Replaces row_merge_create_index_graph().
innodb_update_n_cols(): Renamed from innobase_update_n_virtual().
Call my_error() if an error occurs.
btr_cur_instant_init(), btr_cur_instant_init_low(),
btr_cur_instant_root_init():
Load additional metadata from the clustered index and set
dict_index_t::n_core_null_bytes. This is invoked
when table metadata is first loaded into the data dictionary.
dict_boot(): Initialize n_core_null_bytes for the four hard-coded
dictionary tables.
dict_create_index_step(): Initialize n_core_null_bytes. This is
executed as part of CREATE TABLE.
dict_index_build_internal_clust(): Initialize n_core_null_bytes to
NO_CORE_NULL_BYTES if table->supports_instant().
row_create_index_for_mysql(): Initialize n_core_null_bytes for
CREATE TEMPORARY TABLE.
commit_cache_norebuild(): Call the code to rename or enlarge columns
in the cache only if instant ADD COLUMN is not being used.
(Instant ADD COLUMN would copy all column metadata from
instant_table to old_table, including the names and lengths.)
PAGE_INSTANT: A new 13-bit field for storing dict_index_t::n_core_fields.
This is repurposing the 16-bit field PAGE_DIRECTION, of which only the
least significant 3 bits were used. The original byte containing
PAGE_DIRECTION will be accessible via the new constant PAGE_DIRECTION_B.
page_get_instant(), page_set_instant(): Accessors for the PAGE_INSTANT.
page_ptr_get_direction(), page_get_direction(),
page_ptr_set_direction(): Accessors for PAGE_DIRECTION.
page_direction_reset(): Reset PAGE_DIRECTION, PAGE_N_DIRECTION.
page_direction_increment(): Increment PAGE_N_DIRECTION
and set PAGE_DIRECTION.
rec_get_offsets(): Use the 'leaf' parameter for non-debug purposes,
and assume that heap_no is always set.
Initialize all dict_index_t::n_fields for ROW_FORMAT=REDUNDANT records,
even if the record contains fewer fields.
rec_offs_make_valid(): Add the parameter 'leaf'.
rec_copy_prefix_to_dtuple(): Assert that the tuple is only built
on the core fields. Instant ADD COLUMN only applies to the
clustered index, and we should never build a search key that has
more than the PRIMARY KEY and possibly DB_TRX_ID,DB_ROLL_PTR.
All these columns are always present.
dict_index_build_data_tuple(): Remove assertions that would be
duplicated in rec_copy_prefix_to_dtuple().
rec_init_offsets(): Support ROW_FORMAT=REDUNDANT records whose
number of fields is between n_core_fields and n_fields.
cmp_rec_rec_with_match(): Implement the comparison between two
MIN_REC_FLAG records.
trx_t::in_rollback: Make the field available in non-debug builds.
trx_start_for_ddl_low(): Remove dangerous error-tolerance.
A dictionary transaction must be flagged as such before it has generated
any undo log records. This is because trx_undo_assign_undo() will mark
the transaction as a dictionary transaction in the undo log header
right before the very first undo log record is being written.
btr_index_rec_validate(): Account for instant ADD COLUMN
row_undo_ins_remove_clust_rec(): On the rollback of an insert into
SYS_COLUMNS, revert instant ADD COLUMN in the cache by removing the
last column from the table and the clustered index.
row_search_on_row_ref(), row_undo_mod_parse_undo_rec(), row_undo_mod(),
trx_undo_update_rec_get_update(): Handle the 'default row'
as a special case.
dtuple_t::trim(index): Omit a redundant suffix of an index tuple right
before insert or update. After instant ADD COLUMN, if the last fields
of a clustered index tuple match the 'default row', there is no
need to store them. While trimming the entry, we must hold a page latch,
so that the table cannot be emptied and the 'default row' be deleted.
btr_cur_optimistic_update(), btr_cur_pessimistic_update(),
row_upd_clust_rec_by_insert(), row_ins_clust_index_entry_low():
Invoke dtuple_t::trim() if needed.
row_ins_clust_index_entry(): Restore dtuple_t::n_fields after calling
row_ins_clust_index_entry_low().
rec_get_converted_size(), rec_get_converted_size_comp(): Allow the number
of fields to be between n_core_fields and n_fields. Do not support
infimum,supremum. They are never supposed to be stored in dtuple_t,
because page creation nowadays uses a lower-level method for initializing
them.
rec_convert_dtuple_to_rec_comp(): Assign the status bits based on the
number of fields.
btr_cur_trim(): In an update, trim the index entry as needed. For the
'default row', handle rollback specially. For user records, omit
fields that match the 'default row'.
btr_cur_optimistic_delete_func(), btr_cur_pessimistic_delete():
Skip locking and adaptive hash index for the 'default row'.
row_log_table_apply_convert_mrec(): Replace 'default row' values if needed.
In the temporary file that is applied by row_log_table_apply(),
we must identify whether the records contain the extra header for
instantly added columns. For now, we will allocate an additional byte
for this for ROW_T_INSERT and ROW_T_UPDATE records when the source table
has been subject to instant ADD COLUMN. The ROW_T_DELETE records are
fine, as they will be converted and will only contain 'core' columns
(PRIMARY KEY and some system columns) that are converted from dtuple_t.
rec_get_converted_size_temp(), rec_init_offsets_temp(),
rec_convert_dtuple_to_temp(): Add the parameter 'status'.
REC_INFO_DEFAULT_ROW = REC_INFO_MIN_REC_FLAG | REC_STATUS_COLUMNS_ADDED:
An info_bits constant for distinguishing the 'default row' record.
rec_comp_status_t: An enum of the status bit values.
rec_leaf_format: An enum that replaces the bool parameter of
rec_init_offsets_comp_ordinary().
2017-10-06 07:00:05 +03:00
|
|
|
inline bool dict_index_t::is_instant() const
|
|
|
|
{
|
|
|
|
ut_ad(n_core_fields > 0);
|
MDEV-15662 Instant DROP COLUMN or changing the order of columns
Allow ADD COLUMN anywhere in a table, not only adding as the
last column.
Allow instant DROP COLUMN and instant changing the order of columns.
The added columns will always be added last in clustered index records.
In new records, instantly dropped columns will be stored as NULL or
empty when possible.
Information about dropped and reordered columns will be written in
a metadata BLOB (mblob), which is stored before the first 'user' field
in the hidden metadata record at the start of the clustered index.
The presence of mblob is indicated by setting the delete-mark flag in
the metadata record.
The metadata BLOB stores the number of clustered index fields,
followed by an array of column information for each field.
For dropped columns, we store the NOT NULL flag, the fixed length,
and for variable-length columns, whether the maximum length exceeded
255 bytes. For non-dropped columns, we store the column position.
Unlike with MDEV-11369, when a table becomes empty, it cannot
be converted back to the canonical format. The reason for this is
that other threads may hold cached objects such as
row_prebuilt_t::ins_node that could refer to dropped or reordered
index fields.
For instant DROP COLUMN and ROW_FORMAT=COMPACT or ROW_FORMAT=DYNAMIC,
we must store the n_core_null_bytes in the root page, so that the
chain of node pointer records can be followed in order to reach the
leftmost leaf page where the metadata record is located.
If the mblob is present, we will zero-initialize the strings
"infimum" and "supremum" in the root page, and use the last byte of
"supremum" for storing the number of null bytes (which are allocated
but useless on node pointer pages). This is necessary for
btr_cur_instant_init_metadata() to be able to navigate to the mblob.
If the PRIMARY KEY contains any variable-length column and some
nullable columns were instantly dropped, the dict_index_t::n_nullable
in the data dictionary could be smaller than it actually is in the
non-leaf pages. Because of this, the non-leaf pages could use more
bytes for the null flags than the data dictionary expects, and we
could be reading the lengths of the variable-length columns from the
wrong offset, and thus reading the child page number from wrong place.
This is the result of two design mistakes that involve unnecessary
storage of data: First, it is nonsense to store any data fields for
the leftmost node pointer records, because the comparisons would be
resolved by the MIN_REC_FLAG alone. Second, there cannot be any null
fields in the clustered index node pointer fields, but we nevertheless
reserve space for all the null flags.
Limitations (future work):
MDEV-17459 Allow instant ALTER TABLE even if FULLTEXT INDEX exists
MDEV-17468 Avoid table rebuild on operations on generated columns
MDEV-17494 Refuse ALGORITHM=INSTANT when the row size is too large
btr_page_reorganize_low(): Preserve any metadata in the root page.
Call lock_move_reorganize_page() only after restoring the "infimum"
and "supremum" records, to avoid a memcmp() assertion failure.
dict_col_t::DROPPED: Magic value for dict_col_t::ind.
dict_col_t::clear_instant(): Renamed from dict_col_t::remove_instant().
Do not assert that the column was instantly added, because we
sometimes call this unconditionally for all columns.
Convert an instantly added column to a "core column". The old name
remove_instant() could be mistaken to refer to "instant DROP COLUMN".
dict_col_t::is_added(): Rename from dict_col_t::is_instant().
dtype_t::metadata_blob_init(): Initialize the mblob data type.
dtuple_t::is_metadata(), dtuple_t::is_alter_metadata(),
upd_t::is_metadata(), upd_t::is_alter_metadata(): Check if info_bits
refer to a metadata record.
dict_table_t::instant: Metadata about dropped or reordered columns.
dict_table_t::prepare_instant(): Prepare
ha_innobase_inplace_ctx::instant_table for instant ALTER TABLE.
innobase_instant_try() will pass this to dict_table_t::instant_column().
On rollback, dict_table_t::rollback_instant() will be called.
dict_table_t::instant_column(): Renamed from instant_add_column().
Add the parameter col_map so that columns can be reordered.
Copy and adjust v_cols[] as well.
dict_table_t::find(): Find an old column based on a new column number.
dict_table_t::serialise_columns(), dict_table_t::deserialise_columns():
Convert the mblob.
dict_index_t::instant_metadata(): Create the metadata record
for instant ALTER TABLE. Invoke dict_table_t::serialise_columns().
dict_index_t::reconstruct_fields(): Invoked by
dict_table_t::deserialise_columns().
dict_index_t::clear_instant_alter(): Move the fields for the
dropped columns to the end, and sort the surviving index fields
in ascending order of column position.
ha_innobase::check_if_supported_inplace_alter(): Do not allow
adding a FTS_DOC_ID column if a hidden FTS_DOC_ID column exists
due to FULLTEXT INDEX. (This always required ALGORITHM=COPY.)
instant_alter_column_possible(): Add a parameter for InnoDB table,
to check for additional conditions, such as the maximum number of
index fields.
ha_innobase_inplace_ctx::first_alter_pos: The first column whose position
is affected by instant ADD, DROP, or changing the order of columns.
innobase_build_col_map(): Skip added virtual columns.
prepare_inplace_add_virtual(): Correctly compute num_to_add_vcol.
Remove some unnecessary code. Note that the call to
innodb_base_col_setup() should be executed later.
commit_try_norebuild(): If ctx->is_instant(), let the virtual
columns be added or dropped by innobase_instant_try().
innobase_instant_try(): Fill in a zero default value for the
hidden column FTS_DOC_ID (to reduce the work needed in MDEV-17459).
If any columns were dropped or reordered (or added not last),
delete any SYS_COLUMNS records for the following columns, and
insert SYS_COLUMNS records for all subsequent stored columns as well
as for all virtual columns. If any virtual column is dropped, rewrite
all virtual column metadata. Use a shortcut only for adding
virtual columns. This is because innobase_drop_virtual_try()
assumes that the dropped virtual columns still exist in ctx->old_table.
innodb_update_cols(): Renamed from innodb_update_n_cols().
innobase_add_one_virtual(), innobase_insert_sys_virtual(): Change
the return type to bool, and invoke my_error() when detecting an error.
innodb_insert_sys_columns(): Insert a record into SYS_COLUMNS.
Refactored from innobase_add_one_virtual() and innobase_instant_add_col().
innobase_instant_add_col(): Replace the parameter dfield with type.
innobase_instant_drop_cols(): Drop matching columns from SYS_COLUMNS
and all columns from SYS_VIRTUAL.
innobase_add_virtual_try(), innobase_drop_virtual_try(): Let
the caller invoke innodb_update_cols().
innobase_rename_column_try(): Skip dropped columns.
commit_cache_norebuild(): Update table->fts->doc_col.
dict_mem_table_col_rename_low(): Skip dropped columns.
trx_undo_rec_get_partial_row(): Skip dropped columns.
trx_undo_update_rec_get_update(): Handle the metadata BLOB correctly.
trx_undo_page_report_modify(): Avoid out-of-bounds access to record fields.
Log metadata records consistently.
Apparently, the first fields of a clustered index may be updated
in an update_undo vector when the index is ID_IND of SYS_FOREIGN,
as part of renaming the table during ALTER TABLE. Normally, updates of
the PRIMARY KEY should be logged as delete-mark and an insert.
row_undo_mod_parse_undo_rec(), row_purge_parse_undo_rec():
Use trx_undo_metadata.
row_undo_mod_clust_low(): On metadata rollback, roll back the root page too.
row_undo_mod_clust(): Relax an assertion. The delete-mark flag was
repurposed for ALTER TABLE metadata records.
row_rec_to_index_entry_impl(): Add the template parameter mblob
and the optional parameter info_bits for specifying the desired new
info bits. For the metadata tuple, allow conversion between the original
format (ADD COLUMN only) and the generic format (with hidden BLOB).
Add the optional parameter "pad" to determine whether the tuple should
be padded to the index fields (on ALTER TABLE it should), or whether
it should remain at its original size (on rollback).
row_build_index_entry_low(): Clean up the code, removing
redundant variables and conditions. For instantly dropped columns,
generate a dummy value that is NULL, the empty string, or a
fixed length of NUL bytes, depending on the type of the dropped column.
row_upd_clust_rec_by_insert_inherit_func(): On the update of PRIMARY KEY
of a record that contained a dropped column whose value was stored
externally, we will be inserting a dummy NULL or empty string value
to the field of the dropped column. The externally stored column would
eventually be dropped when purge removes the delete-marked record for
the old PRIMARY KEY value.
btr_index_rec_validate(): Recognize the metadata record.
btr_discard_only_page_on_level(): Preserve the generic instant
ALTER TABLE metadata.
btr_set_instant(): Replaces page_set_instant(). This sets a clustered
index root page to the appropriate format, or upgrades from
the MDEV-11369 instant ADD COLUMN to generic ALTER TABLE format.
btr_cur_instant_init_low(): Read and validate the metadata BLOB page
before reconstructing the dictionary information based on it.
btr_cur_instant_init_metadata(): Do not read any lengths from the
metadata record header before reading the BLOB. At this point, we
would not actually know how many nullable fields the metadata record
contains.
btr_cur_instant_root_init(): Initialize n_core_null_bytes in one
of two possible ways.
btr_cur_trim(): Handle the mblob record.
row_metadata_to_tuple(): Convert a metadata record to a data tuple,
based on the new info_bits of the metadata record.
btr_cur_pessimistic_update(): Invoke row_metadata_to_tuple() if needed.
Invoke dtuple_convert_big_rec() for metadata records if the record is
too large, or if the mblob is not yet marked as externally stored.
btr_cur_optimistic_delete_func(), btr_cur_pessimistic_delete():
When the last user record is deleted, do not delete the
generic instant ALTER TABLE metadata record. Only delete
MDEV-11369 instant ADD COLUMN metadata records.
btr_cur_optimistic_insert(): Avoid unnecessary computation of rec_size.
btr_pcur_store_position(): Allow a logically empty page to contain
a metadata record for generic ALTER TABLE.
REC_INFO_DEFAULT_ROW_ADD: Renamed from REC_INFO_DEFAULT_ROW.
This is for the old instant ADD COLUMN (MDEV-11369) only.
REC_INFO_DEFAULT_ROW_ALTER: The more generic metadata record,
with additional information for dropped or reordered columns.
rec_info_bits_valid(): Remove. The only case when this would fail
is when the record is the generic ALTER TABLE metadata record.
rec_is_alter_metadata(): Check if a record is the metadata record
for instant ALTER TABLE (other than ADD COLUMN). NOTE: This function
must not be invoked on node pointer records, because the delete-mark
flag in those records may be set (it is garbage), and then a debug
assertion could fail because index->is_instant() does not necessarily
hold.
rec_is_add_metadata(): Check if a record is MDEV-11369 ADD COLUMN metadata
record (not more generic instant ALTER TABLE).
rec_get_converted_size_comp_prefix_low(): Assume that the metadata
field will be stored externally. In dtuple_convert_big_rec() during
the rec_get_converted_size() call, it would not be there yet.
rec_get_converted_size_comp(): Replace status,fields,n_fields with tuple.
rec_init_offsets_comp_ordinary(), rec_get_converted_size_comp_prefix_low(),
rec_convert_dtuple_to_rec_comp(): Add template<bool mblob = false>.
With mblob=true, process a record with a metadata BLOB.
rec_copy_prefix_to_buf(): Assert that no fields beyond the key and
system columns are being copied. Exclude the metadata BLOB field.
rec_convert_dtuple_to_metadata_comp(): Convert an alter metadata tuple
into a record.
row_upd_index_replace_metadata(): Apply an update vector to an
alter_metadata tuple.
row_log_allocate(): Replace dict_index_t::is_instant()
with a more appropriate condition that ignores dict_table_t::instant.
Only a table on which the MDEV-11369 ADD COLUMN was performed
can "lose its instantness" when it becomes empty. After
instant DROP COLUMN or reordering columns, we cannot simply
convert the table to the canonical format, because the data
dictionary cache and all possibly existing references to it
from other client connection threads would have to be adjusted.
row_quiesce_write_index_fields(): Do not crash when the table contains
an instantly dropped column.
Thanks to Thirunarayanan Balathandayuthapani for discussing the design
and implementing an initial prototype of this.
Thanks to Matthias Leich for testing.
2018-10-19 16:49:54 +03:00
|
|
|
ut_ad(n_core_fields <= n_fields || table->n_dropped());
|
MDEV-11369 Instant ADD COLUMN for InnoDB
For InnoDB tables, adding, dropping and reordering columns has
required a rebuild of the table and all its indexes. Since MySQL 5.6
(and MariaDB 10.0) this has been supported online (LOCK=NONE), allowing
concurrent modification of the tables.
This work revises the InnoDB ROW_FORMAT=REDUNDANT, ROW_FORMAT=COMPACT
and ROW_FORMAT=DYNAMIC so that columns can be appended instantaneously,
with only minor changes performed to the table structure. The counter
innodb_instant_alter_column in INFORMATION_SCHEMA.GLOBAL_STATUS
is incremented whenever a table rebuild operation is converted into
an instant ADD COLUMN operation.
ROW_FORMAT=COMPRESSED tables will not support instant ADD COLUMN.
Some usability limitations will be addressed in subsequent work:
MDEV-13134 Introduce ALTER TABLE attributes ALGORITHM=NOCOPY
and ALGORITHM=INSTANT
MDEV-14016 Allow instant ADD COLUMN, ADD INDEX, LOCK=NONE
The format of the clustered index (PRIMARY KEY) is changed as follows:
(1) The FIL_PAGE_TYPE of the root page will be FIL_PAGE_TYPE_INSTANT,
and a new field PAGE_INSTANT will contain the original number of fields
in the clustered index ('core' fields).
If instant ADD COLUMN has not been used or the table becomes empty,
or the very first instant ADD COLUMN operation is rolled back,
the fields PAGE_INSTANT and FIL_PAGE_TYPE will be reset
to 0 and FIL_PAGE_INDEX.
(2) A special 'default row' record is inserted into the leftmost leaf,
between the page infimum and the first user record. This record is
distinguished by the REC_INFO_MIN_REC_FLAG, and it is otherwise in the
same format as records that contain values for the instantly added
columns. This 'default row' always has the same number of fields as
the clustered index according to the table definition. The values of
'core' fields are to be ignored. For other fields, the 'default row'
will contain the default values as they were during the ALTER TABLE
statement. (If the column default values are changed later, those
values will only be stored in the .frm file. The 'default row' will
contain the original evaluated values, which must be the same for
every row.) The 'default row' must be completely hidden from
higher-level access routines. Assertions have been added to ensure
that no 'default row' is ever present in the adaptive hash index
or in locked records. The 'default row' is never delete-marked.
(3) In clustered index leaf page records, the number of fields must
reside between the number of 'core' fields (dict_index_t::n_core_fields
introduced in this work) and dict_index_t::n_fields. If the number
of fields is less than dict_index_t::n_fields, the missing fields
are replaced with the column value of the 'default row'.
Note: The number of fields in the record may shrink if some of the
last instantly added columns are updated to the value that is
in the 'default row'. The function btr_cur_trim() implements this
'compression' on update and rollback; dtuple::trim() implements it
on insert.
(4) In ROW_FORMAT=COMPACT and ROW_FORMAT=DYNAMIC records, the new
status value REC_STATUS_COLUMNS_ADDED will indicate the presence of
a new record header that will encode n_fields-n_core_fields-1 in
1 or 2 bytes. (In ROW_FORMAT=REDUNDANT records, the record header
always explicitly encodes the number of fields.)
We introduce the undo log record type TRX_UNDO_INSERT_DEFAULT for
covering the insert of the 'default row' record when instant ADD COLUMN
is used for the first time. Subsequent instant ADD COLUMN can use
TRX_UNDO_UPD_EXIST_REC.
This is joint work with Vin Chen (陈福荣) from Tencent. The design
that was discussed in April 2017 would not have allowed import or
export of data files, because instead of the 'default row' it would
have introduced a data dictionary table. The test
rpl.rpl_alter_instant is exactly as contributed in pull request #408.
The test innodb.instant_alter is based on a contributed test.
The redo log record format changes for ROW_FORMAT=DYNAMIC and
ROW_FORMAT=COMPACT are as contributed. (With this change present,
crash recovery from MariaDB 10.3.1 will fail in spectacular ways!)
Also the semantics of higher-level redo log records that modify the
PAGE_INSTANT field is changed. The redo log format version identifier
was already changed to LOG_HEADER_FORMAT_CURRENT=103 in MariaDB 10.3.1.
Everything else has been rewritten by me. Thanks to Elena Stepanova,
the code has been tested extensively.
When rolling back an instant ADD COLUMN operation, we must empty the
PAGE_FREE list after deleting or shortening the 'default row' record,
by calling either btr_page_empty() or btr_page_reorganize(). We must
know the size of each entry in the PAGE_FREE list. If rollback left a
freed copy of the 'default row' in the PAGE_FREE list, we would be
unable to determine its size (if it is in ROW_FORMAT=COMPACT or
ROW_FORMAT=DYNAMIC) because it would contain more fields than the
rolled-back definition of the clustered index.
UNIV_SQL_DEFAULT: A new special constant that designates an instantly
added column that is not present in the clustered index record.
len_is_stored(): Check if a length is an actual length. There are
two magic length values: UNIV_SQL_DEFAULT, UNIV_SQL_NULL.
dict_col_t::def_val: The 'default row' value of the column. If the
column is not added instantly, def_val.len will be UNIV_SQL_DEFAULT.
dict_col_t: Add the accessors is_virtual(), is_nullable(), is_instant(),
instant_value().
dict_col_t::remove_instant(): Remove the 'instant ADD' status of
a column.
dict_col_t::name(const dict_table_t& table): Replaces
dict_table_get_col_name().
dict_index_t::n_core_fields: The original number of fields.
For secondary indexes and if instant ADD COLUMN has not been used,
this will be equal to dict_index_t::n_fields.
dict_index_t::n_core_null_bytes: Number of bytes needed to
represent the null flags; usually equal to UT_BITS_IN_BYTES(n_nullable).
dict_index_t::NO_CORE_NULL_BYTES: Magic value signalling that
n_core_null_bytes was not initialized yet from the clustered index
root page.
dict_index_t: Add the accessors is_instant(), is_clust(),
get_n_nullable(), instant_field_value().
dict_index_t::instant_add_field(): Adjust clustered index metadata
for instant ADD COLUMN.
dict_index_t::remove_instant(): Remove the 'instant ADD' status
of a clustered index when the table becomes empty, or the very first
instant ADD COLUMN operation is rolled back.
dict_table_t: Add the accessors is_instant(), is_temporary(),
supports_instant().
dict_table_t::instant_add_column(): Adjust metadata for
instant ADD COLUMN.
dict_table_t::rollback_instant(): Adjust metadata on the rollback
of instant ADD COLUMN.
prepare_inplace_alter_table_dict(): First create the ctx->new_table,
and only then decide if the table really needs to be rebuilt.
We must split the creation of table or index metadata from the
creation of the dictionary table records and the creation of
the data. In this way, we can transform a table-rebuilding operation
into an instant ADD COLUMN operation. Dictionary objects will only
be added to cache when table rebuilding or index creation is needed.
The ctx->instant_table will never be added to cache.
dict_table_t::add_to_cache(): Modified and renamed from
dict_table_add_to_cache(). Do not modify the table metadata.
Let the callers invoke dict_table_add_system_columns() and if needed,
set can_be_evicted.
dict_create_sys_tables_tuple(), dict_create_table_step(): Omit the
system columns (which will now exist in the dict_table_t object
already at this point).
dict_create_table_step(): Expect the callers to invoke
dict_table_add_system_columns().
pars_create_table(): Before creating the table creation execution
graph, invoke dict_table_add_system_columns().
row_create_table_for_mysql(): Expect all callers to invoke
dict_table_add_system_columns().
create_index_dict(): Replaces row_merge_create_index_graph().
innodb_update_n_cols(): Renamed from innobase_update_n_virtual().
Call my_error() if an error occurs.
btr_cur_instant_init(), btr_cur_instant_init_low(),
btr_cur_instant_root_init():
Load additional metadata from the clustered index and set
dict_index_t::n_core_null_bytes. This is invoked
when table metadata is first loaded into the data dictionary.
dict_boot(): Initialize n_core_null_bytes for the four hard-coded
dictionary tables.
dict_create_index_step(): Initialize n_core_null_bytes. This is
executed as part of CREATE TABLE.
dict_index_build_internal_clust(): Initialize n_core_null_bytes to
NO_CORE_NULL_BYTES if table->supports_instant().
row_create_index_for_mysql(): Initialize n_core_null_bytes for
CREATE TEMPORARY TABLE.
commit_cache_norebuild(): Call the code to rename or enlarge columns
in the cache only if instant ADD COLUMN is not being used.
(Instant ADD COLUMN would copy all column metadata from
instant_table to old_table, including the names and lengths.)
PAGE_INSTANT: A new 13-bit field for storing dict_index_t::n_core_fields.
This is repurposing the 16-bit field PAGE_DIRECTION, of which only the
least significant 3 bits were used. The original byte containing
PAGE_DIRECTION will be accessible via the new constant PAGE_DIRECTION_B.
page_get_instant(), page_set_instant(): Accessors for the PAGE_INSTANT.
page_ptr_get_direction(), page_get_direction(),
page_ptr_set_direction(): Accessors for PAGE_DIRECTION.
page_direction_reset(): Reset PAGE_DIRECTION, PAGE_N_DIRECTION.
page_direction_increment(): Increment PAGE_N_DIRECTION
and set PAGE_DIRECTION.
rec_get_offsets(): Use the 'leaf' parameter for non-debug purposes,
and assume that heap_no is always set.
Initialize all dict_index_t::n_fields for ROW_FORMAT=REDUNDANT records,
even if the record contains fewer fields.
rec_offs_make_valid(): Add the parameter 'leaf'.
rec_copy_prefix_to_dtuple(): Assert that the tuple is only built
on the core fields. Instant ADD COLUMN only applies to the
clustered index, and we should never build a search key that has
more than the PRIMARY KEY and possibly DB_TRX_ID,DB_ROLL_PTR.
All these columns are always present.
dict_index_build_data_tuple(): Remove assertions that would be
duplicated in rec_copy_prefix_to_dtuple().
rec_init_offsets(): Support ROW_FORMAT=REDUNDANT records whose
number of fields is between n_core_fields and n_fields.
cmp_rec_rec_with_match(): Implement the comparison between two
MIN_REC_FLAG records.
trx_t::in_rollback: Make the field available in non-debug builds.
trx_start_for_ddl_low(): Remove dangerous error-tolerance.
A dictionary transaction must be flagged as such before it has generated
any undo log records. This is because trx_undo_assign_undo() will mark
the transaction as a dictionary transaction in the undo log header
right before the very first undo log record is being written.
btr_index_rec_validate(): Account for instant ADD COLUMN
row_undo_ins_remove_clust_rec(): On the rollback of an insert into
SYS_COLUMNS, revert instant ADD COLUMN in the cache by removing the
last column from the table and the clustered index.
row_search_on_row_ref(), row_undo_mod_parse_undo_rec(), row_undo_mod(),
trx_undo_update_rec_get_update(): Handle the 'default row'
as a special case.
dtuple_t::trim(index): Omit a redundant suffix of an index tuple right
before insert or update. After instant ADD COLUMN, if the last fields
of a clustered index tuple match the 'default row', there is no
need to store them. While trimming the entry, we must hold a page latch,
so that the table cannot be emptied and the 'default row' be deleted.
btr_cur_optimistic_update(), btr_cur_pessimistic_update(),
row_upd_clust_rec_by_insert(), row_ins_clust_index_entry_low():
Invoke dtuple_t::trim() if needed.
row_ins_clust_index_entry(): Restore dtuple_t::n_fields after calling
row_ins_clust_index_entry_low().
rec_get_converted_size(), rec_get_converted_size_comp(): Allow the number
of fields to be between n_core_fields and n_fields. Do not support
infimum,supremum. They are never supposed to be stored in dtuple_t,
because page creation nowadays uses a lower-level method for initializing
them.
rec_convert_dtuple_to_rec_comp(): Assign the status bits based on the
number of fields.
btr_cur_trim(): In an update, trim the index entry as needed. For the
'default row', handle rollback specially. For user records, omit
fields that match the 'default row'.
btr_cur_optimistic_delete_func(), btr_cur_pessimistic_delete():
Skip locking and adaptive hash index for the 'default row'.
row_log_table_apply_convert_mrec(): Replace 'default row' values if needed.
In the temporary file that is applied by row_log_table_apply(),
we must identify whether the records contain the extra header for
instantly added columns. For now, we will allocate an additional byte
for this for ROW_T_INSERT and ROW_T_UPDATE records when the source table
has been subject to instant ADD COLUMN. The ROW_T_DELETE records are
fine, as they will be converted and will only contain 'core' columns
(PRIMARY KEY and some system columns) that are converted from dtuple_t.
rec_get_converted_size_temp(), rec_init_offsets_temp(),
rec_convert_dtuple_to_temp(): Add the parameter 'status'.
REC_INFO_DEFAULT_ROW = REC_INFO_MIN_REC_FLAG | REC_STATUS_COLUMNS_ADDED:
An info_bits constant for distinguishing the 'default row' record.
rec_comp_status_t: An enum of the status bit values.
rec_leaf_format: An enum that replaces the bool parameter of
rec_init_offsets_comp_ordinary().
2017-10-06 07:00:05 +03:00
|
|
|
ut_ad(n_core_fields == n_fields
|
|
|
|
|| (type & ~(DICT_UNIQUE | DICT_CORRUPT)) == DICT_CLUSTERED);
|
|
|
|
ut_ad(n_core_fields == n_fields || table->supports_instant());
|
|
|
|
ut_ad(n_core_fields == n_fields || !table->is_temporary());
|
MDEV-15662 Instant DROP COLUMN or changing the order of columns
Allow ADD COLUMN anywhere in a table, not only adding as the
last column.
Allow instant DROP COLUMN and instant changing the order of columns.
The added columns will always be added last in clustered index records.
In new records, instantly dropped columns will be stored as NULL or
empty when possible.
Information about dropped and reordered columns will be written in
a metadata BLOB (mblob), which is stored before the first 'user' field
in the hidden metadata record at the start of the clustered index.
The presence of mblob is indicated by setting the delete-mark flag in
the metadata record.
The metadata BLOB stores the number of clustered index fields,
followed by an array of column information for each field.
For dropped columns, we store the NOT NULL flag, the fixed length,
and for variable-length columns, whether the maximum length exceeded
255 bytes. For non-dropped columns, we store the column position.
Unlike with MDEV-11369, when a table becomes empty, it cannot
be converted back to the canonical format. The reason for this is
that other threads may hold cached objects such as
row_prebuilt_t::ins_node that could refer to dropped or reordered
index fields.
For instant DROP COLUMN and ROW_FORMAT=COMPACT or ROW_FORMAT=DYNAMIC,
we must store the n_core_null_bytes in the root page, so that the
chain of node pointer records can be followed in order to reach the
leftmost leaf page where the metadata record is located.
If the mblob is present, we will zero-initialize the strings
"infimum" and "supremum" in the root page, and use the last byte of
"supremum" for storing the number of null bytes (which are allocated
but useless on node pointer pages). This is necessary for
btr_cur_instant_init_metadata() to be able to navigate to the mblob.
If the PRIMARY KEY contains any variable-length column and some
nullable columns were instantly dropped, the dict_index_t::n_nullable
in the data dictionary could be smaller than it actually is in the
non-leaf pages. Because of this, the non-leaf pages could use more
bytes for the null flags than the data dictionary expects, and we
could be reading the lengths of the variable-length columns from the
wrong offset, and thus reading the child page number from wrong place.
This is the result of two design mistakes that involve unnecessary
storage of data: First, it is nonsense to store any data fields for
the leftmost node pointer records, because the comparisons would be
resolved by the MIN_REC_FLAG alone. Second, there cannot be any null
fields in the clustered index node pointer fields, but we nevertheless
reserve space for all the null flags.
Limitations (future work):
MDEV-17459 Allow instant ALTER TABLE even if FULLTEXT INDEX exists
MDEV-17468 Avoid table rebuild on operations on generated columns
MDEV-17494 Refuse ALGORITHM=INSTANT when the row size is too large
btr_page_reorganize_low(): Preserve any metadata in the root page.
Call lock_move_reorganize_page() only after restoring the "infimum"
and "supremum" records, to avoid a memcmp() assertion failure.
dict_col_t::DROPPED: Magic value for dict_col_t::ind.
dict_col_t::clear_instant(): Renamed from dict_col_t::remove_instant().
Do not assert that the column was instantly added, because we
sometimes call this unconditionally for all columns.
Convert an instantly added column to a "core column". The old name
remove_instant() could be mistaken to refer to "instant DROP COLUMN".
dict_col_t::is_added(): Rename from dict_col_t::is_instant().
dtype_t::metadata_blob_init(): Initialize the mblob data type.
dtuple_t::is_metadata(), dtuple_t::is_alter_metadata(),
upd_t::is_metadata(), upd_t::is_alter_metadata(): Check if info_bits
refer to a metadata record.
dict_table_t::instant: Metadata about dropped or reordered columns.
dict_table_t::prepare_instant(): Prepare
ha_innobase_inplace_ctx::instant_table for instant ALTER TABLE.
innobase_instant_try() will pass this to dict_table_t::instant_column().
On rollback, dict_table_t::rollback_instant() will be called.
dict_table_t::instant_column(): Renamed from instant_add_column().
Add the parameter col_map so that columns can be reordered.
Copy and adjust v_cols[] as well.
dict_table_t::find(): Find an old column based on a new column number.
dict_table_t::serialise_columns(), dict_table_t::deserialise_columns():
Convert the mblob.
dict_index_t::instant_metadata(): Create the metadata record
for instant ALTER TABLE. Invoke dict_table_t::serialise_columns().
dict_index_t::reconstruct_fields(): Invoked by
dict_table_t::deserialise_columns().
dict_index_t::clear_instant_alter(): Move the fields for the
dropped columns to the end, and sort the surviving index fields
in ascending order of column position.
ha_innobase::check_if_supported_inplace_alter(): Do not allow
adding a FTS_DOC_ID column if a hidden FTS_DOC_ID column exists
due to FULLTEXT INDEX. (This always required ALGORITHM=COPY.)
instant_alter_column_possible(): Add a parameter for InnoDB table,
to check for additional conditions, such as the maximum number of
index fields.
ha_innobase_inplace_ctx::first_alter_pos: The first column whose position
is affected by instant ADD, DROP, or changing the order of columns.
innobase_build_col_map(): Skip added virtual columns.
prepare_inplace_add_virtual(): Correctly compute num_to_add_vcol.
Remove some unnecessary code. Note that the call to
innodb_base_col_setup() should be executed later.
commit_try_norebuild(): If ctx->is_instant(), let the virtual
columns be added or dropped by innobase_instant_try().
innobase_instant_try(): Fill in a zero default value for the
hidden column FTS_DOC_ID (to reduce the work needed in MDEV-17459).
If any columns were dropped or reordered (or added not last),
delete any SYS_COLUMNS records for the following columns, and
insert SYS_COLUMNS records for all subsequent stored columns as well
as for all virtual columns. If any virtual column is dropped, rewrite
all virtual column metadata. Use a shortcut only for adding
virtual columns. This is because innobase_drop_virtual_try()
assumes that the dropped virtual columns still exist in ctx->old_table.
innodb_update_cols(): Renamed from innodb_update_n_cols().
innobase_add_one_virtual(), innobase_insert_sys_virtual(): Change
the return type to bool, and invoke my_error() when detecting an error.
innodb_insert_sys_columns(): Insert a record into SYS_COLUMNS.
Refactored from innobase_add_one_virtual() and innobase_instant_add_col().
innobase_instant_add_col(): Replace the parameter dfield with type.
innobase_instant_drop_cols(): Drop matching columns from SYS_COLUMNS
and all columns from SYS_VIRTUAL.
innobase_add_virtual_try(), innobase_drop_virtual_try(): Let
the caller invoke innodb_update_cols().
innobase_rename_column_try(): Skip dropped columns.
commit_cache_norebuild(): Update table->fts->doc_col.
dict_mem_table_col_rename_low(): Skip dropped columns.
trx_undo_rec_get_partial_row(): Skip dropped columns.
trx_undo_update_rec_get_update(): Handle the metadata BLOB correctly.
trx_undo_page_report_modify(): Avoid out-of-bounds access to record fields.
Log metadata records consistently.
Apparently, the first fields of a clustered index may be updated
in an update_undo vector when the index is ID_IND of SYS_FOREIGN,
as part of renaming the table during ALTER TABLE. Normally, updates of
the PRIMARY KEY should be logged as delete-mark and an insert.
row_undo_mod_parse_undo_rec(), row_purge_parse_undo_rec():
Use trx_undo_metadata.
row_undo_mod_clust_low(): On metadata rollback, roll back the root page too.
row_undo_mod_clust(): Relax an assertion. The delete-mark flag was
repurposed for ALTER TABLE metadata records.
row_rec_to_index_entry_impl(): Add the template parameter mblob
and the optional parameter info_bits for specifying the desired new
info bits. For the metadata tuple, allow conversion between the original
format (ADD COLUMN only) and the generic format (with hidden BLOB).
Add the optional parameter "pad" to determine whether the tuple should
be padded to the index fields (on ALTER TABLE it should), or whether
it should remain at its original size (on rollback).
row_build_index_entry_low(): Clean up the code, removing
redundant variables and conditions. For instantly dropped columns,
generate a dummy value that is NULL, the empty string, or a
fixed length of NUL bytes, depending on the type of the dropped column.
row_upd_clust_rec_by_insert_inherit_func(): On the update of PRIMARY KEY
of a record that contained a dropped column whose value was stored
externally, we will be inserting a dummy NULL or empty string value
to the field of the dropped column. The externally stored column would
eventually be dropped when purge removes the delete-marked record for
the old PRIMARY KEY value.
btr_index_rec_validate(): Recognize the metadata record.
btr_discard_only_page_on_level(): Preserve the generic instant
ALTER TABLE metadata.
btr_set_instant(): Replaces page_set_instant(). This sets a clustered
index root page to the appropriate format, or upgrades from
the MDEV-11369 instant ADD COLUMN to generic ALTER TABLE format.
btr_cur_instant_init_low(): Read and validate the metadata BLOB page
before reconstructing the dictionary information based on it.
btr_cur_instant_init_metadata(): Do not read any lengths from the
metadata record header before reading the BLOB. At this point, we
would not actually know how many nullable fields the metadata record
contains.
btr_cur_instant_root_init(): Initialize n_core_null_bytes in one
of two possible ways.
btr_cur_trim(): Handle the mblob record.
row_metadata_to_tuple(): Convert a metadata record to a data tuple,
based on the new info_bits of the metadata record.
btr_cur_pessimistic_update(): Invoke row_metadata_to_tuple() if needed.
Invoke dtuple_convert_big_rec() for metadata records if the record is
too large, or if the mblob is not yet marked as externally stored.
btr_cur_optimistic_delete_func(), btr_cur_pessimistic_delete():
When the last user record is deleted, do not delete the
generic instant ALTER TABLE metadata record. Only delete
MDEV-11369 instant ADD COLUMN metadata records.
btr_cur_optimistic_insert(): Avoid unnecessary computation of rec_size.
btr_pcur_store_position(): Allow a logically empty page to contain
a metadata record for generic ALTER TABLE.
REC_INFO_DEFAULT_ROW_ADD: Renamed from REC_INFO_DEFAULT_ROW.
This is for the old instant ADD COLUMN (MDEV-11369) only.
REC_INFO_DEFAULT_ROW_ALTER: The more generic metadata record,
with additional information for dropped or reordered columns.
rec_info_bits_valid(): Remove. The only case when this would fail
is when the record is the generic ALTER TABLE metadata record.
rec_is_alter_metadata(): Check if a record is the metadata record
for instant ALTER TABLE (other than ADD COLUMN). NOTE: This function
must not be invoked on node pointer records, because the delete-mark
flag in those records may be set (it is garbage), and then a debug
assertion could fail because index->is_instant() does not necessarily
hold.
rec_is_add_metadata(): Check if a record is MDEV-11369 ADD COLUMN metadata
record (not more generic instant ALTER TABLE).
rec_get_converted_size_comp_prefix_low(): Assume that the metadata
field will be stored externally. In dtuple_convert_big_rec() during
the rec_get_converted_size() call, it would not be there yet.
rec_get_converted_size_comp(): Replace status,fields,n_fields with tuple.
rec_init_offsets_comp_ordinary(), rec_get_converted_size_comp_prefix_low(),
rec_convert_dtuple_to_rec_comp(): Add template<bool mblob = false>.
With mblob=true, process a record with a metadata BLOB.
rec_copy_prefix_to_buf(): Assert that no fields beyond the key and
system columns are being copied. Exclude the metadata BLOB field.
rec_convert_dtuple_to_metadata_comp(): Convert an alter metadata tuple
into a record.
row_upd_index_replace_metadata(): Apply an update vector to an
alter_metadata tuple.
row_log_allocate(): Replace dict_index_t::is_instant()
with a more appropriate condition that ignores dict_table_t::instant.
Only a table on which the MDEV-11369 ADD COLUMN was performed
can "lose its instantness" when it becomes empty. After
instant DROP COLUMN or reordering columns, we cannot simply
convert the table to the canonical format, because the data
dictionary cache and all possibly existing references to it
from other client connection threads would have to be adjusted.
row_quiesce_write_index_fields(): Do not crash when the table contains
an instantly dropped column.
Thanks to Thirunarayanan Balathandayuthapani for discussing the design
and implementing an initial prototype of this.
Thanks to Matthias Leich for testing.
2018-10-19 16:49:54 +03:00
|
|
|
ut_ad(!table->instant || !table->is_temporary());
|
|
|
|
|
|
|
|
return n_core_fields != n_fields
|
|
|
|
|| (is_primary() && table->instant);
|
MDEV-11369 Instant ADD COLUMN for InnoDB
For InnoDB tables, adding, dropping and reordering columns has
required a rebuild of the table and all its indexes. Since MySQL 5.6
(and MariaDB 10.0) this has been supported online (LOCK=NONE), allowing
concurrent modification of the tables.
This work revises the InnoDB ROW_FORMAT=REDUNDANT, ROW_FORMAT=COMPACT
and ROW_FORMAT=DYNAMIC so that columns can be appended instantaneously,
with only minor changes performed to the table structure. The counter
innodb_instant_alter_column in INFORMATION_SCHEMA.GLOBAL_STATUS
is incremented whenever a table rebuild operation is converted into
an instant ADD COLUMN operation.
ROW_FORMAT=COMPRESSED tables will not support instant ADD COLUMN.
Some usability limitations will be addressed in subsequent work:
MDEV-13134 Introduce ALTER TABLE attributes ALGORITHM=NOCOPY
and ALGORITHM=INSTANT
MDEV-14016 Allow instant ADD COLUMN, ADD INDEX, LOCK=NONE
The format of the clustered index (PRIMARY KEY) is changed as follows:
(1) The FIL_PAGE_TYPE of the root page will be FIL_PAGE_TYPE_INSTANT,
and a new field PAGE_INSTANT will contain the original number of fields
in the clustered index ('core' fields).
If instant ADD COLUMN has not been used or the table becomes empty,
or the very first instant ADD COLUMN operation is rolled back,
the fields PAGE_INSTANT and FIL_PAGE_TYPE will be reset
to 0 and FIL_PAGE_INDEX.
(2) A special 'default row' record is inserted into the leftmost leaf,
between the page infimum and the first user record. This record is
distinguished by the REC_INFO_MIN_REC_FLAG, and it is otherwise in the
same format as records that contain values for the instantly added
columns. This 'default row' always has the same number of fields as
the clustered index according to the table definition. The values of
'core' fields are to be ignored. For other fields, the 'default row'
will contain the default values as they were during the ALTER TABLE
statement. (If the column default values are changed later, those
values will only be stored in the .frm file. The 'default row' will
contain the original evaluated values, which must be the same for
every row.) The 'default row' must be completely hidden from
higher-level access routines. Assertions have been added to ensure
that no 'default row' is ever present in the adaptive hash index
or in locked records. The 'default row' is never delete-marked.
(3) In clustered index leaf page records, the number of fields must
reside between the number of 'core' fields (dict_index_t::n_core_fields
introduced in this work) and dict_index_t::n_fields. If the number
of fields is less than dict_index_t::n_fields, the missing fields
are replaced with the column value of the 'default row'.
Note: The number of fields in the record may shrink if some of the
last instantly added columns are updated to the value that is
in the 'default row'. The function btr_cur_trim() implements this
'compression' on update and rollback; dtuple::trim() implements it
on insert.
(4) In ROW_FORMAT=COMPACT and ROW_FORMAT=DYNAMIC records, the new
status value REC_STATUS_COLUMNS_ADDED will indicate the presence of
a new record header that will encode n_fields-n_core_fields-1 in
1 or 2 bytes. (In ROW_FORMAT=REDUNDANT records, the record header
always explicitly encodes the number of fields.)
We introduce the undo log record type TRX_UNDO_INSERT_DEFAULT for
covering the insert of the 'default row' record when instant ADD COLUMN
is used for the first time. Subsequent instant ADD COLUMN can use
TRX_UNDO_UPD_EXIST_REC.
This is joint work with Vin Chen (陈福荣) from Tencent. The design
that was discussed in April 2017 would not have allowed import or
export of data files, because instead of the 'default row' it would
have introduced a data dictionary table. The test
rpl.rpl_alter_instant is exactly as contributed in pull request #408.
The test innodb.instant_alter is based on a contributed test.
The redo log record format changes for ROW_FORMAT=DYNAMIC and
ROW_FORMAT=COMPACT are as contributed. (With this change present,
crash recovery from MariaDB 10.3.1 will fail in spectacular ways!)
Also the semantics of higher-level redo log records that modify the
PAGE_INSTANT field is changed. The redo log format version identifier
was already changed to LOG_HEADER_FORMAT_CURRENT=103 in MariaDB 10.3.1.
Everything else has been rewritten by me. Thanks to Elena Stepanova,
the code has been tested extensively.
When rolling back an instant ADD COLUMN operation, we must empty the
PAGE_FREE list after deleting or shortening the 'default row' record,
by calling either btr_page_empty() or btr_page_reorganize(). We must
know the size of each entry in the PAGE_FREE list. If rollback left a
freed copy of the 'default row' in the PAGE_FREE list, we would be
unable to determine its size (if it is in ROW_FORMAT=COMPACT or
ROW_FORMAT=DYNAMIC) because it would contain more fields than the
rolled-back definition of the clustered index.
UNIV_SQL_DEFAULT: A new special constant that designates an instantly
added column that is not present in the clustered index record.
len_is_stored(): Check if a length is an actual length. There are
two magic length values: UNIV_SQL_DEFAULT, UNIV_SQL_NULL.
dict_col_t::def_val: The 'default row' value of the column. If the
column is not added instantly, def_val.len will be UNIV_SQL_DEFAULT.
dict_col_t: Add the accessors is_virtual(), is_nullable(), is_instant(),
instant_value().
dict_col_t::remove_instant(): Remove the 'instant ADD' status of
a column.
dict_col_t::name(const dict_table_t& table): Replaces
dict_table_get_col_name().
dict_index_t::n_core_fields: The original number of fields.
For secondary indexes and if instant ADD COLUMN has not been used,
this will be equal to dict_index_t::n_fields.
dict_index_t::n_core_null_bytes: Number of bytes needed to
represent the null flags; usually equal to UT_BITS_IN_BYTES(n_nullable).
dict_index_t::NO_CORE_NULL_BYTES: Magic value signalling that
n_core_null_bytes was not initialized yet from the clustered index
root page.
dict_index_t: Add the accessors is_instant(), is_clust(),
get_n_nullable(), instant_field_value().
dict_index_t::instant_add_field(): Adjust clustered index metadata
for instant ADD COLUMN.
dict_index_t::remove_instant(): Remove the 'instant ADD' status
of a clustered index when the table becomes empty, or the very first
instant ADD COLUMN operation is rolled back.
dict_table_t: Add the accessors is_instant(), is_temporary(),
supports_instant().
dict_table_t::instant_add_column(): Adjust metadata for
instant ADD COLUMN.
dict_table_t::rollback_instant(): Adjust metadata on the rollback
of instant ADD COLUMN.
prepare_inplace_alter_table_dict(): First create the ctx->new_table,
and only then decide if the table really needs to be rebuilt.
We must split the creation of table or index metadata from the
creation of the dictionary table records and the creation of
the data. In this way, we can transform a table-rebuilding operation
into an instant ADD COLUMN operation. Dictionary objects will only
be added to cache when table rebuilding or index creation is needed.
The ctx->instant_table will never be added to cache.
dict_table_t::add_to_cache(): Modified and renamed from
dict_table_add_to_cache(). Do not modify the table metadata.
Let the callers invoke dict_table_add_system_columns() and if needed,
set can_be_evicted.
dict_create_sys_tables_tuple(), dict_create_table_step(): Omit the
system columns (which will now exist in the dict_table_t object
already at this point).
dict_create_table_step(): Expect the callers to invoke
dict_table_add_system_columns().
pars_create_table(): Before creating the table creation execution
graph, invoke dict_table_add_system_columns().
row_create_table_for_mysql(): Expect all callers to invoke
dict_table_add_system_columns().
create_index_dict(): Replaces row_merge_create_index_graph().
innodb_update_n_cols(): Renamed from innobase_update_n_virtual().
Call my_error() if an error occurs.
btr_cur_instant_init(), btr_cur_instant_init_low(),
btr_cur_instant_root_init():
Load additional metadata from the clustered index and set
dict_index_t::n_core_null_bytes. This is invoked
when table metadata is first loaded into the data dictionary.
dict_boot(): Initialize n_core_null_bytes for the four hard-coded
dictionary tables.
dict_create_index_step(): Initialize n_core_null_bytes. This is
executed as part of CREATE TABLE.
dict_index_build_internal_clust(): Initialize n_core_null_bytes to
NO_CORE_NULL_BYTES if table->supports_instant().
row_create_index_for_mysql(): Initialize n_core_null_bytes for
CREATE TEMPORARY TABLE.
commit_cache_norebuild(): Call the code to rename or enlarge columns
in the cache only if instant ADD COLUMN is not being used.
(Instant ADD COLUMN would copy all column metadata from
instant_table to old_table, including the names and lengths.)
PAGE_INSTANT: A new 13-bit field for storing dict_index_t::n_core_fields.
This is repurposing the 16-bit field PAGE_DIRECTION, of which only the
least significant 3 bits were used. The original byte containing
PAGE_DIRECTION will be accessible via the new constant PAGE_DIRECTION_B.
page_get_instant(), page_set_instant(): Accessors for the PAGE_INSTANT.
page_ptr_get_direction(), page_get_direction(),
page_ptr_set_direction(): Accessors for PAGE_DIRECTION.
page_direction_reset(): Reset PAGE_DIRECTION, PAGE_N_DIRECTION.
page_direction_increment(): Increment PAGE_N_DIRECTION
and set PAGE_DIRECTION.
rec_get_offsets(): Use the 'leaf' parameter for non-debug purposes,
and assume that heap_no is always set.
Initialize all dict_index_t::n_fields for ROW_FORMAT=REDUNDANT records,
even if the record contains fewer fields.
rec_offs_make_valid(): Add the parameter 'leaf'.
rec_copy_prefix_to_dtuple(): Assert that the tuple is only built
on the core fields. Instant ADD COLUMN only applies to the
clustered index, and we should never build a search key that has
more than the PRIMARY KEY and possibly DB_TRX_ID,DB_ROLL_PTR.
All these columns are always present.
dict_index_build_data_tuple(): Remove assertions that would be
duplicated in rec_copy_prefix_to_dtuple().
rec_init_offsets(): Support ROW_FORMAT=REDUNDANT records whose
number of fields is between n_core_fields and n_fields.
cmp_rec_rec_with_match(): Implement the comparison between two
MIN_REC_FLAG records.
trx_t::in_rollback: Make the field available in non-debug builds.
trx_start_for_ddl_low(): Remove dangerous error-tolerance.
A dictionary transaction must be flagged as such before it has generated
any undo log records. This is because trx_undo_assign_undo() will mark
the transaction as a dictionary transaction in the undo log header
right before the very first undo log record is being written.
btr_index_rec_validate(): Account for instant ADD COLUMN
row_undo_ins_remove_clust_rec(): On the rollback of an insert into
SYS_COLUMNS, revert instant ADD COLUMN in the cache by removing the
last column from the table and the clustered index.
row_search_on_row_ref(), row_undo_mod_parse_undo_rec(), row_undo_mod(),
trx_undo_update_rec_get_update(): Handle the 'default row'
as a special case.
dtuple_t::trim(index): Omit a redundant suffix of an index tuple right
before insert or update. After instant ADD COLUMN, if the last fields
of a clustered index tuple match the 'default row', there is no
need to store them. While trimming the entry, we must hold a page latch,
so that the table cannot be emptied and the 'default row' be deleted.
btr_cur_optimistic_update(), btr_cur_pessimistic_update(),
row_upd_clust_rec_by_insert(), row_ins_clust_index_entry_low():
Invoke dtuple_t::trim() if needed.
row_ins_clust_index_entry(): Restore dtuple_t::n_fields after calling
row_ins_clust_index_entry_low().
rec_get_converted_size(), rec_get_converted_size_comp(): Allow the number
of fields to be between n_core_fields and n_fields. Do not support
infimum,supremum. They are never supposed to be stored in dtuple_t,
because page creation nowadays uses a lower-level method for initializing
them.
rec_convert_dtuple_to_rec_comp(): Assign the status bits based on the
number of fields.
btr_cur_trim(): In an update, trim the index entry as needed. For the
'default row', handle rollback specially. For user records, omit
fields that match the 'default row'.
btr_cur_optimistic_delete_func(), btr_cur_pessimistic_delete():
Skip locking and adaptive hash index for the 'default row'.
row_log_table_apply_convert_mrec(): Replace 'default row' values if needed.
In the temporary file that is applied by row_log_table_apply(),
we must identify whether the records contain the extra header for
instantly added columns. For now, we will allocate an additional byte
for this for ROW_T_INSERT and ROW_T_UPDATE records when the source table
has been subject to instant ADD COLUMN. The ROW_T_DELETE records are
fine, as they will be converted and will only contain 'core' columns
(PRIMARY KEY and some system columns) that are converted from dtuple_t.
rec_get_converted_size_temp(), rec_init_offsets_temp(),
rec_convert_dtuple_to_temp(): Add the parameter 'status'.
REC_INFO_DEFAULT_ROW = REC_INFO_MIN_REC_FLAG | REC_STATUS_COLUMNS_ADDED:
An info_bits constant for distinguishing the 'default row' record.
rec_comp_status_t: An enum of the status bit values.
rec_leaf_format: An enum that replaces the bool parameter of
rec_init_offsets_comp_ordinary().
2017-10-06 07:00:05 +03:00
|
|
|
}
|
|
|
|
|
2018-05-07 15:30:57 +03:00
|
|
|
inline bool dict_index_t::is_corrupted() const
|
|
|
|
{
|
|
|
|
return UNIV_UNLIKELY(online_status >= ONLINE_INDEX_ABORTED
|
|
|
|
|| (type & DICT_CORRUPT)
|
|
|
|
|| (table && table->corrupted));
|
|
|
|
}
|
|
|
|
|
MDEV-15662 Instant DROP COLUMN or changing the order of columns
Allow ADD COLUMN anywhere in a table, not only adding as the
last column.
Allow instant DROP COLUMN and instant changing the order of columns.
The added columns will always be added last in clustered index records.
In new records, instantly dropped columns will be stored as NULL or
empty when possible.
Information about dropped and reordered columns will be written in
a metadata BLOB (mblob), which is stored before the first 'user' field
in the hidden metadata record at the start of the clustered index.
The presence of mblob is indicated by setting the delete-mark flag in
the metadata record.
The metadata BLOB stores the number of clustered index fields,
followed by an array of column information for each field.
For dropped columns, we store the NOT NULL flag, the fixed length,
and for variable-length columns, whether the maximum length exceeded
255 bytes. For non-dropped columns, we store the column position.
Unlike with MDEV-11369, when a table becomes empty, it cannot
be converted back to the canonical format. The reason for this is
that other threads may hold cached objects such as
row_prebuilt_t::ins_node that could refer to dropped or reordered
index fields.
For instant DROP COLUMN and ROW_FORMAT=COMPACT or ROW_FORMAT=DYNAMIC,
we must store the n_core_null_bytes in the root page, so that the
chain of node pointer records can be followed in order to reach the
leftmost leaf page where the metadata record is located.
If the mblob is present, we will zero-initialize the strings
"infimum" and "supremum" in the root page, and use the last byte of
"supremum" for storing the number of null bytes (which are allocated
but useless on node pointer pages). This is necessary for
btr_cur_instant_init_metadata() to be able to navigate to the mblob.
If the PRIMARY KEY contains any variable-length column and some
nullable columns were instantly dropped, the dict_index_t::n_nullable
in the data dictionary could be smaller than it actually is in the
non-leaf pages. Because of this, the non-leaf pages could use more
bytes for the null flags than the data dictionary expects, and we
could be reading the lengths of the variable-length columns from the
wrong offset, and thus reading the child page number from wrong place.
This is the result of two design mistakes that involve unnecessary
storage of data: First, it is nonsense to store any data fields for
the leftmost node pointer records, because the comparisons would be
resolved by the MIN_REC_FLAG alone. Second, there cannot be any null
fields in the clustered index node pointer fields, but we nevertheless
reserve space for all the null flags.
Limitations (future work):
MDEV-17459 Allow instant ALTER TABLE even if FULLTEXT INDEX exists
MDEV-17468 Avoid table rebuild on operations on generated columns
MDEV-17494 Refuse ALGORITHM=INSTANT when the row size is too large
btr_page_reorganize_low(): Preserve any metadata in the root page.
Call lock_move_reorganize_page() only after restoring the "infimum"
and "supremum" records, to avoid a memcmp() assertion failure.
dict_col_t::DROPPED: Magic value for dict_col_t::ind.
dict_col_t::clear_instant(): Renamed from dict_col_t::remove_instant().
Do not assert that the column was instantly added, because we
sometimes call this unconditionally for all columns.
Convert an instantly added column to a "core column". The old name
remove_instant() could be mistaken to refer to "instant DROP COLUMN".
dict_col_t::is_added(): Rename from dict_col_t::is_instant().
dtype_t::metadata_blob_init(): Initialize the mblob data type.
dtuple_t::is_metadata(), dtuple_t::is_alter_metadata(),
upd_t::is_metadata(), upd_t::is_alter_metadata(): Check if info_bits
refer to a metadata record.
dict_table_t::instant: Metadata about dropped or reordered columns.
dict_table_t::prepare_instant(): Prepare
ha_innobase_inplace_ctx::instant_table for instant ALTER TABLE.
innobase_instant_try() will pass this to dict_table_t::instant_column().
On rollback, dict_table_t::rollback_instant() will be called.
dict_table_t::instant_column(): Renamed from instant_add_column().
Add the parameter col_map so that columns can be reordered.
Copy and adjust v_cols[] as well.
dict_table_t::find(): Find an old column based on a new column number.
dict_table_t::serialise_columns(), dict_table_t::deserialise_columns():
Convert the mblob.
dict_index_t::instant_metadata(): Create the metadata record
for instant ALTER TABLE. Invoke dict_table_t::serialise_columns().
dict_index_t::reconstruct_fields(): Invoked by
dict_table_t::deserialise_columns().
dict_index_t::clear_instant_alter(): Move the fields for the
dropped columns to the end, and sort the surviving index fields
in ascending order of column position.
ha_innobase::check_if_supported_inplace_alter(): Do not allow
adding a FTS_DOC_ID column if a hidden FTS_DOC_ID column exists
due to FULLTEXT INDEX. (This always required ALGORITHM=COPY.)
instant_alter_column_possible(): Add a parameter for InnoDB table,
to check for additional conditions, such as the maximum number of
index fields.
ha_innobase_inplace_ctx::first_alter_pos: The first column whose position
is affected by instant ADD, DROP, or changing the order of columns.
innobase_build_col_map(): Skip added virtual columns.
prepare_inplace_add_virtual(): Correctly compute num_to_add_vcol.
Remove some unnecessary code. Note that the call to
innodb_base_col_setup() should be executed later.
commit_try_norebuild(): If ctx->is_instant(), let the virtual
columns be added or dropped by innobase_instant_try().
innobase_instant_try(): Fill in a zero default value for the
hidden column FTS_DOC_ID (to reduce the work needed in MDEV-17459).
If any columns were dropped or reordered (or added not last),
delete any SYS_COLUMNS records for the following columns, and
insert SYS_COLUMNS records for all subsequent stored columns as well
as for all virtual columns. If any virtual column is dropped, rewrite
all virtual column metadata. Use a shortcut only for adding
virtual columns. This is because innobase_drop_virtual_try()
assumes that the dropped virtual columns still exist in ctx->old_table.
innodb_update_cols(): Renamed from innodb_update_n_cols().
innobase_add_one_virtual(), innobase_insert_sys_virtual(): Change
the return type to bool, and invoke my_error() when detecting an error.
innodb_insert_sys_columns(): Insert a record into SYS_COLUMNS.
Refactored from innobase_add_one_virtual() and innobase_instant_add_col().
innobase_instant_add_col(): Replace the parameter dfield with type.
innobase_instant_drop_cols(): Drop matching columns from SYS_COLUMNS
and all columns from SYS_VIRTUAL.
innobase_add_virtual_try(), innobase_drop_virtual_try(): Let
the caller invoke innodb_update_cols().
innobase_rename_column_try(): Skip dropped columns.
commit_cache_norebuild(): Update table->fts->doc_col.
dict_mem_table_col_rename_low(): Skip dropped columns.
trx_undo_rec_get_partial_row(): Skip dropped columns.
trx_undo_update_rec_get_update(): Handle the metadata BLOB correctly.
trx_undo_page_report_modify(): Avoid out-of-bounds access to record fields.
Log metadata records consistently.
Apparently, the first fields of a clustered index may be updated
in an update_undo vector when the index is ID_IND of SYS_FOREIGN,
as part of renaming the table during ALTER TABLE. Normally, updates of
the PRIMARY KEY should be logged as delete-mark and an insert.
row_undo_mod_parse_undo_rec(), row_purge_parse_undo_rec():
Use trx_undo_metadata.
row_undo_mod_clust_low(): On metadata rollback, roll back the root page too.
row_undo_mod_clust(): Relax an assertion. The delete-mark flag was
repurposed for ALTER TABLE metadata records.
row_rec_to_index_entry_impl(): Add the template parameter mblob
and the optional parameter info_bits for specifying the desired new
info bits. For the metadata tuple, allow conversion between the original
format (ADD COLUMN only) and the generic format (with hidden BLOB).
Add the optional parameter "pad" to determine whether the tuple should
be padded to the index fields (on ALTER TABLE it should), or whether
it should remain at its original size (on rollback).
row_build_index_entry_low(): Clean up the code, removing
redundant variables and conditions. For instantly dropped columns,
generate a dummy value that is NULL, the empty string, or a
fixed length of NUL bytes, depending on the type of the dropped column.
row_upd_clust_rec_by_insert_inherit_func(): On the update of PRIMARY KEY
of a record that contained a dropped column whose value was stored
externally, we will be inserting a dummy NULL or empty string value
to the field of the dropped column. The externally stored column would
eventually be dropped when purge removes the delete-marked record for
the old PRIMARY KEY value.
btr_index_rec_validate(): Recognize the metadata record.
btr_discard_only_page_on_level(): Preserve the generic instant
ALTER TABLE metadata.
btr_set_instant(): Replaces page_set_instant(). This sets a clustered
index root page to the appropriate format, or upgrades from
the MDEV-11369 instant ADD COLUMN to generic ALTER TABLE format.
btr_cur_instant_init_low(): Read and validate the metadata BLOB page
before reconstructing the dictionary information based on it.
btr_cur_instant_init_metadata(): Do not read any lengths from the
metadata record header before reading the BLOB. At this point, we
would not actually know how many nullable fields the metadata record
contains.
btr_cur_instant_root_init(): Initialize n_core_null_bytes in one
of two possible ways.
btr_cur_trim(): Handle the mblob record.
row_metadata_to_tuple(): Convert a metadata record to a data tuple,
based on the new info_bits of the metadata record.
btr_cur_pessimistic_update(): Invoke row_metadata_to_tuple() if needed.
Invoke dtuple_convert_big_rec() for metadata records if the record is
too large, or if the mblob is not yet marked as externally stored.
btr_cur_optimistic_delete_func(), btr_cur_pessimistic_delete():
When the last user record is deleted, do not delete the
generic instant ALTER TABLE metadata record. Only delete
MDEV-11369 instant ADD COLUMN metadata records.
btr_cur_optimistic_insert(): Avoid unnecessary computation of rec_size.
btr_pcur_store_position(): Allow a logically empty page to contain
a metadata record for generic ALTER TABLE.
REC_INFO_DEFAULT_ROW_ADD: Renamed from REC_INFO_DEFAULT_ROW.
This is for the old instant ADD COLUMN (MDEV-11369) only.
REC_INFO_DEFAULT_ROW_ALTER: The more generic metadata record,
with additional information for dropped or reordered columns.
rec_info_bits_valid(): Remove. The only case when this would fail
is when the record is the generic ALTER TABLE metadata record.
rec_is_alter_metadata(): Check if a record is the metadata record
for instant ALTER TABLE (other than ADD COLUMN). NOTE: This function
must not be invoked on node pointer records, because the delete-mark
flag in those records may be set (it is garbage), and then a debug
assertion could fail because index->is_instant() does not necessarily
hold.
rec_is_add_metadata(): Check if a record is MDEV-11369 ADD COLUMN metadata
record (not more generic instant ALTER TABLE).
rec_get_converted_size_comp_prefix_low(): Assume that the metadata
field will be stored externally. In dtuple_convert_big_rec() during
the rec_get_converted_size() call, it would not be there yet.
rec_get_converted_size_comp(): Replace status,fields,n_fields with tuple.
rec_init_offsets_comp_ordinary(), rec_get_converted_size_comp_prefix_low(),
rec_convert_dtuple_to_rec_comp(): Add template<bool mblob = false>.
With mblob=true, process a record with a metadata BLOB.
rec_copy_prefix_to_buf(): Assert that no fields beyond the key and
system columns are being copied. Exclude the metadata BLOB field.
rec_convert_dtuple_to_metadata_comp(): Convert an alter metadata tuple
into a record.
row_upd_index_replace_metadata(): Apply an update vector to an
alter_metadata tuple.
row_log_allocate(): Replace dict_index_t::is_instant()
with a more appropriate condition that ignores dict_table_t::instant.
Only a table on which the MDEV-11369 ADD COLUMN was performed
can "lose its instantness" when it becomes empty. After
instant DROP COLUMN or reordering columns, we cannot simply
convert the table to the canonical format, because the data
dictionary cache and all possibly existing references to it
from other client connection threads would have to be adjusted.
row_quiesce_write_index_fields(): Do not crash when the table contains
an instantly dropped column.
Thanks to Thirunarayanan Balathandayuthapani for discussing the design
and implementing an initial prototype of this.
Thanks to Matthias Leich for testing.
2018-10-19 16:49:54 +03:00
|
|
|
inline void dict_index_t::clear_instant_add()
|
|
|
|
{
|
|
|
|
DBUG_ASSERT(is_primary());
|
|
|
|
DBUG_ASSERT(is_instant());
|
|
|
|
DBUG_ASSERT(!table->instant);
|
|
|
|
for (unsigned i = n_core_fields; i < n_fields; i++) {
|
|
|
|
fields[i].col->clear_instant();
|
|
|
|
}
|
|
|
|
n_core_fields = n_fields;
|
|
|
|
n_core_null_bytes = UT_BITS_IN_BYTES(unsigned(n_nullable));
|
|
|
|
}
|
|
|
|
|
|
|
|
inline void dict_index_t::clear_instant_alter()
|
|
|
|
{
|
|
|
|
DBUG_ASSERT(is_primary());
|
|
|
|
DBUG_ASSERT(n_fields == n_def);
|
|
|
|
|
|
|
|
if (!table->instant) {
|
|
|
|
if (is_instant()) {
|
|
|
|
clear_instant_add();
|
|
|
|
}
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
#ifndef DBUG_OFF
|
|
|
|
for (unsigned i = first_user_field(); i--; ) {
|
|
|
|
DBUG_ASSERT(!fields[i].col->is_dropped());
|
|
|
|
DBUG_ASSERT(!fields[i].col->is_nullable());
|
|
|
|
}
|
|
|
|
#endif
|
2018-12-27 16:28:22 +02:00
|
|
|
const dict_col_t* ai_col = table->persistent_autoinc
|
|
|
|
? fields[table->persistent_autoinc - 1].col
|
|
|
|
: NULL;
|
2018-11-20 13:32:20 +02:00
|
|
|
dict_field_t* const begin = &fields[first_user_field()];
|
MDEV-15662 Instant DROP COLUMN or changing the order of columns
Allow ADD COLUMN anywhere in a table, not only adding as the
last column.
Allow instant DROP COLUMN and instant changing the order of columns.
The added columns will always be added last in clustered index records.
In new records, instantly dropped columns will be stored as NULL or
empty when possible.
Information about dropped and reordered columns will be written in
a metadata BLOB (mblob), which is stored before the first 'user' field
in the hidden metadata record at the start of the clustered index.
The presence of mblob is indicated by setting the delete-mark flag in
the metadata record.
The metadata BLOB stores the number of clustered index fields,
followed by an array of column information for each field.
For dropped columns, we store the NOT NULL flag, the fixed length,
and for variable-length columns, whether the maximum length exceeded
255 bytes. For non-dropped columns, we store the column position.
Unlike with MDEV-11369, when a table becomes empty, it cannot
be converted back to the canonical format. The reason for this is
that other threads may hold cached objects such as
row_prebuilt_t::ins_node that could refer to dropped or reordered
index fields.
For instant DROP COLUMN and ROW_FORMAT=COMPACT or ROW_FORMAT=DYNAMIC,
we must store the n_core_null_bytes in the root page, so that the
chain of node pointer records can be followed in order to reach the
leftmost leaf page where the metadata record is located.
If the mblob is present, we will zero-initialize the strings
"infimum" and "supremum" in the root page, and use the last byte of
"supremum" for storing the number of null bytes (which are allocated
but useless on node pointer pages). This is necessary for
btr_cur_instant_init_metadata() to be able to navigate to the mblob.
If the PRIMARY KEY contains any variable-length column and some
nullable columns were instantly dropped, the dict_index_t::n_nullable
in the data dictionary could be smaller than it actually is in the
non-leaf pages. Because of this, the non-leaf pages could use more
bytes for the null flags than the data dictionary expects, and we
could be reading the lengths of the variable-length columns from the
wrong offset, and thus reading the child page number from wrong place.
This is the result of two design mistakes that involve unnecessary
storage of data: First, it is nonsense to store any data fields for
the leftmost node pointer records, because the comparisons would be
resolved by the MIN_REC_FLAG alone. Second, there cannot be any null
fields in the clustered index node pointer fields, but we nevertheless
reserve space for all the null flags.
Limitations (future work):
MDEV-17459 Allow instant ALTER TABLE even if FULLTEXT INDEX exists
MDEV-17468 Avoid table rebuild on operations on generated columns
MDEV-17494 Refuse ALGORITHM=INSTANT when the row size is too large
btr_page_reorganize_low(): Preserve any metadata in the root page.
Call lock_move_reorganize_page() only after restoring the "infimum"
and "supremum" records, to avoid a memcmp() assertion failure.
dict_col_t::DROPPED: Magic value for dict_col_t::ind.
dict_col_t::clear_instant(): Renamed from dict_col_t::remove_instant().
Do not assert that the column was instantly added, because we
sometimes call this unconditionally for all columns.
Convert an instantly added column to a "core column". The old name
remove_instant() could be mistaken to refer to "instant DROP COLUMN".
dict_col_t::is_added(): Rename from dict_col_t::is_instant().
dtype_t::metadata_blob_init(): Initialize the mblob data type.
dtuple_t::is_metadata(), dtuple_t::is_alter_metadata(),
upd_t::is_metadata(), upd_t::is_alter_metadata(): Check if info_bits
refer to a metadata record.
dict_table_t::instant: Metadata about dropped or reordered columns.
dict_table_t::prepare_instant(): Prepare
ha_innobase_inplace_ctx::instant_table for instant ALTER TABLE.
innobase_instant_try() will pass this to dict_table_t::instant_column().
On rollback, dict_table_t::rollback_instant() will be called.
dict_table_t::instant_column(): Renamed from instant_add_column().
Add the parameter col_map so that columns can be reordered.
Copy and adjust v_cols[] as well.
dict_table_t::find(): Find an old column based on a new column number.
dict_table_t::serialise_columns(), dict_table_t::deserialise_columns():
Convert the mblob.
dict_index_t::instant_metadata(): Create the metadata record
for instant ALTER TABLE. Invoke dict_table_t::serialise_columns().
dict_index_t::reconstruct_fields(): Invoked by
dict_table_t::deserialise_columns().
dict_index_t::clear_instant_alter(): Move the fields for the
dropped columns to the end, and sort the surviving index fields
in ascending order of column position.
ha_innobase::check_if_supported_inplace_alter(): Do not allow
adding a FTS_DOC_ID column if a hidden FTS_DOC_ID column exists
due to FULLTEXT INDEX. (This always required ALGORITHM=COPY.)
instant_alter_column_possible(): Add a parameter for InnoDB table,
to check for additional conditions, such as the maximum number of
index fields.
ha_innobase_inplace_ctx::first_alter_pos: The first column whose position
is affected by instant ADD, DROP, or changing the order of columns.
innobase_build_col_map(): Skip added virtual columns.
prepare_inplace_add_virtual(): Correctly compute num_to_add_vcol.
Remove some unnecessary code. Note that the call to
innodb_base_col_setup() should be executed later.
commit_try_norebuild(): If ctx->is_instant(), let the virtual
columns be added or dropped by innobase_instant_try().
innobase_instant_try(): Fill in a zero default value for the
hidden column FTS_DOC_ID (to reduce the work needed in MDEV-17459).
If any columns were dropped or reordered (or added not last),
delete any SYS_COLUMNS records for the following columns, and
insert SYS_COLUMNS records for all subsequent stored columns as well
as for all virtual columns. If any virtual column is dropped, rewrite
all virtual column metadata. Use a shortcut only for adding
virtual columns. This is because innobase_drop_virtual_try()
assumes that the dropped virtual columns still exist in ctx->old_table.
innodb_update_cols(): Renamed from innodb_update_n_cols().
innobase_add_one_virtual(), innobase_insert_sys_virtual(): Change
the return type to bool, and invoke my_error() when detecting an error.
innodb_insert_sys_columns(): Insert a record into SYS_COLUMNS.
Refactored from innobase_add_one_virtual() and innobase_instant_add_col().
innobase_instant_add_col(): Replace the parameter dfield with type.
innobase_instant_drop_cols(): Drop matching columns from SYS_COLUMNS
and all columns from SYS_VIRTUAL.
innobase_add_virtual_try(), innobase_drop_virtual_try(): Let
the caller invoke innodb_update_cols().
innobase_rename_column_try(): Skip dropped columns.
commit_cache_norebuild(): Update table->fts->doc_col.
dict_mem_table_col_rename_low(): Skip dropped columns.
trx_undo_rec_get_partial_row(): Skip dropped columns.
trx_undo_update_rec_get_update(): Handle the metadata BLOB correctly.
trx_undo_page_report_modify(): Avoid out-of-bounds access to record fields.
Log metadata records consistently.
Apparently, the first fields of a clustered index may be updated
in an update_undo vector when the index is ID_IND of SYS_FOREIGN,
as part of renaming the table during ALTER TABLE. Normally, updates of
the PRIMARY KEY should be logged as delete-mark and an insert.
row_undo_mod_parse_undo_rec(), row_purge_parse_undo_rec():
Use trx_undo_metadata.
row_undo_mod_clust_low(): On metadata rollback, roll back the root page too.
row_undo_mod_clust(): Relax an assertion. The delete-mark flag was
repurposed for ALTER TABLE metadata records.
row_rec_to_index_entry_impl(): Add the template parameter mblob
and the optional parameter info_bits for specifying the desired new
info bits. For the metadata tuple, allow conversion between the original
format (ADD COLUMN only) and the generic format (with hidden BLOB).
Add the optional parameter "pad" to determine whether the tuple should
be padded to the index fields (on ALTER TABLE it should), or whether
it should remain at its original size (on rollback).
row_build_index_entry_low(): Clean up the code, removing
redundant variables and conditions. For instantly dropped columns,
generate a dummy value that is NULL, the empty string, or a
fixed length of NUL bytes, depending on the type of the dropped column.
row_upd_clust_rec_by_insert_inherit_func(): On the update of PRIMARY KEY
of a record that contained a dropped column whose value was stored
externally, we will be inserting a dummy NULL or empty string value
to the field of the dropped column. The externally stored column would
eventually be dropped when purge removes the delete-marked record for
the old PRIMARY KEY value.
btr_index_rec_validate(): Recognize the metadata record.
btr_discard_only_page_on_level(): Preserve the generic instant
ALTER TABLE metadata.
btr_set_instant(): Replaces page_set_instant(). This sets a clustered
index root page to the appropriate format, or upgrades from
the MDEV-11369 instant ADD COLUMN to generic ALTER TABLE format.
btr_cur_instant_init_low(): Read and validate the metadata BLOB page
before reconstructing the dictionary information based on it.
btr_cur_instant_init_metadata(): Do not read any lengths from the
metadata record header before reading the BLOB. At this point, we
would not actually know how many nullable fields the metadata record
contains.
btr_cur_instant_root_init(): Initialize n_core_null_bytes in one
of two possible ways.
btr_cur_trim(): Handle the mblob record.
row_metadata_to_tuple(): Convert a metadata record to a data tuple,
based on the new info_bits of the metadata record.
btr_cur_pessimistic_update(): Invoke row_metadata_to_tuple() if needed.
Invoke dtuple_convert_big_rec() for metadata records if the record is
too large, or if the mblob is not yet marked as externally stored.
btr_cur_optimistic_delete_func(), btr_cur_pessimistic_delete():
When the last user record is deleted, do not delete the
generic instant ALTER TABLE metadata record. Only delete
MDEV-11369 instant ADD COLUMN metadata records.
btr_cur_optimistic_insert(): Avoid unnecessary computation of rec_size.
btr_pcur_store_position(): Allow a logically empty page to contain
a metadata record for generic ALTER TABLE.
REC_INFO_DEFAULT_ROW_ADD: Renamed from REC_INFO_DEFAULT_ROW.
This is for the old instant ADD COLUMN (MDEV-11369) only.
REC_INFO_DEFAULT_ROW_ALTER: The more generic metadata record,
with additional information for dropped or reordered columns.
rec_info_bits_valid(): Remove. The only case when this would fail
is when the record is the generic ALTER TABLE metadata record.
rec_is_alter_metadata(): Check if a record is the metadata record
for instant ALTER TABLE (other than ADD COLUMN). NOTE: This function
must not be invoked on node pointer records, because the delete-mark
flag in those records may be set (it is garbage), and then a debug
assertion could fail because index->is_instant() does not necessarily
hold.
rec_is_add_metadata(): Check if a record is MDEV-11369 ADD COLUMN metadata
record (not more generic instant ALTER TABLE).
rec_get_converted_size_comp_prefix_low(): Assume that the metadata
field will be stored externally. In dtuple_convert_big_rec() during
the rec_get_converted_size() call, it would not be there yet.
rec_get_converted_size_comp(): Replace status,fields,n_fields with tuple.
rec_init_offsets_comp_ordinary(), rec_get_converted_size_comp_prefix_low(),
rec_convert_dtuple_to_rec_comp(): Add template<bool mblob = false>.
With mblob=true, process a record with a metadata BLOB.
rec_copy_prefix_to_buf(): Assert that no fields beyond the key and
system columns are being copied. Exclude the metadata BLOB field.
rec_convert_dtuple_to_metadata_comp(): Convert an alter metadata tuple
into a record.
row_upd_index_replace_metadata(): Apply an update vector to an
alter_metadata tuple.
row_log_allocate(): Replace dict_index_t::is_instant()
with a more appropriate condition that ignores dict_table_t::instant.
Only a table on which the MDEV-11369 ADD COLUMN was performed
can "lose its instantness" when it becomes empty. After
instant DROP COLUMN or reordering columns, we cannot simply
convert the table to the canonical format, because the data
dictionary cache and all possibly existing references to it
from other client connection threads would have to be adjusted.
row_quiesce_write_index_fields(): Do not crash when the table contains
an instantly dropped column.
Thanks to Thirunarayanan Balathandayuthapani for discussing the design
and implementing an initial prototype of this.
Thanks to Matthias Leich for testing.
2018-10-19 16:49:54 +03:00
|
|
|
dict_field_t* end = &fields[n_fields];
|
|
|
|
|
2018-11-21 12:24:49 +02:00
|
|
|
for (dict_field_t* d = begin; d < end; ) {
|
MDEV-15662 Instant DROP COLUMN or changing the order of columns
Allow ADD COLUMN anywhere in a table, not only adding as the
last column.
Allow instant DROP COLUMN and instant changing the order of columns.
The added columns will always be added last in clustered index records.
In new records, instantly dropped columns will be stored as NULL or
empty when possible.
Information about dropped and reordered columns will be written in
a metadata BLOB (mblob), which is stored before the first 'user' field
in the hidden metadata record at the start of the clustered index.
The presence of mblob is indicated by setting the delete-mark flag in
the metadata record.
The metadata BLOB stores the number of clustered index fields,
followed by an array of column information for each field.
For dropped columns, we store the NOT NULL flag, the fixed length,
and for variable-length columns, whether the maximum length exceeded
255 bytes. For non-dropped columns, we store the column position.
Unlike with MDEV-11369, when a table becomes empty, it cannot
be converted back to the canonical format. The reason for this is
that other threads may hold cached objects such as
row_prebuilt_t::ins_node that could refer to dropped or reordered
index fields.
For instant DROP COLUMN and ROW_FORMAT=COMPACT or ROW_FORMAT=DYNAMIC,
we must store the n_core_null_bytes in the root page, so that the
chain of node pointer records can be followed in order to reach the
leftmost leaf page where the metadata record is located.
If the mblob is present, we will zero-initialize the strings
"infimum" and "supremum" in the root page, and use the last byte of
"supremum" for storing the number of null bytes (which are allocated
but useless on node pointer pages). This is necessary for
btr_cur_instant_init_metadata() to be able to navigate to the mblob.
If the PRIMARY KEY contains any variable-length column and some
nullable columns were instantly dropped, the dict_index_t::n_nullable
in the data dictionary could be smaller than it actually is in the
non-leaf pages. Because of this, the non-leaf pages could use more
bytes for the null flags than the data dictionary expects, and we
could be reading the lengths of the variable-length columns from the
wrong offset, and thus reading the child page number from wrong place.
This is the result of two design mistakes that involve unnecessary
storage of data: First, it is nonsense to store any data fields for
the leftmost node pointer records, because the comparisons would be
resolved by the MIN_REC_FLAG alone. Second, there cannot be any null
fields in the clustered index node pointer fields, but we nevertheless
reserve space for all the null flags.
Limitations (future work):
MDEV-17459 Allow instant ALTER TABLE even if FULLTEXT INDEX exists
MDEV-17468 Avoid table rebuild on operations on generated columns
MDEV-17494 Refuse ALGORITHM=INSTANT when the row size is too large
btr_page_reorganize_low(): Preserve any metadata in the root page.
Call lock_move_reorganize_page() only after restoring the "infimum"
and "supremum" records, to avoid a memcmp() assertion failure.
dict_col_t::DROPPED: Magic value for dict_col_t::ind.
dict_col_t::clear_instant(): Renamed from dict_col_t::remove_instant().
Do not assert that the column was instantly added, because we
sometimes call this unconditionally for all columns.
Convert an instantly added column to a "core column". The old name
remove_instant() could be mistaken to refer to "instant DROP COLUMN".
dict_col_t::is_added(): Rename from dict_col_t::is_instant().
dtype_t::metadata_blob_init(): Initialize the mblob data type.
dtuple_t::is_metadata(), dtuple_t::is_alter_metadata(),
upd_t::is_metadata(), upd_t::is_alter_metadata(): Check if info_bits
refer to a metadata record.
dict_table_t::instant: Metadata about dropped or reordered columns.
dict_table_t::prepare_instant(): Prepare
ha_innobase_inplace_ctx::instant_table for instant ALTER TABLE.
innobase_instant_try() will pass this to dict_table_t::instant_column().
On rollback, dict_table_t::rollback_instant() will be called.
dict_table_t::instant_column(): Renamed from instant_add_column().
Add the parameter col_map so that columns can be reordered.
Copy and adjust v_cols[] as well.
dict_table_t::find(): Find an old column based on a new column number.
dict_table_t::serialise_columns(), dict_table_t::deserialise_columns():
Convert the mblob.
dict_index_t::instant_metadata(): Create the metadata record
for instant ALTER TABLE. Invoke dict_table_t::serialise_columns().
dict_index_t::reconstruct_fields(): Invoked by
dict_table_t::deserialise_columns().
dict_index_t::clear_instant_alter(): Move the fields for the
dropped columns to the end, and sort the surviving index fields
in ascending order of column position.
ha_innobase::check_if_supported_inplace_alter(): Do not allow
adding a FTS_DOC_ID column if a hidden FTS_DOC_ID column exists
due to FULLTEXT INDEX. (This always required ALGORITHM=COPY.)
instant_alter_column_possible(): Add a parameter for InnoDB table,
to check for additional conditions, such as the maximum number of
index fields.
ha_innobase_inplace_ctx::first_alter_pos: The first column whose position
is affected by instant ADD, DROP, or changing the order of columns.
innobase_build_col_map(): Skip added virtual columns.
prepare_inplace_add_virtual(): Correctly compute num_to_add_vcol.
Remove some unnecessary code. Note that the call to
innodb_base_col_setup() should be executed later.
commit_try_norebuild(): If ctx->is_instant(), let the virtual
columns be added or dropped by innobase_instant_try().
innobase_instant_try(): Fill in a zero default value for the
hidden column FTS_DOC_ID (to reduce the work needed in MDEV-17459).
If any columns were dropped or reordered (or added not last),
delete any SYS_COLUMNS records for the following columns, and
insert SYS_COLUMNS records for all subsequent stored columns as well
as for all virtual columns. If any virtual column is dropped, rewrite
all virtual column metadata. Use a shortcut only for adding
virtual columns. This is because innobase_drop_virtual_try()
assumes that the dropped virtual columns still exist in ctx->old_table.
innodb_update_cols(): Renamed from innodb_update_n_cols().
innobase_add_one_virtual(), innobase_insert_sys_virtual(): Change
the return type to bool, and invoke my_error() when detecting an error.
innodb_insert_sys_columns(): Insert a record into SYS_COLUMNS.
Refactored from innobase_add_one_virtual() and innobase_instant_add_col().
innobase_instant_add_col(): Replace the parameter dfield with type.
innobase_instant_drop_cols(): Drop matching columns from SYS_COLUMNS
and all columns from SYS_VIRTUAL.
innobase_add_virtual_try(), innobase_drop_virtual_try(): Let
the caller invoke innodb_update_cols().
innobase_rename_column_try(): Skip dropped columns.
commit_cache_norebuild(): Update table->fts->doc_col.
dict_mem_table_col_rename_low(): Skip dropped columns.
trx_undo_rec_get_partial_row(): Skip dropped columns.
trx_undo_update_rec_get_update(): Handle the metadata BLOB correctly.
trx_undo_page_report_modify(): Avoid out-of-bounds access to record fields.
Log metadata records consistently.
Apparently, the first fields of a clustered index may be updated
in an update_undo vector when the index is ID_IND of SYS_FOREIGN,
as part of renaming the table during ALTER TABLE. Normally, updates of
the PRIMARY KEY should be logged as delete-mark and an insert.
row_undo_mod_parse_undo_rec(), row_purge_parse_undo_rec():
Use trx_undo_metadata.
row_undo_mod_clust_low(): On metadata rollback, roll back the root page too.
row_undo_mod_clust(): Relax an assertion. The delete-mark flag was
repurposed for ALTER TABLE metadata records.
row_rec_to_index_entry_impl(): Add the template parameter mblob
and the optional parameter info_bits for specifying the desired new
info bits. For the metadata tuple, allow conversion between the original
format (ADD COLUMN only) and the generic format (with hidden BLOB).
Add the optional parameter "pad" to determine whether the tuple should
be padded to the index fields (on ALTER TABLE it should), or whether
it should remain at its original size (on rollback).
row_build_index_entry_low(): Clean up the code, removing
redundant variables and conditions. For instantly dropped columns,
generate a dummy value that is NULL, the empty string, or a
fixed length of NUL bytes, depending on the type of the dropped column.
row_upd_clust_rec_by_insert_inherit_func(): On the update of PRIMARY KEY
of a record that contained a dropped column whose value was stored
externally, we will be inserting a dummy NULL or empty string value
to the field of the dropped column. The externally stored column would
eventually be dropped when purge removes the delete-marked record for
the old PRIMARY KEY value.
btr_index_rec_validate(): Recognize the metadata record.
btr_discard_only_page_on_level(): Preserve the generic instant
ALTER TABLE metadata.
btr_set_instant(): Replaces page_set_instant(). This sets a clustered
index root page to the appropriate format, or upgrades from
the MDEV-11369 instant ADD COLUMN to generic ALTER TABLE format.
btr_cur_instant_init_low(): Read and validate the metadata BLOB page
before reconstructing the dictionary information based on it.
btr_cur_instant_init_metadata(): Do not read any lengths from the
metadata record header before reading the BLOB. At this point, we
would not actually know how many nullable fields the metadata record
contains.
btr_cur_instant_root_init(): Initialize n_core_null_bytes in one
of two possible ways.
btr_cur_trim(): Handle the mblob record.
row_metadata_to_tuple(): Convert a metadata record to a data tuple,
based on the new info_bits of the metadata record.
btr_cur_pessimistic_update(): Invoke row_metadata_to_tuple() if needed.
Invoke dtuple_convert_big_rec() for metadata records if the record is
too large, or if the mblob is not yet marked as externally stored.
btr_cur_optimistic_delete_func(), btr_cur_pessimistic_delete():
When the last user record is deleted, do not delete the
generic instant ALTER TABLE metadata record. Only delete
MDEV-11369 instant ADD COLUMN metadata records.
btr_cur_optimistic_insert(): Avoid unnecessary computation of rec_size.
btr_pcur_store_position(): Allow a logically empty page to contain
a metadata record for generic ALTER TABLE.
REC_INFO_DEFAULT_ROW_ADD: Renamed from REC_INFO_DEFAULT_ROW.
This is for the old instant ADD COLUMN (MDEV-11369) only.
REC_INFO_DEFAULT_ROW_ALTER: The more generic metadata record,
with additional information for dropped or reordered columns.
rec_info_bits_valid(): Remove. The only case when this would fail
is when the record is the generic ALTER TABLE metadata record.
rec_is_alter_metadata(): Check if a record is the metadata record
for instant ALTER TABLE (other than ADD COLUMN). NOTE: This function
must not be invoked on node pointer records, because the delete-mark
flag in those records may be set (it is garbage), and then a debug
assertion could fail because index->is_instant() does not necessarily
hold.
rec_is_add_metadata(): Check if a record is MDEV-11369 ADD COLUMN metadata
record (not more generic instant ALTER TABLE).
rec_get_converted_size_comp_prefix_low(): Assume that the metadata
field will be stored externally. In dtuple_convert_big_rec() during
the rec_get_converted_size() call, it would not be there yet.
rec_get_converted_size_comp(): Replace status,fields,n_fields with tuple.
rec_init_offsets_comp_ordinary(), rec_get_converted_size_comp_prefix_low(),
rec_convert_dtuple_to_rec_comp(): Add template<bool mblob = false>.
With mblob=true, process a record with a metadata BLOB.
rec_copy_prefix_to_buf(): Assert that no fields beyond the key and
system columns are being copied. Exclude the metadata BLOB field.
rec_convert_dtuple_to_metadata_comp(): Convert an alter metadata tuple
into a record.
row_upd_index_replace_metadata(): Apply an update vector to an
alter_metadata tuple.
row_log_allocate(): Replace dict_index_t::is_instant()
with a more appropriate condition that ignores dict_table_t::instant.
Only a table on which the MDEV-11369 ADD COLUMN was performed
can "lose its instantness" when it becomes empty. After
instant DROP COLUMN or reordering columns, we cannot simply
convert the table to the canonical format, because the data
dictionary cache and all possibly existing references to it
from other client connection threads would have to be adjusted.
row_quiesce_write_index_fields(): Do not crash when the table contains
an instantly dropped column.
Thanks to Thirunarayanan Balathandayuthapani for discussing the design
and implementing an initial prototype of this.
Thanks to Matthias Leich for testing.
2018-10-19 16:49:54 +03:00
|
|
|
/* Move fields for dropped columns to the end. */
|
2018-11-21 12:24:49 +02:00
|
|
|
if (!d->col->is_dropped()) {
|
|
|
|
d++;
|
|
|
|
} else {
|
MDEV-15662 Instant DROP COLUMN or changing the order of columns
Allow ADD COLUMN anywhere in a table, not only adding as the
last column.
Allow instant DROP COLUMN and instant changing the order of columns.
The added columns will always be added last in clustered index records.
In new records, instantly dropped columns will be stored as NULL or
empty when possible.
Information about dropped and reordered columns will be written in
a metadata BLOB (mblob), which is stored before the first 'user' field
in the hidden metadata record at the start of the clustered index.
The presence of mblob is indicated by setting the delete-mark flag in
the metadata record.
The metadata BLOB stores the number of clustered index fields,
followed by an array of column information for each field.
For dropped columns, we store the NOT NULL flag, the fixed length,
and for variable-length columns, whether the maximum length exceeded
255 bytes. For non-dropped columns, we store the column position.
Unlike with MDEV-11369, when a table becomes empty, it cannot
be converted back to the canonical format. The reason for this is
that other threads may hold cached objects such as
row_prebuilt_t::ins_node that could refer to dropped or reordered
index fields.
For instant DROP COLUMN and ROW_FORMAT=COMPACT or ROW_FORMAT=DYNAMIC,
we must store the n_core_null_bytes in the root page, so that the
chain of node pointer records can be followed in order to reach the
leftmost leaf page where the metadata record is located.
If the mblob is present, we will zero-initialize the strings
"infimum" and "supremum" in the root page, and use the last byte of
"supremum" for storing the number of null bytes (which are allocated
but useless on node pointer pages). This is necessary for
btr_cur_instant_init_metadata() to be able to navigate to the mblob.
If the PRIMARY KEY contains any variable-length column and some
nullable columns were instantly dropped, the dict_index_t::n_nullable
in the data dictionary could be smaller than it actually is in the
non-leaf pages. Because of this, the non-leaf pages could use more
bytes for the null flags than the data dictionary expects, and we
could be reading the lengths of the variable-length columns from the
wrong offset, and thus reading the child page number from wrong place.
This is the result of two design mistakes that involve unnecessary
storage of data: First, it is nonsense to store any data fields for
the leftmost node pointer records, because the comparisons would be
resolved by the MIN_REC_FLAG alone. Second, there cannot be any null
fields in the clustered index node pointer fields, but we nevertheless
reserve space for all the null flags.
Limitations (future work):
MDEV-17459 Allow instant ALTER TABLE even if FULLTEXT INDEX exists
MDEV-17468 Avoid table rebuild on operations on generated columns
MDEV-17494 Refuse ALGORITHM=INSTANT when the row size is too large
btr_page_reorganize_low(): Preserve any metadata in the root page.
Call lock_move_reorganize_page() only after restoring the "infimum"
and "supremum" records, to avoid a memcmp() assertion failure.
dict_col_t::DROPPED: Magic value for dict_col_t::ind.
dict_col_t::clear_instant(): Renamed from dict_col_t::remove_instant().
Do not assert that the column was instantly added, because we
sometimes call this unconditionally for all columns.
Convert an instantly added column to a "core column". The old name
remove_instant() could be mistaken to refer to "instant DROP COLUMN".
dict_col_t::is_added(): Rename from dict_col_t::is_instant().
dtype_t::metadata_blob_init(): Initialize the mblob data type.
dtuple_t::is_metadata(), dtuple_t::is_alter_metadata(),
upd_t::is_metadata(), upd_t::is_alter_metadata(): Check if info_bits
refer to a metadata record.
dict_table_t::instant: Metadata about dropped or reordered columns.
dict_table_t::prepare_instant(): Prepare
ha_innobase_inplace_ctx::instant_table for instant ALTER TABLE.
innobase_instant_try() will pass this to dict_table_t::instant_column().
On rollback, dict_table_t::rollback_instant() will be called.
dict_table_t::instant_column(): Renamed from instant_add_column().
Add the parameter col_map so that columns can be reordered.
Copy and adjust v_cols[] as well.
dict_table_t::find(): Find an old column based on a new column number.
dict_table_t::serialise_columns(), dict_table_t::deserialise_columns():
Convert the mblob.
dict_index_t::instant_metadata(): Create the metadata record
for instant ALTER TABLE. Invoke dict_table_t::serialise_columns().
dict_index_t::reconstruct_fields(): Invoked by
dict_table_t::deserialise_columns().
dict_index_t::clear_instant_alter(): Move the fields for the
dropped columns to the end, and sort the surviving index fields
in ascending order of column position.
ha_innobase::check_if_supported_inplace_alter(): Do not allow
adding a FTS_DOC_ID column if a hidden FTS_DOC_ID column exists
due to FULLTEXT INDEX. (This always required ALGORITHM=COPY.)
instant_alter_column_possible(): Add a parameter for InnoDB table,
to check for additional conditions, such as the maximum number of
index fields.
ha_innobase_inplace_ctx::first_alter_pos: The first column whose position
is affected by instant ADD, DROP, or changing the order of columns.
innobase_build_col_map(): Skip added virtual columns.
prepare_inplace_add_virtual(): Correctly compute num_to_add_vcol.
Remove some unnecessary code. Note that the call to
innodb_base_col_setup() should be executed later.
commit_try_norebuild(): If ctx->is_instant(), let the virtual
columns be added or dropped by innobase_instant_try().
innobase_instant_try(): Fill in a zero default value for the
hidden column FTS_DOC_ID (to reduce the work needed in MDEV-17459).
If any columns were dropped or reordered (or added not last),
delete any SYS_COLUMNS records for the following columns, and
insert SYS_COLUMNS records for all subsequent stored columns as well
as for all virtual columns. If any virtual column is dropped, rewrite
all virtual column metadata. Use a shortcut only for adding
virtual columns. This is because innobase_drop_virtual_try()
assumes that the dropped virtual columns still exist in ctx->old_table.
innodb_update_cols(): Renamed from innodb_update_n_cols().
innobase_add_one_virtual(), innobase_insert_sys_virtual(): Change
the return type to bool, and invoke my_error() when detecting an error.
innodb_insert_sys_columns(): Insert a record into SYS_COLUMNS.
Refactored from innobase_add_one_virtual() and innobase_instant_add_col().
innobase_instant_add_col(): Replace the parameter dfield with type.
innobase_instant_drop_cols(): Drop matching columns from SYS_COLUMNS
and all columns from SYS_VIRTUAL.
innobase_add_virtual_try(), innobase_drop_virtual_try(): Let
the caller invoke innodb_update_cols().
innobase_rename_column_try(): Skip dropped columns.
commit_cache_norebuild(): Update table->fts->doc_col.
dict_mem_table_col_rename_low(): Skip dropped columns.
trx_undo_rec_get_partial_row(): Skip dropped columns.
trx_undo_update_rec_get_update(): Handle the metadata BLOB correctly.
trx_undo_page_report_modify(): Avoid out-of-bounds access to record fields.
Log metadata records consistently.
Apparently, the first fields of a clustered index may be updated
in an update_undo vector when the index is ID_IND of SYS_FOREIGN,
as part of renaming the table during ALTER TABLE. Normally, updates of
the PRIMARY KEY should be logged as delete-mark and an insert.
row_undo_mod_parse_undo_rec(), row_purge_parse_undo_rec():
Use trx_undo_metadata.
row_undo_mod_clust_low(): On metadata rollback, roll back the root page too.
row_undo_mod_clust(): Relax an assertion. The delete-mark flag was
repurposed for ALTER TABLE metadata records.
row_rec_to_index_entry_impl(): Add the template parameter mblob
and the optional parameter info_bits for specifying the desired new
info bits. For the metadata tuple, allow conversion between the original
format (ADD COLUMN only) and the generic format (with hidden BLOB).
Add the optional parameter "pad" to determine whether the tuple should
be padded to the index fields (on ALTER TABLE it should), or whether
it should remain at its original size (on rollback).
row_build_index_entry_low(): Clean up the code, removing
redundant variables and conditions. For instantly dropped columns,
generate a dummy value that is NULL, the empty string, or a
fixed length of NUL bytes, depending on the type of the dropped column.
row_upd_clust_rec_by_insert_inherit_func(): On the update of PRIMARY KEY
of a record that contained a dropped column whose value was stored
externally, we will be inserting a dummy NULL or empty string value
to the field of the dropped column. The externally stored column would
eventually be dropped when purge removes the delete-marked record for
the old PRIMARY KEY value.
btr_index_rec_validate(): Recognize the metadata record.
btr_discard_only_page_on_level(): Preserve the generic instant
ALTER TABLE metadata.
btr_set_instant(): Replaces page_set_instant(). This sets a clustered
index root page to the appropriate format, or upgrades from
the MDEV-11369 instant ADD COLUMN to generic ALTER TABLE format.
btr_cur_instant_init_low(): Read and validate the metadata BLOB page
before reconstructing the dictionary information based on it.
btr_cur_instant_init_metadata(): Do not read any lengths from the
metadata record header before reading the BLOB. At this point, we
would not actually know how many nullable fields the metadata record
contains.
btr_cur_instant_root_init(): Initialize n_core_null_bytes in one
of two possible ways.
btr_cur_trim(): Handle the mblob record.
row_metadata_to_tuple(): Convert a metadata record to a data tuple,
based on the new info_bits of the metadata record.
btr_cur_pessimistic_update(): Invoke row_metadata_to_tuple() if needed.
Invoke dtuple_convert_big_rec() for metadata records if the record is
too large, or if the mblob is not yet marked as externally stored.
btr_cur_optimistic_delete_func(), btr_cur_pessimistic_delete():
When the last user record is deleted, do not delete the
generic instant ALTER TABLE metadata record. Only delete
MDEV-11369 instant ADD COLUMN metadata records.
btr_cur_optimistic_insert(): Avoid unnecessary computation of rec_size.
btr_pcur_store_position(): Allow a logically empty page to contain
a metadata record for generic ALTER TABLE.
REC_INFO_DEFAULT_ROW_ADD: Renamed from REC_INFO_DEFAULT_ROW.
This is for the old instant ADD COLUMN (MDEV-11369) only.
REC_INFO_DEFAULT_ROW_ALTER: The more generic metadata record,
with additional information for dropped or reordered columns.
rec_info_bits_valid(): Remove. The only case when this would fail
is when the record is the generic ALTER TABLE metadata record.
rec_is_alter_metadata(): Check if a record is the metadata record
for instant ALTER TABLE (other than ADD COLUMN). NOTE: This function
must not be invoked on node pointer records, because the delete-mark
flag in those records may be set (it is garbage), and then a debug
assertion could fail because index->is_instant() does not necessarily
hold.
rec_is_add_metadata(): Check if a record is MDEV-11369 ADD COLUMN metadata
record (not more generic instant ALTER TABLE).
rec_get_converted_size_comp_prefix_low(): Assume that the metadata
field will be stored externally. In dtuple_convert_big_rec() during
the rec_get_converted_size() call, it would not be there yet.
rec_get_converted_size_comp(): Replace status,fields,n_fields with tuple.
rec_init_offsets_comp_ordinary(), rec_get_converted_size_comp_prefix_low(),
rec_convert_dtuple_to_rec_comp(): Add template<bool mblob = false>.
With mblob=true, process a record with a metadata BLOB.
rec_copy_prefix_to_buf(): Assert that no fields beyond the key and
system columns are being copied. Exclude the metadata BLOB field.
rec_convert_dtuple_to_metadata_comp(): Convert an alter metadata tuple
into a record.
row_upd_index_replace_metadata(): Apply an update vector to an
alter_metadata tuple.
row_log_allocate(): Replace dict_index_t::is_instant()
with a more appropriate condition that ignores dict_table_t::instant.
Only a table on which the MDEV-11369 ADD COLUMN was performed
can "lose its instantness" when it becomes empty. After
instant DROP COLUMN or reordering columns, we cannot simply
convert the table to the canonical format, because the data
dictionary cache and all possibly existing references to it
from other client connection threads would have to be adjusted.
row_quiesce_write_index_fields(): Do not crash when the table contains
an instantly dropped column.
Thanks to Thirunarayanan Balathandayuthapani for discussing the design
and implementing an initial prototype of this.
Thanks to Matthias Leich for testing.
2018-10-19 16:49:54 +03:00
|
|
|
if (d->col->is_nullable()) {
|
|
|
|
n_nullable--;
|
|
|
|
}
|
|
|
|
|
|
|
|
std::swap(*d, *--end);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
DBUG_ASSERT(&fields[n_fields - table->n_dropped()] == end);
|
|
|
|
n_core_fields = n_fields = n_def = end - fields;
|
|
|
|
n_core_null_bytes = UT_BITS_IN_BYTES(n_nullable);
|
2018-11-20 13:32:20 +02:00
|
|
|
std::sort(begin, end, [](const dict_field_t& a, const dict_field_t& b)
|
2018-12-27 16:28:22 +02:00
|
|
|
{ return a.col->ind < b.col->ind; });
|
MDEV-15662 Instant DROP COLUMN or changing the order of columns
Allow ADD COLUMN anywhere in a table, not only adding as the
last column.
Allow instant DROP COLUMN and instant changing the order of columns.
The added columns will always be added last in clustered index records.
In new records, instantly dropped columns will be stored as NULL or
empty when possible.
Information about dropped and reordered columns will be written in
a metadata BLOB (mblob), which is stored before the first 'user' field
in the hidden metadata record at the start of the clustered index.
The presence of mblob is indicated by setting the delete-mark flag in
the metadata record.
The metadata BLOB stores the number of clustered index fields,
followed by an array of column information for each field.
For dropped columns, we store the NOT NULL flag, the fixed length,
and for variable-length columns, whether the maximum length exceeded
255 bytes. For non-dropped columns, we store the column position.
Unlike with MDEV-11369, when a table becomes empty, it cannot
be converted back to the canonical format. The reason for this is
that other threads may hold cached objects such as
row_prebuilt_t::ins_node that could refer to dropped or reordered
index fields.
For instant DROP COLUMN and ROW_FORMAT=COMPACT or ROW_FORMAT=DYNAMIC,
we must store the n_core_null_bytes in the root page, so that the
chain of node pointer records can be followed in order to reach the
leftmost leaf page where the metadata record is located.
If the mblob is present, we will zero-initialize the strings
"infimum" and "supremum" in the root page, and use the last byte of
"supremum" for storing the number of null bytes (which are allocated
but useless on node pointer pages). This is necessary for
btr_cur_instant_init_metadata() to be able to navigate to the mblob.
If the PRIMARY KEY contains any variable-length column and some
nullable columns were instantly dropped, the dict_index_t::n_nullable
in the data dictionary could be smaller than it actually is in the
non-leaf pages. Because of this, the non-leaf pages could use more
bytes for the null flags than the data dictionary expects, and we
could be reading the lengths of the variable-length columns from the
wrong offset, and thus reading the child page number from wrong place.
This is the result of two design mistakes that involve unnecessary
storage of data: First, it is nonsense to store any data fields for
the leftmost node pointer records, because the comparisons would be
resolved by the MIN_REC_FLAG alone. Second, there cannot be any null
fields in the clustered index node pointer fields, but we nevertheless
reserve space for all the null flags.
Limitations (future work):
MDEV-17459 Allow instant ALTER TABLE even if FULLTEXT INDEX exists
MDEV-17468 Avoid table rebuild on operations on generated columns
MDEV-17494 Refuse ALGORITHM=INSTANT when the row size is too large
btr_page_reorganize_low(): Preserve any metadata in the root page.
Call lock_move_reorganize_page() only after restoring the "infimum"
and "supremum" records, to avoid a memcmp() assertion failure.
dict_col_t::DROPPED: Magic value for dict_col_t::ind.
dict_col_t::clear_instant(): Renamed from dict_col_t::remove_instant().
Do not assert that the column was instantly added, because we
sometimes call this unconditionally for all columns.
Convert an instantly added column to a "core column". The old name
remove_instant() could be mistaken to refer to "instant DROP COLUMN".
dict_col_t::is_added(): Rename from dict_col_t::is_instant().
dtype_t::metadata_blob_init(): Initialize the mblob data type.
dtuple_t::is_metadata(), dtuple_t::is_alter_metadata(),
upd_t::is_metadata(), upd_t::is_alter_metadata(): Check if info_bits
refer to a metadata record.
dict_table_t::instant: Metadata about dropped or reordered columns.
dict_table_t::prepare_instant(): Prepare
ha_innobase_inplace_ctx::instant_table for instant ALTER TABLE.
innobase_instant_try() will pass this to dict_table_t::instant_column().
On rollback, dict_table_t::rollback_instant() will be called.
dict_table_t::instant_column(): Renamed from instant_add_column().
Add the parameter col_map so that columns can be reordered.
Copy and adjust v_cols[] as well.
dict_table_t::find(): Find an old column based on a new column number.
dict_table_t::serialise_columns(), dict_table_t::deserialise_columns():
Convert the mblob.
dict_index_t::instant_metadata(): Create the metadata record
for instant ALTER TABLE. Invoke dict_table_t::serialise_columns().
dict_index_t::reconstruct_fields(): Invoked by
dict_table_t::deserialise_columns().
dict_index_t::clear_instant_alter(): Move the fields for the
dropped columns to the end, and sort the surviving index fields
in ascending order of column position.
ha_innobase::check_if_supported_inplace_alter(): Do not allow
adding a FTS_DOC_ID column if a hidden FTS_DOC_ID column exists
due to FULLTEXT INDEX. (This always required ALGORITHM=COPY.)
instant_alter_column_possible(): Add a parameter for InnoDB table,
to check for additional conditions, such as the maximum number of
index fields.
ha_innobase_inplace_ctx::first_alter_pos: The first column whose position
is affected by instant ADD, DROP, or changing the order of columns.
innobase_build_col_map(): Skip added virtual columns.
prepare_inplace_add_virtual(): Correctly compute num_to_add_vcol.
Remove some unnecessary code. Note that the call to
innodb_base_col_setup() should be executed later.
commit_try_norebuild(): If ctx->is_instant(), let the virtual
columns be added or dropped by innobase_instant_try().
innobase_instant_try(): Fill in a zero default value for the
hidden column FTS_DOC_ID (to reduce the work needed in MDEV-17459).
If any columns were dropped or reordered (or added not last),
delete any SYS_COLUMNS records for the following columns, and
insert SYS_COLUMNS records for all subsequent stored columns as well
as for all virtual columns. If any virtual column is dropped, rewrite
all virtual column metadata. Use a shortcut only for adding
virtual columns. This is because innobase_drop_virtual_try()
assumes that the dropped virtual columns still exist in ctx->old_table.
innodb_update_cols(): Renamed from innodb_update_n_cols().
innobase_add_one_virtual(), innobase_insert_sys_virtual(): Change
the return type to bool, and invoke my_error() when detecting an error.
innodb_insert_sys_columns(): Insert a record into SYS_COLUMNS.
Refactored from innobase_add_one_virtual() and innobase_instant_add_col().
innobase_instant_add_col(): Replace the parameter dfield with type.
innobase_instant_drop_cols(): Drop matching columns from SYS_COLUMNS
and all columns from SYS_VIRTUAL.
innobase_add_virtual_try(), innobase_drop_virtual_try(): Let
the caller invoke innodb_update_cols().
innobase_rename_column_try(): Skip dropped columns.
commit_cache_norebuild(): Update table->fts->doc_col.
dict_mem_table_col_rename_low(): Skip dropped columns.
trx_undo_rec_get_partial_row(): Skip dropped columns.
trx_undo_update_rec_get_update(): Handle the metadata BLOB correctly.
trx_undo_page_report_modify(): Avoid out-of-bounds access to record fields.
Log metadata records consistently.
Apparently, the first fields of a clustered index may be updated
in an update_undo vector when the index is ID_IND of SYS_FOREIGN,
as part of renaming the table during ALTER TABLE. Normally, updates of
the PRIMARY KEY should be logged as delete-mark and an insert.
row_undo_mod_parse_undo_rec(), row_purge_parse_undo_rec():
Use trx_undo_metadata.
row_undo_mod_clust_low(): On metadata rollback, roll back the root page too.
row_undo_mod_clust(): Relax an assertion. The delete-mark flag was
repurposed for ALTER TABLE metadata records.
row_rec_to_index_entry_impl(): Add the template parameter mblob
and the optional parameter info_bits for specifying the desired new
info bits. For the metadata tuple, allow conversion between the original
format (ADD COLUMN only) and the generic format (with hidden BLOB).
Add the optional parameter "pad" to determine whether the tuple should
be padded to the index fields (on ALTER TABLE it should), or whether
it should remain at its original size (on rollback).
row_build_index_entry_low(): Clean up the code, removing
redundant variables and conditions. For instantly dropped columns,
generate a dummy value that is NULL, the empty string, or a
fixed length of NUL bytes, depending on the type of the dropped column.
row_upd_clust_rec_by_insert_inherit_func(): On the update of PRIMARY KEY
of a record that contained a dropped column whose value was stored
externally, we will be inserting a dummy NULL or empty string value
to the field of the dropped column. The externally stored column would
eventually be dropped when purge removes the delete-marked record for
the old PRIMARY KEY value.
btr_index_rec_validate(): Recognize the metadata record.
btr_discard_only_page_on_level(): Preserve the generic instant
ALTER TABLE metadata.
btr_set_instant(): Replaces page_set_instant(). This sets a clustered
index root page to the appropriate format, or upgrades from
the MDEV-11369 instant ADD COLUMN to generic ALTER TABLE format.
btr_cur_instant_init_low(): Read and validate the metadata BLOB page
before reconstructing the dictionary information based on it.
btr_cur_instant_init_metadata(): Do not read any lengths from the
metadata record header before reading the BLOB. At this point, we
would not actually know how many nullable fields the metadata record
contains.
btr_cur_instant_root_init(): Initialize n_core_null_bytes in one
of two possible ways.
btr_cur_trim(): Handle the mblob record.
row_metadata_to_tuple(): Convert a metadata record to a data tuple,
based on the new info_bits of the metadata record.
btr_cur_pessimistic_update(): Invoke row_metadata_to_tuple() if needed.
Invoke dtuple_convert_big_rec() for metadata records if the record is
too large, or if the mblob is not yet marked as externally stored.
btr_cur_optimistic_delete_func(), btr_cur_pessimistic_delete():
When the last user record is deleted, do not delete the
generic instant ALTER TABLE metadata record. Only delete
MDEV-11369 instant ADD COLUMN metadata records.
btr_cur_optimistic_insert(): Avoid unnecessary computation of rec_size.
btr_pcur_store_position(): Allow a logically empty page to contain
a metadata record for generic ALTER TABLE.
REC_INFO_DEFAULT_ROW_ADD: Renamed from REC_INFO_DEFAULT_ROW.
This is for the old instant ADD COLUMN (MDEV-11369) only.
REC_INFO_DEFAULT_ROW_ALTER: The more generic metadata record,
with additional information for dropped or reordered columns.
rec_info_bits_valid(): Remove. The only case when this would fail
is when the record is the generic ALTER TABLE metadata record.
rec_is_alter_metadata(): Check if a record is the metadata record
for instant ALTER TABLE (other than ADD COLUMN). NOTE: This function
must not be invoked on node pointer records, because the delete-mark
flag in those records may be set (it is garbage), and then a debug
assertion could fail because index->is_instant() does not necessarily
hold.
rec_is_add_metadata(): Check if a record is MDEV-11369 ADD COLUMN metadata
record (not more generic instant ALTER TABLE).
rec_get_converted_size_comp_prefix_low(): Assume that the metadata
field will be stored externally. In dtuple_convert_big_rec() during
the rec_get_converted_size() call, it would not be there yet.
rec_get_converted_size_comp(): Replace status,fields,n_fields with tuple.
rec_init_offsets_comp_ordinary(), rec_get_converted_size_comp_prefix_low(),
rec_convert_dtuple_to_rec_comp(): Add template<bool mblob = false>.
With mblob=true, process a record with a metadata BLOB.
rec_copy_prefix_to_buf(): Assert that no fields beyond the key and
system columns are being copied. Exclude the metadata BLOB field.
rec_convert_dtuple_to_metadata_comp(): Convert an alter metadata tuple
into a record.
row_upd_index_replace_metadata(): Apply an update vector to an
alter_metadata tuple.
row_log_allocate(): Replace dict_index_t::is_instant()
with a more appropriate condition that ignores dict_table_t::instant.
Only a table on which the MDEV-11369 ADD COLUMN was performed
can "lose its instantness" when it becomes empty. After
instant DROP COLUMN or reordering columns, we cannot simply
convert the table to the canonical format, because the data
dictionary cache and all possibly existing references to it
from other client connection threads would have to be adjusted.
row_quiesce_write_index_fields(): Do not crash when the table contains
an instantly dropped column.
Thanks to Thirunarayanan Balathandayuthapani for discussing the design
and implementing an initial prototype of this.
Thanks to Matthias Leich for testing.
2018-10-19 16:49:54 +03:00
|
|
|
table->instant = NULL;
|
2018-12-27 16:28:22 +02:00
|
|
|
if (ai_col) {
|
|
|
|
auto a = std::find_if(begin, end,
|
|
|
|
[ai_col](const dict_field_t& f)
|
|
|
|
{ return f.col == ai_col; });
|
|
|
|
table->persistent_autoinc = (a == end) ? 0 : 1 + (a - fields);
|
|
|
|
}
|
MDEV-15662 Instant DROP COLUMN or changing the order of columns
Allow ADD COLUMN anywhere in a table, not only adding as the
last column.
Allow instant DROP COLUMN and instant changing the order of columns.
The added columns will always be added last in clustered index records.
In new records, instantly dropped columns will be stored as NULL or
empty when possible.
Information about dropped and reordered columns will be written in
a metadata BLOB (mblob), which is stored before the first 'user' field
in the hidden metadata record at the start of the clustered index.
The presence of mblob is indicated by setting the delete-mark flag in
the metadata record.
The metadata BLOB stores the number of clustered index fields,
followed by an array of column information for each field.
For dropped columns, we store the NOT NULL flag, the fixed length,
and for variable-length columns, whether the maximum length exceeded
255 bytes. For non-dropped columns, we store the column position.
Unlike with MDEV-11369, when a table becomes empty, it cannot
be converted back to the canonical format. The reason for this is
that other threads may hold cached objects such as
row_prebuilt_t::ins_node that could refer to dropped or reordered
index fields.
For instant DROP COLUMN and ROW_FORMAT=COMPACT or ROW_FORMAT=DYNAMIC,
we must store the n_core_null_bytes in the root page, so that the
chain of node pointer records can be followed in order to reach the
leftmost leaf page where the metadata record is located.
If the mblob is present, we will zero-initialize the strings
"infimum" and "supremum" in the root page, and use the last byte of
"supremum" for storing the number of null bytes (which are allocated
but useless on node pointer pages). This is necessary for
btr_cur_instant_init_metadata() to be able to navigate to the mblob.
If the PRIMARY KEY contains any variable-length column and some
nullable columns were instantly dropped, the dict_index_t::n_nullable
in the data dictionary could be smaller than it actually is in the
non-leaf pages. Because of this, the non-leaf pages could use more
bytes for the null flags than the data dictionary expects, and we
could be reading the lengths of the variable-length columns from the
wrong offset, and thus reading the child page number from wrong place.
This is the result of two design mistakes that involve unnecessary
storage of data: First, it is nonsense to store any data fields for
the leftmost node pointer records, because the comparisons would be
resolved by the MIN_REC_FLAG alone. Second, there cannot be any null
fields in the clustered index node pointer fields, but we nevertheless
reserve space for all the null flags.
Limitations (future work):
MDEV-17459 Allow instant ALTER TABLE even if FULLTEXT INDEX exists
MDEV-17468 Avoid table rebuild on operations on generated columns
MDEV-17494 Refuse ALGORITHM=INSTANT when the row size is too large
btr_page_reorganize_low(): Preserve any metadata in the root page.
Call lock_move_reorganize_page() only after restoring the "infimum"
and "supremum" records, to avoid a memcmp() assertion failure.
dict_col_t::DROPPED: Magic value for dict_col_t::ind.
dict_col_t::clear_instant(): Renamed from dict_col_t::remove_instant().
Do not assert that the column was instantly added, because we
sometimes call this unconditionally for all columns.
Convert an instantly added column to a "core column". The old name
remove_instant() could be mistaken to refer to "instant DROP COLUMN".
dict_col_t::is_added(): Rename from dict_col_t::is_instant().
dtype_t::metadata_blob_init(): Initialize the mblob data type.
dtuple_t::is_metadata(), dtuple_t::is_alter_metadata(),
upd_t::is_metadata(), upd_t::is_alter_metadata(): Check if info_bits
refer to a metadata record.
dict_table_t::instant: Metadata about dropped or reordered columns.
dict_table_t::prepare_instant(): Prepare
ha_innobase_inplace_ctx::instant_table for instant ALTER TABLE.
innobase_instant_try() will pass this to dict_table_t::instant_column().
On rollback, dict_table_t::rollback_instant() will be called.
dict_table_t::instant_column(): Renamed from instant_add_column().
Add the parameter col_map so that columns can be reordered.
Copy and adjust v_cols[] as well.
dict_table_t::find(): Find an old column based on a new column number.
dict_table_t::serialise_columns(), dict_table_t::deserialise_columns():
Convert the mblob.
dict_index_t::instant_metadata(): Create the metadata record
for instant ALTER TABLE. Invoke dict_table_t::serialise_columns().
dict_index_t::reconstruct_fields(): Invoked by
dict_table_t::deserialise_columns().
dict_index_t::clear_instant_alter(): Move the fields for the
dropped columns to the end, and sort the surviving index fields
in ascending order of column position.
ha_innobase::check_if_supported_inplace_alter(): Do not allow
adding a FTS_DOC_ID column if a hidden FTS_DOC_ID column exists
due to FULLTEXT INDEX. (This always required ALGORITHM=COPY.)
instant_alter_column_possible(): Add a parameter for InnoDB table,
to check for additional conditions, such as the maximum number of
index fields.
ha_innobase_inplace_ctx::first_alter_pos: The first column whose position
is affected by instant ADD, DROP, or changing the order of columns.
innobase_build_col_map(): Skip added virtual columns.
prepare_inplace_add_virtual(): Correctly compute num_to_add_vcol.
Remove some unnecessary code. Note that the call to
innodb_base_col_setup() should be executed later.
commit_try_norebuild(): If ctx->is_instant(), let the virtual
columns be added or dropped by innobase_instant_try().
innobase_instant_try(): Fill in a zero default value for the
hidden column FTS_DOC_ID (to reduce the work needed in MDEV-17459).
If any columns were dropped or reordered (or added not last),
delete any SYS_COLUMNS records for the following columns, and
insert SYS_COLUMNS records for all subsequent stored columns as well
as for all virtual columns. If any virtual column is dropped, rewrite
all virtual column metadata. Use a shortcut only for adding
virtual columns. This is because innobase_drop_virtual_try()
assumes that the dropped virtual columns still exist in ctx->old_table.
innodb_update_cols(): Renamed from innodb_update_n_cols().
innobase_add_one_virtual(), innobase_insert_sys_virtual(): Change
the return type to bool, and invoke my_error() when detecting an error.
innodb_insert_sys_columns(): Insert a record into SYS_COLUMNS.
Refactored from innobase_add_one_virtual() and innobase_instant_add_col().
innobase_instant_add_col(): Replace the parameter dfield with type.
innobase_instant_drop_cols(): Drop matching columns from SYS_COLUMNS
and all columns from SYS_VIRTUAL.
innobase_add_virtual_try(), innobase_drop_virtual_try(): Let
the caller invoke innodb_update_cols().
innobase_rename_column_try(): Skip dropped columns.
commit_cache_norebuild(): Update table->fts->doc_col.
dict_mem_table_col_rename_low(): Skip dropped columns.
trx_undo_rec_get_partial_row(): Skip dropped columns.
trx_undo_update_rec_get_update(): Handle the metadata BLOB correctly.
trx_undo_page_report_modify(): Avoid out-of-bounds access to record fields.
Log metadata records consistently.
Apparently, the first fields of a clustered index may be updated
in an update_undo vector when the index is ID_IND of SYS_FOREIGN,
as part of renaming the table during ALTER TABLE. Normally, updates of
the PRIMARY KEY should be logged as delete-mark and an insert.
row_undo_mod_parse_undo_rec(), row_purge_parse_undo_rec():
Use trx_undo_metadata.
row_undo_mod_clust_low(): On metadata rollback, roll back the root page too.
row_undo_mod_clust(): Relax an assertion. The delete-mark flag was
repurposed for ALTER TABLE metadata records.
row_rec_to_index_entry_impl(): Add the template parameter mblob
and the optional parameter info_bits for specifying the desired new
info bits. For the metadata tuple, allow conversion between the original
format (ADD COLUMN only) and the generic format (with hidden BLOB).
Add the optional parameter "pad" to determine whether the tuple should
be padded to the index fields (on ALTER TABLE it should), or whether
it should remain at its original size (on rollback).
row_build_index_entry_low(): Clean up the code, removing
redundant variables and conditions. For instantly dropped columns,
generate a dummy value that is NULL, the empty string, or a
fixed length of NUL bytes, depending on the type of the dropped column.
row_upd_clust_rec_by_insert_inherit_func(): On the update of PRIMARY KEY
of a record that contained a dropped column whose value was stored
externally, we will be inserting a dummy NULL or empty string value
to the field of the dropped column. The externally stored column would
eventually be dropped when purge removes the delete-marked record for
the old PRIMARY KEY value.
btr_index_rec_validate(): Recognize the metadata record.
btr_discard_only_page_on_level(): Preserve the generic instant
ALTER TABLE metadata.
btr_set_instant(): Replaces page_set_instant(). This sets a clustered
index root page to the appropriate format, or upgrades from
the MDEV-11369 instant ADD COLUMN to generic ALTER TABLE format.
btr_cur_instant_init_low(): Read and validate the metadata BLOB page
before reconstructing the dictionary information based on it.
btr_cur_instant_init_metadata(): Do not read any lengths from the
metadata record header before reading the BLOB. At this point, we
would not actually know how many nullable fields the metadata record
contains.
btr_cur_instant_root_init(): Initialize n_core_null_bytes in one
of two possible ways.
btr_cur_trim(): Handle the mblob record.
row_metadata_to_tuple(): Convert a metadata record to a data tuple,
based on the new info_bits of the metadata record.
btr_cur_pessimistic_update(): Invoke row_metadata_to_tuple() if needed.
Invoke dtuple_convert_big_rec() for metadata records if the record is
too large, or if the mblob is not yet marked as externally stored.
btr_cur_optimistic_delete_func(), btr_cur_pessimistic_delete():
When the last user record is deleted, do not delete the
generic instant ALTER TABLE metadata record. Only delete
MDEV-11369 instant ADD COLUMN metadata records.
btr_cur_optimistic_insert(): Avoid unnecessary computation of rec_size.
btr_pcur_store_position(): Allow a logically empty page to contain
a metadata record for generic ALTER TABLE.
REC_INFO_DEFAULT_ROW_ADD: Renamed from REC_INFO_DEFAULT_ROW.
This is for the old instant ADD COLUMN (MDEV-11369) only.
REC_INFO_DEFAULT_ROW_ALTER: The more generic metadata record,
with additional information for dropped or reordered columns.
rec_info_bits_valid(): Remove. The only case when this would fail
is when the record is the generic ALTER TABLE metadata record.
rec_is_alter_metadata(): Check if a record is the metadata record
for instant ALTER TABLE (other than ADD COLUMN). NOTE: This function
must not be invoked on node pointer records, because the delete-mark
flag in those records may be set (it is garbage), and then a debug
assertion could fail because index->is_instant() does not necessarily
hold.
rec_is_add_metadata(): Check if a record is MDEV-11369 ADD COLUMN metadata
record (not more generic instant ALTER TABLE).
rec_get_converted_size_comp_prefix_low(): Assume that the metadata
field will be stored externally. In dtuple_convert_big_rec() during
the rec_get_converted_size() call, it would not be there yet.
rec_get_converted_size_comp(): Replace status,fields,n_fields with tuple.
rec_init_offsets_comp_ordinary(), rec_get_converted_size_comp_prefix_low(),
rec_convert_dtuple_to_rec_comp(): Add template<bool mblob = false>.
With mblob=true, process a record with a metadata BLOB.
rec_copy_prefix_to_buf(): Assert that no fields beyond the key and
system columns are being copied. Exclude the metadata BLOB field.
rec_convert_dtuple_to_metadata_comp(): Convert an alter metadata tuple
into a record.
row_upd_index_replace_metadata(): Apply an update vector to an
alter_metadata tuple.
row_log_allocate(): Replace dict_index_t::is_instant()
with a more appropriate condition that ignores dict_table_t::instant.
Only a table on which the MDEV-11369 ADD COLUMN was performed
can "lose its instantness" when it becomes empty. After
instant DROP COLUMN or reordering columns, we cannot simply
convert the table to the canonical format, because the data
dictionary cache and all possibly existing references to it
from other client connection threads would have to be adjusted.
row_quiesce_write_index_fields(): Do not crash when the table contains
an instantly dropped column.
Thanks to Thirunarayanan Balathandayuthapani for discussing the design
and implementing an initial prototype of this.
Thanks to Matthias Leich for testing.
2018-10-19 16:49:54 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/** @return whether the column was instantly dropped
|
|
|
|
@param[in] index the clustered index */
|
|
|
|
inline bool dict_col_t::is_dropped(const dict_index_t& index) const
|
|
|
|
{
|
|
|
|
DBUG_ASSERT(index.is_primary());
|
|
|
|
DBUG_ASSERT(!is_dropped() == !index.table->instant);
|
|
|
|
DBUG_ASSERT(!is_dropped() || (this >= index.table->instant->dropped
|
|
|
|
&& this < index.table->instant->dropped
|
|
|
|
+ index.table->instant->n_dropped));
|
|
|
|
return is_dropped();
|
|
|
|
}
|
|
|
|
|
2016-08-12 11:17:45 +03:00
|
|
|
/*******************************************************************//**
|
|
|
|
Initialise the table lock list. */
|
|
|
|
void
|
|
|
|
lock_table_lock_list_init(
|
|
|
|
/*======================*/
|
|
|
|
table_lock_list_t* locks); /*!< List to initialise */
|
|
|
|
|
2014-09-11 10:13:35 +02:00
|
|
|
/** A function object to add the foreign key constraint to the referenced set
|
|
|
|
of the referenced table, if it exists in the dictionary cache. */
|
|
|
|
struct dict_foreign_add_to_referenced_table {
|
|
|
|
void operator()(dict_foreign_t* foreign) const
|
|
|
|
{
|
|
|
|
if (dict_table_t* table = foreign->referenced_table) {
|
|
|
|
std::pair<dict_foreign_set::iterator, bool> ret
|
|
|
|
= table->referenced_set.insert(foreign);
|
|
|
|
ut_a(ret.second);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
2015-05-04 22:13:46 +02:00
|
|
|
/** Destroy the autoinc latch of the given table.
|
|
|
|
This function is only called from either single threaded environment
|
|
|
|
or from a thread that has not shared the table object with other threads.
|
|
|
|
@param[in,out] table table whose stats latch to destroy */
|
|
|
|
inline
|
|
|
|
void
|
|
|
|
dict_table_autoinc_destroy(
|
|
|
|
dict_table_t* table)
|
|
|
|
{
|
|
|
|
if (table->autoinc_mutex_created == os_once::DONE
|
|
|
|
&& table->autoinc_mutex != NULL) {
|
|
|
|
mutex_free(table->autoinc_mutex);
|
2016-08-12 11:17:45 +03:00
|
|
|
UT_DELETE(table->autoinc_mutex);
|
2015-05-04 22:13:46 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/** Request for lazy creation of the autoinc latch of a given table.
|
|
|
|
This function is only called from either single threaded environment
|
|
|
|
or from a thread that has not shared the table object with other threads.
|
|
|
|
@param[in,out] table table whose autoinc latch is to be created. */
|
|
|
|
inline
|
|
|
|
void
|
|
|
|
dict_table_autoinc_create_lazy(
|
|
|
|
dict_table_t* table)
|
|
|
|
{
|
|
|
|
table->autoinc_mutex = NULL;
|
|
|
|
table->autoinc_mutex_created = os_once::NEVER_DONE;
|
|
|
|
}
|
|
|
|
|
|
|
|
/** Request a lazy creation of dict_index_t::zip_pad::mutex.
|
|
|
|
This function is only called from either single threaded environment
|
|
|
|
or from a thread that has not shared the table object with other threads.
|
|
|
|
@param[in,out] index index whose zip_pad mutex is to be created */
|
|
|
|
inline
|
|
|
|
void
|
|
|
|
dict_index_zip_pad_mutex_create_lazy(
|
|
|
|
dict_index_t* index)
|
|
|
|
{
|
|
|
|
index->zip_pad.mutex = NULL;
|
|
|
|
index->zip_pad.mutex_created = os_once::NEVER_DONE;
|
|
|
|
}
|
|
|
|
|
|
|
|
/** Destroy the zip_pad_mutex of the given index.
|
|
|
|
This function is only called from either single threaded environment
|
|
|
|
or from a thread that has not shared the table object with other threads.
|
|
|
|
@param[in,out] table table whose stats latch to destroy */
|
|
|
|
inline
|
|
|
|
void
|
|
|
|
dict_index_zip_pad_mutex_destroy(
|
|
|
|
dict_index_t* index)
|
|
|
|
{
|
|
|
|
if (index->zip_pad.mutex_created == os_once::DONE
|
|
|
|
&& index->zip_pad.mutex != NULL) {
|
2016-08-12 11:17:45 +03:00
|
|
|
mutex_free(index->zip_pad.mutex);
|
|
|
|
UT_DELETE(index->zip_pad.mutex);
|
2015-05-04 22:13:46 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/** Release the zip_pad_mutex of a given index.
|
|
|
|
@param[in,out] index index whose zip_pad_mutex is to be released */
|
|
|
|
inline
|
|
|
|
void
|
|
|
|
dict_index_zip_pad_unlock(
|
|
|
|
dict_index_t* index)
|
|
|
|
{
|
2016-08-12 11:17:45 +03:00
|
|
|
mutex_exit(index->zip_pad.mutex);
|
2015-05-04 22:13:46 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
#ifdef UNIV_DEBUG
|
|
|
|
/** Check if the current thread owns the autoinc_mutex of a given table.
|
|
|
|
@param[in] table the autoinc_mutex belongs to this table
|
|
|
|
@return true, if the current thread owns the autoinc_mutex, false otherwise.*/
|
|
|
|
inline
|
|
|
|
bool
|
|
|
|
dict_table_autoinc_own(
|
|
|
|
const dict_table_t* table)
|
|
|
|
{
|
|
|
|
return(mutex_own(table->autoinc_mutex));
|
|
|
|
}
|
|
|
|
#endif /* UNIV_DEBUG */
|
|
|
|
|
2016-08-12 11:17:45 +03:00
|
|
|
/** Check whether the col is used in spatial index or regular index.
|
|
|
|
@param[in] col column to check
|
2016-09-06 09:43:16 +03:00
|
|
|
@return spatial status */
|
2016-08-12 11:17:45 +03:00
|
|
|
inline
|
2016-09-06 09:43:16 +03:00
|
|
|
spatial_status_t
|
2016-08-12 11:17:45 +03:00
|
|
|
dict_col_get_spatial_status(
|
|
|
|
const dict_col_t* col)
|
|
|
|
{
|
2016-09-06 09:43:16 +03:00
|
|
|
spatial_status_t spatial_status = SPATIAL_NONE;
|
2016-08-12 11:17:45 +03:00
|
|
|
|
2016-09-06 09:43:16 +03:00
|
|
|
/* Column is not a part of any index. */
|
|
|
|
if (!col->ord_part) {
|
|
|
|
return(spatial_status);
|
|
|
|
}
|
2016-08-12 11:17:45 +03:00
|
|
|
|
|
|
|
if (DATA_GEOMETRY_MTYPE(col->mtype)) {
|
|
|
|
if (col->max_prefix == 0) {
|
|
|
|
spatial_status = SPATIAL_ONLY;
|
|
|
|
} else {
|
|
|
|
/* Any regular index on a geometry column
|
|
|
|
should have a prefix. */
|
|
|
|
spatial_status = SPATIAL_MIXED;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return(spatial_status);
|
|
|
|
}
|
|
|
|
|
2017-12-06 10:32:24 +02:00
|
|
|
/** Clear defragmentation summary. */
|
|
|
|
inline void dict_stats_empty_defrag_summary(dict_index_t* index)
|
|
|
|
{
|
|
|
|
index->stat_defrag_n_pages_freed = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/** Clear defragmentation related index stats. */
|
|
|
|
inline void dict_stats_empty_defrag_stats(dict_index_t* index)
|
|
|
|
{
|
|
|
|
index->stat_defrag_modified_counter = 0;
|
|
|
|
index->stat_defrag_n_page_split = 0;
|
|
|
|
}
|
|
|
|
|
2014-02-26 19:11:54 +01:00
|
|
|
#include "dict0mem.ic"
|
|
|
|
|
2016-08-12 11:17:45 +03:00
|
|
|
#endif /* dict0mem_h */
|