mariadb/storage/innobase/include/mtr0mtr.h

685 lines
25 KiB
C
Raw Normal View History

/*****************************************************************************
Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved.
Copyright (c) 2012, Facebook Inc.
Copyright (c) 2013, 2020, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
You should have received a copy of the GNU General Public License along with
this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
*****************************************************************************/
/**************************************************//**
@file include/mtr0mtr.h
Mini-transaction buffer
Created 11/26/1995 Heikki Tuuri
*******************************************************/
#ifndef mtr0mtr_h
#define mtr0mtr_h
#include "fil0fil.h"
#include "dyn0buf.h"
/** Start a mini-transaction. */
#define mtr_start(m) (m)->start()
/** Commit a mini-transaction. */
#define mtr_commit(m) (m)->commit()
/** Set and return a savepoint in mtr.
@return savepoint */
#define mtr_set_savepoint(m) (m)->get_savepoint()
/** Release the (index tree) s-latch stored in an mtr memo after a
2014-12-22 16:53:17 +02:00
savepoint. */
#define mtr_release_s_latch_at_savepoint(m, s, l) \
(m)->release_s_latch_at_savepoint((s), (l))
/** Change the logging mode of a mini-transaction.
@return old mode */
#define mtr_set_log_mode(m, d) (m)->set_log_mode((d))
/** Release an object in the memo stack.
@return true if released */
#define mtr_memo_release(m, o, t) \
(m)->memo_release((o), (t))
/** Print info of an mtr handle. */
#define mtr_print(m) (m)->print()
/** Return the log object of a mini-transaction buffer.
@return log */
#define mtr_get_log(m) (m)->get_log()
/** Push an object to an mtr memo stack. */
#define mtr_memo_push(m, o, t) (m)->memo_push(o, t)
#define mtr_s_lock_index(i, m) (m)->s_lock(&(i)->lock, __FILE__, __LINE__)
#define mtr_x_lock_index(i, m) (m)->x_lock(&(i)->lock, __FILE__, __LINE__)
#define mtr_sx_lock_index(i, m) (m)->sx_lock(&(i)->lock, __FILE__, __LINE__)
#define mtr_release_block_at_savepoint(m, s, b) \
(m)->release_block_at_savepoint((s), (b))
#define mtr_block_sx_latch_at_savepoint(m, s, b) \
(m)->sx_latch_at_savepoint((s), (b))
#define mtr_block_x_latch_at_savepoint(m, s, b) \
(m)->x_latch_at_savepoint((s), (b))
/** Check if a mini-transaction is dirtying a clean page.
@param b block being x-fixed
@return true if the mtr is dirtying a clean page. */
#define mtr_block_dirtied(b) mtr_t::is_block_dirtied((b))
/** Mini-transaction memo stack slot. */
struct mtr_memo_slot_t {
/** pointer to the object */
void* object;
2019-11-15 14:55:38 +02:00
/** type of the stored object */
mtr_memo_type_t type;
};
/** Mini-transaction handle and buffer */
struct mtr_t {
/** Start a mini-transaction. */
void start();
/** Commit the mini-transaction. */
void commit();
/** Commit a mini-transaction that did not modify any pages,
but generated some redo log on a higher level, such as
MDEV-12353: Change the redo log encoding log_t::FORMAT_10_5: physical redo log format tag log_phys_t: Buffered records in the physical format. The log record bytes will follow the last data field, making use of alignment padding that would otherwise be wasted. If there are multiple records for the same page, also those may be appended to an existing log_phys_t object if the memory is available. In the physical format, the first byte of a record identifies the record and its length (up to 15 bytes). For longer records, the immediately following bytes will encode the remaining length in a variable-length encoding. Usually, a variable-length-encoded page identifier will follow, followed by optional payload, whose length is included in the initially encoded total record length. When a mini-transaction is updating multiple fields in a page, it can avoid repeating the tablespace identifier and page number by setting the same_page flag (most significant bit) in the first byte of the log record. The byte offset of the record will be relative to where the previous record for that page ended. Until MDEV-14425 introduces a separate file-level log for redo log checkpoints and file operations, we will write the file-level records in the page-level redo log file. The record FILE_CHECKPOINT (which replaces MLOG_CHECKPOINT) will be removed in MDEV-14425, and one sequential scan of the page recovery log will suffice. Compared to MLOG_FILE_CREATE2, FILE_CREATE will not include any flags. If the information is needed, it can be parsed from WRITE records that modify FSP_SPACE_FLAGS. MLOG_ZIP_WRITE_STRING: Remove. The record was only introduced temporarily as part of this work, before being replaced with WRITE (along with MLOG_WRITE_STRING, MLOG_1BYTE, MLOG_nBYTES). mtr_buf_t::empty(): Check if the buffer is empty. mtr_t::m_n_log_recs: Remove. It suffices to check if m_log is empty. mtr_t::m_last, mtr_t::m_last_offset: End of the latest m_log record, for the same_page encoding. page_recv_t::last_offset: Reflects mtr_t::m_last_offset. Valid values for last_offset during recovery should be 0 or above 8. (The first 8 bytes of a page are the checksum and the page number, and neither are ever updated directly by log records.) Internally, the special value 1 indicates that the same_page form will not be allowed for the subsequent record. mtr_t::page_create(): Take the block descriptor as parameter, so that it can be compared to mtr_t::m_last. The INIT_INDEX_PAGE record will always followed by a subtype byte, because same_page records must be longer than 1 byte. trx_undo_page_init(): Combine the writes in WRITE record. trx_undo_header_create(): Write 4 bytes using a special MEMSET record that includes 1 bytes of length and 2 bytes of payload. flst_write_addr(): Define as a static function. Combine the writes. flst_zero_both(): Replaces two flst_zero_addr() calls. flst_init(): Do not inline the function. fsp_free_seg_inode(): Zerofill the whole inode. fsp_apply_init_file_page(): Initialize FIL_PAGE_PREV,FIL_PAGE_NEXT to FIL_NULL when using the physical format. btr_create(): Assert !page_has_siblings() because fsp_apply_init_file_page() must have been invoked. fil_ibd_create(): Do not write FILE_MODIFY after FILE_CREATE. fil_names_dirty_and_write(): Remove the parameter mtr. Write the records using a separate mini-transaction object, because any FILE_ records must be at the start of a mini-transaction log. recv_recover_page(): Add a fil_space_t* parameter. After applying log to the a ROW_FORMAT=COMPRESSED page, invoke buf_zip_decompress() to restore the uncompressed page. buf_page_io_complete(): Remove the temporary hack to discard the uncompressed page of a ROW_FORMAT=COMPRESSED page. page_zip_write_header(): Remove. Use mtr_t::write() or mtr_t::memset() instead, and update the compressed page frame separately. trx_undo_header_add_space_for_xid(): Remove. trx_undo_seg_create(): Perform the changes that were previously made by trx_undo_header_add_space_for_xid(). btr_reset_instant(): New function: Reset the table to MariaDB 10.2 or 10.3 format when rolling back an instant ALTER TABLE operation. page_rec_find_owner_rec(): Merge with the only callers. page_cur_insert_rec_low(): Combine writes by using a local buffer. MEMMOVE data from the preceding record whenever feasible (copying at least 3 bytes). page_cur_insert_rec_zip(): Combine writes to page header fields. PageBulk::insertPage(): Issue MEMMOVE records to copy a matching part from the preceding record. PageBulk::finishPage(): Combine the writes to the page header and to the sparse page directory slots. mtr_t::write(): Only log the least significant (last) bytes of multi-byte fields that actually differ. For updating FSP_SIZE, we must always write all 4 bytes to the redo log, so that the fil_space_set_recv_size() logic in recv_sys_t::parse() will work. mtr_t::memcpy(), mtr_t::zmemcpy(): Take a pointer argument instead of a numeric offset to the page frame. Only log the last bytes of multi-byte fields that actually differ. In fil_space_crypt_t::write_page0(), we must log also any unchanged bytes, so that recovery will recognize the record and invoke fil_crypt_parse(). Future work: MDEV-21724 Optimize page_cur_insert_rec_low() redo logging MDEV-21725 Optimize btr_page_reorganize_low() redo logging MDEV-21727 Optimize redo logging for ROW_FORMAT=COMPRESSED
2020-02-13 19:12:17 +02:00
FILE_MODIFY records and an optional FILE_CHECKPOINT marker.
The caller must hold log_sys.mutex.
This is to be used at log_checkpoint().
@param checkpoint_lsn the log sequence number of a checkpoint, or 0 */
void commit_files(lsn_t checkpoint_lsn= 0);
/** @return mini-transaction savepoint (current size of m_memo) */
ulint get_savepoint() const { ut_ad(is_active()); return m_memo.size(); }
/** Release the (index tree) s-latch stored in an mtr memo after a
savepoint.
@param savepoint value returned by @see set_savepoint.
@param lock latch to release */
inline void release_s_latch_at_savepoint(
ulint savepoint,
rw_lock_t* lock);
/** Release the block in an mtr memo after a savepoint. */
inline void release_block_at_savepoint(
ulint savepoint,
buf_block_t* block);
/** SX-latch a not yet latched block after a savepoint. */
inline void sx_latch_at_savepoint(ulint savepoint, buf_block_t* block);
/** X-latch a not yet latched block after a savepoint. */
inline void x_latch_at_savepoint(ulint savepoint, buf_block_t* block);
/** @return the logging mode */
mtr_log_t get_log_mode() const
{
static_assert(MTR_LOG_ALL == 0, "efficiency");
ut_ad(m_log_mode <= MTR_LOG_NO_REDO);
return static_cast<mtr_log_t>(m_log_mode);
}
/** Change the logging mode.
@param mode logging mode
@return old mode */
mtr_log_t set_log_mode(mtr_log_t mode)
{
const mtr_log_t old_mode= get_log_mode();
m_log_mode= mode & 3;
return old_mode;
}
2020-11-13 22:06:50 +02:00
/** Check if we are holding a block latch in exclusive mode
@param block buffer pool block to search for */
bool have_x_latch(const buf_block_t &block) const;
/** Copy the tablespaces associated with the mini-transaction
MDEV-12353: Change the redo log encoding log_t::FORMAT_10_5: physical redo log format tag log_phys_t: Buffered records in the physical format. The log record bytes will follow the last data field, making use of alignment padding that would otherwise be wasted. If there are multiple records for the same page, also those may be appended to an existing log_phys_t object if the memory is available. In the physical format, the first byte of a record identifies the record and its length (up to 15 bytes). For longer records, the immediately following bytes will encode the remaining length in a variable-length encoding. Usually, a variable-length-encoded page identifier will follow, followed by optional payload, whose length is included in the initially encoded total record length. When a mini-transaction is updating multiple fields in a page, it can avoid repeating the tablespace identifier and page number by setting the same_page flag (most significant bit) in the first byte of the log record. The byte offset of the record will be relative to where the previous record for that page ended. Until MDEV-14425 introduces a separate file-level log for redo log checkpoints and file operations, we will write the file-level records in the page-level redo log file. The record FILE_CHECKPOINT (which replaces MLOG_CHECKPOINT) will be removed in MDEV-14425, and one sequential scan of the page recovery log will suffice. Compared to MLOG_FILE_CREATE2, FILE_CREATE will not include any flags. If the information is needed, it can be parsed from WRITE records that modify FSP_SPACE_FLAGS. MLOG_ZIP_WRITE_STRING: Remove. The record was only introduced temporarily as part of this work, before being replaced with WRITE (along with MLOG_WRITE_STRING, MLOG_1BYTE, MLOG_nBYTES). mtr_buf_t::empty(): Check if the buffer is empty. mtr_t::m_n_log_recs: Remove. It suffices to check if m_log is empty. mtr_t::m_last, mtr_t::m_last_offset: End of the latest m_log record, for the same_page encoding. page_recv_t::last_offset: Reflects mtr_t::m_last_offset. Valid values for last_offset during recovery should be 0 or above 8. (The first 8 bytes of a page are the checksum and the page number, and neither are ever updated directly by log records.) Internally, the special value 1 indicates that the same_page form will not be allowed for the subsequent record. mtr_t::page_create(): Take the block descriptor as parameter, so that it can be compared to mtr_t::m_last. The INIT_INDEX_PAGE record will always followed by a subtype byte, because same_page records must be longer than 1 byte. trx_undo_page_init(): Combine the writes in WRITE record. trx_undo_header_create(): Write 4 bytes using a special MEMSET record that includes 1 bytes of length and 2 bytes of payload. flst_write_addr(): Define as a static function. Combine the writes. flst_zero_both(): Replaces two flst_zero_addr() calls. flst_init(): Do not inline the function. fsp_free_seg_inode(): Zerofill the whole inode. fsp_apply_init_file_page(): Initialize FIL_PAGE_PREV,FIL_PAGE_NEXT to FIL_NULL when using the physical format. btr_create(): Assert !page_has_siblings() because fsp_apply_init_file_page() must have been invoked. fil_ibd_create(): Do not write FILE_MODIFY after FILE_CREATE. fil_names_dirty_and_write(): Remove the parameter mtr. Write the records using a separate mini-transaction object, because any FILE_ records must be at the start of a mini-transaction log. recv_recover_page(): Add a fil_space_t* parameter. After applying log to the a ROW_FORMAT=COMPRESSED page, invoke buf_zip_decompress() to restore the uncompressed page. buf_page_io_complete(): Remove the temporary hack to discard the uncompressed page of a ROW_FORMAT=COMPRESSED page. page_zip_write_header(): Remove. Use mtr_t::write() or mtr_t::memset() instead, and update the compressed page frame separately. trx_undo_header_add_space_for_xid(): Remove. trx_undo_seg_create(): Perform the changes that were previously made by trx_undo_header_add_space_for_xid(). btr_reset_instant(): New function: Reset the table to MariaDB 10.2 or 10.3 format when rolling back an instant ALTER TABLE operation. page_rec_find_owner_rec(): Merge with the only callers. page_cur_insert_rec_low(): Combine writes by using a local buffer. MEMMOVE data from the preceding record whenever feasible (copying at least 3 bytes). page_cur_insert_rec_zip(): Combine writes to page header fields. PageBulk::insertPage(): Issue MEMMOVE records to copy a matching part from the preceding record. PageBulk::finishPage(): Combine the writes to the page header and to the sparse page directory slots. mtr_t::write(): Only log the least significant (last) bytes of multi-byte fields that actually differ. For updating FSP_SIZE, we must always write all 4 bytes to the redo log, so that the fil_space_set_recv_size() logic in recv_sys_t::parse() will work. mtr_t::memcpy(), mtr_t::zmemcpy(): Take a pointer argument instead of a numeric offset to the page frame. Only log the last bytes of multi-byte fields that actually differ. In fil_space_crypt_t::write_page0(), we must log also any unchanged bytes, so that recovery will recognize the record and invoke fil_crypt_parse(). Future work: MDEV-21724 Optimize page_cur_insert_rec_low() redo logging MDEV-21725 Optimize btr_page_reorganize_low() redo logging MDEV-21727 Optimize redo logging for ROW_FORMAT=COMPRESSED
2020-02-13 19:12:17 +02:00
(needed for generating FILE_MODIFY records)
@param[in] mtr mini-transaction that may modify
the same set of tablespaces as this one */
void set_spaces(const mtr_t& mtr)
{
ut_ad(!m_user_space_id);
ut_ad(!m_user_space);
ut_d(m_user_space_id = mtr.m_user_space_id);
m_user_space = mtr.m_user_space;
}
/** Set the tablespace associated with the mini-transaction
MDEV-12353: Change the redo log encoding log_t::FORMAT_10_5: physical redo log format tag log_phys_t: Buffered records in the physical format. The log record bytes will follow the last data field, making use of alignment padding that would otherwise be wasted. If there are multiple records for the same page, also those may be appended to an existing log_phys_t object if the memory is available. In the physical format, the first byte of a record identifies the record and its length (up to 15 bytes). For longer records, the immediately following bytes will encode the remaining length in a variable-length encoding. Usually, a variable-length-encoded page identifier will follow, followed by optional payload, whose length is included in the initially encoded total record length. When a mini-transaction is updating multiple fields in a page, it can avoid repeating the tablespace identifier and page number by setting the same_page flag (most significant bit) in the first byte of the log record. The byte offset of the record will be relative to where the previous record for that page ended. Until MDEV-14425 introduces a separate file-level log for redo log checkpoints and file operations, we will write the file-level records in the page-level redo log file. The record FILE_CHECKPOINT (which replaces MLOG_CHECKPOINT) will be removed in MDEV-14425, and one sequential scan of the page recovery log will suffice. Compared to MLOG_FILE_CREATE2, FILE_CREATE will not include any flags. If the information is needed, it can be parsed from WRITE records that modify FSP_SPACE_FLAGS. MLOG_ZIP_WRITE_STRING: Remove. The record was only introduced temporarily as part of this work, before being replaced with WRITE (along with MLOG_WRITE_STRING, MLOG_1BYTE, MLOG_nBYTES). mtr_buf_t::empty(): Check if the buffer is empty. mtr_t::m_n_log_recs: Remove. It suffices to check if m_log is empty. mtr_t::m_last, mtr_t::m_last_offset: End of the latest m_log record, for the same_page encoding. page_recv_t::last_offset: Reflects mtr_t::m_last_offset. Valid values for last_offset during recovery should be 0 or above 8. (The first 8 bytes of a page are the checksum and the page number, and neither are ever updated directly by log records.) Internally, the special value 1 indicates that the same_page form will not be allowed for the subsequent record. mtr_t::page_create(): Take the block descriptor as parameter, so that it can be compared to mtr_t::m_last. The INIT_INDEX_PAGE record will always followed by a subtype byte, because same_page records must be longer than 1 byte. trx_undo_page_init(): Combine the writes in WRITE record. trx_undo_header_create(): Write 4 bytes using a special MEMSET record that includes 1 bytes of length and 2 bytes of payload. flst_write_addr(): Define as a static function. Combine the writes. flst_zero_both(): Replaces two flst_zero_addr() calls. flst_init(): Do not inline the function. fsp_free_seg_inode(): Zerofill the whole inode. fsp_apply_init_file_page(): Initialize FIL_PAGE_PREV,FIL_PAGE_NEXT to FIL_NULL when using the physical format. btr_create(): Assert !page_has_siblings() because fsp_apply_init_file_page() must have been invoked. fil_ibd_create(): Do not write FILE_MODIFY after FILE_CREATE. fil_names_dirty_and_write(): Remove the parameter mtr. Write the records using a separate mini-transaction object, because any FILE_ records must be at the start of a mini-transaction log. recv_recover_page(): Add a fil_space_t* parameter. After applying log to the a ROW_FORMAT=COMPRESSED page, invoke buf_zip_decompress() to restore the uncompressed page. buf_page_io_complete(): Remove the temporary hack to discard the uncompressed page of a ROW_FORMAT=COMPRESSED page. page_zip_write_header(): Remove. Use mtr_t::write() or mtr_t::memset() instead, and update the compressed page frame separately. trx_undo_header_add_space_for_xid(): Remove. trx_undo_seg_create(): Perform the changes that were previously made by trx_undo_header_add_space_for_xid(). btr_reset_instant(): New function: Reset the table to MariaDB 10.2 or 10.3 format when rolling back an instant ALTER TABLE operation. page_rec_find_owner_rec(): Merge with the only callers. page_cur_insert_rec_low(): Combine writes by using a local buffer. MEMMOVE data from the preceding record whenever feasible (copying at least 3 bytes). page_cur_insert_rec_zip(): Combine writes to page header fields. PageBulk::insertPage(): Issue MEMMOVE records to copy a matching part from the preceding record. PageBulk::finishPage(): Combine the writes to the page header and to the sparse page directory slots. mtr_t::write(): Only log the least significant (last) bytes of multi-byte fields that actually differ. For updating FSP_SIZE, we must always write all 4 bytes to the redo log, so that the fil_space_set_recv_size() logic in recv_sys_t::parse() will work. mtr_t::memcpy(), mtr_t::zmemcpy(): Take a pointer argument instead of a numeric offset to the page frame. Only log the last bytes of multi-byte fields that actually differ. In fil_space_crypt_t::write_page0(), we must log also any unchanged bytes, so that recovery will recognize the record and invoke fil_crypt_parse(). Future work: MDEV-21724 Optimize page_cur_insert_rec_low() redo logging MDEV-21725 Optimize btr_page_reorganize_low() redo logging MDEV-21727 Optimize redo logging for ROW_FORMAT=COMPRESSED
2020-02-13 19:12:17 +02:00
(needed for generating a FILE_MODIFY record)
@param[in] space_id user or system tablespace ID
@return the tablespace */
MDEV-12266: Change dict_table_t::space to fil_space_t* InnoDB always keeps all tablespaces in the fil_system cache. The fil_system.LRU is only for closing file handles; the fil_space_t and fil_node_t for all data files will remain in main memory. Between startup to shutdown, they can only be created and removed by DDL statements. Therefore, we can let dict_table_t::space point directly to the fil_space_t. dict_table_t::space_id: A numeric tablespace ID for the corner cases where we do not have a tablespace. The most prominent examples are ALTER TABLE...DISCARD TABLESPACE or a missing or corrupted file. There are a few functional differences; most notably: (1) DROP TABLE will delete matching .ibd and .cfg files, even if they were not attached to the data dictionary. (2) Some error messages will report file names instead of numeric IDs. There still are many functions that use numeric tablespace IDs instead of fil_space_t*, and many functions could be converted to fil_space_t member functions. Also, Tablespace and Datafile should be merged with fil_space_t and fil_node_t. page_id_t and buf_page_get_gen() could use fil_space_t& instead of a numeric ID, and after moving to a single buffer pool (MDEV-15058), buf_pool_t::page_hash could be moved to fil_space_t::page_hash. FilSpace: Remove. Only few calls to fil_space_acquire() will remain, and gradually they should be removed. mtr_t::set_named_space_id(ulint): Renamed from set_named_space(), to prevent accidental calls to this slower function. Very few callers remain. fseg_create(), fsp_reserve_free_extents(): Take fil_space_t* as a parameter instead of a space_id. fil_space_t::rename(): Wrapper for fil_rename_tablespace_check(), fil_name_write_rename(), fil_rename_tablespace(). Mariabackup passes the parameter log=false; InnoDB passes log=true. dict_mem_table_create(): Take fil_space_t* instead of space_id as parameter. dict_process_sys_tables_rec_and_mtr_commit(): Replace the parameter 'status' with 'bool cached'. dict_get_and_save_data_dir_path(): Avoid copying the fil_node_t::name. fil_ibd_open(): Return the tablespace. fil_space_t::set_imported(): Replaces fil_space_set_imported(). truncate_t: Change many member function parameters to fil_space_t*, and remove page_size parameters. row_truncate_prepare(): Merge to its only caller. row_drop_table_from_cache(): Assert that the table is persistent. dict_create_sys_indexes_tuple(): Write SYS_INDEXES.SPACE=FIL_NULL if the tablespace has been discarded. row_import_update_discarded_flag(): Remove a constant parameter.
2018-03-27 16:31:10 +03:00
fil_space_t* set_named_space_id(ulint space_id)
{
ut_ad(!m_user_space_id);
ut_d(m_user_space_id = static_cast<uint32_t>(space_id));
2018-02-06 14:50:50 +01:00
if (!space_id) {
return fil_system.sys_space;
} else {
2019-11-12 16:30:57 +02:00
ut_ad(m_user_space_id == space_id);
ut_ad(!m_user_space);
m_user_space = fil_space_get(space_id);
ut_ad(m_user_space);
return m_user_space;
}
}
/** Set the tablespace associated with the mini-transaction
MDEV-12353: Change the redo log encoding log_t::FORMAT_10_5: physical redo log format tag log_phys_t: Buffered records in the physical format. The log record bytes will follow the last data field, making use of alignment padding that would otherwise be wasted. If there are multiple records for the same page, also those may be appended to an existing log_phys_t object if the memory is available. In the physical format, the first byte of a record identifies the record and its length (up to 15 bytes). For longer records, the immediately following bytes will encode the remaining length in a variable-length encoding. Usually, a variable-length-encoded page identifier will follow, followed by optional payload, whose length is included in the initially encoded total record length. When a mini-transaction is updating multiple fields in a page, it can avoid repeating the tablespace identifier and page number by setting the same_page flag (most significant bit) in the first byte of the log record. The byte offset of the record will be relative to where the previous record for that page ended. Until MDEV-14425 introduces a separate file-level log for redo log checkpoints and file operations, we will write the file-level records in the page-level redo log file. The record FILE_CHECKPOINT (which replaces MLOG_CHECKPOINT) will be removed in MDEV-14425, and one sequential scan of the page recovery log will suffice. Compared to MLOG_FILE_CREATE2, FILE_CREATE will not include any flags. If the information is needed, it can be parsed from WRITE records that modify FSP_SPACE_FLAGS. MLOG_ZIP_WRITE_STRING: Remove. The record was only introduced temporarily as part of this work, before being replaced with WRITE (along with MLOG_WRITE_STRING, MLOG_1BYTE, MLOG_nBYTES). mtr_buf_t::empty(): Check if the buffer is empty. mtr_t::m_n_log_recs: Remove. It suffices to check if m_log is empty. mtr_t::m_last, mtr_t::m_last_offset: End of the latest m_log record, for the same_page encoding. page_recv_t::last_offset: Reflects mtr_t::m_last_offset. Valid values for last_offset during recovery should be 0 or above 8. (The first 8 bytes of a page are the checksum and the page number, and neither are ever updated directly by log records.) Internally, the special value 1 indicates that the same_page form will not be allowed for the subsequent record. mtr_t::page_create(): Take the block descriptor as parameter, so that it can be compared to mtr_t::m_last. The INIT_INDEX_PAGE record will always followed by a subtype byte, because same_page records must be longer than 1 byte. trx_undo_page_init(): Combine the writes in WRITE record. trx_undo_header_create(): Write 4 bytes using a special MEMSET record that includes 1 bytes of length and 2 bytes of payload. flst_write_addr(): Define as a static function. Combine the writes. flst_zero_both(): Replaces two flst_zero_addr() calls. flst_init(): Do not inline the function. fsp_free_seg_inode(): Zerofill the whole inode. fsp_apply_init_file_page(): Initialize FIL_PAGE_PREV,FIL_PAGE_NEXT to FIL_NULL when using the physical format. btr_create(): Assert !page_has_siblings() because fsp_apply_init_file_page() must have been invoked. fil_ibd_create(): Do not write FILE_MODIFY after FILE_CREATE. fil_names_dirty_and_write(): Remove the parameter mtr. Write the records using a separate mini-transaction object, because any FILE_ records must be at the start of a mini-transaction log. recv_recover_page(): Add a fil_space_t* parameter. After applying log to the a ROW_FORMAT=COMPRESSED page, invoke buf_zip_decompress() to restore the uncompressed page. buf_page_io_complete(): Remove the temporary hack to discard the uncompressed page of a ROW_FORMAT=COMPRESSED page. page_zip_write_header(): Remove. Use mtr_t::write() or mtr_t::memset() instead, and update the compressed page frame separately. trx_undo_header_add_space_for_xid(): Remove. trx_undo_seg_create(): Perform the changes that were previously made by trx_undo_header_add_space_for_xid(). btr_reset_instant(): New function: Reset the table to MariaDB 10.2 or 10.3 format when rolling back an instant ALTER TABLE operation. page_rec_find_owner_rec(): Merge with the only callers. page_cur_insert_rec_low(): Combine writes by using a local buffer. MEMMOVE data from the preceding record whenever feasible (copying at least 3 bytes). page_cur_insert_rec_zip(): Combine writes to page header fields. PageBulk::insertPage(): Issue MEMMOVE records to copy a matching part from the preceding record. PageBulk::finishPage(): Combine the writes to the page header and to the sparse page directory slots. mtr_t::write(): Only log the least significant (last) bytes of multi-byte fields that actually differ. For updating FSP_SIZE, we must always write all 4 bytes to the redo log, so that the fil_space_set_recv_size() logic in recv_sys_t::parse() will work. mtr_t::memcpy(), mtr_t::zmemcpy(): Take a pointer argument instead of a numeric offset to the page frame. Only log the last bytes of multi-byte fields that actually differ. In fil_space_crypt_t::write_page0(), we must log also any unchanged bytes, so that recovery will recognize the record and invoke fil_crypt_parse(). Future work: MDEV-21724 Optimize page_cur_insert_rec_low() redo logging MDEV-21725 Optimize btr_page_reorganize_low() redo logging MDEV-21727 Optimize redo logging for ROW_FORMAT=COMPRESSED
2020-02-13 19:12:17 +02:00
(needed for generating a FILE_MODIFY record)
@param[in] space user or system tablespace */
void set_named_space(fil_space_t* space)
{
2019-11-12 16:30:57 +02:00
ut_ad(!m_user_space_id);
ut_d(m_user_space_id = static_cast<uint32_t>(space->id));
if (space->id) {
2019-11-12 16:30:57 +02:00
m_user_space = space;
}
}
#ifdef UNIV_DEBUG
/** Check the tablespace associated with the mini-transaction
MDEV-12353: Change the redo log encoding log_t::FORMAT_10_5: physical redo log format tag log_phys_t: Buffered records in the physical format. The log record bytes will follow the last data field, making use of alignment padding that would otherwise be wasted. If there are multiple records for the same page, also those may be appended to an existing log_phys_t object if the memory is available. In the physical format, the first byte of a record identifies the record and its length (up to 15 bytes). For longer records, the immediately following bytes will encode the remaining length in a variable-length encoding. Usually, a variable-length-encoded page identifier will follow, followed by optional payload, whose length is included in the initially encoded total record length. When a mini-transaction is updating multiple fields in a page, it can avoid repeating the tablespace identifier and page number by setting the same_page flag (most significant bit) in the first byte of the log record. The byte offset of the record will be relative to where the previous record for that page ended. Until MDEV-14425 introduces a separate file-level log for redo log checkpoints and file operations, we will write the file-level records in the page-level redo log file. The record FILE_CHECKPOINT (which replaces MLOG_CHECKPOINT) will be removed in MDEV-14425, and one sequential scan of the page recovery log will suffice. Compared to MLOG_FILE_CREATE2, FILE_CREATE will not include any flags. If the information is needed, it can be parsed from WRITE records that modify FSP_SPACE_FLAGS. MLOG_ZIP_WRITE_STRING: Remove. The record was only introduced temporarily as part of this work, before being replaced with WRITE (along with MLOG_WRITE_STRING, MLOG_1BYTE, MLOG_nBYTES). mtr_buf_t::empty(): Check if the buffer is empty. mtr_t::m_n_log_recs: Remove. It suffices to check if m_log is empty. mtr_t::m_last, mtr_t::m_last_offset: End of the latest m_log record, for the same_page encoding. page_recv_t::last_offset: Reflects mtr_t::m_last_offset. Valid values for last_offset during recovery should be 0 or above 8. (The first 8 bytes of a page are the checksum and the page number, and neither are ever updated directly by log records.) Internally, the special value 1 indicates that the same_page form will not be allowed for the subsequent record. mtr_t::page_create(): Take the block descriptor as parameter, so that it can be compared to mtr_t::m_last. The INIT_INDEX_PAGE record will always followed by a subtype byte, because same_page records must be longer than 1 byte. trx_undo_page_init(): Combine the writes in WRITE record. trx_undo_header_create(): Write 4 bytes using a special MEMSET record that includes 1 bytes of length and 2 bytes of payload. flst_write_addr(): Define as a static function. Combine the writes. flst_zero_both(): Replaces two flst_zero_addr() calls. flst_init(): Do not inline the function. fsp_free_seg_inode(): Zerofill the whole inode. fsp_apply_init_file_page(): Initialize FIL_PAGE_PREV,FIL_PAGE_NEXT to FIL_NULL when using the physical format. btr_create(): Assert !page_has_siblings() because fsp_apply_init_file_page() must have been invoked. fil_ibd_create(): Do not write FILE_MODIFY after FILE_CREATE. fil_names_dirty_and_write(): Remove the parameter mtr. Write the records using a separate mini-transaction object, because any FILE_ records must be at the start of a mini-transaction log. recv_recover_page(): Add a fil_space_t* parameter. After applying log to the a ROW_FORMAT=COMPRESSED page, invoke buf_zip_decompress() to restore the uncompressed page. buf_page_io_complete(): Remove the temporary hack to discard the uncompressed page of a ROW_FORMAT=COMPRESSED page. page_zip_write_header(): Remove. Use mtr_t::write() or mtr_t::memset() instead, and update the compressed page frame separately. trx_undo_header_add_space_for_xid(): Remove. trx_undo_seg_create(): Perform the changes that were previously made by trx_undo_header_add_space_for_xid(). btr_reset_instant(): New function: Reset the table to MariaDB 10.2 or 10.3 format when rolling back an instant ALTER TABLE operation. page_rec_find_owner_rec(): Merge with the only callers. page_cur_insert_rec_low(): Combine writes by using a local buffer. MEMMOVE data from the preceding record whenever feasible (copying at least 3 bytes). page_cur_insert_rec_zip(): Combine writes to page header fields. PageBulk::insertPage(): Issue MEMMOVE records to copy a matching part from the preceding record. PageBulk::finishPage(): Combine the writes to the page header and to the sparse page directory slots. mtr_t::write(): Only log the least significant (last) bytes of multi-byte fields that actually differ. For updating FSP_SIZE, we must always write all 4 bytes to the redo log, so that the fil_space_set_recv_size() logic in recv_sys_t::parse() will work. mtr_t::memcpy(), mtr_t::zmemcpy(): Take a pointer argument instead of a numeric offset to the page frame. Only log the last bytes of multi-byte fields that actually differ. In fil_space_crypt_t::write_page0(), we must log also any unchanged bytes, so that recovery will recognize the record and invoke fil_crypt_parse(). Future work: MDEV-21724 Optimize page_cur_insert_rec_low() redo logging MDEV-21725 Optimize btr_page_reorganize_low() redo logging MDEV-21727 Optimize redo logging for ROW_FORMAT=COMPRESSED
2020-02-13 19:12:17 +02:00
(needed for generating a FILE_MODIFY record)
@param[in] space tablespace
@return whether the mini-transaction is associated with the space */
bool is_named_space(ulint space) const;
MDEV-12266: Change dict_table_t::space to fil_space_t* InnoDB always keeps all tablespaces in the fil_system cache. The fil_system.LRU is only for closing file handles; the fil_space_t and fil_node_t for all data files will remain in main memory. Between startup to shutdown, they can only be created and removed by DDL statements. Therefore, we can let dict_table_t::space point directly to the fil_space_t. dict_table_t::space_id: A numeric tablespace ID for the corner cases where we do not have a tablespace. The most prominent examples are ALTER TABLE...DISCARD TABLESPACE or a missing or corrupted file. There are a few functional differences; most notably: (1) DROP TABLE will delete matching .ibd and .cfg files, even if they were not attached to the data dictionary. (2) Some error messages will report file names instead of numeric IDs. There still are many functions that use numeric tablespace IDs instead of fil_space_t*, and many functions could be converted to fil_space_t member functions. Also, Tablespace and Datafile should be merged with fil_space_t and fil_node_t. page_id_t and buf_page_get_gen() could use fil_space_t& instead of a numeric ID, and after moving to a single buffer pool (MDEV-15058), buf_pool_t::page_hash could be moved to fil_space_t::page_hash. FilSpace: Remove. Only few calls to fil_space_acquire() will remain, and gradually they should be removed. mtr_t::set_named_space_id(ulint): Renamed from set_named_space(), to prevent accidental calls to this slower function. Very few callers remain. fseg_create(), fsp_reserve_free_extents(): Take fil_space_t* as a parameter instead of a space_id. fil_space_t::rename(): Wrapper for fil_rename_tablespace_check(), fil_name_write_rename(), fil_rename_tablespace(). Mariabackup passes the parameter log=false; InnoDB passes log=true. dict_mem_table_create(): Take fil_space_t* instead of space_id as parameter. dict_process_sys_tables_rec_and_mtr_commit(): Replace the parameter 'status' with 'bool cached'. dict_get_and_save_data_dir_path(): Avoid copying the fil_node_t::name. fil_ibd_open(): Return the tablespace. fil_space_t::set_imported(): Replaces fil_space_set_imported(). truncate_t: Change many member function parameters to fil_space_t*, and remove page_size parameters. row_truncate_prepare(): Merge to its only caller. row_drop_table_from_cache(): Assert that the table is persistent. dict_create_sys_indexes_tuple(): Write SYS_INDEXES.SPACE=FIL_NULL if the tablespace has been discarded. row_import_update_discarded_flag(): Remove a constant parameter.
2018-03-27 16:31:10 +03:00
/** Check the tablespace associated with the mini-transaction
MDEV-12353: Change the redo log encoding log_t::FORMAT_10_5: physical redo log format tag log_phys_t: Buffered records in the physical format. The log record bytes will follow the last data field, making use of alignment padding that would otherwise be wasted. If there are multiple records for the same page, also those may be appended to an existing log_phys_t object if the memory is available. In the physical format, the first byte of a record identifies the record and its length (up to 15 bytes). For longer records, the immediately following bytes will encode the remaining length in a variable-length encoding. Usually, a variable-length-encoded page identifier will follow, followed by optional payload, whose length is included in the initially encoded total record length. When a mini-transaction is updating multiple fields in a page, it can avoid repeating the tablespace identifier and page number by setting the same_page flag (most significant bit) in the first byte of the log record. The byte offset of the record will be relative to where the previous record for that page ended. Until MDEV-14425 introduces a separate file-level log for redo log checkpoints and file operations, we will write the file-level records in the page-level redo log file. The record FILE_CHECKPOINT (which replaces MLOG_CHECKPOINT) will be removed in MDEV-14425, and one sequential scan of the page recovery log will suffice. Compared to MLOG_FILE_CREATE2, FILE_CREATE will not include any flags. If the information is needed, it can be parsed from WRITE records that modify FSP_SPACE_FLAGS. MLOG_ZIP_WRITE_STRING: Remove. The record was only introduced temporarily as part of this work, before being replaced with WRITE (along with MLOG_WRITE_STRING, MLOG_1BYTE, MLOG_nBYTES). mtr_buf_t::empty(): Check if the buffer is empty. mtr_t::m_n_log_recs: Remove. It suffices to check if m_log is empty. mtr_t::m_last, mtr_t::m_last_offset: End of the latest m_log record, for the same_page encoding. page_recv_t::last_offset: Reflects mtr_t::m_last_offset. Valid values for last_offset during recovery should be 0 or above 8. (The first 8 bytes of a page are the checksum and the page number, and neither are ever updated directly by log records.) Internally, the special value 1 indicates that the same_page form will not be allowed for the subsequent record. mtr_t::page_create(): Take the block descriptor as parameter, so that it can be compared to mtr_t::m_last. The INIT_INDEX_PAGE record will always followed by a subtype byte, because same_page records must be longer than 1 byte. trx_undo_page_init(): Combine the writes in WRITE record. trx_undo_header_create(): Write 4 bytes using a special MEMSET record that includes 1 bytes of length and 2 bytes of payload. flst_write_addr(): Define as a static function. Combine the writes. flst_zero_both(): Replaces two flst_zero_addr() calls. flst_init(): Do not inline the function. fsp_free_seg_inode(): Zerofill the whole inode. fsp_apply_init_file_page(): Initialize FIL_PAGE_PREV,FIL_PAGE_NEXT to FIL_NULL when using the physical format. btr_create(): Assert !page_has_siblings() because fsp_apply_init_file_page() must have been invoked. fil_ibd_create(): Do not write FILE_MODIFY after FILE_CREATE. fil_names_dirty_and_write(): Remove the parameter mtr. Write the records using a separate mini-transaction object, because any FILE_ records must be at the start of a mini-transaction log. recv_recover_page(): Add a fil_space_t* parameter. After applying log to the a ROW_FORMAT=COMPRESSED page, invoke buf_zip_decompress() to restore the uncompressed page. buf_page_io_complete(): Remove the temporary hack to discard the uncompressed page of a ROW_FORMAT=COMPRESSED page. page_zip_write_header(): Remove. Use mtr_t::write() or mtr_t::memset() instead, and update the compressed page frame separately. trx_undo_header_add_space_for_xid(): Remove. trx_undo_seg_create(): Perform the changes that were previously made by trx_undo_header_add_space_for_xid(). btr_reset_instant(): New function: Reset the table to MariaDB 10.2 or 10.3 format when rolling back an instant ALTER TABLE operation. page_rec_find_owner_rec(): Merge with the only callers. page_cur_insert_rec_low(): Combine writes by using a local buffer. MEMMOVE data from the preceding record whenever feasible (copying at least 3 bytes). page_cur_insert_rec_zip(): Combine writes to page header fields. PageBulk::insertPage(): Issue MEMMOVE records to copy a matching part from the preceding record. PageBulk::finishPage(): Combine the writes to the page header and to the sparse page directory slots. mtr_t::write(): Only log the least significant (last) bytes of multi-byte fields that actually differ. For updating FSP_SIZE, we must always write all 4 bytes to the redo log, so that the fil_space_set_recv_size() logic in recv_sys_t::parse() will work. mtr_t::memcpy(), mtr_t::zmemcpy(): Take a pointer argument instead of a numeric offset to the page frame. Only log the last bytes of multi-byte fields that actually differ. In fil_space_crypt_t::write_page0(), we must log also any unchanged bytes, so that recovery will recognize the record and invoke fil_crypt_parse(). Future work: MDEV-21724 Optimize page_cur_insert_rec_low() redo logging MDEV-21725 Optimize btr_page_reorganize_low() redo logging MDEV-21727 Optimize redo logging for ROW_FORMAT=COMPRESSED
2020-02-13 19:12:17 +02:00
(needed for generating a FILE_MODIFY record)
MDEV-12266: Change dict_table_t::space to fil_space_t* InnoDB always keeps all tablespaces in the fil_system cache. The fil_system.LRU is only for closing file handles; the fil_space_t and fil_node_t for all data files will remain in main memory. Between startup to shutdown, they can only be created and removed by DDL statements. Therefore, we can let dict_table_t::space point directly to the fil_space_t. dict_table_t::space_id: A numeric tablespace ID for the corner cases where we do not have a tablespace. The most prominent examples are ALTER TABLE...DISCARD TABLESPACE or a missing or corrupted file. There are a few functional differences; most notably: (1) DROP TABLE will delete matching .ibd and .cfg files, even if they were not attached to the data dictionary. (2) Some error messages will report file names instead of numeric IDs. There still are many functions that use numeric tablespace IDs instead of fil_space_t*, and many functions could be converted to fil_space_t member functions. Also, Tablespace and Datafile should be merged with fil_space_t and fil_node_t. page_id_t and buf_page_get_gen() could use fil_space_t& instead of a numeric ID, and after moving to a single buffer pool (MDEV-15058), buf_pool_t::page_hash could be moved to fil_space_t::page_hash. FilSpace: Remove. Only few calls to fil_space_acquire() will remain, and gradually they should be removed. mtr_t::set_named_space_id(ulint): Renamed from set_named_space(), to prevent accidental calls to this slower function. Very few callers remain. fseg_create(), fsp_reserve_free_extents(): Take fil_space_t* as a parameter instead of a space_id. fil_space_t::rename(): Wrapper for fil_rename_tablespace_check(), fil_name_write_rename(), fil_rename_tablespace(). Mariabackup passes the parameter log=false; InnoDB passes log=true. dict_mem_table_create(): Take fil_space_t* instead of space_id as parameter. dict_process_sys_tables_rec_and_mtr_commit(): Replace the parameter 'status' with 'bool cached'. dict_get_and_save_data_dir_path(): Avoid copying the fil_node_t::name. fil_ibd_open(): Return the tablespace. fil_space_t::set_imported(): Replaces fil_space_set_imported(). truncate_t: Change many member function parameters to fil_space_t*, and remove page_size parameters. row_truncate_prepare(): Merge to its only caller. row_drop_table_from_cache(): Assert that the table is persistent. dict_create_sys_indexes_tuple(): Write SYS_INDEXES.SPACE=FIL_NULL if the tablespace has been discarded. row_import_update_discarded_flag(): Remove a constant parameter.
2018-03-27 16:31:10 +03:00
@param[in] space tablespace
@return whether the mini-transaction is associated with the space */
bool is_named_space(const fil_space_t* space) const;
#endif /* UNIV_DEBUG */
/** Acquire a tablespace X-latch.
@param[in] space_id tablespace ID
@return the tablespace object (never NULL) */
fil_space_t* x_lock_space(ulint space_id);
/** Acquire a shared rw-latch.
@param[in] lock rw-latch
@param[in] file file name from where called
@param[in] line line number in file */
void s_lock(rw_lock_t* lock, const char* file, unsigned line)
{
rw_lock_s_lock_inline(lock, 0, file, line);
memo_push(lock, MTR_MEMO_S_LOCK);
}
/** Acquire an exclusive rw-latch.
@param[in] lock rw-latch
@param[in] file file name from where called
@param[in] line line number in file */
void x_lock(rw_lock_t* lock, const char* file, unsigned line)
{
rw_lock_x_lock_inline(lock, 0, file, line);
memo_push(lock, MTR_MEMO_X_LOCK);
}
/** Acquire an shared/exclusive rw-latch.
@param[in] lock rw-latch
@param[in] file file name from where called
@param[in] line line number in file */
void sx_lock(rw_lock_t* lock, const char* file, unsigned line)
{
rw_lock_sx_lock_inline(lock, 0, file, line);
memo_push(lock, MTR_MEMO_SX_LOCK);
}
/** Acquire a tablespace S-latch.
@param[in] space tablespace */
void s_lock_space(fil_space_t* space)
{
ut_ad(space->purpose == FIL_TYPE_TEMPORARY
|| space->purpose == FIL_TYPE_IMPORT
|| space->purpose == FIL_TYPE_TABLESPACE);
memo_push(space, MTR_MEMO_SPACE_S_LOCK);
space->s_lock();
}
/** Acquire a tablespace X-latch.
@param[in] space tablespace */
void x_lock_space(fil_space_t* space);
/** Release an object in the memo stack.
@param object object
2019-11-15 14:55:38 +02:00
@param type object type
@return bool if lock released */
bool memo_release(const void* object, ulint type);
/** Release a page latch.
@param[in] ptr pointer to within a page frame
@param[in] type object type: MTR_MEMO_PAGE_X_FIX, ... */
void release_page(const void* ptr, mtr_memo_type_t type);
private:
/** Note that the mini-transaction will modify data. */
void flag_modified() { m_modifications = true; }
/** Mark the given latched page as modified.
@param block page that will be modified */
void modify(const buf_block_t& block);
public:
/** Note that the mini-transaction will modify a block. */
void set_modified(const buf_block_t &block)
{ flag_modified(); if (m_log_mode != MTR_LOG_NONE) modify(block); }
/** Set the state to not-modified. This will not log the changes.
This is only used during redo log apply, to avoid logging the changes. */
void discard_modifications() { m_modifications = false; }
/** Get the LSN of commit().
@return the commit LSN
@retval 0 if the transaction only modified temporary tablespaces */
lsn_t commit_lsn() const { ut_ad(has_committed()); return m_commit_lsn; }
/** Note that we are inside the change buffer code. */
void enter_ibuf() { m_inside_ibuf= true; }
/** Note that we have exited from the change buffer code. */
void exit_ibuf() { m_inside_ibuf= false; }
/** @return true if we are inside the change buffer code */
bool is_inside_ibuf() const { return m_inside_ibuf; }
MDEV-8139 Fix Scrubbing fil_space_t::freed_ranges: Store ranges of freed page numbers. fil_space_t::last_freed_lsn: Store the most recent LSN of freeing a page. fil_space_t::freed_mutex: Protects freed_ranges, last_freed_lsn. fil_space_create(): Initialize the freed_range mutex. fil_space_free_low(): Frees the freed_range mutex. range_set: Ranges of page numbers. buf_page_create(): Removes the page from freed_ranges when page is being reused. btr_free_root(): Remove the PAGE_INDEX_ID invalidation. Because btr_free_root() and dict_drop_index_tree() are executed in the same atomic mini-transaction, there is no need to invalidate the root page. buf_release_freed_page(): Split from buf_flush_freed_page(). Skip any I/O buf_flush_freed_pages(): Get the freed ranges from tablespace and Write punch-hole or zeroes of the freed ranges. buf_flush_try_neighbors(): Handles the flushing of freed ranges. mtr_t::freed_pages: Variable to store the list of freed pages. mtr_t::add_freed_pages(): To add freed pages. mtr_t::clear_freed_pages(): To clear the freed pages. mtr_t::m_freed_in_system_tablespace: Variable to indicate whether page has been freed in system tablespace. mtr_t::m_trim_pages: Variable to indicate whether the space has been trimmed. mtr_t::commit(): Add the freed page and update the last freed lsn in the tablespace and clear the tablespace freed range if space is trimmed. file_name_t::freed_pages: Store the freed pages during recovery. file_name_t::add_freed_page(), file_name_t::remove_freed_page(): To add and remove freed page during recovery. store_freed_or_init_rec(): Store or remove the freed pages while encountering FREE_PAGE or INIT_PAGE redo log record. recv_init_crash_recovery_spaces(): Add the freed page encountered during recovery to respective tablespace.
2020-06-11 22:52:47 +05:30
/** Note that system tablespace page has been freed. */
void freed_system_tablespace_page() { m_freed_in_system_tablespace= true; }
/** Note that pages has been trimed */
void set_trim_pages() { m_trim_pages= true; }
/** @return true if pages has been trimed */
bool is_trim_pages() { return m_trim_pages; }
/** @return whether a page_compressed table was modified */
bool is_page_compressed() const
{
#if defined(HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE) || defined(_WIN32)
return m_user_space && m_user_space->is_compressed();
#else
return false;
#endif
}
/** Check if we are holding tablespace latch
@param space tablespace to search for
@param shared whether to look for shared latch, instead of exclusive
@return whether space.latch is being held */
bool memo_contains(const fil_space_t& space, bool shared= false)
MY_ATTRIBUTE((warn_unused_result));
#ifdef UNIV_DEBUG
/** Check if we are holding an rw-latch in this mini-transaction
@param lock latch to search for
@param type held latch type
@return whether (lock,type) is contained */
bool memo_contains(const rw_lock_t &lock, mtr_memo_type_t type)
MY_ATTRIBUTE((warn_unused_result));
/** Check if memo contains the given item.
@param object object to search
@param flags specify types of object (can be ORred) of
MTR_MEMO_PAGE_S_FIX ... values
@return true if contains */
bool memo_contains_flagged(const void* ptr, ulint flags) const;
/** Check if memo contains the given page.
@param[in] ptr pointer to within buffer frame
@param[in] flags specify types of object with OR of
MTR_MEMO_PAGE_S_FIX... values
@return the block
@retval NULL if not found */
buf_block_t* memo_contains_page_flagged(
const byte* ptr,
ulint flags) const;
/** Print info of an mtr handle. */
void print() const;
/** @return true if mini-transaction contains modifications. */
bool has_modifications() const { return m_modifications; }
/** @return the memo stack */
const mtr_buf_t* get_memo() const { return &m_memo; }
/** @return the memo stack */
mtr_buf_t* get_memo() { return &m_memo; }
MDEV-8139 Fix Scrubbing fil_space_t::freed_ranges: Store ranges of freed page numbers. fil_space_t::last_freed_lsn: Store the most recent LSN of freeing a page. fil_space_t::freed_mutex: Protects freed_ranges, last_freed_lsn. fil_space_create(): Initialize the freed_range mutex. fil_space_free_low(): Frees the freed_range mutex. range_set: Ranges of page numbers. buf_page_create(): Removes the page from freed_ranges when page is being reused. btr_free_root(): Remove the PAGE_INDEX_ID invalidation. Because btr_free_root() and dict_drop_index_tree() are executed in the same atomic mini-transaction, there is no need to invalidate the root page. buf_release_freed_page(): Split from buf_flush_freed_page(). Skip any I/O buf_flush_freed_pages(): Get the freed ranges from tablespace and Write punch-hole or zeroes of the freed ranges. buf_flush_try_neighbors(): Handles the flushing of freed ranges. mtr_t::freed_pages: Variable to store the list of freed pages. mtr_t::add_freed_pages(): To add freed pages. mtr_t::clear_freed_pages(): To clear the freed pages. mtr_t::m_freed_in_system_tablespace: Variable to indicate whether page has been freed in system tablespace. mtr_t::m_trim_pages: Variable to indicate whether the space has been trimmed. mtr_t::commit(): Add the freed page and update the last freed lsn in the tablespace and clear the tablespace freed range if space is trimmed. file_name_t::freed_pages: Store the freed pages during recovery. file_name_t::add_freed_page(), file_name_t::remove_freed_page(): To add and remove freed page during recovery. store_freed_or_init_rec(): Store or remove the freed pages while encountering FREE_PAGE or INIT_PAGE redo log record. recv_init_crash_recovery_spaces(): Add the freed page encountered during recovery to respective tablespace.
2020-06-11 22:52:47 +05:30
/** @return true if system tablespace page has been freed */
bool is_freed_system_tablespace_page()
{
return m_freed_in_system_tablespace;
}
#endif /* UNIV_DEBUG */
/** @return true if a record was added to the mini-transaction */
bool is_dirty() const { return m_made_dirty; }
/** Get the buffered redo log of this mini-transaction.
@return redo log */
const mtr_buf_t* get_log() const { return &m_log; }
/** Get the buffered redo log of this mini-transaction.
@return redo log */
mtr_buf_t* get_log() { return &m_log; }
/** Push an object to an mtr memo stack.
@param object object
@param type object type: MTR_MEMO_S_LOCK, ... */
inline void memo_push(void* object, mtr_memo_type_t type);
/** Check if this mini-transaction is dirtying a clean page.
@param block block being x-fixed
@return true if the mtr is dirtying a clean page. */
static inline bool is_block_dirtied(const buf_block_t* block)
MY_ATTRIBUTE((warn_unused_result));
/** Write request types */
enum write_type
{
/** the page is guaranteed to always change */
NORMAL= 0,
/** optional: the page contents might not change */
MAYBE_NOP,
/** force a write, even if the page contents is not changing */
FORCED
};
/** Write 1, 2, 4, or 8 bytes to a file page.
@param[in] block file page
@param[in,out] ptr pointer in file page
@param[in] val value to write
@tparam l number of bytes to write
@tparam w write request type
MDEV-12353: Change the redo log encoding log_t::FORMAT_10_5: physical redo log format tag log_phys_t: Buffered records in the physical format. The log record bytes will follow the last data field, making use of alignment padding that would otherwise be wasted. If there are multiple records for the same page, also those may be appended to an existing log_phys_t object if the memory is available. In the physical format, the first byte of a record identifies the record and its length (up to 15 bytes). For longer records, the immediately following bytes will encode the remaining length in a variable-length encoding. Usually, a variable-length-encoded page identifier will follow, followed by optional payload, whose length is included in the initially encoded total record length. When a mini-transaction is updating multiple fields in a page, it can avoid repeating the tablespace identifier and page number by setting the same_page flag (most significant bit) in the first byte of the log record. The byte offset of the record will be relative to where the previous record for that page ended. Until MDEV-14425 introduces a separate file-level log for redo log checkpoints and file operations, we will write the file-level records in the page-level redo log file. The record FILE_CHECKPOINT (which replaces MLOG_CHECKPOINT) will be removed in MDEV-14425, and one sequential scan of the page recovery log will suffice. Compared to MLOG_FILE_CREATE2, FILE_CREATE will not include any flags. If the information is needed, it can be parsed from WRITE records that modify FSP_SPACE_FLAGS. MLOG_ZIP_WRITE_STRING: Remove. The record was only introduced temporarily as part of this work, before being replaced with WRITE (along with MLOG_WRITE_STRING, MLOG_1BYTE, MLOG_nBYTES). mtr_buf_t::empty(): Check if the buffer is empty. mtr_t::m_n_log_recs: Remove. It suffices to check if m_log is empty. mtr_t::m_last, mtr_t::m_last_offset: End of the latest m_log record, for the same_page encoding. page_recv_t::last_offset: Reflects mtr_t::m_last_offset. Valid values for last_offset during recovery should be 0 or above 8. (The first 8 bytes of a page are the checksum and the page number, and neither are ever updated directly by log records.) Internally, the special value 1 indicates that the same_page form will not be allowed for the subsequent record. mtr_t::page_create(): Take the block descriptor as parameter, so that it can be compared to mtr_t::m_last. The INIT_INDEX_PAGE record will always followed by a subtype byte, because same_page records must be longer than 1 byte. trx_undo_page_init(): Combine the writes in WRITE record. trx_undo_header_create(): Write 4 bytes using a special MEMSET record that includes 1 bytes of length and 2 bytes of payload. flst_write_addr(): Define as a static function. Combine the writes. flst_zero_both(): Replaces two flst_zero_addr() calls. flst_init(): Do not inline the function. fsp_free_seg_inode(): Zerofill the whole inode. fsp_apply_init_file_page(): Initialize FIL_PAGE_PREV,FIL_PAGE_NEXT to FIL_NULL when using the physical format. btr_create(): Assert !page_has_siblings() because fsp_apply_init_file_page() must have been invoked. fil_ibd_create(): Do not write FILE_MODIFY after FILE_CREATE. fil_names_dirty_and_write(): Remove the parameter mtr. Write the records using a separate mini-transaction object, because any FILE_ records must be at the start of a mini-transaction log. recv_recover_page(): Add a fil_space_t* parameter. After applying log to the a ROW_FORMAT=COMPRESSED page, invoke buf_zip_decompress() to restore the uncompressed page. buf_page_io_complete(): Remove the temporary hack to discard the uncompressed page of a ROW_FORMAT=COMPRESSED page. page_zip_write_header(): Remove. Use mtr_t::write() or mtr_t::memset() instead, and update the compressed page frame separately. trx_undo_header_add_space_for_xid(): Remove. trx_undo_seg_create(): Perform the changes that were previously made by trx_undo_header_add_space_for_xid(). btr_reset_instant(): New function: Reset the table to MariaDB 10.2 or 10.3 format when rolling back an instant ALTER TABLE operation. page_rec_find_owner_rec(): Merge with the only callers. page_cur_insert_rec_low(): Combine writes by using a local buffer. MEMMOVE data from the preceding record whenever feasible (copying at least 3 bytes). page_cur_insert_rec_zip(): Combine writes to page header fields. PageBulk::insertPage(): Issue MEMMOVE records to copy a matching part from the preceding record. PageBulk::finishPage(): Combine the writes to the page header and to the sparse page directory slots. mtr_t::write(): Only log the least significant (last) bytes of multi-byte fields that actually differ. For updating FSP_SIZE, we must always write all 4 bytes to the redo log, so that the fil_space_set_recv_size() logic in recv_sys_t::parse() will work. mtr_t::memcpy(), mtr_t::zmemcpy(): Take a pointer argument instead of a numeric offset to the page frame. Only log the last bytes of multi-byte fields that actually differ. In fil_space_crypt_t::write_page0(), we must log also any unchanged bytes, so that recovery will recognize the record and invoke fil_crypt_parse(). Future work: MDEV-21724 Optimize page_cur_insert_rec_low() redo logging MDEV-21725 Optimize btr_page_reorganize_low() redo logging MDEV-21727 Optimize redo logging for ROW_FORMAT=COMPRESSED
2020-02-13 19:12:17 +02:00
@tparam V type of val
@return whether any log was written */
template<unsigned l,write_type w= NORMAL,typename V>
MDEV-12353: Change the redo log encoding log_t::FORMAT_10_5: physical redo log format tag log_phys_t: Buffered records in the physical format. The log record bytes will follow the last data field, making use of alignment padding that would otherwise be wasted. If there are multiple records for the same page, also those may be appended to an existing log_phys_t object if the memory is available. In the physical format, the first byte of a record identifies the record and its length (up to 15 bytes). For longer records, the immediately following bytes will encode the remaining length in a variable-length encoding. Usually, a variable-length-encoded page identifier will follow, followed by optional payload, whose length is included in the initially encoded total record length. When a mini-transaction is updating multiple fields in a page, it can avoid repeating the tablespace identifier and page number by setting the same_page flag (most significant bit) in the first byte of the log record. The byte offset of the record will be relative to where the previous record for that page ended. Until MDEV-14425 introduces a separate file-level log for redo log checkpoints and file operations, we will write the file-level records in the page-level redo log file. The record FILE_CHECKPOINT (which replaces MLOG_CHECKPOINT) will be removed in MDEV-14425, and one sequential scan of the page recovery log will suffice. Compared to MLOG_FILE_CREATE2, FILE_CREATE will not include any flags. If the information is needed, it can be parsed from WRITE records that modify FSP_SPACE_FLAGS. MLOG_ZIP_WRITE_STRING: Remove. The record was only introduced temporarily as part of this work, before being replaced with WRITE (along with MLOG_WRITE_STRING, MLOG_1BYTE, MLOG_nBYTES). mtr_buf_t::empty(): Check if the buffer is empty. mtr_t::m_n_log_recs: Remove. It suffices to check if m_log is empty. mtr_t::m_last, mtr_t::m_last_offset: End of the latest m_log record, for the same_page encoding. page_recv_t::last_offset: Reflects mtr_t::m_last_offset. Valid values for last_offset during recovery should be 0 or above 8. (The first 8 bytes of a page are the checksum and the page number, and neither are ever updated directly by log records.) Internally, the special value 1 indicates that the same_page form will not be allowed for the subsequent record. mtr_t::page_create(): Take the block descriptor as parameter, so that it can be compared to mtr_t::m_last. The INIT_INDEX_PAGE record will always followed by a subtype byte, because same_page records must be longer than 1 byte. trx_undo_page_init(): Combine the writes in WRITE record. trx_undo_header_create(): Write 4 bytes using a special MEMSET record that includes 1 bytes of length and 2 bytes of payload. flst_write_addr(): Define as a static function. Combine the writes. flst_zero_both(): Replaces two flst_zero_addr() calls. flst_init(): Do not inline the function. fsp_free_seg_inode(): Zerofill the whole inode. fsp_apply_init_file_page(): Initialize FIL_PAGE_PREV,FIL_PAGE_NEXT to FIL_NULL when using the physical format. btr_create(): Assert !page_has_siblings() because fsp_apply_init_file_page() must have been invoked. fil_ibd_create(): Do not write FILE_MODIFY after FILE_CREATE. fil_names_dirty_and_write(): Remove the parameter mtr. Write the records using a separate mini-transaction object, because any FILE_ records must be at the start of a mini-transaction log. recv_recover_page(): Add a fil_space_t* parameter. After applying log to the a ROW_FORMAT=COMPRESSED page, invoke buf_zip_decompress() to restore the uncompressed page. buf_page_io_complete(): Remove the temporary hack to discard the uncompressed page of a ROW_FORMAT=COMPRESSED page. page_zip_write_header(): Remove. Use mtr_t::write() or mtr_t::memset() instead, and update the compressed page frame separately. trx_undo_header_add_space_for_xid(): Remove. trx_undo_seg_create(): Perform the changes that were previously made by trx_undo_header_add_space_for_xid(). btr_reset_instant(): New function: Reset the table to MariaDB 10.2 or 10.3 format when rolling back an instant ALTER TABLE operation. page_rec_find_owner_rec(): Merge with the only callers. page_cur_insert_rec_low(): Combine writes by using a local buffer. MEMMOVE data from the preceding record whenever feasible (copying at least 3 bytes). page_cur_insert_rec_zip(): Combine writes to page header fields. PageBulk::insertPage(): Issue MEMMOVE records to copy a matching part from the preceding record. PageBulk::finishPage(): Combine the writes to the page header and to the sparse page directory slots. mtr_t::write(): Only log the least significant (last) bytes of multi-byte fields that actually differ. For updating FSP_SIZE, we must always write all 4 bytes to the redo log, so that the fil_space_set_recv_size() logic in recv_sys_t::parse() will work. mtr_t::memcpy(), mtr_t::zmemcpy(): Take a pointer argument instead of a numeric offset to the page frame. Only log the last bytes of multi-byte fields that actually differ. In fil_space_crypt_t::write_page0(), we must log also any unchanged bytes, so that recovery will recognize the record and invoke fil_crypt_parse(). Future work: MDEV-21724 Optimize page_cur_insert_rec_low() redo logging MDEV-21725 Optimize btr_page_reorganize_low() redo logging MDEV-21727 Optimize redo logging for ROW_FORMAT=COMPRESSED
2020-02-13 19:12:17 +02:00
inline bool write(const buf_block_t &block, void *ptr, V val)
MY_ATTRIBUTE((nonnull));
/** Log a write of a byte string to a page.
@param[in] b buffer page
@param[in] ofs byte offset from b->frame
@param[in] len length of the data to write */
MDEV-12353: Change the redo log encoding log_t::FORMAT_10_5: physical redo log format tag log_phys_t: Buffered records in the physical format. The log record bytes will follow the last data field, making use of alignment padding that would otherwise be wasted. If there are multiple records for the same page, also those may be appended to an existing log_phys_t object if the memory is available. In the physical format, the first byte of a record identifies the record and its length (up to 15 bytes). For longer records, the immediately following bytes will encode the remaining length in a variable-length encoding. Usually, a variable-length-encoded page identifier will follow, followed by optional payload, whose length is included in the initially encoded total record length. When a mini-transaction is updating multiple fields in a page, it can avoid repeating the tablespace identifier and page number by setting the same_page flag (most significant bit) in the first byte of the log record. The byte offset of the record will be relative to where the previous record for that page ended. Until MDEV-14425 introduces a separate file-level log for redo log checkpoints and file operations, we will write the file-level records in the page-level redo log file. The record FILE_CHECKPOINT (which replaces MLOG_CHECKPOINT) will be removed in MDEV-14425, and one sequential scan of the page recovery log will suffice. Compared to MLOG_FILE_CREATE2, FILE_CREATE will not include any flags. If the information is needed, it can be parsed from WRITE records that modify FSP_SPACE_FLAGS. MLOG_ZIP_WRITE_STRING: Remove. The record was only introduced temporarily as part of this work, before being replaced with WRITE (along with MLOG_WRITE_STRING, MLOG_1BYTE, MLOG_nBYTES). mtr_buf_t::empty(): Check if the buffer is empty. mtr_t::m_n_log_recs: Remove. It suffices to check if m_log is empty. mtr_t::m_last, mtr_t::m_last_offset: End of the latest m_log record, for the same_page encoding. page_recv_t::last_offset: Reflects mtr_t::m_last_offset. Valid values for last_offset during recovery should be 0 or above 8. (The first 8 bytes of a page are the checksum and the page number, and neither are ever updated directly by log records.) Internally, the special value 1 indicates that the same_page form will not be allowed for the subsequent record. mtr_t::page_create(): Take the block descriptor as parameter, so that it can be compared to mtr_t::m_last. The INIT_INDEX_PAGE record will always followed by a subtype byte, because same_page records must be longer than 1 byte. trx_undo_page_init(): Combine the writes in WRITE record. trx_undo_header_create(): Write 4 bytes using a special MEMSET record that includes 1 bytes of length and 2 bytes of payload. flst_write_addr(): Define as a static function. Combine the writes. flst_zero_both(): Replaces two flst_zero_addr() calls. flst_init(): Do not inline the function. fsp_free_seg_inode(): Zerofill the whole inode. fsp_apply_init_file_page(): Initialize FIL_PAGE_PREV,FIL_PAGE_NEXT to FIL_NULL when using the physical format. btr_create(): Assert !page_has_siblings() because fsp_apply_init_file_page() must have been invoked. fil_ibd_create(): Do not write FILE_MODIFY after FILE_CREATE. fil_names_dirty_and_write(): Remove the parameter mtr. Write the records using a separate mini-transaction object, because any FILE_ records must be at the start of a mini-transaction log. recv_recover_page(): Add a fil_space_t* parameter. After applying log to the a ROW_FORMAT=COMPRESSED page, invoke buf_zip_decompress() to restore the uncompressed page. buf_page_io_complete(): Remove the temporary hack to discard the uncompressed page of a ROW_FORMAT=COMPRESSED page. page_zip_write_header(): Remove. Use mtr_t::write() or mtr_t::memset() instead, and update the compressed page frame separately. trx_undo_header_add_space_for_xid(): Remove. trx_undo_seg_create(): Perform the changes that were previously made by trx_undo_header_add_space_for_xid(). btr_reset_instant(): New function: Reset the table to MariaDB 10.2 or 10.3 format when rolling back an instant ALTER TABLE operation. page_rec_find_owner_rec(): Merge with the only callers. page_cur_insert_rec_low(): Combine writes by using a local buffer. MEMMOVE data from the preceding record whenever feasible (copying at least 3 bytes). page_cur_insert_rec_zip(): Combine writes to page header fields. PageBulk::insertPage(): Issue MEMMOVE records to copy a matching part from the preceding record. PageBulk::finishPage(): Combine the writes to the page header and to the sparse page directory slots. mtr_t::write(): Only log the least significant (last) bytes of multi-byte fields that actually differ. For updating FSP_SIZE, we must always write all 4 bytes to the redo log, so that the fil_space_set_recv_size() logic in recv_sys_t::parse() will work. mtr_t::memcpy(), mtr_t::zmemcpy(): Take a pointer argument instead of a numeric offset to the page frame. Only log the last bytes of multi-byte fields that actually differ. In fil_space_crypt_t::write_page0(), we must log also any unchanged bytes, so that recovery will recognize the record and invoke fil_crypt_parse(). Future work: MDEV-21724 Optimize page_cur_insert_rec_low() redo logging MDEV-21725 Optimize btr_page_reorganize_low() redo logging MDEV-21727 Optimize redo logging for ROW_FORMAT=COMPRESSED
2020-02-13 19:12:17 +02:00
inline void memcpy(const buf_block_t &b, ulint ofs, ulint len);
/** Write a byte string to a page.
@param[in,out] b buffer page
MDEV-12353: Change the redo log encoding log_t::FORMAT_10_5: physical redo log format tag log_phys_t: Buffered records in the physical format. The log record bytes will follow the last data field, making use of alignment padding that would otherwise be wasted. If there are multiple records for the same page, also those may be appended to an existing log_phys_t object if the memory is available. In the physical format, the first byte of a record identifies the record and its length (up to 15 bytes). For longer records, the immediately following bytes will encode the remaining length in a variable-length encoding. Usually, a variable-length-encoded page identifier will follow, followed by optional payload, whose length is included in the initially encoded total record length. When a mini-transaction is updating multiple fields in a page, it can avoid repeating the tablespace identifier and page number by setting the same_page flag (most significant bit) in the first byte of the log record. The byte offset of the record will be relative to where the previous record for that page ended. Until MDEV-14425 introduces a separate file-level log for redo log checkpoints and file operations, we will write the file-level records in the page-level redo log file. The record FILE_CHECKPOINT (which replaces MLOG_CHECKPOINT) will be removed in MDEV-14425, and one sequential scan of the page recovery log will suffice. Compared to MLOG_FILE_CREATE2, FILE_CREATE will not include any flags. If the information is needed, it can be parsed from WRITE records that modify FSP_SPACE_FLAGS. MLOG_ZIP_WRITE_STRING: Remove. The record was only introduced temporarily as part of this work, before being replaced with WRITE (along with MLOG_WRITE_STRING, MLOG_1BYTE, MLOG_nBYTES). mtr_buf_t::empty(): Check if the buffer is empty. mtr_t::m_n_log_recs: Remove. It suffices to check if m_log is empty. mtr_t::m_last, mtr_t::m_last_offset: End of the latest m_log record, for the same_page encoding. page_recv_t::last_offset: Reflects mtr_t::m_last_offset. Valid values for last_offset during recovery should be 0 or above 8. (The first 8 bytes of a page are the checksum and the page number, and neither are ever updated directly by log records.) Internally, the special value 1 indicates that the same_page form will not be allowed for the subsequent record. mtr_t::page_create(): Take the block descriptor as parameter, so that it can be compared to mtr_t::m_last. The INIT_INDEX_PAGE record will always followed by a subtype byte, because same_page records must be longer than 1 byte. trx_undo_page_init(): Combine the writes in WRITE record. trx_undo_header_create(): Write 4 bytes using a special MEMSET record that includes 1 bytes of length and 2 bytes of payload. flst_write_addr(): Define as a static function. Combine the writes. flst_zero_both(): Replaces two flst_zero_addr() calls. flst_init(): Do not inline the function. fsp_free_seg_inode(): Zerofill the whole inode. fsp_apply_init_file_page(): Initialize FIL_PAGE_PREV,FIL_PAGE_NEXT to FIL_NULL when using the physical format. btr_create(): Assert !page_has_siblings() because fsp_apply_init_file_page() must have been invoked. fil_ibd_create(): Do not write FILE_MODIFY after FILE_CREATE. fil_names_dirty_and_write(): Remove the parameter mtr. Write the records using a separate mini-transaction object, because any FILE_ records must be at the start of a mini-transaction log. recv_recover_page(): Add a fil_space_t* parameter. After applying log to the a ROW_FORMAT=COMPRESSED page, invoke buf_zip_decompress() to restore the uncompressed page. buf_page_io_complete(): Remove the temporary hack to discard the uncompressed page of a ROW_FORMAT=COMPRESSED page. page_zip_write_header(): Remove. Use mtr_t::write() or mtr_t::memset() instead, and update the compressed page frame separately. trx_undo_header_add_space_for_xid(): Remove. trx_undo_seg_create(): Perform the changes that were previously made by trx_undo_header_add_space_for_xid(). btr_reset_instant(): New function: Reset the table to MariaDB 10.2 or 10.3 format when rolling back an instant ALTER TABLE operation. page_rec_find_owner_rec(): Merge with the only callers. page_cur_insert_rec_low(): Combine writes by using a local buffer. MEMMOVE data from the preceding record whenever feasible (copying at least 3 bytes). page_cur_insert_rec_zip(): Combine writes to page header fields. PageBulk::insertPage(): Issue MEMMOVE records to copy a matching part from the preceding record. PageBulk::finishPage(): Combine the writes to the page header and to the sparse page directory slots. mtr_t::write(): Only log the least significant (last) bytes of multi-byte fields that actually differ. For updating FSP_SIZE, we must always write all 4 bytes to the redo log, so that the fil_space_set_recv_size() logic in recv_sys_t::parse() will work. mtr_t::memcpy(), mtr_t::zmemcpy(): Take a pointer argument instead of a numeric offset to the page frame. Only log the last bytes of multi-byte fields that actually differ. In fil_space_crypt_t::write_page0(), we must log also any unchanged bytes, so that recovery will recognize the record and invoke fil_crypt_parse(). Future work: MDEV-21724 Optimize page_cur_insert_rec_low() redo logging MDEV-21725 Optimize btr_page_reorganize_low() redo logging MDEV-21727 Optimize redo logging for ROW_FORMAT=COMPRESSED
2020-02-13 19:12:17 +02:00
@param[in] dest destination within b.frame
@param[in] str the data to write
MDEV-12353: Change the redo log encoding log_t::FORMAT_10_5: physical redo log format tag log_phys_t: Buffered records in the physical format. The log record bytes will follow the last data field, making use of alignment padding that would otherwise be wasted. If there are multiple records for the same page, also those may be appended to an existing log_phys_t object if the memory is available. In the physical format, the first byte of a record identifies the record and its length (up to 15 bytes). For longer records, the immediately following bytes will encode the remaining length in a variable-length encoding. Usually, a variable-length-encoded page identifier will follow, followed by optional payload, whose length is included in the initially encoded total record length. When a mini-transaction is updating multiple fields in a page, it can avoid repeating the tablespace identifier and page number by setting the same_page flag (most significant bit) in the first byte of the log record. The byte offset of the record will be relative to where the previous record for that page ended. Until MDEV-14425 introduces a separate file-level log for redo log checkpoints and file operations, we will write the file-level records in the page-level redo log file. The record FILE_CHECKPOINT (which replaces MLOG_CHECKPOINT) will be removed in MDEV-14425, and one sequential scan of the page recovery log will suffice. Compared to MLOG_FILE_CREATE2, FILE_CREATE will not include any flags. If the information is needed, it can be parsed from WRITE records that modify FSP_SPACE_FLAGS. MLOG_ZIP_WRITE_STRING: Remove. The record was only introduced temporarily as part of this work, before being replaced with WRITE (along with MLOG_WRITE_STRING, MLOG_1BYTE, MLOG_nBYTES). mtr_buf_t::empty(): Check if the buffer is empty. mtr_t::m_n_log_recs: Remove. It suffices to check if m_log is empty. mtr_t::m_last, mtr_t::m_last_offset: End of the latest m_log record, for the same_page encoding. page_recv_t::last_offset: Reflects mtr_t::m_last_offset. Valid values for last_offset during recovery should be 0 or above 8. (The first 8 bytes of a page are the checksum and the page number, and neither are ever updated directly by log records.) Internally, the special value 1 indicates that the same_page form will not be allowed for the subsequent record. mtr_t::page_create(): Take the block descriptor as parameter, so that it can be compared to mtr_t::m_last. The INIT_INDEX_PAGE record will always followed by a subtype byte, because same_page records must be longer than 1 byte. trx_undo_page_init(): Combine the writes in WRITE record. trx_undo_header_create(): Write 4 bytes using a special MEMSET record that includes 1 bytes of length and 2 bytes of payload. flst_write_addr(): Define as a static function. Combine the writes. flst_zero_both(): Replaces two flst_zero_addr() calls. flst_init(): Do not inline the function. fsp_free_seg_inode(): Zerofill the whole inode. fsp_apply_init_file_page(): Initialize FIL_PAGE_PREV,FIL_PAGE_NEXT to FIL_NULL when using the physical format. btr_create(): Assert !page_has_siblings() because fsp_apply_init_file_page() must have been invoked. fil_ibd_create(): Do not write FILE_MODIFY after FILE_CREATE. fil_names_dirty_and_write(): Remove the parameter mtr. Write the records using a separate mini-transaction object, because any FILE_ records must be at the start of a mini-transaction log. recv_recover_page(): Add a fil_space_t* parameter. After applying log to the a ROW_FORMAT=COMPRESSED page, invoke buf_zip_decompress() to restore the uncompressed page. buf_page_io_complete(): Remove the temporary hack to discard the uncompressed page of a ROW_FORMAT=COMPRESSED page. page_zip_write_header(): Remove. Use mtr_t::write() or mtr_t::memset() instead, and update the compressed page frame separately. trx_undo_header_add_space_for_xid(): Remove. trx_undo_seg_create(): Perform the changes that were previously made by trx_undo_header_add_space_for_xid(). btr_reset_instant(): New function: Reset the table to MariaDB 10.2 or 10.3 format when rolling back an instant ALTER TABLE operation. page_rec_find_owner_rec(): Merge with the only callers. page_cur_insert_rec_low(): Combine writes by using a local buffer. MEMMOVE data from the preceding record whenever feasible (copying at least 3 bytes). page_cur_insert_rec_zip(): Combine writes to page header fields. PageBulk::insertPage(): Issue MEMMOVE records to copy a matching part from the preceding record. PageBulk::finishPage(): Combine the writes to the page header and to the sparse page directory slots. mtr_t::write(): Only log the least significant (last) bytes of multi-byte fields that actually differ. For updating FSP_SIZE, we must always write all 4 bytes to the redo log, so that the fil_space_set_recv_size() logic in recv_sys_t::parse() will work. mtr_t::memcpy(), mtr_t::zmemcpy(): Take a pointer argument instead of a numeric offset to the page frame. Only log the last bytes of multi-byte fields that actually differ. In fil_space_crypt_t::write_page0(), we must log also any unchanged bytes, so that recovery will recognize the record and invoke fil_crypt_parse(). Future work: MDEV-21724 Optimize page_cur_insert_rec_low() redo logging MDEV-21725 Optimize btr_page_reorganize_low() redo logging MDEV-21727 Optimize redo logging for ROW_FORMAT=COMPRESSED
2020-02-13 19:12:17 +02:00
@param[in] len length of the data to write
@tparam w write request type */
template<write_type w= NORMAL>
inline void memcpy(const buf_block_t &b, void *dest, const void *str,
ulint len);
MDEV-12353: Change the redo log encoding log_t::FORMAT_10_5: physical redo log format tag log_phys_t: Buffered records in the physical format. The log record bytes will follow the last data field, making use of alignment padding that would otherwise be wasted. If there are multiple records for the same page, also those may be appended to an existing log_phys_t object if the memory is available. In the physical format, the first byte of a record identifies the record and its length (up to 15 bytes). For longer records, the immediately following bytes will encode the remaining length in a variable-length encoding. Usually, a variable-length-encoded page identifier will follow, followed by optional payload, whose length is included in the initially encoded total record length. When a mini-transaction is updating multiple fields in a page, it can avoid repeating the tablespace identifier and page number by setting the same_page flag (most significant bit) in the first byte of the log record. The byte offset of the record will be relative to where the previous record for that page ended. Until MDEV-14425 introduces a separate file-level log for redo log checkpoints and file operations, we will write the file-level records in the page-level redo log file. The record FILE_CHECKPOINT (which replaces MLOG_CHECKPOINT) will be removed in MDEV-14425, and one sequential scan of the page recovery log will suffice. Compared to MLOG_FILE_CREATE2, FILE_CREATE will not include any flags. If the information is needed, it can be parsed from WRITE records that modify FSP_SPACE_FLAGS. MLOG_ZIP_WRITE_STRING: Remove. The record was only introduced temporarily as part of this work, before being replaced with WRITE (along with MLOG_WRITE_STRING, MLOG_1BYTE, MLOG_nBYTES). mtr_buf_t::empty(): Check if the buffer is empty. mtr_t::m_n_log_recs: Remove. It suffices to check if m_log is empty. mtr_t::m_last, mtr_t::m_last_offset: End of the latest m_log record, for the same_page encoding. page_recv_t::last_offset: Reflects mtr_t::m_last_offset. Valid values for last_offset during recovery should be 0 or above 8. (The first 8 bytes of a page are the checksum and the page number, and neither are ever updated directly by log records.) Internally, the special value 1 indicates that the same_page form will not be allowed for the subsequent record. mtr_t::page_create(): Take the block descriptor as parameter, so that it can be compared to mtr_t::m_last. The INIT_INDEX_PAGE record will always followed by a subtype byte, because same_page records must be longer than 1 byte. trx_undo_page_init(): Combine the writes in WRITE record. trx_undo_header_create(): Write 4 bytes using a special MEMSET record that includes 1 bytes of length and 2 bytes of payload. flst_write_addr(): Define as a static function. Combine the writes. flst_zero_both(): Replaces two flst_zero_addr() calls. flst_init(): Do not inline the function. fsp_free_seg_inode(): Zerofill the whole inode. fsp_apply_init_file_page(): Initialize FIL_PAGE_PREV,FIL_PAGE_NEXT to FIL_NULL when using the physical format. btr_create(): Assert !page_has_siblings() because fsp_apply_init_file_page() must have been invoked. fil_ibd_create(): Do not write FILE_MODIFY after FILE_CREATE. fil_names_dirty_and_write(): Remove the parameter mtr. Write the records using a separate mini-transaction object, because any FILE_ records must be at the start of a mini-transaction log. recv_recover_page(): Add a fil_space_t* parameter. After applying log to the a ROW_FORMAT=COMPRESSED page, invoke buf_zip_decompress() to restore the uncompressed page. buf_page_io_complete(): Remove the temporary hack to discard the uncompressed page of a ROW_FORMAT=COMPRESSED page. page_zip_write_header(): Remove. Use mtr_t::write() or mtr_t::memset() instead, and update the compressed page frame separately. trx_undo_header_add_space_for_xid(): Remove. trx_undo_seg_create(): Perform the changes that were previously made by trx_undo_header_add_space_for_xid(). btr_reset_instant(): New function: Reset the table to MariaDB 10.2 or 10.3 format when rolling back an instant ALTER TABLE operation. page_rec_find_owner_rec(): Merge with the only callers. page_cur_insert_rec_low(): Combine writes by using a local buffer. MEMMOVE data from the preceding record whenever feasible (copying at least 3 bytes). page_cur_insert_rec_zip(): Combine writes to page header fields. PageBulk::insertPage(): Issue MEMMOVE records to copy a matching part from the preceding record. PageBulk::finishPage(): Combine the writes to the page header and to the sparse page directory slots. mtr_t::write(): Only log the least significant (last) bytes of multi-byte fields that actually differ. For updating FSP_SIZE, we must always write all 4 bytes to the redo log, so that the fil_space_set_recv_size() logic in recv_sys_t::parse() will work. mtr_t::memcpy(), mtr_t::zmemcpy(): Take a pointer argument instead of a numeric offset to the page frame. Only log the last bytes of multi-byte fields that actually differ. In fil_space_crypt_t::write_page0(), we must log also any unchanged bytes, so that recovery will recognize the record and invoke fil_crypt_parse(). Future work: MDEV-21724 Optimize page_cur_insert_rec_low() redo logging MDEV-21725 Optimize btr_page_reorganize_low() redo logging MDEV-21727 Optimize redo logging for ROW_FORMAT=COMPRESSED
2020-02-13 19:12:17 +02:00
/** Log a write of a byte string to a ROW_FORMAT=COMPRESSED page.
@param[in] b ROW_FORMAT=COMPRESSED index page
MDEV-12353: Change the redo log encoding log_t::FORMAT_10_5: physical redo log format tag log_phys_t: Buffered records in the physical format. The log record bytes will follow the last data field, making use of alignment padding that would otherwise be wasted. If there are multiple records for the same page, also those may be appended to an existing log_phys_t object if the memory is available. In the physical format, the first byte of a record identifies the record and its length (up to 15 bytes). For longer records, the immediately following bytes will encode the remaining length in a variable-length encoding. Usually, a variable-length-encoded page identifier will follow, followed by optional payload, whose length is included in the initially encoded total record length. When a mini-transaction is updating multiple fields in a page, it can avoid repeating the tablespace identifier and page number by setting the same_page flag (most significant bit) in the first byte of the log record. The byte offset of the record will be relative to where the previous record for that page ended. Until MDEV-14425 introduces a separate file-level log for redo log checkpoints and file operations, we will write the file-level records in the page-level redo log file. The record FILE_CHECKPOINT (which replaces MLOG_CHECKPOINT) will be removed in MDEV-14425, and one sequential scan of the page recovery log will suffice. Compared to MLOG_FILE_CREATE2, FILE_CREATE will not include any flags. If the information is needed, it can be parsed from WRITE records that modify FSP_SPACE_FLAGS. MLOG_ZIP_WRITE_STRING: Remove. The record was only introduced temporarily as part of this work, before being replaced with WRITE (along with MLOG_WRITE_STRING, MLOG_1BYTE, MLOG_nBYTES). mtr_buf_t::empty(): Check if the buffer is empty. mtr_t::m_n_log_recs: Remove. It suffices to check if m_log is empty. mtr_t::m_last, mtr_t::m_last_offset: End of the latest m_log record, for the same_page encoding. page_recv_t::last_offset: Reflects mtr_t::m_last_offset. Valid values for last_offset during recovery should be 0 or above 8. (The first 8 bytes of a page are the checksum and the page number, and neither are ever updated directly by log records.) Internally, the special value 1 indicates that the same_page form will not be allowed for the subsequent record. mtr_t::page_create(): Take the block descriptor as parameter, so that it can be compared to mtr_t::m_last. The INIT_INDEX_PAGE record will always followed by a subtype byte, because same_page records must be longer than 1 byte. trx_undo_page_init(): Combine the writes in WRITE record. trx_undo_header_create(): Write 4 bytes using a special MEMSET record that includes 1 bytes of length and 2 bytes of payload. flst_write_addr(): Define as a static function. Combine the writes. flst_zero_both(): Replaces two flst_zero_addr() calls. flst_init(): Do not inline the function. fsp_free_seg_inode(): Zerofill the whole inode. fsp_apply_init_file_page(): Initialize FIL_PAGE_PREV,FIL_PAGE_NEXT to FIL_NULL when using the physical format. btr_create(): Assert !page_has_siblings() because fsp_apply_init_file_page() must have been invoked. fil_ibd_create(): Do not write FILE_MODIFY after FILE_CREATE. fil_names_dirty_and_write(): Remove the parameter mtr. Write the records using a separate mini-transaction object, because any FILE_ records must be at the start of a mini-transaction log. recv_recover_page(): Add a fil_space_t* parameter. After applying log to the a ROW_FORMAT=COMPRESSED page, invoke buf_zip_decompress() to restore the uncompressed page. buf_page_io_complete(): Remove the temporary hack to discard the uncompressed page of a ROW_FORMAT=COMPRESSED page. page_zip_write_header(): Remove. Use mtr_t::write() or mtr_t::memset() instead, and update the compressed page frame separately. trx_undo_header_add_space_for_xid(): Remove. trx_undo_seg_create(): Perform the changes that were previously made by trx_undo_header_add_space_for_xid(). btr_reset_instant(): New function: Reset the table to MariaDB 10.2 or 10.3 format when rolling back an instant ALTER TABLE operation. page_rec_find_owner_rec(): Merge with the only callers. page_cur_insert_rec_low(): Combine writes by using a local buffer. MEMMOVE data from the preceding record whenever feasible (copying at least 3 bytes). page_cur_insert_rec_zip(): Combine writes to page header fields. PageBulk::insertPage(): Issue MEMMOVE records to copy a matching part from the preceding record. PageBulk::finishPage(): Combine the writes to the page header and to the sparse page directory slots. mtr_t::write(): Only log the least significant (last) bytes of multi-byte fields that actually differ. For updating FSP_SIZE, we must always write all 4 bytes to the redo log, so that the fil_space_set_recv_size() logic in recv_sys_t::parse() will work. mtr_t::memcpy(), mtr_t::zmemcpy(): Take a pointer argument instead of a numeric offset to the page frame. Only log the last bytes of multi-byte fields that actually differ. In fil_space_crypt_t::write_page0(), we must log also any unchanged bytes, so that recovery will recognize the record and invoke fil_crypt_parse(). Future work: MDEV-21724 Optimize page_cur_insert_rec_low() redo logging MDEV-21725 Optimize btr_page_reorganize_low() redo logging MDEV-21727 Optimize redo logging for ROW_FORMAT=COMPRESSED
2020-02-13 19:12:17 +02:00
@param[in] offset byte offset from b.zip.data
@param[in] len length of the data to write */
inline void zmemcpy(const buf_block_t &b, ulint offset, ulint len);
/** Write a byte string to a ROW_FORMAT=COMPRESSED page.
@param[in] b ROW_FORMAT=COMPRESSED index page
MDEV-12353: Change the redo log encoding log_t::FORMAT_10_5: physical redo log format tag log_phys_t: Buffered records in the physical format. The log record bytes will follow the last data field, making use of alignment padding that would otherwise be wasted. If there are multiple records for the same page, also those may be appended to an existing log_phys_t object if the memory is available. In the physical format, the first byte of a record identifies the record and its length (up to 15 bytes). For longer records, the immediately following bytes will encode the remaining length in a variable-length encoding. Usually, a variable-length-encoded page identifier will follow, followed by optional payload, whose length is included in the initially encoded total record length. When a mini-transaction is updating multiple fields in a page, it can avoid repeating the tablespace identifier and page number by setting the same_page flag (most significant bit) in the first byte of the log record. The byte offset of the record will be relative to where the previous record for that page ended. Until MDEV-14425 introduces a separate file-level log for redo log checkpoints and file operations, we will write the file-level records in the page-level redo log file. The record FILE_CHECKPOINT (which replaces MLOG_CHECKPOINT) will be removed in MDEV-14425, and one sequential scan of the page recovery log will suffice. Compared to MLOG_FILE_CREATE2, FILE_CREATE will not include any flags. If the information is needed, it can be parsed from WRITE records that modify FSP_SPACE_FLAGS. MLOG_ZIP_WRITE_STRING: Remove. The record was only introduced temporarily as part of this work, before being replaced with WRITE (along with MLOG_WRITE_STRING, MLOG_1BYTE, MLOG_nBYTES). mtr_buf_t::empty(): Check if the buffer is empty. mtr_t::m_n_log_recs: Remove. It suffices to check if m_log is empty. mtr_t::m_last, mtr_t::m_last_offset: End of the latest m_log record, for the same_page encoding. page_recv_t::last_offset: Reflects mtr_t::m_last_offset. Valid values for last_offset during recovery should be 0 or above 8. (The first 8 bytes of a page are the checksum and the page number, and neither are ever updated directly by log records.) Internally, the special value 1 indicates that the same_page form will not be allowed for the subsequent record. mtr_t::page_create(): Take the block descriptor as parameter, so that it can be compared to mtr_t::m_last. The INIT_INDEX_PAGE record will always followed by a subtype byte, because same_page records must be longer than 1 byte. trx_undo_page_init(): Combine the writes in WRITE record. trx_undo_header_create(): Write 4 bytes using a special MEMSET record that includes 1 bytes of length and 2 bytes of payload. flst_write_addr(): Define as a static function. Combine the writes. flst_zero_both(): Replaces two flst_zero_addr() calls. flst_init(): Do not inline the function. fsp_free_seg_inode(): Zerofill the whole inode. fsp_apply_init_file_page(): Initialize FIL_PAGE_PREV,FIL_PAGE_NEXT to FIL_NULL when using the physical format. btr_create(): Assert !page_has_siblings() because fsp_apply_init_file_page() must have been invoked. fil_ibd_create(): Do not write FILE_MODIFY after FILE_CREATE. fil_names_dirty_and_write(): Remove the parameter mtr. Write the records using a separate mini-transaction object, because any FILE_ records must be at the start of a mini-transaction log. recv_recover_page(): Add a fil_space_t* parameter. After applying log to the a ROW_FORMAT=COMPRESSED page, invoke buf_zip_decompress() to restore the uncompressed page. buf_page_io_complete(): Remove the temporary hack to discard the uncompressed page of a ROW_FORMAT=COMPRESSED page. page_zip_write_header(): Remove. Use mtr_t::write() or mtr_t::memset() instead, and update the compressed page frame separately. trx_undo_header_add_space_for_xid(): Remove. trx_undo_seg_create(): Perform the changes that were previously made by trx_undo_header_add_space_for_xid(). btr_reset_instant(): New function: Reset the table to MariaDB 10.2 or 10.3 format when rolling back an instant ALTER TABLE operation. page_rec_find_owner_rec(): Merge with the only callers. page_cur_insert_rec_low(): Combine writes by using a local buffer. MEMMOVE data from the preceding record whenever feasible (copying at least 3 bytes). page_cur_insert_rec_zip(): Combine writes to page header fields. PageBulk::insertPage(): Issue MEMMOVE records to copy a matching part from the preceding record. PageBulk::finishPage(): Combine the writes to the page header and to the sparse page directory slots. mtr_t::write(): Only log the least significant (last) bytes of multi-byte fields that actually differ. For updating FSP_SIZE, we must always write all 4 bytes to the redo log, so that the fil_space_set_recv_size() logic in recv_sys_t::parse() will work. mtr_t::memcpy(), mtr_t::zmemcpy(): Take a pointer argument instead of a numeric offset to the page frame. Only log the last bytes of multi-byte fields that actually differ. In fil_space_crypt_t::write_page0(), we must log also any unchanged bytes, so that recovery will recognize the record and invoke fil_crypt_parse(). Future work: MDEV-21724 Optimize page_cur_insert_rec_low() redo logging MDEV-21725 Optimize btr_page_reorganize_low() redo logging MDEV-21727 Optimize redo logging for ROW_FORMAT=COMPRESSED
2020-02-13 19:12:17 +02:00
@param[in] dest destination within b.zip.data
@param[in] str the data to write
MDEV-12353: Change the redo log encoding log_t::FORMAT_10_5: physical redo log format tag log_phys_t: Buffered records in the physical format. The log record bytes will follow the last data field, making use of alignment padding that would otherwise be wasted. If there are multiple records for the same page, also those may be appended to an existing log_phys_t object if the memory is available. In the physical format, the first byte of a record identifies the record and its length (up to 15 bytes). For longer records, the immediately following bytes will encode the remaining length in a variable-length encoding. Usually, a variable-length-encoded page identifier will follow, followed by optional payload, whose length is included in the initially encoded total record length. When a mini-transaction is updating multiple fields in a page, it can avoid repeating the tablespace identifier and page number by setting the same_page flag (most significant bit) in the first byte of the log record. The byte offset of the record will be relative to where the previous record for that page ended. Until MDEV-14425 introduces a separate file-level log for redo log checkpoints and file operations, we will write the file-level records in the page-level redo log file. The record FILE_CHECKPOINT (which replaces MLOG_CHECKPOINT) will be removed in MDEV-14425, and one sequential scan of the page recovery log will suffice. Compared to MLOG_FILE_CREATE2, FILE_CREATE will not include any flags. If the information is needed, it can be parsed from WRITE records that modify FSP_SPACE_FLAGS. MLOG_ZIP_WRITE_STRING: Remove. The record was only introduced temporarily as part of this work, before being replaced with WRITE (along with MLOG_WRITE_STRING, MLOG_1BYTE, MLOG_nBYTES). mtr_buf_t::empty(): Check if the buffer is empty. mtr_t::m_n_log_recs: Remove. It suffices to check if m_log is empty. mtr_t::m_last, mtr_t::m_last_offset: End of the latest m_log record, for the same_page encoding. page_recv_t::last_offset: Reflects mtr_t::m_last_offset. Valid values for last_offset during recovery should be 0 or above 8. (The first 8 bytes of a page are the checksum and the page number, and neither are ever updated directly by log records.) Internally, the special value 1 indicates that the same_page form will not be allowed for the subsequent record. mtr_t::page_create(): Take the block descriptor as parameter, so that it can be compared to mtr_t::m_last. The INIT_INDEX_PAGE record will always followed by a subtype byte, because same_page records must be longer than 1 byte. trx_undo_page_init(): Combine the writes in WRITE record. trx_undo_header_create(): Write 4 bytes using a special MEMSET record that includes 1 bytes of length and 2 bytes of payload. flst_write_addr(): Define as a static function. Combine the writes. flst_zero_both(): Replaces two flst_zero_addr() calls. flst_init(): Do not inline the function. fsp_free_seg_inode(): Zerofill the whole inode. fsp_apply_init_file_page(): Initialize FIL_PAGE_PREV,FIL_PAGE_NEXT to FIL_NULL when using the physical format. btr_create(): Assert !page_has_siblings() because fsp_apply_init_file_page() must have been invoked. fil_ibd_create(): Do not write FILE_MODIFY after FILE_CREATE. fil_names_dirty_and_write(): Remove the parameter mtr. Write the records using a separate mini-transaction object, because any FILE_ records must be at the start of a mini-transaction log. recv_recover_page(): Add a fil_space_t* parameter. After applying log to the a ROW_FORMAT=COMPRESSED page, invoke buf_zip_decompress() to restore the uncompressed page. buf_page_io_complete(): Remove the temporary hack to discard the uncompressed page of a ROW_FORMAT=COMPRESSED page. page_zip_write_header(): Remove. Use mtr_t::write() or mtr_t::memset() instead, and update the compressed page frame separately. trx_undo_header_add_space_for_xid(): Remove. trx_undo_seg_create(): Perform the changes that were previously made by trx_undo_header_add_space_for_xid(). btr_reset_instant(): New function: Reset the table to MariaDB 10.2 or 10.3 format when rolling back an instant ALTER TABLE operation. page_rec_find_owner_rec(): Merge with the only callers. page_cur_insert_rec_low(): Combine writes by using a local buffer. MEMMOVE data from the preceding record whenever feasible (copying at least 3 bytes). page_cur_insert_rec_zip(): Combine writes to page header fields. PageBulk::insertPage(): Issue MEMMOVE records to copy a matching part from the preceding record. PageBulk::finishPage(): Combine the writes to the page header and to the sparse page directory slots. mtr_t::write(): Only log the least significant (last) bytes of multi-byte fields that actually differ. For updating FSP_SIZE, we must always write all 4 bytes to the redo log, so that the fil_space_set_recv_size() logic in recv_sys_t::parse() will work. mtr_t::memcpy(), mtr_t::zmemcpy(): Take a pointer argument instead of a numeric offset to the page frame. Only log the last bytes of multi-byte fields that actually differ. In fil_space_crypt_t::write_page0(), we must log also any unchanged bytes, so that recovery will recognize the record and invoke fil_crypt_parse(). Future work: MDEV-21724 Optimize page_cur_insert_rec_low() redo logging MDEV-21725 Optimize btr_page_reorganize_low() redo logging MDEV-21727 Optimize redo logging for ROW_FORMAT=COMPRESSED
2020-02-13 19:12:17 +02:00
@param[in] len length of the data to write
@tparam w write request type */
template<write_type w= NORMAL>
inline void zmemcpy(const buf_block_t &b, void *dest, const void *str,
MDEV-12353: Change the redo log encoding log_t::FORMAT_10_5: physical redo log format tag log_phys_t: Buffered records in the physical format. The log record bytes will follow the last data field, making use of alignment padding that would otherwise be wasted. If there are multiple records for the same page, also those may be appended to an existing log_phys_t object if the memory is available. In the physical format, the first byte of a record identifies the record and its length (up to 15 bytes). For longer records, the immediately following bytes will encode the remaining length in a variable-length encoding. Usually, a variable-length-encoded page identifier will follow, followed by optional payload, whose length is included in the initially encoded total record length. When a mini-transaction is updating multiple fields in a page, it can avoid repeating the tablespace identifier and page number by setting the same_page flag (most significant bit) in the first byte of the log record. The byte offset of the record will be relative to where the previous record for that page ended. Until MDEV-14425 introduces a separate file-level log for redo log checkpoints and file operations, we will write the file-level records in the page-level redo log file. The record FILE_CHECKPOINT (which replaces MLOG_CHECKPOINT) will be removed in MDEV-14425, and one sequential scan of the page recovery log will suffice. Compared to MLOG_FILE_CREATE2, FILE_CREATE will not include any flags. If the information is needed, it can be parsed from WRITE records that modify FSP_SPACE_FLAGS. MLOG_ZIP_WRITE_STRING: Remove. The record was only introduced temporarily as part of this work, before being replaced with WRITE (along with MLOG_WRITE_STRING, MLOG_1BYTE, MLOG_nBYTES). mtr_buf_t::empty(): Check if the buffer is empty. mtr_t::m_n_log_recs: Remove. It suffices to check if m_log is empty. mtr_t::m_last, mtr_t::m_last_offset: End of the latest m_log record, for the same_page encoding. page_recv_t::last_offset: Reflects mtr_t::m_last_offset. Valid values for last_offset during recovery should be 0 or above 8. (The first 8 bytes of a page are the checksum and the page number, and neither are ever updated directly by log records.) Internally, the special value 1 indicates that the same_page form will not be allowed for the subsequent record. mtr_t::page_create(): Take the block descriptor as parameter, so that it can be compared to mtr_t::m_last. The INIT_INDEX_PAGE record will always followed by a subtype byte, because same_page records must be longer than 1 byte. trx_undo_page_init(): Combine the writes in WRITE record. trx_undo_header_create(): Write 4 bytes using a special MEMSET record that includes 1 bytes of length and 2 bytes of payload. flst_write_addr(): Define as a static function. Combine the writes. flst_zero_both(): Replaces two flst_zero_addr() calls. flst_init(): Do not inline the function. fsp_free_seg_inode(): Zerofill the whole inode. fsp_apply_init_file_page(): Initialize FIL_PAGE_PREV,FIL_PAGE_NEXT to FIL_NULL when using the physical format. btr_create(): Assert !page_has_siblings() because fsp_apply_init_file_page() must have been invoked. fil_ibd_create(): Do not write FILE_MODIFY after FILE_CREATE. fil_names_dirty_and_write(): Remove the parameter mtr. Write the records using a separate mini-transaction object, because any FILE_ records must be at the start of a mini-transaction log. recv_recover_page(): Add a fil_space_t* parameter. After applying log to the a ROW_FORMAT=COMPRESSED page, invoke buf_zip_decompress() to restore the uncompressed page. buf_page_io_complete(): Remove the temporary hack to discard the uncompressed page of a ROW_FORMAT=COMPRESSED page. page_zip_write_header(): Remove. Use mtr_t::write() or mtr_t::memset() instead, and update the compressed page frame separately. trx_undo_header_add_space_for_xid(): Remove. trx_undo_seg_create(): Perform the changes that were previously made by trx_undo_header_add_space_for_xid(). btr_reset_instant(): New function: Reset the table to MariaDB 10.2 or 10.3 format when rolling back an instant ALTER TABLE operation. page_rec_find_owner_rec(): Merge with the only callers. page_cur_insert_rec_low(): Combine writes by using a local buffer. MEMMOVE data from the preceding record whenever feasible (copying at least 3 bytes). page_cur_insert_rec_zip(): Combine writes to page header fields. PageBulk::insertPage(): Issue MEMMOVE records to copy a matching part from the preceding record. PageBulk::finishPage(): Combine the writes to the page header and to the sparse page directory slots. mtr_t::write(): Only log the least significant (last) bytes of multi-byte fields that actually differ. For updating FSP_SIZE, we must always write all 4 bytes to the redo log, so that the fil_space_set_recv_size() logic in recv_sys_t::parse() will work. mtr_t::memcpy(), mtr_t::zmemcpy(): Take a pointer argument instead of a numeric offset to the page frame. Only log the last bytes of multi-byte fields that actually differ. In fil_space_crypt_t::write_page0(), we must log also any unchanged bytes, so that recovery will recognize the record and invoke fil_crypt_parse(). Future work: MDEV-21724 Optimize page_cur_insert_rec_low() redo logging MDEV-21725 Optimize btr_page_reorganize_low() redo logging MDEV-21727 Optimize redo logging for ROW_FORMAT=COMPRESSED
2020-02-13 19:12:17 +02:00
ulint len);
/** Log an initialization of a string of bytes.
@param[in] b buffer page
@param[in] ofs byte offset from b->frame
@param[in] len length of the data to write
@param[in] val the data byte to write */
inline void memset(const buf_block_t &b, ulint ofs, ulint len, byte val);
/** Initialize a string of bytes.
@param[in,out] b buffer page
@param[in] ofs byte offset from b->frame
@param[in] len length of the data to write
@param[in] val the data byte to write */
MDEV-12353: Change the redo log encoding log_t::FORMAT_10_5: physical redo log format tag log_phys_t: Buffered records in the physical format. The log record bytes will follow the last data field, making use of alignment padding that would otherwise be wasted. If there are multiple records for the same page, also those may be appended to an existing log_phys_t object if the memory is available. In the physical format, the first byte of a record identifies the record and its length (up to 15 bytes). For longer records, the immediately following bytes will encode the remaining length in a variable-length encoding. Usually, a variable-length-encoded page identifier will follow, followed by optional payload, whose length is included in the initially encoded total record length. When a mini-transaction is updating multiple fields in a page, it can avoid repeating the tablespace identifier and page number by setting the same_page flag (most significant bit) in the first byte of the log record. The byte offset of the record will be relative to where the previous record for that page ended. Until MDEV-14425 introduces a separate file-level log for redo log checkpoints and file operations, we will write the file-level records in the page-level redo log file. The record FILE_CHECKPOINT (which replaces MLOG_CHECKPOINT) will be removed in MDEV-14425, and one sequential scan of the page recovery log will suffice. Compared to MLOG_FILE_CREATE2, FILE_CREATE will not include any flags. If the information is needed, it can be parsed from WRITE records that modify FSP_SPACE_FLAGS. MLOG_ZIP_WRITE_STRING: Remove. The record was only introduced temporarily as part of this work, before being replaced with WRITE (along with MLOG_WRITE_STRING, MLOG_1BYTE, MLOG_nBYTES). mtr_buf_t::empty(): Check if the buffer is empty. mtr_t::m_n_log_recs: Remove. It suffices to check if m_log is empty. mtr_t::m_last, mtr_t::m_last_offset: End of the latest m_log record, for the same_page encoding. page_recv_t::last_offset: Reflects mtr_t::m_last_offset. Valid values for last_offset during recovery should be 0 or above 8. (The first 8 bytes of a page are the checksum and the page number, and neither are ever updated directly by log records.) Internally, the special value 1 indicates that the same_page form will not be allowed for the subsequent record. mtr_t::page_create(): Take the block descriptor as parameter, so that it can be compared to mtr_t::m_last. The INIT_INDEX_PAGE record will always followed by a subtype byte, because same_page records must be longer than 1 byte. trx_undo_page_init(): Combine the writes in WRITE record. trx_undo_header_create(): Write 4 bytes using a special MEMSET record that includes 1 bytes of length and 2 bytes of payload. flst_write_addr(): Define as a static function. Combine the writes. flst_zero_both(): Replaces two flst_zero_addr() calls. flst_init(): Do not inline the function. fsp_free_seg_inode(): Zerofill the whole inode. fsp_apply_init_file_page(): Initialize FIL_PAGE_PREV,FIL_PAGE_NEXT to FIL_NULL when using the physical format. btr_create(): Assert !page_has_siblings() because fsp_apply_init_file_page() must have been invoked. fil_ibd_create(): Do not write FILE_MODIFY after FILE_CREATE. fil_names_dirty_and_write(): Remove the parameter mtr. Write the records using a separate mini-transaction object, because any FILE_ records must be at the start of a mini-transaction log. recv_recover_page(): Add a fil_space_t* parameter. After applying log to the a ROW_FORMAT=COMPRESSED page, invoke buf_zip_decompress() to restore the uncompressed page. buf_page_io_complete(): Remove the temporary hack to discard the uncompressed page of a ROW_FORMAT=COMPRESSED page. page_zip_write_header(): Remove. Use mtr_t::write() or mtr_t::memset() instead, and update the compressed page frame separately. trx_undo_header_add_space_for_xid(): Remove. trx_undo_seg_create(): Perform the changes that were previously made by trx_undo_header_add_space_for_xid(). btr_reset_instant(): New function: Reset the table to MariaDB 10.2 or 10.3 format when rolling back an instant ALTER TABLE operation. page_rec_find_owner_rec(): Merge with the only callers. page_cur_insert_rec_low(): Combine writes by using a local buffer. MEMMOVE data from the preceding record whenever feasible (copying at least 3 bytes). page_cur_insert_rec_zip(): Combine writes to page header fields. PageBulk::insertPage(): Issue MEMMOVE records to copy a matching part from the preceding record. PageBulk::finishPage(): Combine the writes to the page header and to the sparse page directory slots. mtr_t::write(): Only log the least significant (last) bytes of multi-byte fields that actually differ. For updating FSP_SIZE, we must always write all 4 bytes to the redo log, so that the fil_space_set_recv_size() logic in recv_sys_t::parse() will work. mtr_t::memcpy(), mtr_t::zmemcpy(): Take a pointer argument instead of a numeric offset to the page frame. Only log the last bytes of multi-byte fields that actually differ. In fil_space_crypt_t::write_page0(), we must log also any unchanged bytes, so that recovery will recognize the record and invoke fil_crypt_parse(). Future work: MDEV-21724 Optimize page_cur_insert_rec_low() redo logging MDEV-21725 Optimize btr_page_reorganize_low() redo logging MDEV-21727 Optimize redo logging for ROW_FORMAT=COMPRESSED
2020-02-13 19:12:17 +02:00
inline void memset(const buf_block_t *b, ulint ofs, ulint len, byte val);
/** Log an initialization of a repeating string of bytes.
@param[in] b buffer page
@param[in] ofs byte offset from b->frame
@param[in] len length of the data to write, in bytes
@param[in] str the string to write
@param[in] size size of str, in bytes */
inline void memset(const buf_block_t &b, ulint ofs, size_t len,
const void *str, size_t size);
/** Initialize a repeating string of bytes.
@param[in,out] b buffer page
@param[in] ofs byte offset from b->frame
@param[in] len length of the data to write, in bytes
@param[in] str the string to write
@param[in] size size of str, in bytes */
inline void memset(const buf_block_t *b, ulint ofs, size_t len,
const void *str, size_t size);
/** Log that a string of bytes was copied from the same page.
@param[in] b buffer page
@param[in] d destination offset within the page
@param[in] s source offset within the page
@param[in] len length of the data to copy */
inline void memmove(const buf_block_t &b, ulint d, ulint s, ulint len);
/** Initialize an entire page.
@param[in,out] b buffer page */
void init(buf_block_t *b);
/** Free a page.
MDEV-8139 Fix Scrubbing fil_space_t::freed_ranges: Store ranges of freed page numbers. fil_space_t::last_freed_lsn: Store the most recent LSN of freeing a page. fil_space_t::freed_mutex: Protects freed_ranges, last_freed_lsn. fil_space_create(): Initialize the freed_range mutex. fil_space_free_low(): Frees the freed_range mutex. range_set: Ranges of page numbers. buf_page_create(): Removes the page from freed_ranges when page is being reused. btr_free_root(): Remove the PAGE_INDEX_ID invalidation. Because btr_free_root() and dict_drop_index_tree() are executed in the same atomic mini-transaction, there is no need to invalidate the root page. buf_release_freed_page(): Split from buf_flush_freed_page(). Skip any I/O buf_flush_freed_pages(): Get the freed ranges from tablespace and Write punch-hole or zeroes of the freed ranges. buf_flush_try_neighbors(): Handles the flushing of freed ranges. mtr_t::freed_pages: Variable to store the list of freed pages. mtr_t::add_freed_pages(): To add freed pages. mtr_t::clear_freed_pages(): To clear the freed pages. mtr_t::m_freed_in_system_tablespace: Variable to indicate whether page has been freed in system tablespace. mtr_t::m_trim_pages: Variable to indicate whether the space has been trimmed. mtr_t::commit(): Add the freed page and update the last freed lsn in the tablespace and clear the tablespace freed range if space is trimmed. file_name_t::freed_pages: Store the freed pages during recovery. file_name_t::add_freed_page(), file_name_t::remove_freed_page(): To add and remove freed page during recovery. store_freed_or_init_rec(): Store or remove the freed pages while encountering FREE_PAGE or INIT_PAGE redo log record. recv_init_crash_recovery_spaces(): Add the freed page encountered during recovery to respective tablespace.
2020-06-11 22:52:47 +05:30
@param[in] space tablespace contains page to be freed
@param[in] offset page offset to be freed */
inline void free(fil_space_t &space, uint32_t offset);
/** Write log for partly initializing a B-tree or R-tree page.
@param block B-tree or R-tree page
@param comp false=ROW_FORMAT=REDUNDANT, true=COMPACT or DYNAMIC */
MDEV-12353: Change the redo log encoding log_t::FORMAT_10_5: physical redo log format tag log_phys_t: Buffered records in the physical format. The log record bytes will follow the last data field, making use of alignment padding that would otherwise be wasted. If there are multiple records for the same page, also those may be appended to an existing log_phys_t object if the memory is available. In the physical format, the first byte of a record identifies the record and its length (up to 15 bytes). For longer records, the immediately following bytes will encode the remaining length in a variable-length encoding. Usually, a variable-length-encoded page identifier will follow, followed by optional payload, whose length is included in the initially encoded total record length. When a mini-transaction is updating multiple fields in a page, it can avoid repeating the tablespace identifier and page number by setting the same_page flag (most significant bit) in the first byte of the log record. The byte offset of the record will be relative to where the previous record for that page ended. Until MDEV-14425 introduces a separate file-level log for redo log checkpoints and file operations, we will write the file-level records in the page-level redo log file. The record FILE_CHECKPOINT (which replaces MLOG_CHECKPOINT) will be removed in MDEV-14425, and one sequential scan of the page recovery log will suffice. Compared to MLOG_FILE_CREATE2, FILE_CREATE will not include any flags. If the information is needed, it can be parsed from WRITE records that modify FSP_SPACE_FLAGS. MLOG_ZIP_WRITE_STRING: Remove. The record was only introduced temporarily as part of this work, before being replaced with WRITE (along with MLOG_WRITE_STRING, MLOG_1BYTE, MLOG_nBYTES). mtr_buf_t::empty(): Check if the buffer is empty. mtr_t::m_n_log_recs: Remove. It suffices to check if m_log is empty. mtr_t::m_last, mtr_t::m_last_offset: End of the latest m_log record, for the same_page encoding. page_recv_t::last_offset: Reflects mtr_t::m_last_offset. Valid values for last_offset during recovery should be 0 or above 8. (The first 8 bytes of a page are the checksum and the page number, and neither are ever updated directly by log records.) Internally, the special value 1 indicates that the same_page form will not be allowed for the subsequent record. mtr_t::page_create(): Take the block descriptor as parameter, so that it can be compared to mtr_t::m_last. The INIT_INDEX_PAGE record will always followed by a subtype byte, because same_page records must be longer than 1 byte. trx_undo_page_init(): Combine the writes in WRITE record. trx_undo_header_create(): Write 4 bytes using a special MEMSET record that includes 1 bytes of length and 2 bytes of payload. flst_write_addr(): Define as a static function. Combine the writes. flst_zero_both(): Replaces two flst_zero_addr() calls. flst_init(): Do not inline the function. fsp_free_seg_inode(): Zerofill the whole inode. fsp_apply_init_file_page(): Initialize FIL_PAGE_PREV,FIL_PAGE_NEXT to FIL_NULL when using the physical format. btr_create(): Assert !page_has_siblings() because fsp_apply_init_file_page() must have been invoked. fil_ibd_create(): Do not write FILE_MODIFY after FILE_CREATE. fil_names_dirty_and_write(): Remove the parameter mtr. Write the records using a separate mini-transaction object, because any FILE_ records must be at the start of a mini-transaction log. recv_recover_page(): Add a fil_space_t* parameter. After applying log to the a ROW_FORMAT=COMPRESSED page, invoke buf_zip_decompress() to restore the uncompressed page. buf_page_io_complete(): Remove the temporary hack to discard the uncompressed page of a ROW_FORMAT=COMPRESSED page. page_zip_write_header(): Remove. Use mtr_t::write() or mtr_t::memset() instead, and update the compressed page frame separately. trx_undo_header_add_space_for_xid(): Remove. trx_undo_seg_create(): Perform the changes that were previously made by trx_undo_header_add_space_for_xid(). btr_reset_instant(): New function: Reset the table to MariaDB 10.2 or 10.3 format when rolling back an instant ALTER TABLE operation. page_rec_find_owner_rec(): Merge with the only callers. page_cur_insert_rec_low(): Combine writes by using a local buffer. MEMMOVE data from the preceding record whenever feasible (copying at least 3 bytes). page_cur_insert_rec_zip(): Combine writes to page header fields. PageBulk::insertPage(): Issue MEMMOVE records to copy a matching part from the preceding record. PageBulk::finishPage(): Combine the writes to the page header and to the sparse page directory slots. mtr_t::write(): Only log the least significant (last) bytes of multi-byte fields that actually differ. For updating FSP_SIZE, we must always write all 4 bytes to the redo log, so that the fil_space_set_recv_size() logic in recv_sys_t::parse() will work. mtr_t::memcpy(), mtr_t::zmemcpy(): Take a pointer argument instead of a numeric offset to the page frame. Only log the last bytes of multi-byte fields that actually differ. In fil_space_crypt_t::write_page0(), we must log also any unchanged bytes, so that recovery will recognize the record and invoke fil_crypt_parse(). Future work: MDEV-21724 Optimize page_cur_insert_rec_low() redo logging MDEV-21725 Optimize btr_page_reorganize_low() redo logging MDEV-21727 Optimize redo logging for ROW_FORMAT=COMPRESSED
2020-02-13 19:12:17 +02:00
inline void page_create(const buf_block_t &block, bool comp);
MDEV-21724: Optimize page_cur_insert_low() redo logging Inserting a record into an index page involves updating multiple fields in the page header as well as updating the next-record links and potentially updating fields related to the sparse page directory. Let us cover the insert operations by higher-level log records, to avoid 'redundant' logging about the writes. The code for applying the high-level log records will check the consistency of the page thoroughly, to avoid crashes during recovery. We will refuse to replay the inserts if any inconsistency is detected. With innodb_force_recovery=1, recovery will continue, but the affected pages may be more inconsistent if some changes were omitted. mrec_ext_t: Introduce the EXTENDED record subtypes INSERT_HEAP_REDUNDANT, INSERT_REUSE_REDUNDANT, INSERT_HEAP_DYNAMIC, INSERT_REUSE_DYNAMIC. The record will explicitly identify the page type and whether the space will be allocated from PAGE_HEAP_TOP or reused from the PAGE_FREE list. It will also tell how many bytes to copy from the preceding record header and payload, and how to initialize the rest of the record header and payload. mtr_t::page_insert(): Write the high-level log records. log_phys_t::apply(): Parse the high-level log records. page_apply_insert_redundant(), page_apply_insert_dynamic(): Apply the high-level log records. page_dir_split_slot(): Introduce a variant that does not write log nor deal with ROW_FORMAT=COMPRESSED pages. page_mem_alloc_heap(): Remove the mtr_t parameter page_cur_insert_rec_low(): Write log only via mtr_t::page_insert().
2020-02-27 17:19:44 +02:00
/** Write log for inserting a B-tree or R-tree record in
ROW_FORMAT=REDUNDANT.
@param block B-tree or R-tree page
@param reuse false=allocate from PAGE_HEAP_TOP; true=reuse PAGE_FREE
@param prev_rec byte offset of the predecessor of the record to insert,
starting from PAGE_OLD_INFIMUM
@param info_bits info_bits of the record
@param n_fields_s number of fields << 1 | rec_get_1byte_offs_flag()
@param hdr_c number of common record header bytes with prev_rec
@param data_c number of common data bytes with prev_rec
@param hdr record header bytes to copy to the log
@param hdr_l number of copied record header bytes
@param data record payload bytes to copy to the log
@param data_l number of copied record data bytes */
inline void page_insert(const buf_block_t &block, bool reuse,
ulint prev_rec, byte info_bits,
ulint n_fields_s, size_t hdr_c, size_t data_c,
const byte *hdr, size_t hdr_l,
const byte *data, size_t data_l);
/** Write log for inserting a B-tree or R-tree record in
ROW_FORMAT=COMPACT or ROW_FORMAT=DYNAMIC.
@param block B-tree or R-tree page
@param reuse false=allocate from PAGE_HEAP_TOP; true=reuse PAGE_FREE
@param prev_rec byte offset of the predecessor of the record to insert,
starting from PAGE_NEW_INFIMUM
@param info_status rec_get_info_and_status_bits()
@param shift unless !reuse: number of bytes the PAGE_FREE is moving
@param hdr_c number of common record header bytes with prev_rec
@param data_c number of common data bytes with prev_rec
@param hdr record header bytes to copy to the log
@param hdr_l number of copied record header bytes
@param data record payload bytes to copy to the log
@param data_l number of copied record data bytes */
inline void page_insert(const buf_block_t &block, bool reuse,
ulint prev_rec, byte info_status,
ssize_t shift, size_t hdr_c, size_t data_c,
const byte *hdr, size_t hdr_l,
const byte *data, size_t data_l);
/** Write log for deleting a B-tree or R-tree record in ROW_FORMAT=REDUNDANT.
@param block B-tree or R-tree page
@param prev_rec byte offset of the predecessor of the record to delete,
starting from PAGE_OLD_INFIMUM */
inline void page_delete(const buf_block_t &block, ulint prev_rec);
/** Write log for deleting a COMPACT or DYNAMIC B-tree or R-tree record.
@param block B-tree or R-tree page
@param prev_rec byte offset of the predecessor of the record to delete,
starting from PAGE_NEW_INFIMUM
@param hdr_size record header size, excluding REC_N_NEW_EXTRA_BYTES
@param data_size data payload size, in bytes */
inline void page_delete(const buf_block_t &block, ulint prev_rec,
size_t hdr_size, size_t data_size);
/** Write log for initializing an undo log page.
@param block undo page */
inline void undo_create(const buf_block_t &block);
/** Write log for appending an undo log record.
@param block undo page
@param data record within the undo page
@param len length of the undo record, in bytes */
inline void undo_append(const buf_block_t &block,
const void *data, size_t len);
/** Trim the end of a tablespace.
@param id first page identifier that will not be in the file */
inline void trim_pages(const page_id_t id);
/** Write a log record about a file operation.
@param type file operation
@param space_id tablespace identifier
@param path file path
MDEV-12353: Change the redo log encoding log_t::FORMAT_10_5: physical redo log format tag log_phys_t: Buffered records in the physical format. The log record bytes will follow the last data field, making use of alignment padding that would otherwise be wasted. If there are multiple records for the same page, also those may be appended to an existing log_phys_t object if the memory is available. In the physical format, the first byte of a record identifies the record and its length (up to 15 bytes). For longer records, the immediately following bytes will encode the remaining length in a variable-length encoding. Usually, a variable-length-encoded page identifier will follow, followed by optional payload, whose length is included in the initially encoded total record length. When a mini-transaction is updating multiple fields in a page, it can avoid repeating the tablespace identifier and page number by setting the same_page flag (most significant bit) in the first byte of the log record. The byte offset of the record will be relative to where the previous record for that page ended. Until MDEV-14425 introduces a separate file-level log for redo log checkpoints and file operations, we will write the file-level records in the page-level redo log file. The record FILE_CHECKPOINT (which replaces MLOG_CHECKPOINT) will be removed in MDEV-14425, and one sequential scan of the page recovery log will suffice. Compared to MLOG_FILE_CREATE2, FILE_CREATE will not include any flags. If the information is needed, it can be parsed from WRITE records that modify FSP_SPACE_FLAGS. MLOG_ZIP_WRITE_STRING: Remove. The record was only introduced temporarily as part of this work, before being replaced with WRITE (along with MLOG_WRITE_STRING, MLOG_1BYTE, MLOG_nBYTES). mtr_buf_t::empty(): Check if the buffer is empty. mtr_t::m_n_log_recs: Remove. It suffices to check if m_log is empty. mtr_t::m_last, mtr_t::m_last_offset: End of the latest m_log record, for the same_page encoding. page_recv_t::last_offset: Reflects mtr_t::m_last_offset. Valid values for last_offset during recovery should be 0 or above 8. (The first 8 bytes of a page are the checksum and the page number, and neither are ever updated directly by log records.) Internally, the special value 1 indicates that the same_page form will not be allowed for the subsequent record. mtr_t::page_create(): Take the block descriptor as parameter, so that it can be compared to mtr_t::m_last. The INIT_INDEX_PAGE record will always followed by a subtype byte, because same_page records must be longer than 1 byte. trx_undo_page_init(): Combine the writes in WRITE record. trx_undo_header_create(): Write 4 bytes using a special MEMSET record that includes 1 bytes of length and 2 bytes of payload. flst_write_addr(): Define as a static function. Combine the writes. flst_zero_both(): Replaces two flst_zero_addr() calls. flst_init(): Do not inline the function. fsp_free_seg_inode(): Zerofill the whole inode. fsp_apply_init_file_page(): Initialize FIL_PAGE_PREV,FIL_PAGE_NEXT to FIL_NULL when using the physical format. btr_create(): Assert !page_has_siblings() because fsp_apply_init_file_page() must have been invoked. fil_ibd_create(): Do not write FILE_MODIFY after FILE_CREATE. fil_names_dirty_and_write(): Remove the parameter mtr. Write the records using a separate mini-transaction object, because any FILE_ records must be at the start of a mini-transaction log. recv_recover_page(): Add a fil_space_t* parameter. After applying log to the a ROW_FORMAT=COMPRESSED page, invoke buf_zip_decompress() to restore the uncompressed page. buf_page_io_complete(): Remove the temporary hack to discard the uncompressed page of a ROW_FORMAT=COMPRESSED page. page_zip_write_header(): Remove. Use mtr_t::write() or mtr_t::memset() instead, and update the compressed page frame separately. trx_undo_header_add_space_for_xid(): Remove. trx_undo_seg_create(): Perform the changes that were previously made by trx_undo_header_add_space_for_xid(). btr_reset_instant(): New function: Reset the table to MariaDB 10.2 or 10.3 format when rolling back an instant ALTER TABLE operation. page_rec_find_owner_rec(): Merge with the only callers. page_cur_insert_rec_low(): Combine writes by using a local buffer. MEMMOVE data from the preceding record whenever feasible (copying at least 3 bytes). page_cur_insert_rec_zip(): Combine writes to page header fields. PageBulk::insertPage(): Issue MEMMOVE records to copy a matching part from the preceding record. PageBulk::finishPage(): Combine the writes to the page header and to the sparse page directory slots. mtr_t::write(): Only log the least significant (last) bytes of multi-byte fields that actually differ. For updating FSP_SIZE, we must always write all 4 bytes to the redo log, so that the fil_space_set_recv_size() logic in recv_sys_t::parse() will work. mtr_t::memcpy(), mtr_t::zmemcpy(): Take a pointer argument instead of a numeric offset to the page frame. Only log the last bytes of multi-byte fields that actually differ. In fil_space_crypt_t::write_page0(), we must log also any unchanged bytes, so that recovery will recognize the record and invoke fil_crypt_parse(). Future work: MDEV-21724 Optimize page_cur_insert_rec_low() redo logging MDEV-21725 Optimize btr_page_reorganize_low() redo logging MDEV-21727 Optimize redo logging for ROW_FORMAT=COMPRESSED
2020-02-13 19:12:17 +02:00
@param new_path new file path for type=FILE_RENAME */
inline void log_file_op(mfile_type_t type, ulint space_id,
const char *path,
MDEV-12353: Change the redo log encoding log_t::FORMAT_10_5: physical redo log format tag log_phys_t: Buffered records in the physical format. The log record bytes will follow the last data field, making use of alignment padding that would otherwise be wasted. If there are multiple records for the same page, also those may be appended to an existing log_phys_t object if the memory is available. In the physical format, the first byte of a record identifies the record and its length (up to 15 bytes). For longer records, the immediately following bytes will encode the remaining length in a variable-length encoding. Usually, a variable-length-encoded page identifier will follow, followed by optional payload, whose length is included in the initially encoded total record length. When a mini-transaction is updating multiple fields in a page, it can avoid repeating the tablespace identifier and page number by setting the same_page flag (most significant bit) in the first byte of the log record. The byte offset of the record will be relative to where the previous record for that page ended. Until MDEV-14425 introduces a separate file-level log for redo log checkpoints and file operations, we will write the file-level records in the page-level redo log file. The record FILE_CHECKPOINT (which replaces MLOG_CHECKPOINT) will be removed in MDEV-14425, and one sequential scan of the page recovery log will suffice. Compared to MLOG_FILE_CREATE2, FILE_CREATE will not include any flags. If the information is needed, it can be parsed from WRITE records that modify FSP_SPACE_FLAGS. MLOG_ZIP_WRITE_STRING: Remove. The record was only introduced temporarily as part of this work, before being replaced with WRITE (along with MLOG_WRITE_STRING, MLOG_1BYTE, MLOG_nBYTES). mtr_buf_t::empty(): Check if the buffer is empty. mtr_t::m_n_log_recs: Remove. It suffices to check if m_log is empty. mtr_t::m_last, mtr_t::m_last_offset: End of the latest m_log record, for the same_page encoding. page_recv_t::last_offset: Reflects mtr_t::m_last_offset. Valid values for last_offset during recovery should be 0 or above 8. (The first 8 bytes of a page are the checksum and the page number, and neither are ever updated directly by log records.) Internally, the special value 1 indicates that the same_page form will not be allowed for the subsequent record. mtr_t::page_create(): Take the block descriptor as parameter, so that it can be compared to mtr_t::m_last. The INIT_INDEX_PAGE record will always followed by a subtype byte, because same_page records must be longer than 1 byte. trx_undo_page_init(): Combine the writes in WRITE record. trx_undo_header_create(): Write 4 bytes using a special MEMSET record that includes 1 bytes of length and 2 bytes of payload. flst_write_addr(): Define as a static function. Combine the writes. flst_zero_both(): Replaces two flst_zero_addr() calls. flst_init(): Do not inline the function. fsp_free_seg_inode(): Zerofill the whole inode. fsp_apply_init_file_page(): Initialize FIL_PAGE_PREV,FIL_PAGE_NEXT to FIL_NULL when using the physical format. btr_create(): Assert !page_has_siblings() because fsp_apply_init_file_page() must have been invoked. fil_ibd_create(): Do not write FILE_MODIFY after FILE_CREATE. fil_names_dirty_and_write(): Remove the parameter mtr. Write the records using a separate mini-transaction object, because any FILE_ records must be at the start of a mini-transaction log. recv_recover_page(): Add a fil_space_t* parameter. After applying log to the a ROW_FORMAT=COMPRESSED page, invoke buf_zip_decompress() to restore the uncompressed page. buf_page_io_complete(): Remove the temporary hack to discard the uncompressed page of a ROW_FORMAT=COMPRESSED page. page_zip_write_header(): Remove. Use mtr_t::write() or mtr_t::memset() instead, and update the compressed page frame separately. trx_undo_header_add_space_for_xid(): Remove. trx_undo_seg_create(): Perform the changes that were previously made by trx_undo_header_add_space_for_xid(). btr_reset_instant(): New function: Reset the table to MariaDB 10.2 or 10.3 format when rolling back an instant ALTER TABLE operation. page_rec_find_owner_rec(): Merge with the only callers. page_cur_insert_rec_low(): Combine writes by using a local buffer. MEMMOVE data from the preceding record whenever feasible (copying at least 3 bytes). page_cur_insert_rec_zip(): Combine writes to page header fields. PageBulk::insertPage(): Issue MEMMOVE records to copy a matching part from the preceding record. PageBulk::finishPage(): Combine the writes to the page header and to the sparse page directory slots. mtr_t::write(): Only log the least significant (last) bytes of multi-byte fields that actually differ. For updating FSP_SIZE, we must always write all 4 bytes to the redo log, so that the fil_space_set_recv_size() logic in recv_sys_t::parse() will work. mtr_t::memcpy(), mtr_t::zmemcpy(): Take a pointer argument instead of a numeric offset to the page frame. Only log the last bytes of multi-byte fields that actually differ. In fil_space_crypt_t::write_page0(), we must log also any unchanged bytes, so that recovery will recognize the record and invoke fil_crypt_parse(). Future work: MDEV-21724 Optimize page_cur_insert_rec_low() redo logging MDEV-21725 Optimize btr_page_reorganize_low() redo logging MDEV-21727 Optimize redo logging for ROW_FORMAT=COMPRESSED
2020-02-13 19:12:17 +02:00
const char *new_path= nullptr);
MDEV-8139 Fix Scrubbing fil_space_t::freed_ranges: Store ranges of freed page numbers. fil_space_t::last_freed_lsn: Store the most recent LSN of freeing a page. fil_space_t::freed_mutex: Protects freed_ranges, last_freed_lsn. fil_space_create(): Initialize the freed_range mutex. fil_space_free_low(): Frees the freed_range mutex. range_set: Ranges of page numbers. buf_page_create(): Removes the page from freed_ranges when page is being reused. btr_free_root(): Remove the PAGE_INDEX_ID invalidation. Because btr_free_root() and dict_drop_index_tree() are executed in the same atomic mini-transaction, there is no need to invalidate the root page. buf_release_freed_page(): Split from buf_flush_freed_page(). Skip any I/O buf_flush_freed_pages(): Get the freed ranges from tablespace and Write punch-hole or zeroes of the freed ranges. buf_flush_try_neighbors(): Handles the flushing of freed ranges. mtr_t::freed_pages: Variable to store the list of freed pages. mtr_t::add_freed_pages(): To add freed pages. mtr_t::clear_freed_pages(): To clear the freed pages. mtr_t::m_freed_in_system_tablespace: Variable to indicate whether page has been freed in system tablespace. mtr_t::m_trim_pages: Variable to indicate whether the space has been trimmed. mtr_t::commit(): Add the freed page and update the last freed lsn in the tablespace and clear the tablespace freed range if space is trimmed. file_name_t::freed_pages: Store the freed pages during recovery. file_name_t::add_freed_page(), file_name_t::remove_freed_page(): To add and remove freed page during recovery. store_freed_or_init_rec(): Store or remove the freed pages while encountering FREE_PAGE or INIT_PAGE redo log record. recv_init_crash_recovery_spaces(): Add the freed page encountered during recovery to respective tablespace.
2020-06-11 22:52:47 +05:30
/** Add freed page numbers to freed_pages */
void add_freed_offset(page_id_t id)
{
ut_ad(m_user_space == NULL || id.space() == m_user_space->id);
if (!m_freed_pages)
m_freed_pages= new range_set();
m_freed_pages->add_value(id.page_no());
MDEV-8139 Fix Scrubbing fil_space_t::freed_ranges: Store ranges of freed page numbers. fil_space_t::last_freed_lsn: Store the most recent LSN of freeing a page. fil_space_t::freed_mutex: Protects freed_ranges, last_freed_lsn. fil_space_create(): Initialize the freed_range mutex. fil_space_free_low(): Frees the freed_range mutex. range_set: Ranges of page numbers. buf_page_create(): Removes the page from freed_ranges when page is being reused. btr_free_root(): Remove the PAGE_INDEX_ID invalidation. Because btr_free_root() and dict_drop_index_tree() are executed in the same atomic mini-transaction, there is no need to invalidate the root page. buf_release_freed_page(): Split from buf_flush_freed_page(). Skip any I/O buf_flush_freed_pages(): Get the freed ranges from tablespace and Write punch-hole or zeroes of the freed ranges. buf_flush_try_neighbors(): Handles the flushing of freed ranges. mtr_t::freed_pages: Variable to store the list of freed pages. mtr_t::add_freed_pages(): To add freed pages. mtr_t::clear_freed_pages(): To clear the freed pages. mtr_t::m_freed_in_system_tablespace: Variable to indicate whether page has been freed in system tablespace. mtr_t::m_trim_pages: Variable to indicate whether the space has been trimmed. mtr_t::commit(): Add the freed page and update the last freed lsn in the tablespace and clear the tablespace freed range if space is trimmed. file_name_t::freed_pages: Store the freed pages during recovery. file_name_t::add_freed_page(), file_name_t::remove_freed_page(): To add and remove freed page during recovery. store_freed_or_init_rec(): Store or remove the freed pages while encountering FREE_PAGE or INIT_PAGE redo log record. recv_init_crash_recovery_spaces(): Add the freed page encountered during recovery to respective tablespace.
2020-06-11 22:52:47 +05:30
}
2020-09-09 16:57:30 +03:00
/** Determine the added buffer fix count of a block.
@param block block to be checked
@return number of buffer count added by this mtr */
uint32_t get_fix_count(const buf_block_t *block) const;
private:
MDEV-12353: Change the redo log encoding log_t::FORMAT_10_5: physical redo log format tag log_phys_t: Buffered records in the physical format. The log record bytes will follow the last data field, making use of alignment padding that would otherwise be wasted. If there are multiple records for the same page, also those may be appended to an existing log_phys_t object if the memory is available. In the physical format, the first byte of a record identifies the record and its length (up to 15 bytes). For longer records, the immediately following bytes will encode the remaining length in a variable-length encoding. Usually, a variable-length-encoded page identifier will follow, followed by optional payload, whose length is included in the initially encoded total record length. When a mini-transaction is updating multiple fields in a page, it can avoid repeating the tablespace identifier and page number by setting the same_page flag (most significant bit) in the first byte of the log record. The byte offset of the record will be relative to where the previous record for that page ended. Until MDEV-14425 introduces a separate file-level log for redo log checkpoints and file operations, we will write the file-level records in the page-level redo log file. The record FILE_CHECKPOINT (which replaces MLOG_CHECKPOINT) will be removed in MDEV-14425, and one sequential scan of the page recovery log will suffice. Compared to MLOG_FILE_CREATE2, FILE_CREATE will not include any flags. If the information is needed, it can be parsed from WRITE records that modify FSP_SPACE_FLAGS. MLOG_ZIP_WRITE_STRING: Remove. The record was only introduced temporarily as part of this work, before being replaced with WRITE (along with MLOG_WRITE_STRING, MLOG_1BYTE, MLOG_nBYTES). mtr_buf_t::empty(): Check if the buffer is empty. mtr_t::m_n_log_recs: Remove. It suffices to check if m_log is empty. mtr_t::m_last, mtr_t::m_last_offset: End of the latest m_log record, for the same_page encoding. page_recv_t::last_offset: Reflects mtr_t::m_last_offset. Valid values for last_offset during recovery should be 0 or above 8. (The first 8 bytes of a page are the checksum and the page number, and neither are ever updated directly by log records.) Internally, the special value 1 indicates that the same_page form will not be allowed for the subsequent record. mtr_t::page_create(): Take the block descriptor as parameter, so that it can be compared to mtr_t::m_last. The INIT_INDEX_PAGE record will always followed by a subtype byte, because same_page records must be longer than 1 byte. trx_undo_page_init(): Combine the writes in WRITE record. trx_undo_header_create(): Write 4 bytes using a special MEMSET record that includes 1 bytes of length and 2 bytes of payload. flst_write_addr(): Define as a static function. Combine the writes. flst_zero_both(): Replaces two flst_zero_addr() calls. flst_init(): Do not inline the function. fsp_free_seg_inode(): Zerofill the whole inode. fsp_apply_init_file_page(): Initialize FIL_PAGE_PREV,FIL_PAGE_NEXT to FIL_NULL when using the physical format. btr_create(): Assert !page_has_siblings() because fsp_apply_init_file_page() must have been invoked. fil_ibd_create(): Do not write FILE_MODIFY after FILE_CREATE. fil_names_dirty_and_write(): Remove the parameter mtr. Write the records using a separate mini-transaction object, because any FILE_ records must be at the start of a mini-transaction log. recv_recover_page(): Add a fil_space_t* parameter. After applying log to the a ROW_FORMAT=COMPRESSED page, invoke buf_zip_decompress() to restore the uncompressed page. buf_page_io_complete(): Remove the temporary hack to discard the uncompressed page of a ROW_FORMAT=COMPRESSED page. page_zip_write_header(): Remove. Use mtr_t::write() or mtr_t::memset() instead, and update the compressed page frame separately. trx_undo_header_add_space_for_xid(): Remove. trx_undo_seg_create(): Perform the changes that were previously made by trx_undo_header_add_space_for_xid(). btr_reset_instant(): New function: Reset the table to MariaDB 10.2 or 10.3 format when rolling back an instant ALTER TABLE operation. page_rec_find_owner_rec(): Merge with the only callers. page_cur_insert_rec_low(): Combine writes by using a local buffer. MEMMOVE data from the preceding record whenever feasible (copying at least 3 bytes). page_cur_insert_rec_zip(): Combine writes to page header fields. PageBulk::insertPage(): Issue MEMMOVE records to copy a matching part from the preceding record. PageBulk::finishPage(): Combine the writes to the page header and to the sparse page directory slots. mtr_t::write(): Only log the least significant (last) bytes of multi-byte fields that actually differ. For updating FSP_SIZE, we must always write all 4 bytes to the redo log, so that the fil_space_set_recv_size() logic in recv_sys_t::parse() will work. mtr_t::memcpy(), mtr_t::zmemcpy(): Take a pointer argument instead of a numeric offset to the page frame. Only log the last bytes of multi-byte fields that actually differ. In fil_space_crypt_t::write_page0(), we must log also any unchanged bytes, so that recovery will recognize the record and invoke fil_crypt_parse(). Future work: MDEV-21724 Optimize page_cur_insert_rec_low() redo logging MDEV-21725 Optimize btr_page_reorganize_low() redo logging MDEV-21727 Optimize redo logging for ROW_FORMAT=COMPRESSED
2020-02-13 19:12:17 +02:00
/** Log a write of a byte string to a page.
@param block buffer page
MDEV-12353: Change the redo log encoding log_t::FORMAT_10_5: physical redo log format tag log_phys_t: Buffered records in the physical format. The log record bytes will follow the last data field, making use of alignment padding that would otherwise be wasted. If there are multiple records for the same page, also those may be appended to an existing log_phys_t object if the memory is available. In the physical format, the first byte of a record identifies the record and its length (up to 15 bytes). For longer records, the immediately following bytes will encode the remaining length in a variable-length encoding. Usually, a variable-length-encoded page identifier will follow, followed by optional payload, whose length is included in the initially encoded total record length. When a mini-transaction is updating multiple fields in a page, it can avoid repeating the tablespace identifier and page number by setting the same_page flag (most significant bit) in the first byte of the log record. The byte offset of the record will be relative to where the previous record for that page ended. Until MDEV-14425 introduces a separate file-level log for redo log checkpoints and file operations, we will write the file-level records in the page-level redo log file. The record FILE_CHECKPOINT (which replaces MLOG_CHECKPOINT) will be removed in MDEV-14425, and one sequential scan of the page recovery log will suffice. Compared to MLOG_FILE_CREATE2, FILE_CREATE will not include any flags. If the information is needed, it can be parsed from WRITE records that modify FSP_SPACE_FLAGS. MLOG_ZIP_WRITE_STRING: Remove. The record was only introduced temporarily as part of this work, before being replaced with WRITE (along with MLOG_WRITE_STRING, MLOG_1BYTE, MLOG_nBYTES). mtr_buf_t::empty(): Check if the buffer is empty. mtr_t::m_n_log_recs: Remove. It suffices to check if m_log is empty. mtr_t::m_last, mtr_t::m_last_offset: End of the latest m_log record, for the same_page encoding. page_recv_t::last_offset: Reflects mtr_t::m_last_offset. Valid values for last_offset during recovery should be 0 or above 8. (The first 8 bytes of a page are the checksum and the page number, and neither are ever updated directly by log records.) Internally, the special value 1 indicates that the same_page form will not be allowed for the subsequent record. mtr_t::page_create(): Take the block descriptor as parameter, so that it can be compared to mtr_t::m_last. The INIT_INDEX_PAGE record will always followed by a subtype byte, because same_page records must be longer than 1 byte. trx_undo_page_init(): Combine the writes in WRITE record. trx_undo_header_create(): Write 4 bytes using a special MEMSET record that includes 1 bytes of length and 2 bytes of payload. flst_write_addr(): Define as a static function. Combine the writes. flst_zero_both(): Replaces two flst_zero_addr() calls. flst_init(): Do not inline the function. fsp_free_seg_inode(): Zerofill the whole inode. fsp_apply_init_file_page(): Initialize FIL_PAGE_PREV,FIL_PAGE_NEXT to FIL_NULL when using the physical format. btr_create(): Assert !page_has_siblings() because fsp_apply_init_file_page() must have been invoked. fil_ibd_create(): Do not write FILE_MODIFY after FILE_CREATE. fil_names_dirty_and_write(): Remove the parameter mtr. Write the records using a separate mini-transaction object, because any FILE_ records must be at the start of a mini-transaction log. recv_recover_page(): Add a fil_space_t* parameter. After applying log to the a ROW_FORMAT=COMPRESSED page, invoke buf_zip_decompress() to restore the uncompressed page. buf_page_io_complete(): Remove the temporary hack to discard the uncompressed page of a ROW_FORMAT=COMPRESSED page. page_zip_write_header(): Remove. Use mtr_t::write() or mtr_t::memset() instead, and update the compressed page frame separately. trx_undo_header_add_space_for_xid(): Remove. trx_undo_seg_create(): Perform the changes that were previously made by trx_undo_header_add_space_for_xid(). btr_reset_instant(): New function: Reset the table to MariaDB 10.2 or 10.3 format when rolling back an instant ALTER TABLE operation. page_rec_find_owner_rec(): Merge with the only callers. page_cur_insert_rec_low(): Combine writes by using a local buffer. MEMMOVE data from the preceding record whenever feasible (copying at least 3 bytes). page_cur_insert_rec_zip(): Combine writes to page header fields. PageBulk::insertPage(): Issue MEMMOVE records to copy a matching part from the preceding record. PageBulk::finishPage(): Combine the writes to the page header and to the sparse page directory slots. mtr_t::write(): Only log the least significant (last) bytes of multi-byte fields that actually differ. For updating FSP_SIZE, we must always write all 4 bytes to the redo log, so that the fil_space_set_recv_size() logic in recv_sys_t::parse() will work. mtr_t::memcpy(), mtr_t::zmemcpy(): Take a pointer argument instead of a numeric offset to the page frame. Only log the last bytes of multi-byte fields that actually differ. In fil_space_crypt_t::write_page0(), we must log also any unchanged bytes, so that recovery will recognize the record and invoke fil_crypt_parse(). Future work: MDEV-21724 Optimize page_cur_insert_rec_low() redo logging MDEV-21725 Optimize btr_page_reorganize_low() redo logging MDEV-21727 Optimize redo logging for ROW_FORMAT=COMPRESSED
2020-02-13 19:12:17 +02:00
@param offset byte offset within page
@param data data to be written
@param len length of the data, in bytes */
inline void memcpy_low(const buf_block_t &block, uint16_t offset,
MDEV-12353: Change the redo log encoding log_t::FORMAT_10_5: physical redo log format tag log_phys_t: Buffered records in the physical format. The log record bytes will follow the last data field, making use of alignment padding that would otherwise be wasted. If there are multiple records for the same page, also those may be appended to an existing log_phys_t object if the memory is available. In the physical format, the first byte of a record identifies the record and its length (up to 15 bytes). For longer records, the immediately following bytes will encode the remaining length in a variable-length encoding. Usually, a variable-length-encoded page identifier will follow, followed by optional payload, whose length is included in the initially encoded total record length. When a mini-transaction is updating multiple fields in a page, it can avoid repeating the tablespace identifier and page number by setting the same_page flag (most significant bit) in the first byte of the log record. The byte offset of the record will be relative to where the previous record for that page ended. Until MDEV-14425 introduces a separate file-level log for redo log checkpoints and file operations, we will write the file-level records in the page-level redo log file. The record FILE_CHECKPOINT (which replaces MLOG_CHECKPOINT) will be removed in MDEV-14425, and one sequential scan of the page recovery log will suffice. Compared to MLOG_FILE_CREATE2, FILE_CREATE will not include any flags. If the information is needed, it can be parsed from WRITE records that modify FSP_SPACE_FLAGS. MLOG_ZIP_WRITE_STRING: Remove. The record was only introduced temporarily as part of this work, before being replaced with WRITE (along with MLOG_WRITE_STRING, MLOG_1BYTE, MLOG_nBYTES). mtr_buf_t::empty(): Check if the buffer is empty. mtr_t::m_n_log_recs: Remove. It suffices to check if m_log is empty. mtr_t::m_last, mtr_t::m_last_offset: End of the latest m_log record, for the same_page encoding. page_recv_t::last_offset: Reflects mtr_t::m_last_offset. Valid values for last_offset during recovery should be 0 or above 8. (The first 8 bytes of a page are the checksum and the page number, and neither are ever updated directly by log records.) Internally, the special value 1 indicates that the same_page form will not be allowed for the subsequent record. mtr_t::page_create(): Take the block descriptor as parameter, so that it can be compared to mtr_t::m_last. The INIT_INDEX_PAGE record will always followed by a subtype byte, because same_page records must be longer than 1 byte. trx_undo_page_init(): Combine the writes in WRITE record. trx_undo_header_create(): Write 4 bytes using a special MEMSET record that includes 1 bytes of length and 2 bytes of payload. flst_write_addr(): Define as a static function. Combine the writes. flst_zero_both(): Replaces two flst_zero_addr() calls. flst_init(): Do not inline the function. fsp_free_seg_inode(): Zerofill the whole inode. fsp_apply_init_file_page(): Initialize FIL_PAGE_PREV,FIL_PAGE_NEXT to FIL_NULL when using the physical format. btr_create(): Assert !page_has_siblings() because fsp_apply_init_file_page() must have been invoked. fil_ibd_create(): Do not write FILE_MODIFY after FILE_CREATE. fil_names_dirty_and_write(): Remove the parameter mtr. Write the records using a separate mini-transaction object, because any FILE_ records must be at the start of a mini-transaction log. recv_recover_page(): Add a fil_space_t* parameter. After applying log to the a ROW_FORMAT=COMPRESSED page, invoke buf_zip_decompress() to restore the uncompressed page. buf_page_io_complete(): Remove the temporary hack to discard the uncompressed page of a ROW_FORMAT=COMPRESSED page. page_zip_write_header(): Remove. Use mtr_t::write() or mtr_t::memset() instead, and update the compressed page frame separately. trx_undo_header_add_space_for_xid(): Remove. trx_undo_seg_create(): Perform the changes that were previously made by trx_undo_header_add_space_for_xid(). btr_reset_instant(): New function: Reset the table to MariaDB 10.2 or 10.3 format when rolling back an instant ALTER TABLE operation. page_rec_find_owner_rec(): Merge with the only callers. page_cur_insert_rec_low(): Combine writes by using a local buffer. MEMMOVE data from the preceding record whenever feasible (copying at least 3 bytes). page_cur_insert_rec_zip(): Combine writes to page header fields. PageBulk::insertPage(): Issue MEMMOVE records to copy a matching part from the preceding record. PageBulk::finishPage(): Combine the writes to the page header and to the sparse page directory slots. mtr_t::write(): Only log the least significant (last) bytes of multi-byte fields that actually differ. For updating FSP_SIZE, we must always write all 4 bytes to the redo log, so that the fil_space_set_recv_size() logic in recv_sys_t::parse() will work. mtr_t::memcpy(), mtr_t::zmemcpy(): Take a pointer argument instead of a numeric offset to the page frame. Only log the last bytes of multi-byte fields that actually differ. In fil_space_crypt_t::write_page0(), we must log also any unchanged bytes, so that recovery will recognize the record and invoke fil_crypt_parse(). Future work: MDEV-21724 Optimize page_cur_insert_rec_low() redo logging MDEV-21725 Optimize btr_page_reorganize_low() redo logging MDEV-21727 Optimize redo logging for ROW_FORMAT=COMPRESSED
2020-02-13 19:12:17 +02:00
const void *data, size_t len);
/**
Write a log record.
MDEV-12353: Change the redo log encoding log_t::FORMAT_10_5: physical redo log format tag log_phys_t: Buffered records in the physical format. The log record bytes will follow the last data field, making use of alignment padding that would otherwise be wasted. If there are multiple records for the same page, also those may be appended to an existing log_phys_t object if the memory is available. In the physical format, the first byte of a record identifies the record and its length (up to 15 bytes). For longer records, the immediately following bytes will encode the remaining length in a variable-length encoding. Usually, a variable-length-encoded page identifier will follow, followed by optional payload, whose length is included in the initially encoded total record length. When a mini-transaction is updating multiple fields in a page, it can avoid repeating the tablespace identifier and page number by setting the same_page flag (most significant bit) in the first byte of the log record. The byte offset of the record will be relative to where the previous record for that page ended. Until MDEV-14425 introduces a separate file-level log for redo log checkpoints and file operations, we will write the file-level records in the page-level redo log file. The record FILE_CHECKPOINT (which replaces MLOG_CHECKPOINT) will be removed in MDEV-14425, and one sequential scan of the page recovery log will suffice. Compared to MLOG_FILE_CREATE2, FILE_CREATE will not include any flags. If the information is needed, it can be parsed from WRITE records that modify FSP_SPACE_FLAGS. MLOG_ZIP_WRITE_STRING: Remove. The record was only introduced temporarily as part of this work, before being replaced with WRITE (along with MLOG_WRITE_STRING, MLOG_1BYTE, MLOG_nBYTES). mtr_buf_t::empty(): Check if the buffer is empty. mtr_t::m_n_log_recs: Remove. It suffices to check if m_log is empty. mtr_t::m_last, mtr_t::m_last_offset: End of the latest m_log record, for the same_page encoding. page_recv_t::last_offset: Reflects mtr_t::m_last_offset. Valid values for last_offset during recovery should be 0 or above 8. (The first 8 bytes of a page are the checksum and the page number, and neither are ever updated directly by log records.) Internally, the special value 1 indicates that the same_page form will not be allowed for the subsequent record. mtr_t::page_create(): Take the block descriptor as parameter, so that it can be compared to mtr_t::m_last. The INIT_INDEX_PAGE record will always followed by a subtype byte, because same_page records must be longer than 1 byte. trx_undo_page_init(): Combine the writes in WRITE record. trx_undo_header_create(): Write 4 bytes using a special MEMSET record that includes 1 bytes of length and 2 bytes of payload. flst_write_addr(): Define as a static function. Combine the writes. flst_zero_both(): Replaces two flst_zero_addr() calls. flst_init(): Do not inline the function. fsp_free_seg_inode(): Zerofill the whole inode. fsp_apply_init_file_page(): Initialize FIL_PAGE_PREV,FIL_PAGE_NEXT to FIL_NULL when using the physical format. btr_create(): Assert !page_has_siblings() because fsp_apply_init_file_page() must have been invoked. fil_ibd_create(): Do not write FILE_MODIFY after FILE_CREATE. fil_names_dirty_and_write(): Remove the parameter mtr. Write the records using a separate mini-transaction object, because any FILE_ records must be at the start of a mini-transaction log. recv_recover_page(): Add a fil_space_t* parameter. After applying log to the a ROW_FORMAT=COMPRESSED page, invoke buf_zip_decompress() to restore the uncompressed page. buf_page_io_complete(): Remove the temporary hack to discard the uncompressed page of a ROW_FORMAT=COMPRESSED page. page_zip_write_header(): Remove. Use mtr_t::write() or mtr_t::memset() instead, and update the compressed page frame separately. trx_undo_header_add_space_for_xid(): Remove. trx_undo_seg_create(): Perform the changes that were previously made by trx_undo_header_add_space_for_xid(). btr_reset_instant(): New function: Reset the table to MariaDB 10.2 or 10.3 format when rolling back an instant ALTER TABLE operation. page_rec_find_owner_rec(): Merge with the only callers. page_cur_insert_rec_low(): Combine writes by using a local buffer. MEMMOVE data from the preceding record whenever feasible (copying at least 3 bytes). page_cur_insert_rec_zip(): Combine writes to page header fields. PageBulk::insertPage(): Issue MEMMOVE records to copy a matching part from the preceding record. PageBulk::finishPage(): Combine the writes to the page header and to the sparse page directory slots. mtr_t::write(): Only log the least significant (last) bytes of multi-byte fields that actually differ. For updating FSP_SIZE, we must always write all 4 bytes to the redo log, so that the fil_space_set_recv_size() logic in recv_sys_t::parse() will work. mtr_t::memcpy(), mtr_t::zmemcpy(): Take a pointer argument instead of a numeric offset to the page frame. Only log the last bytes of multi-byte fields that actually differ. In fil_space_crypt_t::write_page0(), we must log also any unchanged bytes, so that recovery will recognize the record and invoke fil_crypt_parse(). Future work: MDEV-21724 Optimize page_cur_insert_rec_low() redo logging MDEV-21725 Optimize btr_page_reorganize_low() redo logging MDEV-21727 Optimize redo logging for ROW_FORMAT=COMPRESSED
2020-02-13 19:12:17 +02:00
@tparam type redo log record type
@param id persistent page identifier
MDEV-12353: Change the redo log encoding log_t::FORMAT_10_5: physical redo log format tag log_phys_t: Buffered records in the physical format. The log record bytes will follow the last data field, making use of alignment padding that would otherwise be wasted. If there are multiple records for the same page, also those may be appended to an existing log_phys_t object if the memory is available. In the physical format, the first byte of a record identifies the record and its length (up to 15 bytes). For longer records, the immediately following bytes will encode the remaining length in a variable-length encoding. Usually, a variable-length-encoded page identifier will follow, followed by optional payload, whose length is included in the initially encoded total record length. When a mini-transaction is updating multiple fields in a page, it can avoid repeating the tablespace identifier and page number by setting the same_page flag (most significant bit) in the first byte of the log record. The byte offset of the record will be relative to where the previous record for that page ended. Until MDEV-14425 introduces a separate file-level log for redo log checkpoints and file operations, we will write the file-level records in the page-level redo log file. The record FILE_CHECKPOINT (which replaces MLOG_CHECKPOINT) will be removed in MDEV-14425, and one sequential scan of the page recovery log will suffice. Compared to MLOG_FILE_CREATE2, FILE_CREATE will not include any flags. If the information is needed, it can be parsed from WRITE records that modify FSP_SPACE_FLAGS. MLOG_ZIP_WRITE_STRING: Remove. The record was only introduced temporarily as part of this work, before being replaced with WRITE (along with MLOG_WRITE_STRING, MLOG_1BYTE, MLOG_nBYTES). mtr_buf_t::empty(): Check if the buffer is empty. mtr_t::m_n_log_recs: Remove. It suffices to check if m_log is empty. mtr_t::m_last, mtr_t::m_last_offset: End of the latest m_log record, for the same_page encoding. page_recv_t::last_offset: Reflects mtr_t::m_last_offset. Valid values for last_offset during recovery should be 0 or above 8. (The first 8 bytes of a page are the checksum and the page number, and neither are ever updated directly by log records.) Internally, the special value 1 indicates that the same_page form will not be allowed for the subsequent record. mtr_t::page_create(): Take the block descriptor as parameter, so that it can be compared to mtr_t::m_last. The INIT_INDEX_PAGE record will always followed by a subtype byte, because same_page records must be longer than 1 byte. trx_undo_page_init(): Combine the writes in WRITE record. trx_undo_header_create(): Write 4 bytes using a special MEMSET record that includes 1 bytes of length and 2 bytes of payload. flst_write_addr(): Define as a static function. Combine the writes. flst_zero_both(): Replaces two flst_zero_addr() calls. flst_init(): Do not inline the function. fsp_free_seg_inode(): Zerofill the whole inode. fsp_apply_init_file_page(): Initialize FIL_PAGE_PREV,FIL_PAGE_NEXT to FIL_NULL when using the physical format. btr_create(): Assert !page_has_siblings() because fsp_apply_init_file_page() must have been invoked. fil_ibd_create(): Do not write FILE_MODIFY after FILE_CREATE. fil_names_dirty_and_write(): Remove the parameter mtr. Write the records using a separate mini-transaction object, because any FILE_ records must be at the start of a mini-transaction log. recv_recover_page(): Add a fil_space_t* parameter. After applying log to the a ROW_FORMAT=COMPRESSED page, invoke buf_zip_decompress() to restore the uncompressed page. buf_page_io_complete(): Remove the temporary hack to discard the uncompressed page of a ROW_FORMAT=COMPRESSED page. page_zip_write_header(): Remove. Use mtr_t::write() or mtr_t::memset() instead, and update the compressed page frame separately. trx_undo_header_add_space_for_xid(): Remove. trx_undo_seg_create(): Perform the changes that were previously made by trx_undo_header_add_space_for_xid(). btr_reset_instant(): New function: Reset the table to MariaDB 10.2 or 10.3 format when rolling back an instant ALTER TABLE operation. page_rec_find_owner_rec(): Merge with the only callers. page_cur_insert_rec_low(): Combine writes by using a local buffer. MEMMOVE data from the preceding record whenever feasible (copying at least 3 bytes). page_cur_insert_rec_zip(): Combine writes to page header fields. PageBulk::insertPage(): Issue MEMMOVE records to copy a matching part from the preceding record. PageBulk::finishPage(): Combine the writes to the page header and to the sparse page directory slots. mtr_t::write(): Only log the least significant (last) bytes of multi-byte fields that actually differ. For updating FSP_SIZE, we must always write all 4 bytes to the redo log, so that the fil_space_set_recv_size() logic in recv_sys_t::parse() will work. mtr_t::memcpy(), mtr_t::zmemcpy(): Take a pointer argument instead of a numeric offset to the page frame. Only log the last bytes of multi-byte fields that actually differ. In fil_space_crypt_t::write_page0(), we must log also any unchanged bytes, so that recovery will recognize the record and invoke fil_crypt_parse(). Future work: MDEV-21724 Optimize page_cur_insert_rec_low() redo logging MDEV-21725 Optimize btr_page_reorganize_low() redo logging MDEV-21727 Optimize redo logging for ROW_FORMAT=COMPRESSED
2020-02-13 19:12:17 +02:00
@param bpage buffer pool page, or nullptr
@param len number of additional bytes to write
@param alloc whether to allocate the additional bytes
@param offset byte offset, or 0 if the record type does not allow one
@return end of mini-transaction log, minus len */
template<byte type>
inline byte *log_write(const page_id_t id, const buf_page_t *bpage,
size_t len= 0, bool alloc= false, size_t offset= 0);
/** Write an EXTENDED log record.
@param block buffer pool page
@param type extended record subtype; @see mrec_ext_t */
inline void log_write_extended(const buf_block_t &block, byte type);
/** Prepare to write the mini-transaction log to the redo log buffer.
@return number of bytes to write in finish_write() */
inline ulint prepare_write();
/** Append the redo log records to the redo log buffer.
@param len number of bytes to write
MDEV-23855: Improve InnoDB log checkpoint performance After MDEV-15053, MDEV-22871, MDEV-23399 shifted the scalability bottleneck, log checkpoints became a new bottleneck. If innodb_io_capacity is set low or innodb_max_dirty_pct_lwm is set high and the workload fits in the buffer pool, the page cleaner thread will perform very little flushing. When we reach the capacity of the circular redo log file ib_logfile0 and must initiate a checkpoint, some 'furious flushing' will be necessary. (If innodb_flush_sync=OFF, then flushing would continue at the innodb_io_capacity rate, and writers would be throttled.) We have the best chance of advancing the checkpoint LSN immediately after a page flush batch has been completed. Hence, it is best to perform checkpoints after every batch in the page cleaner thread, attempting to run once per second. By initiating high-priority flushing in the page cleaner as early as possible, we aim to make the throughput more stable. The function buf_flush_wait_flushed() used to sleep for 10ms, hoping that the page cleaner thread would do something during that time. The observed end result was that a large number of threads that call log_free_check() would end up sleeping while nothing useful is happening. We will revise the design so that in the default innodb_flush_sync=ON mode, buf_flush_wait_flushed() will wake up the page cleaner thread to perform the necessary flushing, and it will wait for a signal from the page cleaner thread. If innodb_io_capacity is set to a low value (causing the page cleaner to throttle its work), a write workload would initially perform well, until the capacity of the circular ib_logfile0 is reached and log_free_check() will trigger checkpoints. At that point, the extra waiting in buf_flush_wait_flushed() will start reducing throughput. The page cleaner thread will also initiate log checkpoints after each buf_flush_lists() call, because that is the best point of time for the checkpoint LSN to advance by the maximum amount. Even in 'furious flushing' mode we invoke buf_flush_lists() with innodb_io_capacity_max pages at a time, and at the start of each batch (in the log_flush() callback function that runs in a separate task) we will invoke os_aio_wait_until_no_pending_writes(). This tweak allows the checkpoint to advance in smaller steps and significantly reduces the maximum latency. On an Intel Optane 960 NVMe SSD on Linux, it reduced from 4.6 seconds to 74 milliseconds. On Microsoft Windows with a slower SSD, it reduced from more than 180 seconds to 0.6 seconds. We will make innodb_adaptive_flushing=OFF simply flush innodb_io_capacity per second whenever the dirty proportion of buffer pool pages exceeds innodb_max_dirty_pages_pct_lwm. For innodb_adaptive_flushing=ON we try to make page_cleaner_flush_pages_recommendation() more consistent and predictable: if we are below innodb_adaptive_flushing_lwm, let us flush pages according to the return value of af_get_pct_for_dirty(). innodb_max_dirty_pages_pct_lwm: Revert the change of the default value that was made in MDEV-23399. The value innodb_max_dirty_pages_pct_lwm=0 guarantees that a shutdown of an idle server will be fast. Users might be surprised if normal shutdown suddenly became slower when upgrading within a GA release series. innodb_checkpoint_usec: Remove. The master task will no longer perform periodic log checkpoints. It is the duty of the page cleaner thread. log_sys.max_modified_age: Remove. The current span of the buf_pool.flush_list expressed in LSN only matters for adaptive flushing (outside the 'furious flushing' condition). For the correctness of checkpoints, the only thing that matters is the checkpoint age (log_sys.lsn - log_sys.last_checkpoint_lsn). This run-time constant was also reported as log_max_modified_age_sync. log_sys.max_checkpoint_age_async: Remove. This does not serve any purpose, because the checkpoints will now be triggered by the page cleaner thread. We will retain the log_sys.max_checkpoint_age limit for engaging 'furious flushing'. page_cleaner.slot: Remove. It turns out that page_cleaner_slot.flush_list_time was duplicating page_cleaner.slot.flush_time and page_cleaner.slot.flush_list_pass was duplicating page_cleaner.flush_pass. Likewise, there were some redundant monitor counters, because the page cleaner thread no longer performs any buf_pool.LRU flushing, and because there only is one buf_flush_page_cleaner thread. buf_flush_sync_lsn: Protect writes by buf_pool.flush_list_mutex. buf_pool_t::get_oldest_modification(): Add a parameter to specify the return value when no persistent data pages are dirty. Require the caller to hold buf_pool.flush_list_mutex. log_buf_pool_get_oldest_modification(): Take the fall-back LSN as a parameter. All callers will also invoke log_sys.get_lsn(). log_preflush_pool_modified_pages(): Replaced with buf_flush_wait_flushed(). buf_flush_wait_flushed(): Implement two limits. If not enough buffer pool has been flushed, signal the page cleaner (unless innodb_flush_sync=OFF) and wait for the page cleaner to complete. If the page cleaner thread is not running (which can be the case durign shutdown), initiate the flush and wait for it directly. buf_flush_ahead(): If innodb_flush_sync=ON (the default), submit a new buf_flush_sync_lsn target for the page cleaner but do not wait for the flushing to finish. log_get_capacity(), log_get_max_modified_age_async(): Remove, to make it easier to see that af_get_pct_for_lsn() is not acquiring any mutexes. page_cleaner_flush_pages_recommendation(): Protect all access to buf_pool.flush_list with buf_pool.flush_list_mutex. Previously there were some race conditions in the calculation. buf_flush_sync_for_checkpoint(): New function to process buf_flush_sync_lsn in the page cleaner thread. At the end of each batch, we try to wake up any blocked buf_flush_wait_flushed(). If everything up to buf_flush_sync_lsn has been flushed, we will reset buf_flush_sync_lsn=0. The page cleaner thread will keep 'furious flushing' until the limit is reached. Any threads that are waiting in buf_flush_wait_flushed() will be able to resume as soon as their own limit has been satisfied. buf_flush_page_cleaner: Prioritize buf_flush_sync_lsn and do not sleep as long as it is set. Do not update any page_cleaner statistics for this special mode of operation. In the normal mode (buf_flush_sync_lsn is not set for innodb_flush_sync=ON), try to wake up once per second. No longer check whether srv_inc_activity_count() has been called. After each batch, try to perform a log checkpoint, because the best chances for the checkpoint LSN to advance by the maximum amount are upon completing a flushing batch. log_t: Move buf_free, max_buf_free possibly to the same cache line with log_sys.mutex. log_margin_checkpoint_age(): Simplify the logic, and replace a 0.1-second sleep with a call to buf_flush_wait_flushed() to initiate flushing. Moved to the same compilation unit with the only caller. log_close(): Clean up the calculations. (Should be no functional change.) Return whether flush-ahead is needed. Moved to the same compilation unit with the only caller. mtr_t::finish_write(): Return whether flush-ahead is needed. mtr_t::commit(): Invoke buf_flush_ahead() when needed. Let us avoid external calls in mtr_t::commit() and make the logic easier to follow by having related code in a single compilation unit. Also, we will invoke srv_stats.log_write_requests.inc() only once per mini-transaction commit, while not holding mutexes. log_checkpoint_margin(): Only care about log_sys.max_checkpoint_age. Upon reaching log_sys.max_checkpoint_age where we must wait to prevent the log from getting corrupted, let us wait for at most 1MiB of LSN at a time, before rechecking the condition. This should allow writers to proceed even if the redo log capacity has been reached and 'furious flushing' is in progress. We no longer care about log_sys.max_modified_age_sync or log_sys.max_modified_age_async. The log_sys.max_modified_age_sync could be a relic from the time when there was a srv_master_thread that wrote dirty pages to data files. Also, we no longer have any log_sys.max_checkpoint_age_async limit, because log checkpoints will now be triggered by the page cleaner thread upon completing buf_flush_lists(). log_set_capacity(): Simplify the calculations of the limit (no functional change). log_checkpoint_low(): Split from log_checkpoint(). Moved to the same compilation unit with the caller. log_make_checkpoint(): Only wait for everything to be flushed until the current LSN. create_log_file(): After checkpoint, invoke log_write_up_to() to ensure that the FILE_CHECKPOINT record has been written. This avoids ut_ad(!srv_log_file_created) in create_log_file_rename(). srv_start(): Do not call recv_recovery_from_checkpoint_start() if the log has just been created. Set fil_system.space_id_reuse_warned before dict_boot() has been executed, and clear it after recovery has finished. dict_boot(): Initialize fil_system.max_assigned_id. srv_check_activity(): Remove. The activity count is counting transaction commits and therefore mostly interesting for the purge of history. BtrBulk::insert(): Do not explicitly wake up the page cleaner, but do invoke srv_inc_activity_count(), because that counter is still being used in buf_load_throttle_if_needed() for some heuristics. (It might be cleaner to execute buf_load() in the page cleaner thread!) Reviewed by: Vladislav Vaintroub
2020-10-26 16:35:47 +02:00
@return {start_lsn,flush_ahead} */
inline std::pair<lsn_t,bool> finish_write(ulint len);
/** Release the resources */
inline void release_resources();
#ifdef UNIV_DEBUG
public:
/** @return whether the mini-transaction is active */
bool is_active() const
{ ut_ad(!m_commit || m_start); return m_start && !m_commit; }
/** @return whether the mini-transaction has been committed */
bool has_committed() const { ut_ad(!m_commit || m_start); return m_commit; }
private:
/** whether start() has been called */
bool m_start= false;
/** whether commit() has been called */
bool m_commit= false;
#endif
MDEV-12353: Change the redo log encoding log_t::FORMAT_10_5: physical redo log format tag log_phys_t: Buffered records in the physical format. The log record bytes will follow the last data field, making use of alignment padding that would otherwise be wasted. If there are multiple records for the same page, also those may be appended to an existing log_phys_t object if the memory is available. In the physical format, the first byte of a record identifies the record and its length (up to 15 bytes). For longer records, the immediately following bytes will encode the remaining length in a variable-length encoding. Usually, a variable-length-encoded page identifier will follow, followed by optional payload, whose length is included in the initially encoded total record length. When a mini-transaction is updating multiple fields in a page, it can avoid repeating the tablespace identifier and page number by setting the same_page flag (most significant bit) in the first byte of the log record. The byte offset of the record will be relative to where the previous record for that page ended. Until MDEV-14425 introduces a separate file-level log for redo log checkpoints and file operations, we will write the file-level records in the page-level redo log file. The record FILE_CHECKPOINT (which replaces MLOG_CHECKPOINT) will be removed in MDEV-14425, and one sequential scan of the page recovery log will suffice. Compared to MLOG_FILE_CREATE2, FILE_CREATE will not include any flags. If the information is needed, it can be parsed from WRITE records that modify FSP_SPACE_FLAGS. MLOG_ZIP_WRITE_STRING: Remove. The record was only introduced temporarily as part of this work, before being replaced with WRITE (along with MLOG_WRITE_STRING, MLOG_1BYTE, MLOG_nBYTES). mtr_buf_t::empty(): Check if the buffer is empty. mtr_t::m_n_log_recs: Remove. It suffices to check if m_log is empty. mtr_t::m_last, mtr_t::m_last_offset: End of the latest m_log record, for the same_page encoding. page_recv_t::last_offset: Reflects mtr_t::m_last_offset. Valid values for last_offset during recovery should be 0 or above 8. (The first 8 bytes of a page are the checksum and the page number, and neither are ever updated directly by log records.) Internally, the special value 1 indicates that the same_page form will not be allowed for the subsequent record. mtr_t::page_create(): Take the block descriptor as parameter, so that it can be compared to mtr_t::m_last. The INIT_INDEX_PAGE record will always followed by a subtype byte, because same_page records must be longer than 1 byte. trx_undo_page_init(): Combine the writes in WRITE record. trx_undo_header_create(): Write 4 bytes using a special MEMSET record that includes 1 bytes of length and 2 bytes of payload. flst_write_addr(): Define as a static function. Combine the writes. flst_zero_both(): Replaces two flst_zero_addr() calls. flst_init(): Do not inline the function. fsp_free_seg_inode(): Zerofill the whole inode. fsp_apply_init_file_page(): Initialize FIL_PAGE_PREV,FIL_PAGE_NEXT to FIL_NULL when using the physical format. btr_create(): Assert !page_has_siblings() because fsp_apply_init_file_page() must have been invoked. fil_ibd_create(): Do not write FILE_MODIFY after FILE_CREATE. fil_names_dirty_and_write(): Remove the parameter mtr. Write the records using a separate mini-transaction object, because any FILE_ records must be at the start of a mini-transaction log. recv_recover_page(): Add a fil_space_t* parameter. After applying log to the a ROW_FORMAT=COMPRESSED page, invoke buf_zip_decompress() to restore the uncompressed page. buf_page_io_complete(): Remove the temporary hack to discard the uncompressed page of a ROW_FORMAT=COMPRESSED page. page_zip_write_header(): Remove. Use mtr_t::write() or mtr_t::memset() instead, and update the compressed page frame separately. trx_undo_header_add_space_for_xid(): Remove. trx_undo_seg_create(): Perform the changes that were previously made by trx_undo_header_add_space_for_xid(). btr_reset_instant(): New function: Reset the table to MariaDB 10.2 or 10.3 format when rolling back an instant ALTER TABLE operation. page_rec_find_owner_rec(): Merge with the only callers. page_cur_insert_rec_low(): Combine writes by using a local buffer. MEMMOVE data from the preceding record whenever feasible (copying at least 3 bytes). page_cur_insert_rec_zip(): Combine writes to page header fields. PageBulk::insertPage(): Issue MEMMOVE records to copy a matching part from the preceding record. PageBulk::finishPage(): Combine the writes to the page header and to the sparse page directory slots. mtr_t::write(): Only log the least significant (last) bytes of multi-byte fields that actually differ. For updating FSP_SIZE, we must always write all 4 bytes to the redo log, so that the fil_space_set_recv_size() logic in recv_sys_t::parse() will work. mtr_t::memcpy(), mtr_t::zmemcpy(): Take a pointer argument instead of a numeric offset to the page frame. Only log the last bytes of multi-byte fields that actually differ. In fil_space_crypt_t::write_page0(), we must log also any unchanged bytes, so that recovery will recognize the record and invoke fil_crypt_parse(). Future work: MDEV-21724 Optimize page_cur_insert_rec_low() redo logging MDEV-21725 Optimize btr_page_reorganize_low() redo logging MDEV-21727 Optimize redo logging for ROW_FORMAT=COMPRESSED
2020-02-13 19:12:17 +02:00
/** The page of the most recent m_log record written, or NULL */
const buf_page_t* m_last;
/** The current byte offset in m_last, or 0 */
uint16_t m_last_offset;
/** specifies which operations should be logged; default MTR_LOG_ALL */
uint16_t m_log_mode:2;
/** whether at least one buffer pool page was written to */
uint16_t m_modifications:1;
/** whether at least one previously clean buffer pool page was written to */
uint16_t m_made_dirty:1;
/** whether change buffer is latched; only needed in non-debug builds
to suppress some read-ahead operations, @see ibuf_inside() */
uint16_t m_inside_ibuf:1;
MDEV-8139 Fix Scrubbing fil_space_t::freed_ranges: Store ranges of freed page numbers. fil_space_t::last_freed_lsn: Store the most recent LSN of freeing a page. fil_space_t::freed_mutex: Protects freed_ranges, last_freed_lsn. fil_space_create(): Initialize the freed_range mutex. fil_space_free_low(): Frees the freed_range mutex. range_set: Ranges of page numbers. buf_page_create(): Removes the page from freed_ranges when page is being reused. btr_free_root(): Remove the PAGE_INDEX_ID invalidation. Because btr_free_root() and dict_drop_index_tree() are executed in the same atomic mini-transaction, there is no need to invalidate the root page. buf_release_freed_page(): Split from buf_flush_freed_page(). Skip any I/O buf_flush_freed_pages(): Get the freed ranges from tablespace and Write punch-hole or zeroes of the freed ranges. buf_flush_try_neighbors(): Handles the flushing of freed ranges. mtr_t::freed_pages: Variable to store the list of freed pages. mtr_t::add_freed_pages(): To add freed pages. mtr_t::clear_freed_pages(): To clear the freed pages. mtr_t::m_freed_in_system_tablespace: Variable to indicate whether page has been freed in system tablespace. mtr_t::m_trim_pages: Variable to indicate whether the space has been trimmed. mtr_t::commit(): Add the freed page and update the last freed lsn in the tablespace and clear the tablespace freed range if space is trimmed. file_name_t::freed_pages: Store the freed pages during recovery. file_name_t::add_freed_page(), file_name_t::remove_freed_page(): To add and remove freed page during recovery. store_freed_or_init_rec(): Store or remove the freed pages while encountering FREE_PAGE or INIT_PAGE redo log record. recv_init_crash_recovery_spaces(): Add the freed page encountered during recovery to respective tablespace.
2020-06-11 22:52:47 +05:30
/** whether the page has been freed in system tablespace */
uint16_t m_freed_in_system_tablespace:1;
/** whether the pages has been trimmed */
uint16_t m_trim_pages:1;
#ifdef UNIV_DEBUG
/** Persistent user tablespace associated with the
mini-transaction, or 0 (TRX_SYS_SPACE) if none yet */
uint32_t m_user_space_id;
#endif /* UNIV_DEBUG */
/** acquired dict_index_t::lock, fil_space_t::latch, buf_block_t */
mtr_buf_t m_memo;
/** mini-transaction log */
mtr_buf_t m_log;
/** user tablespace that is being modified by the mini-transaction */
fil_space_t* m_user_space;
/** LSN at commit time */
lsn_t m_commit_lsn;
MDEV-8139 Fix Scrubbing fil_space_t::freed_ranges: Store ranges of freed page numbers. fil_space_t::last_freed_lsn: Store the most recent LSN of freeing a page. fil_space_t::freed_mutex: Protects freed_ranges, last_freed_lsn. fil_space_create(): Initialize the freed_range mutex. fil_space_free_low(): Frees the freed_range mutex. range_set: Ranges of page numbers. buf_page_create(): Removes the page from freed_ranges when page is being reused. btr_free_root(): Remove the PAGE_INDEX_ID invalidation. Because btr_free_root() and dict_drop_index_tree() are executed in the same atomic mini-transaction, there is no need to invalidate the root page. buf_release_freed_page(): Split from buf_flush_freed_page(). Skip any I/O buf_flush_freed_pages(): Get the freed ranges from tablespace and Write punch-hole or zeroes of the freed ranges. buf_flush_try_neighbors(): Handles the flushing of freed ranges. mtr_t::freed_pages: Variable to store the list of freed pages. mtr_t::add_freed_pages(): To add freed pages. mtr_t::clear_freed_pages(): To clear the freed pages. mtr_t::m_freed_in_system_tablespace: Variable to indicate whether page has been freed in system tablespace. mtr_t::m_trim_pages: Variable to indicate whether the space has been trimmed. mtr_t::commit(): Add the freed page and update the last freed lsn in the tablespace and clear the tablespace freed range if space is trimmed. file_name_t::freed_pages: Store the freed pages during recovery. file_name_t::add_freed_page(), file_name_t::remove_freed_page(): To add and remove freed page during recovery. store_freed_or_init_rec(): Store or remove the freed pages while encountering FREE_PAGE or INIT_PAGE redo log record. recv_init_crash_recovery_spaces(): Add the freed page encountered during recovery to respective tablespace.
2020-06-11 22:52:47 +05:30
/** set of freed page ids */
range_set *m_freed_pages= nullptr;
};
#include "mtr0mtr.ic"
#endif /* mtr0mtr_h */