From 1122ac978e2e709ae17a19335cbf0e4e5b53ad01 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marko=20M=C3=A4kel=C3=A4?= Date: Thu, 4 Apr 2024 08:12:54 +0300 Subject: [PATCH] MDEV-33545: Improve innodb_doublewrite to cover NO_FSYNC In commit 24648768b443f6adeb8a0f4302958bfb300d536f (MDEV-30136) the parameter innodb_flush_method was deprecated, with no direct replacement for innodb_flush_method=O_DIRECT_NO_FSYNC. Let us change innodb_doublewrite from Boolean to ENUM that can be changed while the server is running: OFF: Assume that writes of innodb_page_size are atomic ON: Prevent torn writes (the default) fast: Like ON, but avoid synchronizing writes to data files The deprecated start-up parameter innodb_flush_method=NO_FSYNC will cause innodb_doublewrite=ON to be changed to innodb_doublewrite=fast, which will prevent InnoDB from making any durable writes to data files. This would normally be done right before the log checkpoint LSN is updated. Depending on the file systems being used and their configuration, this may or may not be safe. The value innodb_doublewrite=fast differs from the previous combination of innodb_doublewrite=ON and innodb_flush_method=O_DIRECT_NO_FSYNC by always invoking os_file_flush() on the doublewrite buffer itself in buf_dblwr_t::flush_buffered_writes_completed(). This should be safer when there are multiple doublewrite batches between checkpoints. Typically, once per second, buf_flush_page_cleaner() would write out up to innodb_io_capacity pages and advance the log checkpoint. Also typically, innodb_io_capacity>128, which is the size of the doublewrite buffer in pages. Should os_file_flush_func() not be invoked between doublewrite batches, writes could be reordered in an unsafe way. The setting innodb_doublewrite=fast could be safe when the doublewrite buffer (the first file of the system tablespace) and the data files reside in the same file system. This was tested by running "./mtr --rr innodb.alter_kill". On the first server startup, with innodb_doublewrite=fast, os_file_flush_func() would only be invoked on the ibdata1 file and possibly ib_logfile0. On subsequent startups with innodb_doublewrite=OFF, os_file_flush_func() will be invoked on the individual data files during log_checkpoint(). Note: The setting debug_no_sync (in the code, my_disable_sync) would disable all durable writes to InnoDB files, which would be much less safe. IORequest::Type: Introduce special values WRITE_DBL and PUNCH_DBL for asynchronous writes that are submitted via the doublewrite buffer. In this way, fil_space_t::use_doublewrite() or buf_dblwr.in_use() will only be consulted during buf_page_t::flush() and the doublewrite buffer can be enabled or disabled without any fear of inconsistency. buf_dblwr_t::block_size: Replaces block_size(). buf_dblwr_t::flush_buffered_writes(): If !in_use() and the doublewrite buffer is empty, just invoke fil_flush_file_spaces() and return. The doublewrite buffer could have been disabled while a batch was in progress. innodb_init_params(): If innodb_flush_method=O_DIRECT_NO_FSYNC, set innodb_doublewrite=fast or innodb_doublewrite=fearless. Thanks to Mark Callaghan for reporting this, and Vladislav Vaintroub for feedback. --- extra/mariabackup/xtrabackup.cc | 6 +- mysql-test/suite/innodb/r/alter_kill.result | 9 +- .../suite/innodb/t/alter_kill-master.opt | 2 +- mysql-test/suite/innodb/t/alter_kill.test | 6 ++ .../r/innodb_doublewrite_basic.result | 47 +++++------ .../suite/sys_vars/r/sysvars_innodb.result | 10 +-- .../sys_vars/t/innodb_doublewrite_basic.opt | 1 + .../sys_vars/t/innodb_doublewrite_basic.test | 82 +++---------------- storage/innobase/buf/buf0dblwr.cc | 36 ++++---- storage/innobase/buf/buf0flu.cc | 4 +- storage/innobase/fil/fil0fil.cc | 35 ++++---- storage/innobase/handler/ha_innodb.cc | 37 +++++++-- storage/innobase/include/buf0dblwr.h | 40 +++++++-- storage/innobase/include/fil0fil.h | 33 +++++--- storage/innobase/include/os0file.h | 12 +++ storage/innobase/include/srv0srv.h | 1 - storage/innobase/srv/srv0srv.cc | 2 - storage/innobase/srv/srv0start.cc | 2 +- 18 files changed, 193 insertions(+), 172 deletions(-) create mode 100644 mysql-test/suite/sys_vars/t/innodb_doublewrite_basic.opt diff --git a/extra/mariabackup/xtrabackup.cc b/extra/mariabackup/xtrabackup.cc index 5facdefb193..b7ccdc784a2 100644 --- a/extra/mariabackup/xtrabackup.cc +++ b/extra/mariabackup/xtrabackup.cc @@ -380,8 +380,8 @@ static my_bool opt_check_privileges; extern const char *innodb_checksum_algorithm_names[]; extern TYPELIB innodb_checksum_algorithm_typelib; -extern const char *innodb_flush_method_names[]; extern TYPELIB innodb_flush_method_typelib; +extern TYPELIB innodb_doublewrite_typelib; /** Ignored option */ static ulong innodb_flush_method; @@ -1859,8 +1859,8 @@ struct my_option xb_server_options[] = &innobase_data_home_dir, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0}, {"innodb_doublewrite", OPT_INNODB_DOUBLEWRITE, "Enable InnoDB doublewrite buffer during --prepare.", - (G_PTR*) &srv_use_doublewrite_buf, - (G_PTR*) &srv_use_doublewrite_buf, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0}, + (G_PTR*) &buf_dblwr.use, (G_PTR*) &buf_dblwr.use, + &innodb_doublewrite_typelib, GET_ENUM, OPT_ARG, 0, 0, 0, 0, 0, 0}, {"innodb_io_capacity", OPT_INNODB_IO_CAPACITY, "Number of IOPs the server can do. Tunes the background IO rate", (G_PTR*) &srv_io_capacity, (G_PTR*) &srv_io_capacity, diff --git a/mysql-test/suite/innodb/r/alter_kill.result b/mysql-test/suite/innodb/r/alter_kill.result index c4469a8c322..af23efe9790 100644 --- a/mysql-test/suite/innodb/r/alter_kill.result +++ b/mysql-test/suite/innodb/r/alter_kill.result @@ -1,3 +1,7 @@ +SELECT @@innodb_doublewrite; +@@innodb_doublewrite +OFF +SET GLOBAL innodb_doublewrite=fast; # # Bug#16720368 INNODB CRASHES ON BROKEN #SQL*.IBD FILE AT STARTUP # @@ -12,7 +16,10 @@ connection default; disconnect con1; # Corrupt FIL_PAGE_TYPE in bug16720368.ibd, # and recompute innodb_checksum_algorithm=crc32 -# restart +# restart: --innodb-flush-method=O_DIRECT +SELECT @@innodb_doublewrite; +@@innodb_doublewrite +OFF SELECT COUNT(*) FROM bug16720368; ERROR HY000: Table `test`.`bug16720368` is corrupted. Please drop the table and recreate. INSERT INTO bug16720368 VALUES(1); diff --git a/mysql-test/suite/innodb/t/alter_kill-master.opt b/mysql-test/suite/innodb/t/alter_kill-master.opt index e472160c2b7..9eb72834ef6 100644 --- a/mysql-test/suite/innodb/t/alter_kill-master.opt +++ b/mysql-test/suite/innodb/t/alter_kill-master.opt @@ -1 +1 @@ ---innodb-doublewrite=false +--innodb-flush-method=O_DIRECT_NO_FSYNC --skip-innodb-doublewrite diff --git a/mysql-test/suite/innodb/t/alter_kill.test b/mysql-test/suite/innodb/t/alter_kill.test index 798f9af00db..3936b3fd9e4 100644 --- a/mysql-test/suite/innodb/t/alter_kill.test +++ b/mysql-test/suite/innodb/t/alter_kill.test @@ -7,6 +7,9 @@ let MYSQLD_DATADIR=`select @@datadir`; let PAGE_SIZE=`select @@innodb_page_size`; +SELECT @@innodb_doublewrite; +SET GLOBAL innodb_doublewrite=fast; + -- disable_query_log call mtr.add_suppression("InnoDB: innodb_force_recovery is on."); call mtr.add_suppression("InnoDB: Ignoring tablespace for.*bug16720368"); @@ -73,8 +76,11 @@ syswrite(FILE, $page, $ps)==$ps || die "Unable to write $file\n"; close(FILE) || die "Unable to close $file"; EOF +-- let $restart_parameters=--innodb-flush-method=O_DIRECT -- source include/start_mysqld.inc +-- let $restart_parameters= +SELECT @@innodb_doublewrite; --error ER_TABLE_CORRUPT SELECT COUNT(*) FROM bug16720368; --error ER_TABLE_CORRUPT diff --git a/mysql-test/suite/sys_vars/r/innodb_doublewrite_basic.result b/mysql-test/suite/sys_vars/r/innodb_doublewrite_basic.result index 4a5baf0aeda..9e93d943c9f 100644 --- a/mysql-test/suite/sys_vars/r/innodb_doublewrite_basic.result +++ b/mysql-test/suite/sys_vars/r/innodb_doublewrite_basic.result @@ -1,33 +1,25 @@ -'#---------------------BS_STVARS_026_01----------------------#' -SELECT COUNT(@@GLOBAL.innodb_doublewrite); -COUNT(@@GLOBAL.innodb_doublewrite) -1 -1 Expected -'#---------------------BS_STVARS_026_02----------------------#' +SELECT @@GLOBAL.innodb_doublewrite; +@@GLOBAL.innodb_doublewrite +ON +SET @@GLOBAL.innodb_doublewrite=0; +SELECT @@GLOBAL.innodb_doublewrite; +@@GLOBAL.innodb_doublewrite +OFF +SET @@GLOBAL.innodb_doublewrite=2; +SET @@GLOBAL.innodb_doublewrite=3; +ERROR 42000: Variable 'innodb_doublewrite' can't be set to the value of '3' +SELECT @@GLOBAL.innodb_doublewrite; +@@GLOBAL.innodb_doublewrite +fast SET @@GLOBAL.innodb_doublewrite=1; -ERROR HY000: Variable 'innodb_doublewrite' is a read only variable -Expected error 'Read only variable' -SELECT COUNT(@@GLOBAL.innodb_doublewrite); -COUNT(@@GLOBAL.innodb_doublewrite) -1 -1 Expected -'#---------------------BS_STVARS_026_03----------------------#' -SELECT IF(@@GLOBAL.innodb_doublewrite, "ON", "OFF") = VARIABLE_VALUE -FROM INFORMATION_SCHEMA.GLOBAL_VARIABLES -WHERE VARIABLE_NAME='innodb_doublewrite'; -IF(@@GLOBAL.innodb_doublewrite, "ON", "OFF") = VARIABLE_VALUE -1 -1 Expected -SELECT COUNT(@@GLOBAL.innodb_doublewrite); -COUNT(@@GLOBAL.innodb_doublewrite) -1 -1 Expected +SELECT @@GLOBAL.innodb_doublewrite; +@@GLOBAL.innodb_doublewrite +ON SELECT COUNT(VARIABLE_VALUE) -FROM INFORMATION_SCHEMA.GLOBAL_VARIABLES +FROM INFORMATION_SCHEMA.GLOBAL_VARIABLES WHERE VARIABLE_NAME='innodb_doublewrite'; COUNT(VARIABLE_VALUE) 1 -1 Expected '#---------------------BS_STVARS_026_04----------------------#' SELECT @@innodb_doublewrite = @@GLOBAL.innodb_doublewrite; @@innodb_doublewrite = @@GLOBAL.innodb_doublewrite @@ -48,6 +40,5 @@ SELECT COUNT(@@GLOBAL.innodb_doublewrite); COUNT(@@GLOBAL.innodb_doublewrite) 1 1 Expected -SELECT innodb_doublewrite = @@SESSION.innodb_doublewrite; -ERROR 42S22: Unknown column 'innodb_doublewrite' in 'field list' -Expected error 'Readonly variable' +SELECT @@innodb_doublewrite = @@SESSION.innodb_doublewrite; +ERROR HY000: Variable 'innodb_doublewrite' is a GLOBAL variable diff --git a/mysql-test/suite/sys_vars/r/sysvars_innodb.result b/mysql-test/suite/sys_vars/r/sysvars_innodb.result index 83e137a946d..f486271cbcc 100644 --- a/mysql-test/suite/sys_vars/r/sysvars_innodb.result +++ b/mysql-test/suite/sys_vars/r/sysvars_innodb.result @@ -503,14 +503,14 @@ VARIABLE_NAME INNODB_DOUBLEWRITE SESSION_VALUE NULL DEFAULT_VALUE ON VARIABLE_SCOPE GLOBAL -VARIABLE_TYPE BOOLEAN -VARIABLE_COMMENT Enable InnoDB doublewrite buffer (enabled by default). Disable with --skip-innodb-doublewrite. +VARIABLE_TYPE ENUM +VARIABLE_COMMENT Whether and how to use the doublewrite buffer. OFF=Assume that writes of innodb_page_size are atomic; ON=Prevent torn writes (the default); fast=Like ON, but do not synchronize writes to data files NUMERIC_MIN_VALUE NULL NUMERIC_MAX_VALUE NULL NUMERIC_BLOCK_SIZE NULL -ENUM_VALUE_LIST OFF,ON -READ_ONLY YES -COMMAND_LINE_ARGUMENT NONE +ENUM_VALUE_LIST OFF,ON,fast +READ_ONLY NO +COMMAND_LINE_ARGUMENT OPTIONAL VARIABLE_NAME INNODB_ENCRYPTION_ROTATE_KEY_AGE SESSION_VALUE NULL DEFAULT_VALUE 1 diff --git a/mysql-test/suite/sys_vars/t/innodb_doublewrite_basic.opt b/mysql-test/suite/sys_vars/t/innodb_doublewrite_basic.opt new file mode 100644 index 00000000000..2bea5a22cde --- /dev/null +++ b/mysql-test/suite/sys_vars/t/innodb_doublewrite_basic.opt @@ -0,0 +1 @@ +--innodb-doublewrite diff --git a/mysql-test/suite/sys_vars/t/innodb_doublewrite_basic.test b/mysql-test/suite/sys_vars/t/innodb_doublewrite_basic.test index 1ae10d0f7cf..4e76c0ac33d 100644 --- a/mysql-test/suite/sys_vars/t/innodb_doublewrite_basic.test +++ b/mysql-test/suite/sys_vars/t/innodb_doublewrite_basic.test @@ -1,75 +1,20 @@ - - -################## mysql-test\t\innodb_doublewrite_basic.test ################# -# # -# Variable Name: innodb_doublewrite # -# Scope: Global # -# Access Type: Static # -# Data Type: boolean # -# # -# # -# Creation Date: 2008-02-07 # -# Author : Sharique Abdullah # -# # -# # -# Description:Test Cases of Dynamic System Variable innodb_doublewrite # -# that checks the behavior of this variable in the following ways # -# * Value Check # -# * Scope Check # -# # -# Reference: http://dev.mysql.com/doc/refman/5.1/en/ # -# server-system-variables.html # -# # -############################################################################### - --source include/have_innodb.inc ---echo '#---------------------BS_STVARS_026_01----------------------#' -#################################################################### -# Displaying default value # -#################################################################### -SELECT COUNT(@@GLOBAL.innodb_doublewrite); ---echo 1 Expected +SELECT @@GLOBAL.innodb_doublewrite; +SET @@GLOBAL.innodb_doublewrite=0; +SELECT @@GLOBAL.innodb_doublewrite; ---echo '#---------------------BS_STVARS_026_02----------------------#' -#################################################################### -# Check if Value can set # -#################################################################### - ---error ER_INCORRECT_GLOBAL_LOCAL_VAR +SET @@GLOBAL.innodb_doublewrite=2; +--error ER_WRONG_VALUE_FOR_VAR +SET @@GLOBAL.innodb_doublewrite=3; +SELECT @@GLOBAL.innodb_doublewrite; SET @@GLOBAL.innodb_doublewrite=1; ---echo Expected error 'Read only variable' +SELECT @@GLOBAL.innodb_doublewrite; -SELECT COUNT(@@GLOBAL.innodb_doublewrite); ---echo 1 Expected - - - - ---echo '#---------------------BS_STVARS_026_03----------------------#' -################################################################# -# Check if the value in GLOBAL Table matches value in variable # -################################################################# - ---disable_warnings -SELECT IF(@@GLOBAL.innodb_doublewrite, "ON", "OFF") = VARIABLE_VALUE +SELECT COUNT(VARIABLE_VALUE) FROM INFORMATION_SCHEMA.GLOBAL_VARIABLES WHERE VARIABLE_NAME='innodb_doublewrite'; ---enable_warnings ---echo 1 Expected - -SELECT COUNT(@@GLOBAL.innodb_doublewrite); ---echo 1 Expected - ---disable_warnings -SELECT COUNT(VARIABLE_VALUE) -FROM INFORMATION_SCHEMA.GLOBAL_VARIABLES -WHERE VARIABLE_NAME='innodb_doublewrite'; ---enable_warnings ---echo 1 Expected - - --echo '#---------------------BS_STVARS_026_04----------------------#' ################################################################################ @@ -78,8 +23,6 @@ WHERE VARIABLE_NAME='innodb_doublewrite'; SELECT @@innodb_doublewrite = @@GLOBAL.innodb_doublewrite; --echo 1 Expected - - --echo '#---------------------BS_STVARS_026_05----------------------#' ################################################################################ # Check if innodb_doublewrite can be accessed with and without @@ sign # @@ -99,8 +42,5 @@ SELECT COUNT(@@SESSION.innodb_doublewrite); SELECT COUNT(@@GLOBAL.innodb_doublewrite); --echo 1 Expected ---Error ER_BAD_FIELD_ERROR -SELECT innodb_doublewrite = @@SESSION.innodb_doublewrite; ---echo Expected error 'Readonly variable' - - +--error ER_INCORRECT_GLOBAL_LOCAL_VAR +SELECT @@innodb_doublewrite = @@SESSION.innodb_doublewrite; diff --git a/storage/innobase/buf/buf0dblwr.cc b/storage/innobase/buf/buf0dblwr.cc index e2702adc880..ec64d8d46ff 100644 --- a/storage/innobase/buf/buf0dblwr.cc +++ b/storage/innobase/buf/buf0dblwr.cc @@ -53,6 +53,7 @@ void buf_dblwr_t::init() active_slot= &slots[0]; mysql_mutex_init(buf_dblwr_mutex_key, &mutex, nullptr); pthread_cond_init(&cond, nullptr); + block_size= FSP_EXTENT_SIZE; } } @@ -67,7 +68,7 @@ inline void buf_dblwr_t::init(const byte *header) block1= page_id_t(0, mach_read_from_4(header + TRX_SYS_DOUBLEWRITE_BLOCK1)); block2= page_id_t(0, mach_read_from_4(header + TRX_SYS_DOUBLEWRITE_BLOCK2)); - const uint32_t buf_size= 2 * block_size(); + const uint32_t buf_size= 2 * block_size; for (int i= 0; i < 2; i++) { slots[i].write_buf= static_cast @@ -86,7 +87,7 @@ bool buf_dblwr_t::create() return true; mtr_t mtr; - const ulint size= block_size(); + const ulint size= block_size; start_again: mtr.start(); @@ -251,7 +252,7 @@ loads the pages from double write buffer into memory. dberr_t buf_dblwr_t::init_or_load_pages(pfs_os_file_t file, const char *path) { ut_ad(this == &buf_dblwr); - const uint32_t size= block_size(); + const uint32_t size= block_size; /* We do the file i/o past the buffer pool */ byte *read_buf= static_cast(aligned_malloc(srv_page_size, @@ -488,7 +489,6 @@ void buf_dblwr_t::write_completed() mysql_mutex_lock(&mutex); ut_ad(is_created()); - ut_ad(srv_use_doublewrite_buf); ut_ad(batch_running); slot *flush_slot= active_slot == &slots[0] ? &slots[1] : &slots[0]; ut_ad(flush_slot->reserved); @@ -574,7 +574,7 @@ static void buf_dblwr_check_block(const buf_page_t *bpage) bool buf_dblwr_t::flush_buffered_writes(const ulint size) { mysql_mutex_assert_owner(&mutex); - ut_ad(size == block_size()); + ut_ad(size == block_size); for (;;) { @@ -647,7 +647,6 @@ static void *get_frame(const IORequest &request) void buf_dblwr_t::flush_buffered_writes_completed(const IORequest &request) { ut_ad(this == &buf_dblwr); - ut_ad(srv_use_doublewrite_buf); ut_ad(is_created()); ut_ad(!srv_read_only_mode); ut_ad(!request.bpage); @@ -670,8 +669,14 @@ void buf_dblwr_t::flush_buffered_writes_completed(const IORequest &request) pages_written+= flush_slot->first_free; mysql_mutex_unlock(&mutex); - /* Now flush the doublewrite buffer data to disk */ - fil_system.sys_space->flush(); + /* Make the doublewrite durable. Note: The doublewrite buffer is + always in the first file of the system tablespace. We will not + bother about fil_system.unflushed_spaces, which can result in a + redundant call during fil_flush_file_spaces() in + log_checkpoint(). Writes to the system tablespace should be rare, + except when executing DDL or using the non-default settings + innodb_file_per_table=OFF or innodb_undo_tablespaces=0. */ + os_file_flush(request.node->handle); /* The writes have been flushed to disk now and in recovery we will find them in the doublewrite buffer blocks. Next, write the data pages. */ @@ -714,17 +719,18 @@ posted, and also when we may have to wait for a page latch! Otherwise a deadlock of threads can occur. */ void buf_dblwr_t::flush_buffered_writes() { - if (!is_created() || !srv_use_doublewrite_buf) + mysql_mutex_lock(&mutex); + + if (!in_use() && active_slot->first_free == 0) { + mysql_mutex_unlock(&mutex); fil_flush_file_spaces(); return; } ut_ad(!srv_read_only_mode); - const ulint size= block_size(); - mysql_mutex_lock(&mutex); - if (!flush_buffered_writes(size)) + if (!flush_buffered_writes(block_size)) mysql_mutex_unlock(&mutex); } @@ -734,8 +740,6 @@ flush_buffered_writes() will be invoked to make space. @param size payload size in bytes */ void buf_dblwr_t::add_to_batch(const IORequest &request, size_t size) { - ut_ad(request.is_async()); - ut_ad(request.is_write()); ut_ad(request.bpage); ut_ad(request.bpage->in_file()); ut_ad(request.node); @@ -744,7 +748,7 @@ void buf_dblwr_t::add_to_batch(const IORequest &request, size_t size) ut_ad(request.node->space->referenced()); ut_ad(!srv_read_only_mode); - const ulint buf_size= 2 * block_size(); + const ulint buf_size= 2 * block_size; mysql_mutex_lock(&mutex); @@ -773,7 +777,7 @@ void buf_dblwr_t::add_to_batch(const IORequest &request, size_t size) ut_ad(active_slot->reserved == active_slot->first_free); ut_ad(active_slot->reserved < buf_size); new (active_slot->buf_block_arr + active_slot->first_free++) - element{request, size}; + element{request.doublewritten(), size}; active_slot->reserved= active_slot->first_free; if (active_slot->first_free != buf_size || diff --git a/storage/innobase/buf/buf0flu.cc b/storage/innobase/buf/buf0flu.cc index 4e54c7055ca..ed1ef22bb4e 100644 --- a/storage/innobase/buf/buf0flu.cc +++ b/storage/innobase/buf/buf0flu.cc @@ -350,9 +350,9 @@ void buf_page_write_complete(const IORequest &request, bool error) else { bpage->write_complete(persistent, error, state); - if (state < buf_page_t::WRITE_FIX_REINIT && - request.node->space->use_doublewrite()) + if (request.is_doublewritten()) { + ut_ad(state < buf_page_t::WRITE_FIX_REINIT); ut_ad(persistent); buf_dblwr.write_completed(); } diff --git a/storage/innobase/fil/fil0fil.cc b/storage/innobase/fil/fil0fil.cc index ac491a958d6..0acc04f25d0 100644 --- a/storage/innobase/fil/fil0fil.cc +++ b/storage/innobase/fil/fil0fil.cc @@ -1231,9 +1231,6 @@ void fil_system_t::create(ulint hash_size) ut_ad(!is_initialised()); ut_ad(!(srv_page_size % FSP_EXTENT_SIZE)); ut_ad(srv_page_size); - ut_ad(!spaces.array); - - m_initialised = true; compile_time_assert(!(UNIV_PAGE_SIZE_MAX % FSP_EXTENT_SIZE_MAX)); compile_time_assert(!(UNIV_PAGE_SIZE_MIN % FSP_EXTENT_SIZE_MIN)); @@ -1244,6 +1241,8 @@ void fil_system_t::create(ulint hash_size) spaces.create(hash_size); + need_unflushed_spaces = !write_through && buf_dblwr.need_fsync(); + fil_space_crypt_init(); #ifdef __linux__ ssd.clear(); @@ -1317,13 +1316,12 @@ void fil_system_t::close() if (is_initialised()) { - m_initialised= false; spaces.free(); mysql_mutex_destroy(&mutex); fil_space_crypt_cleanup(); } - ut_ad(!spaces.array); + ut_ad(!is_initialised()); #ifdef __linux__ ssd.clear(); @@ -1464,6 +1462,7 @@ void fil_system_t::set_write_through(bool write_through) { this->write_through= write_through; fil_space_t::reopen_all(); + need_unflushed_spaces = !write_through && buf_dblwr.need_fsync(); } mysql_mutex_unlock(&mutex); @@ -2833,19 +2832,18 @@ static void fil_invalid_page_access_msg(const char *name, } /** Update the data structures on write completion */ -inline void fil_node_t::complete_write() +void fil_space_t::complete_write() { mysql_mutex_assert_not_owner(&fil_system.mutex); - if (space->purpose != FIL_TYPE_TEMPORARY && - (!fil_system.is_write_through() && !my_disable_sync) && - space->set_needs_flush()) + if (purpose != FIL_TYPE_TEMPORARY && + fil_system.use_unflushed_spaces() && set_needs_flush()) { mysql_mutex_lock(&fil_system.mutex); - if (!space->is_in_unflushed_spaces) + if (!is_in_unflushed_spaces) { - space->is_in_unflushed_spaces= true; - fil_system.unflushed_spaces.push_front(*space); + is_in_unflushed_spaces= true; + fil_system.unflushed_spaces.push_front(*this); } mysql_mutex_unlock(&fil_system.mutex); } @@ -2945,7 +2943,7 @@ io_error: if (!type.is_async()) { if (type.is_write()) { release_sync_write: - node->complete_write(); + complete_write(); release: release(); goto func_exit; @@ -2965,21 +2963,28 @@ void IORequest::write_complete(int io_error) const { ut_ad(fil_validate_skip()); ut_ad(node); + fil_space_t *space= node->space; ut_ad(is_write()); - node->complete_write(); if (!bpage) { ut_ad(!srv_read_only_mode); if (type == IORequest::DBLWR_BATCH) + { buf_dblwr.flush_buffered_writes_completed(*this); + /* Above, we already invoked os_file_flush() on the + doublewrite buffer if needed. */ + goto func_exit; + } else ut_ad(type == IORequest::WRITE_ASYNC); } else buf_page_write_complete(*this, io_error); - node->space->release(); + space->complete_write(); + func_exit: + space->release(); } void IORequest::read_complete(int io_error) const diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc index 21c33ddd337..85fb7d765f9 100644 --- a/storage/innobase/handler/ha_innodb.cc +++ b/storage/innobase/handler/ha_innodb.cc @@ -352,7 +352,7 @@ static TYPELIB innodb_default_row_format_typelib = { }; /** Names of allowed values of innodb_flush_method */ -const char* innodb_flush_method_names[] = { +static const char* innodb_flush_method_names[] = { "fsync", "O_DSYNC", "littlesync", @@ -380,6 +380,18 @@ TYPELIB innodb_flush_method_typelib = { /** Deprecated parameter */ static ulong innodb_flush_method; +/** Names of allowed values of innodb_doublewrite */ +static const char *innodb_doublewrite_names[]= + {"OFF", "ON", "fast", nullptr}; + +/** Enumeration of innodb_doublewrite */ +TYPELIB innodb_doublewrite_typelib= { + array_elements(innodb_doublewrite_names) - 1, + "innodb_doublewrite_typelib", + innodb_doublewrite_names, + nullptr +}; + /** Names of allowed values of innodb_deadlock_report */ static const char *innodb_deadlock_report_names[]= { "off", /* Do not report any details of deadlocks */ @@ -3982,6 +3994,10 @@ static int innodb_init_params() } else if (innodb_flush_method >= 4 /* O_DIRECT */ IF_WIN(&& innodb_flush_method < 8 /* normal */,)) { /* O_DIRECT and similar settings do nothing */ + if (innodb_flush_method == 5 /* O_DIRECT_NO_FSYNC */ + && buf_dblwr.use) { + buf_dblwr.use = buf_dblwr.USE_FAST; + } #ifdef O_DIRECT } else if (srv_use_atomic_writes && my_may_have_atomic_write) { /* If atomic writes are enabled, do the same as with @@ -18442,6 +18458,12 @@ static void innodb_data_file_write_through_update(THD *, st_mysql_sys_var*, mysql_mutex_lock(&LOCK_global_system_variables); } +static void innodb_doublewrite_update(THD *, st_mysql_sys_var*, + void *, const void *save) +{ + fil_system.set_use_doublewrite(*static_cast(save)); +} + static void innodb_log_file_size_update(THD *thd, st_mysql_sys_var*, void *var, const void *save) { @@ -18775,11 +18797,14 @@ static MYSQL_SYSVAR_STR(data_home_dir, innobase_data_home_dir, "The common part for InnoDB table spaces.", NULL, NULL, NULL); -static MYSQL_SYSVAR_BOOL(doublewrite, srv_use_doublewrite_buf, - PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY, - "Enable InnoDB doublewrite buffer (enabled by default)." - " Disable with --skip-innodb-doublewrite.", - NULL, NULL, TRUE); +static MYSQL_SYSVAR_ENUM(doublewrite, buf_dblwr.use, + PLUGIN_VAR_OPCMDARG, + "Whether and how to use the doublewrite buffer. " + "OFF=Assume that writes of innodb_page_size are atomic; " + "ON=Prevent torn writes (the default); " + "fast=Like ON, but do not synchronize writes to data files", + nullptr, innodb_doublewrite_update, true, + &innodb_doublewrite_typelib); static MYSQL_SYSVAR_BOOL(use_atomic_writes, srv_use_atomic_writes, PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY, diff --git a/storage/innobase/include/buf0dblwr.h b/storage/innobase/include/buf0dblwr.h index 6e7662d9b81..f912775de59 100644 --- a/storage/innobase/include/buf0dblwr.h +++ b/storage/innobase/include/buf0dblwr.h @@ -53,9 +53,9 @@ class buf_dblwr_t element* buf_block_arr; }; - /** the page number of the first doublewrite block (block_size() pages) */ + /** the page number of the first doublewrite block (block_size pages) */ page_id_t block1{0, 0}; - /** the page number of the second doublewrite block (block_size() pages) */ + /** the page number of the second doublewrite block (block_size pages) */ page_id_t block2{0, 0}; /** mutex protecting the data members below */ @@ -74,6 +74,22 @@ class buf_dblwr_t slot slots[2]; slot *active_slot; + /** Size of the doublewrite block in pages */ + uint32_t block_size; + +public: + /** Values of use */ + enum usage { + /** Assume that writes are atomic */ + USE_NO= 0, + /** Use the doublewrite buffer with full durability */ + USE_YES, + /** Durable writes to the doublewrite buffer, not to data files */ + USE_FAST + }; + /** The value of innodb_doublewrite */ + ulong use; +private: /** Initialise the persistent storage of the doublewrite buffer. @param header doublewrite page header in the TRX_SYS page */ inline void init(const byte *header); @@ -126,9 +142,6 @@ public: @param request the completed batch write request */ void flush_buffered_writes_completed(const IORequest &request); - /** Size of the doublewrite block in pages */ - uint32_t block_size() const { return FSP_EXTENT_SIZE; } - /** Schedule a page write. If the doublewrite memory buffer is full, flush_buffered_writes() will be invoked to make space. @param request asynchronous write request @@ -139,6 +152,19 @@ public: bool is_created() const { return UNIV_LIKELY(block1 != page_id_t(0, 0)); } + /** @return whether the doublewrite buffer is in use */ + bool in_use() const { return is_created() && use; } + /** @return whether fsync() is needed on non-doublewrite pages */ + bool need_fsync() const { return use < USE_FAST; } + + void set_use(ulong use) + { + ut_ad(use <= USE_FAST); + mysql_mutex_lock(&mutex); + this->use= use; + mysql_mutex_unlock(&mutex); + } + /** @return whether a page identifier is part of the doublewrite buffer */ bool is_inside(const page_id_t id) const { @@ -147,8 +173,8 @@ public: ut_ad(block1 < block2); if (id < block1) return false; - const uint32_t size= block_size(); - return id < block1 + size || (id >= block2 && id < block2 + size); + return id < block1 + block_size || + (id >= block2 && id < block2 + block_size); } /** Wait for flush_buffered_writes() to be fully completed */ diff --git a/storage/innobase/include/fil0fil.h b/storage/innobase/include/fil0fil.h index 1f9b329efa3..94de29f494c 100644 --- a/storage/innobase/include/fil0fil.h +++ b/storage/innobase/include/fil0fil.h @@ -48,9 +48,6 @@ struct named_spaces_tag_t; using space_list_t= ilist; -// Forward declaration -extern my_bool srv_use_doublewrite_buf; - /** Undo tablespaces starts with space_id. */ extern uint32_t srv_undo_space_id_start; /** The number of UNDO tablespaces that are open and ready to use. */ @@ -1008,6 +1005,9 @@ public: /** @return the tablespace name (databasename/tablename) */ name_type name() const; + /** Update the data structures on write completion */ + void complete_write(); + private: /** @return whether the file is usable for io() */ ATTRIBUTE_COLD bool prepare_acquired(); @@ -1080,9 +1080,6 @@ struct fil_node_t final @return detached handle or OS_FILE_CLOSED */ inline pfs_os_file_t close_to_free(bool detach_handle= false); - /** Update the data structures on write completion */ - inline void complete_write(); - private: /** Does stuff common for close() and detach() */ void prepare_to_close_or_detach(); @@ -1090,8 +1087,7 @@ private: inline bool fil_space_t::use_doublewrite() const { - return !UT_LIST_GET_FIRST(chain)->atomic_write && srv_use_doublewrite_buf && - buf_dblwr.is_created(); + return !UT_LIST_GET_FIRST(chain)->atomic_write && buf_dblwr.in_use(); } inline void fil_space_t::set_imported() @@ -1352,9 +1348,9 @@ struct fil_system_t Some members may require late initialisation, thus we just mark object as uninitialised. Real initialisation happens in create(). */ - fil_system_t() : m_initialised(false) {} + fil_system_t() {} - bool is_initialised() const { return m_initialised; } + bool is_initialised() const { return spaces.array; } /** Create the file system interface at database start. @@ -1367,8 +1363,6 @@ struct fil_system_t void close(); private: - bool m_initialised; - /** Points to the last opened space in space_list. Protected with fil_system.mutex. */ fil_space_t *space_list_last_opened= nullptr; @@ -1404,19 +1398,32 @@ public: /** Map of fil_space_t::id to fil_space_t* */ hash_table_t spaces; - /** whether each write to data files is durable (O_DSYNC) */ + /** false=invoke fsync() or fdatasync() on data files before checkpoint; + true=each write is durable (O_DSYNC) */ my_bool write_through; /** whether data files are buffered (not O_DIRECT) */ my_bool buffered; + /** whether fdatasync() is needed on data files */ + Atomic_relaxed need_unflushed_spaces; /** Try to enable or disable write-through of data files */ void set_write_through(bool write_through); + /** Update innodb_doublewrite */ + void set_use_doublewrite(ulong use) + { + buf_dblwr.set_use(use); + need_unflushed_spaces= !write_through && buf_dblwr.need_fsync(); + } + /** Try to enable or disable file system caching of data files */ void set_buffered(bool buffered); TPOOL_SUPPRESS_TSAN bool is_write_through() const { return write_through; } TPOOL_SUPPRESS_TSAN bool is_buffered() const { return buffered; } + /** @return whether to update unflushed_spaces */ + bool use_unflushed_spaces() const { return need_unflushed_spaces; } + /** tablespaces for which fil_space_t::needs_flush() holds */ sized_ilist unflushed_spaces; /** number of currently open files; protected by mutex */ diff --git a/storage/innobase/include/os0file.h b/storage/innobase/include/os0file.h index a5953dcfd51..317c1c132ee 100644 --- a/storage/innobase/include/os0file.h +++ b/storage/innobase/include/os0file.h @@ -185,10 +185,14 @@ public: WRITE_SYNC= 16, /** Asynchronous write */ WRITE_ASYNC= WRITE_SYNC | 1, + /** Asynchronous doublewritten page */ + WRITE_DBL= WRITE_ASYNC | 4, /** A doublewrite batch */ DBLWR_BATCH= WRITE_ASYNC | 8, /** Write data and punch hole for the rest */ PUNCH= WRITE_ASYNC | 16, + /** Write doublewritten data and punch hole for the rest */ + PUNCH_DBL= PUNCH | 4, /** Zero out a range of bytes in fil_space_t::io() */ PUNCH_RANGE= WRITE_SYNC | 32, }; @@ -204,6 +208,14 @@ public: bool is_read() const { return (type & READ_SYNC) != 0; } bool is_write() const { return (type & WRITE_SYNC) != 0; } bool is_async() const { return (type & (READ_SYNC ^ READ_ASYNC)) != 0; } + bool is_doublewritten() const { return (type & 4) != 0; } + + /** Create a write request for the doublewrite buffer. */ + IORequest doublewritten() const + { + ut_ad(type == WRITE_ASYNC || type == PUNCH); + return IORequest{bpage, slot, node, Type(type | 4)}; + } void write_complete(int io_error) const; void read_complete(int io_error) const; diff --git a/storage/innobase/include/srv0srv.h b/storage/innobase/include/srv0srv.h index 46fa1ca95e2..3f35bb78017 100644 --- a/storage/innobase/include/srv0srv.h +++ b/storage/innobase/include/srv0srv.h @@ -303,7 +303,6 @@ extern my_bool srv_stats_include_delete_marked; extern unsigned long long srv_stats_modified_counter; extern my_bool srv_stats_sample_traditional; -extern my_bool srv_use_doublewrite_buf; extern ulong srv_checksum_algorithm; extern my_bool srv_force_primary_key; diff --git a/storage/innobase/srv/srv0srv.cc b/storage/innobase/srv/srv0srv.cc index 3d24b97a21a..84b065b413b 100644 --- a/storage/innobase/srv/srv0srv.cc +++ b/storage/innobase/srv/srv0srv.cc @@ -305,8 +305,6 @@ unsigned long long srv_stats_modified_counter; based on number of configured pages */ my_bool srv_stats_sample_traditional; -my_bool srv_use_doublewrite_buf; - /** innodb_sync_spin_loops */ ulong srv_n_spin_wait_rounds; /** innodb_spin_wait_delay */ diff --git a/storage/innobase/srv/srv0start.cc b/storage/innobase/srv/srv0start.cc index 3c90186374c..875dbbe8d57 100644 --- a/storage/innobase/srv/srv0start.cc +++ b/storage/innobase/srv/srv0start.cc @@ -1193,7 +1193,7 @@ dberr_t srv_start(bool create_new_db) if (srv_read_only_mode) { sql_print_information("InnoDB: Started in read only mode"); - srv_use_doublewrite_buf = false; + buf_dblwr.use = buf_dblwr.USE_NO; } high_level_read_only = srv_read_only_mode