mirror of
https://github.com/MariaDB/server.git
synced 2025-05-17 19:29:54 +02:00

In commit 24648768b4
(MDEV-30136)
the parameter innodb_flush_method was deprecated, with no direct
replacement for innodb_flush_method=O_DIRECT_NO_FSYNC.
Let us change innodb_doublewrite from Boolean to ENUM that can
be changed while the server is running:
OFF: Assume that writes of innodb_page_size are atomic
ON: Prevent torn writes (the default)
fast: Like ON, but avoid synchronizing writes to data files
The deprecated start-up parameter innodb_flush_method=NO_FSYNC will cause
innodb_doublewrite=ON to be changed to innodb_doublewrite=fast,
which will prevent InnoDB from making any durable writes to data files.
This would normally be done right before the log checkpoint LSN is updated.
Depending on the file systems being used and their configuration,
this may or may not be safe.
The value innodb_doublewrite=fast differs from the previous combination of
innodb_doublewrite=ON and innodb_flush_method=O_DIRECT_NO_FSYNC by always
invoking os_file_flush() on the doublewrite buffer itself
in buf_dblwr_t::flush_buffered_writes_completed(). This should be safer
when there are multiple doublewrite batches between checkpoints.
Typically, once per second, buf_flush_page_cleaner() would write out
up to innodb_io_capacity pages and advance the log checkpoint.
Also typically, innodb_io_capacity>128, which is the size of the
doublewrite buffer in pages. Should os_file_flush_func() not be invoked
between doublewrite batches, writes could be reordered in an unsafe way.
The setting innodb_doublewrite=fast could be safe when the doublewrite
buffer (the first file of the system tablespace) and the data files
reside in the same file system.
This was tested by running "./mtr --rr innodb.alter_kill". On the first
server startup, with innodb_doublewrite=fast, os_file_flush_func()
would only be invoked on the ibdata1 file and possibly ib_logfile0.
On subsequent startups with innodb_doublewrite=OFF, os_file_flush_func()
will be invoked on the individual data files during log_checkpoint().
Note: The setting debug_no_sync (in the code, my_disable_sync) would
disable all durable writes to InnoDB files, which would be much less safe.
IORequest::Type: Introduce special values WRITE_DBL and PUNCH_DBL
for asynchronous writes that are submitted via the doublewrite buffer.
In this way, fil_space_t::use_doublewrite() or buf_dblwr.in_use()
will only be consulted during buf_page_t::flush() and the doublewrite
buffer can be enabled or disabled without any fear of inconsistency.
buf_dblwr_t::block_size: Replaces block_size().
buf_dblwr_t::flush_buffered_writes(): If !in_use() and the doublewrite
buffer is empty, just invoke fil_flush_file_spaces() and return. The
doublewrite buffer could have been disabled while a batch was in
progress.
innodb_init_params(): If innodb_flush_method=O_DIRECT_NO_FSYNC,
set innodb_doublewrite=fast or innodb_doublewrite=fearless.
Thanks to Mark Callaghan for reporting this, and Vladislav Vaintroub
for feedback.
191 lines
6.4 KiB
C++
191 lines
6.4 KiB
C++
/*****************************************************************************
|
|
|
|
Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved.
|
|
Copyright (c) 2017, 2022, MariaDB Corporation.
|
|
|
|
This program is free software; you can redistribute it and/or modify it under
|
|
the terms of the GNU General Public License as published by the Free Software
|
|
Foundation; version 2 of the License.
|
|
|
|
This program is distributed in the hope that it will be useful, but WITHOUT
|
|
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
|
|
FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
|
|
|
|
You should have received a copy of the GNU General Public License along with
|
|
this program; if not, write to the Free Software Foundation, Inc.,
|
|
51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
|
|
|
|
*****************************************************************************/
|
|
|
|
/**************************************************//**
|
|
@file include/buf0dblwr.h
|
|
Doublewrite buffer module
|
|
|
|
Created 2011/12/19 Inaam Rana
|
|
*******************************************************/
|
|
|
|
#pragma once
|
|
|
|
#include "os0file.h"
|
|
#include "buf0types.h"
|
|
|
|
/** Doublewrite control struct */
|
|
class buf_dblwr_t
|
|
{
|
|
struct element
|
|
{
|
|
/** asynchronous write request */
|
|
IORequest request;
|
|
/** payload size in bytes */
|
|
size_t size;
|
|
};
|
|
|
|
struct slot
|
|
{
|
|
/** first free position in write_buf measured in units of
|
|
* srv_page_size */
|
|
ulint first_free;
|
|
/** number of slots reserved for the current write batch */
|
|
ulint reserved;
|
|
/** the doublewrite buffer, aligned to srv_page_size */
|
|
byte* write_buf;
|
|
/** buffer blocks to be written via write_buf */
|
|
element* buf_block_arr;
|
|
};
|
|
|
|
/** the page number of the first doublewrite block (block_size pages) */
|
|
page_id_t block1{0, 0};
|
|
/** the page number of the second doublewrite block (block_size pages) */
|
|
page_id_t block2{0, 0};
|
|
|
|
/** mutex protecting the data members below */
|
|
mysql_mutex_t mutex;
|
|
/** condition variable for !batch_running */
|
|
pthread_cond_t cond;
|
|
/** whether a batch is being written from the doublewrite buffer */
|
|
bool batch_running;
|
|
/** number of expected flush_buffered_writes_completed() calls */
|
|
unsigned flushing_buffered_writes;
|
|
/** number of flush_buffered_writes_completed() calls */
|
|
ulint writes_completed;
|
|
/** number of pages written by flush_buffered_writes_completed() */
|
|
ulint pages_written;
|
|
|
|
slot slots[2];
|
|
slot *active_slot;
|
|
|
|
/** Size of the doublewrite block in pages */
|
|
uint32_t block_size;
|
|
|
|
public:
|
|
/** Values of use */
|
|
enum usage {
|
|
/** Assume that writes are atomic */
|
|
USE_NO= 0,
|
|
/** Use the doublewrite buffer with full durability */
|
|
USE_YES,
|
|
/** Durable writes to the doublewrite buffer, not to data files */
|
|
USE_FAST
|
|
};
|
|
/** The value of innodb_doublewrite */
|
|
ulong use;
|
|
private:
|
|
/** Initialise the persistent storage of the doublewrite buffer.
|
|
@param header doublewrite page header in the TRX_SYS page */
|
|
inline void init(const byte *header);
|
|
|
|
/** Flush possible buffered writes to persistent storage. */
|
|
bool flush_buffered_writes(const ulint size);
|
|
|
|
public:
|
|
/** Initialise the doublewrite buffer data structures. */
|
|
void init();
|
|
/** Create or restore the doublewrite buffer in the TRX_SYS page.
|
|
@return whether the operation succeeded */
|
|
bool create();
|
|
/** Free the doublewrite buffer. */
|
|
void close();
|
|
|
|
/** Acquire the mutex */
|
|
void lock() { mysql_mutex_lock(&mutex); }
|
|
/** @return the number of completed batches */
|
|
ulint batches() const
|
|
{ mysql_mutex_assert_owner(&mutex); return writes_completed; }
|
|
/** @return the number of final pages written */
|
|
ulint written() const
|
|
{ mysql_mutex_assert_owner(&mutex); return pages_written; }
|
|
/** Release the mutex */
|
|
void unlock() { mysql_mutex_unlock(&mutex); }
|
|
|
|
/** Initialize the doublewrite buffer memory structure on recovery.
|
|
If we are upgrading from a version before MySQL 4.1, then this
|
|
function performs the necessary update operations to support
|
|
innodb_file_per_table. If we are in a crash recovery, this function
|
|
loads the pages from double write buffer which are not older than
|
|
the checkpoint into memory.
|
|
@param file File handle
|
|
@param path Path name of file
|
|
@return DB_SUCCESS or error code */
|
|
dberr_t init_or_load_pages(pfs_os_file_t file, const char *path);
|
|
|
|
/** Process and remove the double write buffer pages for all tablespaces. */
|
|
void recover();
|
|
|
|
/** Update the doublewrite buffer on data page write completion. */
|
|
void write_completed();
|
|
/** Flush possible buffered writes to persistent storage.
|
|
It is very important to call this function after a batch of writes has been
|
|
posted, and also when we may have to wait for a page latch!
|
|
Otherwise a deadlock of threads can occur. */
|
|
void flush_buffered_writes();
|
|
/** Update the doublewrite buffer on write batch completion
|
|
@param request the completed batch write request */
|
|
void flush_buffered_writes_completed(const IORequest &request);
|
|
|
|
/** Schedule a page write. If the doublewrite memory buffer is full,
|
|
flush_buffered_writes() will be invoked to make space.
|
|
@param request asynchronous write request
|
|
@param size payload size in bytes */
|
|
void add_to_batch(const IORequest &request, size_t size);
|
|
|
|
/** Determine whether the doublewrite buffer has been created */
|
|
bool is_created() const
|
|
{ return UNIV_LIKELY(block1 != page_id_t(0, 0)); }
|
|
|
|
/** @return whether the doublewrite buffer is in use */
|
|
bool in_use() const { return is_created() && use; }
|
|
/** @return whether fsync() is needed on non-doublewrite pages */
|
|
bool need_fsync() const { return use < USE_FAST; }
|
|
|
|
void set_use(ulong use)
|
|
{
|
|
ut_ad(use <= USE_FAST);
|
|
mysql_mutex_lock(&mutex);
|
|
this->use= use;
|
|
mysql_mutex_unlock(&mutex);
|
|
}
|
|
|
|
/** @return whether a page identifier is part of the doublewrite buffer */
|
|
bool is_inside(const page_id_t id) const
|
|
{
|
|
if (!is_created())
|
|
return false;
|
|
ut_ad(block1 < block2);
|
|
if (id < block1)
|
|
return false;
|
|
return id < block1 + block_size ||
|
|
(id >= block2 && id < block2 + block_size);
|
|
}
|
|
|
|
/** Wait for flush_buffered_writes() to be fully completed */
|
|
void wait_flush_buffered_writes()
|
|
{
|
|
mysql_mutex_lock(&mutex);
|
|
while (batch_running)
|
|
my_cond_wait(&cond, &mutex.m_mutex);
|
|
mysql_mutex_unlock(&mutex);
|
|
}
|
|
};
|
|
|
|
/** The doublewrite buffer */
|
|
extern buf_dblwr_t buf_dblwr;
|