mirror of
https://github.com/MariaDB/server.git
synced 2025-10-24 16:38:14 +02:00
864 lines
29 KiB
C++
864 lines
29 KiB
C++
/*****************************************************************************
|
|
|
|
Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved.
|
|
Copyright (c) 2013, 2022, MariaDB Corporation.
|
|
|
|
This program is free software; you can redistribute it and/or modify it under
|
|
the terms of the GNU General Public License as published by the Free Software
|
|
Foundation; version 2 of the License.
|
|
|
|
This program is distributed in the hope that it will be useful, but WITHOUT
|
|
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
|
|
FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
|
|
|
|
You should have received a copy of the GNU General Public License along with
|
|
this program; if not, write to the Free Software Foundation, Inc.,
|
|
51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
|
|
|
|
*****************************************************************************/
|
|
|
|
/**************************************************//**
|
|
@file buf/buf0dblwr.cc
|
|
Doublewrite buffer module
|
|
|
|
Created 2011/12/19
|
|
*******************************************************/
|
|
|
|
#include "buf0dblwr.h"
|
|
#include "buf0flu.h"
|
|
#include "buf0checksum.h"
|
|
#include "srv0start.h"
|
|
#include "srv0srv.h"
|
|
#include "page0zip.h"
|
|
#include "trx0sys.h"
|
|
#include "fil0crypt.h"
|
|
#include "fil0pagecompress.h"
|
|
#include "log.h"
|
|
|
|
using st_::span;
|
|
|
|
/** The doublewrite buffer */
|
|
buf_dblwr_t buf_dblwr;
|
|
|
|
/** @return the TRX_SYS page */
|
|
inline buf_block_t *buf_dblwr_trx_sys_get(mtr_t *mtr) noexcept
|
|
{
|
|
return buf_page_get(page_id_t(TRX_SYS_SPACE, TRX_SYS_PAGE_NO),
|
|
0, RW_X_LATCH, mtr);
|
|
}
|
|
|
|
void buf_dblwr_t::init() noexcept
|
|
{
|
|
if (!active_slot)
|
|
{
|
|
active_slot= &slots[0];
|
|
mysql_mutex_init(buf_dblwr_mutex_key, &mutex, nullptr);
|
|
pthread_cond_init(&cond, nullptr);
|
|
block_size= FSP_EXTENT_SIZE;
|
|
}
|
|
}
|
|
|
|
/** Initialise the persistent storage of the doublewrite buffer.
|
|
@param header doublewrite page header in the TRX_SYS page */
|
|
inline void buf_dblwr_t::init(const byte *header) noexcept
|
|
{
|
|
ut_ad(!active_slot->first_free);
|
|
ut_ad(!active_slot->reserved);
|
|
ut_ad(!batch_running);
|
|
|
|
block1= page_id_t(0, mach_read_from_4(header + TRX_SYS_DOUBLEWRITE_BLOCK1));
|
|
block2= page_id_t(0, mach_read_from_4(header + TRX_SYS_DOUBLEWRITE_BLOCK2));
|
|
|
|
const uint32_t buf_size= 2 * block_size;
|
|
for (int i= 0; i < 2; i++)
|
|
{
|
|
slots[i].write_buf= static_cast<byte*>
|
|
(aligned_malloc(buf_size << srv_page_size_shift, srv_page_size));
|
|
slots[i].buf_block_arr= static_cast<element*>
|
|
(ut_zalloc_nokey(buf_size * sizeof(element)));
|
|
}
|
|
active_slot= &slots[0];
|
|
}
|
|
|
|
/** Create or restore the doublewrite buffer in the TRX_SYS page.
|
|
@return whether the operation succeeded */
|
|
bool buf_dblwr_t::create() noexcept
|
|
{
|
|
if (is_created())
|
|
return true;
|
|
|
|
mtr_t mtr;
|
|
const ulint size= block_size;
|
|
|
|
start_again:
|
|
mtr.start();
|
|
|
|
dberr_t err;
|
|
buf_block_t *trx_sys_block= buf_dblwr_trx_sys_get(&mtr);
|
|
if (!trx_sys_block)
|
|
{
|
|
mtr.commit();
|
|
return false;
|
|
}
|
|
|
|
if (mach_read_from_4(TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_MAGIC +
|
|
trx_sys_block->page.frame) ==
|
|
TRX_SYS_DOUBLEWRITE_MAGIC_N)
|
|
{
|
|
/* The doublewrite buffer has already been created: just read in
|
|
some numbers */
|
|
init(TRX_SYS_DOUBLEWRITE + trx_sys_block->page.frame);
|
|
mtr.commit();
|
|
return true;
|
|
}
|
|
|
|
if (UT_LIST_GET_FIRST(fil_system.sys_space->chain)->size < 3 * size)
|
|
{
|
|
sql_print_error("InnoDB: Cannot create doublewrite buffer: "
|
|
"the first file in innodb_data_file_path must be at least "
|
|
"%zuM.", 3 * (size >> (20U - srv_page_size_shift)));
|
|
fail:
|
|
mtr.commit();
|
|
return false;
|
|
}
|
|
else
|
|
{
|
|
buf_block_t *b= fseg_create(fil_system.sys_space,
|
|
TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_FSEG,
|
|
&mtr, &err, false, trx_sys_block);
|
|
if (!b)
|
|
{
|
|
sql_print_error("InnoDB: Cannot create doublewrite buffer: %s",
|
|
ut_strerr(err));
|
|
goto fail;
|
|
}
|
|
|
|
sql_print_information("InnoDB: Doublewrite buffer not found:"
|
|
" creating new");
|
|
|
|
/* FIXME: After this point, the doublewrite buffer creation
|
|
is not atomic. The doublewrite buffer should not exist in
|
|
the InnoDB system tablespace file in the first place.
|
|
It could be located in separate optional file(s) in a
|
|
user-specified location. */
|
|
}
|
|
|
|
byte *fseg_header= TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_FSEG +
|
|
trx_sys_block->page.frame;
|
|
for (uint32_t prev_page_no= 0, i= 0, extent_size= FSP_EXTENT_SIZE;
|
|
i < 2 * size + extent_size / 2; i++)
|
|
{
|
|
buf_block_t *new_block=
|
|
fseg_alloc_free_page_general(fseg_header, prev_page_no + 1, FSP_UP,
|
|
false, &mtr, &mtr, &err);
|
|
if (!new_block)
|
|
{
|
|
sql_print_error("InnoDB: Cannot create doublewrite buffer: "
|
|
" you must increase your tablespace size."
|
|
" Cannot continue operation.");
|
|
/* This may essentially corrupt the doublewrite
|
|
buffer. However, usually the doublewrite buffer
|
|
is created at database initialization, and it
|
|
should not matter (just remove all newly created
|
|
InnoDB files and restart). */
|
|
mtr.commit();
|
|
return false;
|
|
}
|
|
|
|
/* We read the allocated pages to the buffer pool; when they are
|
|
written to disk in a flush, the space id and page number fields
|
|
are also written to the pages. When we at database startup read
|
|
pages from the doublewrite buffer, we know that if the space id
|
|
and page number in them are the same as the page position in the
|
|
tablespace, then the page has not been written to in
|
|
doublewrite. */
|
|
|
|
ut_ad(new_block->page.lock.not_recursive());
|
|
const page_id_t id= new_block->page.id();
|
|
/* We only do this in the debug build, to ensure that the check in
|
|
buf_flush_init_for_writing() will see a valid page type. The
|
|
flushes of new_block are actually unnecessary here. */
|
|
ut_d(mtr.write<2>(*new_block, FIL_PAGE_TYPE + new_block->page.frame,
|
|
FIL_PAGE_TYPE_SYS));
|
|
|
|
if (i == size / 2)
|
|
{
|
|
ut_a(id.page_no() == size);
|
|
mtr.write<4>(*trx_sys_block,
|
|
TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_BLOCK1 +
|
|
trx_sys_block->page.frame, id.page_no());
|
|
mtr.write<4>(*trx_sys_block,
|
|
TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_REPEAT +
|
|
TRX_SYS_DOUBLEWRITE_BLOCK1 + trx_sys_block->page.frame,
|
|
id.page_no());
|
|
}
|
|
else if (i == size / 2 + size)
|
|
{
|
|
ut_a(id.page_no() == 2 * size);
|
|
mtr.write<4>(*trx_sys_block,
|
|
TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_BLOCK2 +
|
|
trx_sys_block->page.frame, id.page_no());
|
|
mtr.write<4>(*trx_sys_block,
|
|
TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_REPEAT +
|
|
TRX_SYS_DOUBLEWRITE_BLOCK2 + trx_sys_block->page.frame,
|
|
id.page_no());
|
|
}
|
|
else if (i > size / 2)
|
|
ut_a(id.page_no() == prev_page_no + 1);
|
|
|
|
if (((i + 1) & 15) == 0) {
|
|
/* rw_locks can only be recursively x-locked 2048 times. (on 32
|
|
bit platforms, (lint) 0 - (X_LOCK_DECR * 2049) is no longer a
|
|
negative number, and thus lock_word becomes like a shared lock).
|
|
For 4k page size this loop will lock the fseg header too many
|
|
times. Since this code is not done while any other threads are
|
|
active, restart the MTR occasionally. */
|
|
mtr.commit();
|
|
mtr.start();
|
|
trx_sys_block= buf_dblwr_trx_sys_get(&mtr);
|
|
fseg_header= TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_FSEG +
|
|
trx_sys_block->page.frame;
|
|
}
|
|
|
|
prev_page_no= id.page_no();
|
|
}
|
|
|
|
mtr.write<4>(*trx_sys_block,
|
|
TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_MAGIC +
|
|
trx_sys_block->page.frame, TRX_SYS_DOUBLEWRITE_MAGIC_N);
|
|
mtr.write<4>(*trx_sys_block,
|
|
TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_MAGIC +
|
|
TRX_SYS_DOUBLEWRITE_REPEAT + trx_sys_block->page.frame,
|
|
TRX_SYS_DOUBLEWRITE_MAGIC_N);
|
|
|
|
mtr.write<4>(*trx_sys_block,
|
|
TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED +
|
|
trx_sys_block->page.frame,
|
|
TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N);
|
|
mtr.commit();
|
|
|
|
buf_flush_wait_flushed(mtr.commit_lsn());
|
|
|
|
/* Remove doublewrite pages from LRU */
|
|
buf_pool_invalidate();
|
|
|
|
sql_print_information("InnoDB: Doublewrite buffer created");
|
|
goto start_again;
|
|
}
|
|
|
|
/** Initialize the doublewrite buffer memory structure on recovery.
|
|
If we are upgrading from a version before MySQL 4.1, then this
|
|
function performs the necessary update operations to support
|
|
innodb_file_per_table. If we are in a crash recovery, this function
|
|
loads the pages from double write buffer into memory.
|
|
@param file File handle
|
|
@param path Path name of file
|
|
@return DB_SUCCESS or error code */
|
|
dberr_t buf_dblwr_t::init_or_load_pages(pfs_os_file_t file, const char *path)
|
|
noexcept
|
|
{
|
|
ut_ad(this == &buf_dblwr);
|
|
const uint32_t size= block_size;
|
|
|
|
/* We do the file i/o past the buffer pool */
|
|
byte *read_buf= static_cast<byte*>(aligned_malloc(srv_page_size,
|
|
srv_page_size));
|
|
/* Read the TRX_SYS header to check if we are using the doublewrite buffer */
|
|
dberr_t err= os_file_read(IORequestRead, file, read_buf,
|
|
TRX_SYS_PAGE_NO << srv_page_size_shift,
|
|
srv_page_size, nullptr);
|
|
|
|
if (err != DB_SUCCESS)
|
|
{
|
|
sql_print_error("InnoDB: Failed to read the system tablespace"
|
|
" header page");
|
|
func_exit:
|
|
aligned_free(read_buf);
|
|
return err;
|
|
}
|
|
|
|
/* TRX_SYS_PAGE_NO is not encrypted see fil_crypt_rotate_page() */
|
|
if (mach_read_from_4(TRX_SYS_DOUBLEWRITE_MAGIC + TRX_SYS_DOUBLEWRITE +
|
|
read_buf) != TRX_SYS_DOUBLEWRITE_MAGIC_N)
|
|
{
|
|
/* There is no doublewrite buffer initialized in the TRX_SYS page.
|
|
This should normally not be possible; the doublewrite buffer should
|
|
be initialized when creating the database. */
|
|
err= DB_SUCCESS;
|
|
goto func_exit;
|
|
}
|
|
|
|
init(TRX_SYS_DOUBLEWRITE + read_buf);
|
|
|
|
const bool upgrade_to_innodb_file_per_table=
|
|
!srv_read_only_mode &&
|
|
mach_read_from_4(TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED +
|
|
TRX_SYS_DOUBLEWRITE + read_buf) !=
|
|
TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N;
|
|
|
|
auto write_buf= active_slot->write_buf;
|
|
/* Read the pages from the doublewrite buffer to memory */
|
|
err= os_file_read(IORequestRead, file, write_buf,
|
|
block1.page_no() << srv_page_size_shift,
|
|
size << srv_page_size_shift, nullptr);
|
|
|
|
if (err != DB_SUCCESS)
|
|
{
|
|
sql_print_error("InnoDB: Failed to read"
|
|
" the first double write buffer extent");
|
|
goto func_exit;
|
|
}
|
|
|
|
err= os_file_read(IORequestRead, file,
|
|
write_buf + (size << srv_page_size_shift),
|
|
block2.page_no() << srv_page_size_shift,
|
|
size << srv_page_size_shift, nullptr);
|
|
if (err != DB_SUCCESS)
|
|
{
|
|
sql_print_error("InnoDB: Failed to read"
|
|
" the second double write buffer extent");
|
|
goto func_exit;
|
|
}
|
|
|
|
byte *page= write_buf;
|
|
|
|
if (UNIV_UNLIKELY(upgrade_to_innodb_file_per_table))
|
|
{
|
|
sql_print_information("InnoDB: Resetting space id's in "
|
|
"the doublewrite buffer");
|
|
|
|
for (ulint i= 0; i < size * 2; i++, page += srv_page_size)
|
|
{
|
|
memset(page + FIL_PAGE_SPACE_ID, 0, 4);
|
|
/* For pre-MySQL-4.1 innodb_checksum_algorithm=innodb, we do not need to
|
|
calculate new checksums for the pages because the field
|
|
.._SPACE_ID does not affect them. Write the page back to where
|
|
we read it from. */
|
|
const ulint source_page_no= i < size
|
|
? block1.page_no() + i
|
|
: block2.page_no() + i - size;
|
|
err= os_file_write(IORequestWrite, path, file, page,
|
|
source_page_no << srv_page_size_shift, srv_page_size);
|
|
if (err != DB_SUCCESS)
|
|
{
|
|
sql_print_error("InnoDB: Failed to upgrade the double write buffer");
|
|
goto func_exit;
|
|
}
|
|
}
|
|
os_file_flush(file);
|
|
}
|
|
else
|
|
{
|
|
alignas(8) char checkpoint[8];
|
|
mach_write_to_8(checkpoint, log_sys.next_checkpoint_lsn);
|
|
for (auto i= size * 2; i--; page += srv_page_size)
|
|
if (memcmp_aligned<8>(page + FIL_PAGE_LSN, checkpoint, 8) >= 0)
|
|
/* Valid pages are not older than the log checkpoint. */
|
|
recv_sys.dblwr.add(page);
|
|
}
|
|
err= DB_SUCCESS;
|
|
goto func_exit;
|
|
}
|
|
|
|
/** Process and remove the double write buffer pages for all tablespaces. */
|
|
void buf_dblwr_t::recover() noexcept
|
|
{
|
|
ut_ad(log_sys.last_checkpoint_lsn);
|
|
if (!is_created())
|
|
return;
|
|
const lsn_t max_lsn{log_sys.get_lsn()};
|
|
ut_ad(recv_sys.scanned_lsn == max_lsn);
|
|
ut_ad(recv_sys.scanned_lsn >= recv_sys.lsn);
|
|
|
|
uint32_t page_no_dblwr= 0;
|
|
byte *read_buf= static_cast<byte*>(aligned_malloc(3 * srv_page_size,
|
|
srv_page_size));
|
|
byte *const buf= read_buf + srv_page_size;
|
|
|
|
std::deque<byte*> deferred_pages;
|
|
for (recv_dblwr_t::list::iterator i= recv_sys.dblwr.pages.begin();
|
|
i != recv_sys.dblwr.pages.end(); ++i, ++page_no_dblwr)
|
|
{
|
|
const page_t *const page= *i;
|
|
const uint32_t page_no= page_get_page_no(page);
|
|
const lsn_t lsn= mach_read_from_8(page + FIL_PAGE_LSN);
|
|
if (log_sys.last_checkpoint_lsn > lsn || lsn > recv_sys.lsn)
|
|
/* Pages written before or after the recovery range are not usable. */
|
|
continue;
|
|
const uint32_t space_id= page_get_space_id(page);
|
|
const page_id_t page_id(space_id, page_no);
|
|
|
|
fil_space_t *space= fil_space_t::get(space_id);
|
|
|
|
if (!space)
|
|
{
|
|
/* These pages does not appear to belong to any tablespace.
|
|
There is a possibility that this page could be
|
|
encrypted/compressed using full_crc32 format.
|
|
If innodb encounters any corrupted encrypted/compressed
|
|
page during recovery then InnoDB should use this page to
|
|
find the valid page.
|
|
See find_encrypted_page()/find_page_compressed() */
|
|
deferred_pages.push_back(*i);
|
|
continue;
|
|
}
|
|
|
|
if (UNIV_UNLIKELY(page_no >= space->get_size()))
|
|
{
|
|
/* Do not report the warning for undo tablespaces, because they
|
|
can be truncated in place. */
|
|
if (!srv_is_undo_tablespace(space_id))
|
|
sql_print_warning("InnoDB: A copy of page "
|
|
"[page id: space=" UINT32PF
|
|
", page number=" UINT32PF "]"
|
|
" in the doublewrite buffer slot " UINT32PF
|
|
" is beyond the end of %s (" UINT32PF " pages)",
|
|
page_id.space(), page_id.page_no(),
|
|
page_no_dblwr, space->chain.start->name,
|
|
space->size);
|
|
next_page:
|
|
space->release();
|
|
continue;
|
|
}
|
|
|
|
const ulint physical_size= space->physical_size();
|
|
ut_ad(!buf_is_zeroes(span<const byte>(page, physical_size)));
|
|
|
|
/* We want to ensure that for partial reads the unread portion of
|
|
the page is NUL. */
|
|
memset(read_buf, 0x0, physical_size);
|
|
|
|
/* Read in the actual page from the file */
|
|
fil_io_t fio= space->io(IORequest(IORequest::DBLWR_RECOVER),
|
|
os_offset_t{page_no} * physical_size,
|
|
physical_size, read_buf);
|
|
|
|
if (UNIV_UNLIKELY(fio.err != DB_SUCCESS))
|
|
sql_print_warning("InnoDB: Double write buffer recovery: "
|
|
"[page id: space=" UINT32PF
|
|
", page number=" UINT32PF "]"
|
|
" ('%s') read failed with error: %s",
|
|
page_id.space(), page_id.page_no(), fio.node->name,
|
|
ut_strerr(fio.err));
|
|
else if (buf_is_zeroes(span<const byte>(read_buf, physical_size)))
|
|
{
|
|
/* We will check if the copy in the doublewrite buffer is
|
|
valid. If not, we will ignore this page (there should be redo
|
|
log records to initialize it). */
|
|
}
|
|
else if (recv_sys.dblwr.validate_page(page_id, max_lsn, space,
|
|
read_buf, buf))
|
|
goto next_page;
|
|
else
|
|
/* We intentionally skip this message for all-zero pages. */
|
|
sql_print_information("InnoDB: Trying to recover page "
|
|
"[page id: space=" UINT32PF
|
|
", page number=" UINT32PF "]"
|
|
" from the doublewrite buffer.",
|
|
page_id.space(), page_id.page_no());
|
|
|
|
if (const byte *page=
|
|
recv_sys.dblwr.find_page(page_id, max_lsn, space, buf))
|
|
{
|
|
/* Write the good page from the doublewrite buffer to the intended
|
|
position. */
|
|
space->reacquire();
|
|
fio= space->io(IORequestWrite,
|
|
os_offset_t{page_id.page_no()} * physical_size,
|
|
physical_size, const_cast<byte*>(page));
|
|
|
|
if (fio.err == DB_SUCCESS)
|
|
sql_print_information("InnoDB: Recovered page "
|
|
"[page id: space=" UINT32PF
|
|
", page number=" UINT32PF "]"
|
|
" to '%s' from the doublewrite buffer.",
|
|
page_id.space(), page_id.page_no(),
|
|
fio.node->name);
|
|
}
|
|
|
|
goto next_page;
|
|
}
|
|
|
|
recv_sys.dblwr.pages.clear();
|
|
for (byte *page : deferred_pages)
|
|
recv_sys.dblwr.pages.push_back(page);
|
|
fil_flush_file_spaces();
|
|
aligned_free(read_buf);
|
|
}
|
|
|
|
/** Free the doublewrite buffer. */
|
|
void buf_dblwr_t::close() noexcept
|
|
{
|
|
if (!active_slot)
|
|
return;
|
|
|
|
ut_ad(!active_slot->reserved);
|
|
ut_ad(!active_slot->first_free);
|
|
ut_ad(!batch_running);
|
|
|
|
pthread_cond_destroy(&cond);
|
|
for (int i= 0; i < 2; i++)
|
|
{
|
|
aligned_free(slots[i].write_buf);
|
|
ut_free(slots[i].buf_block_arr);
|
|
}
|
|
mysql_mutex_destroy(&mutex);
|
|
|
|
memset((void*) this, 0, sizeof *this);
|
|
}
|
|
|
|
/** Update the doublewrite buffer on write completion. */
|
|
void buf_dblwr_t::write_completed() noexcept
|
|
{
|
|
ut_ad(this == &buf_dblwr);
|
|
ut_ad(!srv_read_only_mode);
|
|
|
|
mysql_mutex_lock(&mutex);
|
|
|
|
ut_ad(is_created());
|
|
ut_ad(batch_running);
|
|
slot *flush_slot= active_slot == &slots[0] ? &slots[1] : &slots[0];
|
|
ut_ad(flush_slot->reserved);
|
|
ut_ad(flush_slot->reserved <= flush_slot->first_free);
|
|
|
|
if (!--flush_slot->reserved)
|
|
{
|
|
mysql_mutex_unlock(&mutex);
|
|
/* This will finish the batch. Sync data files to the disk. */
|
|
fil_flush_file_spaces();
|
|
mysql_mutex_lock(&mutex);
|
|
|
|
/* We can now reuse the doublewrite memory buffer: */
|
|
flush_slot->first_free= 0;
|
|
batch_running= false;
|
|
pthread_cond_broadcast(&cond);
|
|
}
|
|
|
|
mysql_mutex_unlock(&mutex);
|
|
}
|
|
|
|
#ifdef UNIV_DEBUG
|
|
/** Check the LSN values on the page.
|
|
@param[in] page page to check
|
|
@param[in] s tablespace */
|
|
static void buf_dblwr_check_page_lsn(const page_t* page, const fil_space_t& s)
|
|
noexcept
|
|
{
|
|
/* Ignore page_compressed or encrypted pages */
|
|
if (s.is_compressed() || buf_page_get_key_version(page, s.flags))
|
|
return;
|
|
const byte* lsn_start= FIL_PAGE_LSN + 4 + page;
|
|
const byte* lsn_end= page + srv_page_size -
|
|
(s.full_crc32()
|
|
? FIL_PAGE_FCRC32_END_LSN
|
|
: FIL_PAGE_END_LSN_OLD_CHKSUM - 4);
|
|
static_assert(FIL_PAGE_FCRC32_END_LSN % 4 == 0, "alignment");
|
|
static_assert(FIL_PAGE_LSN % 4 == 0, "alignment");
|
|
ut_ad(!memcmp_aligned<4>(lsn_start, lsn_end, 4));
|
|
}
|
|
|
|
static void buf_dblwr_check_page_lsn(const buf_page_t &b, const byte *page)
|
|
noexcept
|
|
{
|
|
if (fil_space_t *space= fil_space_t::get_for_write(b.id().space()))
|
|
{
|
|
buf_dblwr_check_page_lsn(page, *space);
|
|
space->release();
|
|
}
|
|
}
|
|
|
|
/** Check the LSN values on the page with which this block is associated. */
|
|
static void buf_dblwr_check_block(const buf_page_t *bpage) noexcept
|
|
{
|
|
ut_ad(bpage->in_file());
|
|
const page_t *page= bpage->frame;
|
|
ut_ad(page);
|
|
|
|
switch (fil_page_get_type(page)) {
|
|
case FIL_PAGE_INDEX:
|
|
case FIL_PAGE_TYPE_INSTANT:
|
|
case FIL_PAGE_RTREE:
|
|
if (page_is_comp(page))
|
|
{
|
|
if (page_simple_validate_new(page))
|
|
return;
|
|
}
|
|
else if (page_simple_validate_old(page))
|
|
return;
|
|
/* While it is possible that this is not an index page but just
|
|
happens to have wrongly set FIL_PAGE_TYPE, such pages should never
|
|
be modified to without also adjusting the page type during page
|
|
allocation or buf_flush_init_for_writing() or
|
|
fil_block_reset_type(). */
|
|
buf_page_print(page);
|
|
|
|
ib::fatal() << "Apparent corruption of an index page " << bpage->id()
|
|
<< " to be written to data file. We intentionally crash"
|
|
" the server to prevent corrupt data from ending up in"
|
|
" data files.";
|
|
}
|
|
}
|
|
#endif /* UNIV_DEBUG */
|
|
|
|
ATTRIBUTE_COLD void buf_dblwr_t::print_info() const noexcept
|
|
{
|
|
mysql_mutex_assert_owner(&mutex);
|
|
const slot *flush_slot= active_slot == &slots[0] ? &slots[1] : &slots[0];
|
|
|
|
sql_print_information("InnoDB: Double Write State\n"
|
|
"-------------------\n"
|
|
"Batch running : %s\n"
|
|
"Active Slot - first_free: %zu reserved: %zu\n"
|
|
"Flush Slot - first_free: %zu reserved: %zu\n"
|
|
"-------------------",
|
|
(batch_running ? "true" : "false"),
|
|
active_slot->first_free, active_slot->reserved,
|
|
flush_slot->first_free, flush_slot->reserved);
|
|
}
|
|
|
|
bool buf_dblwr_t::flush_buffered_writes(const ulint size) noexcept
|
|
{
|
|
mysql_mutex_assert_owner(&mutex);
|
|
ut_ad(size == block_size);
|
|
|
|
const size_t max_count= 60 * 60;
|
|
const size_t first_log_count= 30;
|
|
const size_t fatal_threshold=
|
|
static_cast<size_t>(srv_fatal_semaphore_wait_threshold);
|
|
size_t log_count= first_log_count;
|
|
|
|
for (size_t count= 0;;)
|
|
{
|
|
if (!active_slot->first_free)
|
|
return false;
|
|
if (!batch_running)
|
|
break;
|
|
|
|
timespec abstime;
|
|
set_timespec(abstime, 1);
|
|
my_cond_timedwait(&cond, &mutex.m_mutex, &abstime);
|
|
|
|
if (count > fatal_threshold)
|
|
{
|
|
buf_pool.print_flush_info();
|
|
print_info();
|
|
ib::fatal() << "InnoDB: Long wait (" << count
|
|
<< " seconds) for double-write buffer flush.";
|
|
}
|
|
else if (++count < first_log_count && !(count % 5))
|
|
{
|
|
sql_print_information("InnoDB: Long wait (%zu seconds) for double-write"
|
|
" buffer flush.", count);
|
|
buf_pool.print_flush_info();
|
|
print_info();
|
|
}
|
|
else if (!(count % log_count))
|
|
{
|
|
sql_print_warning("InnoDB: Long wait (%zu seconds) for double-write"
|
|
" buffer flush.", count);
|
|
buf_pool.print_flush_info();
|
|
print_info();
|
|
log_count= log_count >= max_count ? max_count : log_count * 2;
|
|
}
|
|
}
|
|
ut_ad(active_slot->reserved == active_slot->first_free);
|
|
ut_ad(!flushing_buffered_writes);
|
|
|
|
/* Disallow anyone else to start another batch of flushing. */
|
|
slot *flush_slot= active_slot;
|
|
/* Switch the active slot */
|
|
active_slot= active_slot == &slots[0] ? &slots[1] : &slots[0];
|
|
ut_a(active_slot->first_free == 0);
|
|
batch_running= true;
|
|
const ulint old_first_free= flush_slot->first_free;
|
|
auto write_buf= flush_slot->write_buf;
|
|
const bool multi_batch= block1 + static_cast<uint32_t>(size) != block2 &&
|
|
old_first_free > size;
|
|
flushing_buffered_writes= 1 + multi_batch;
|
|
/* Now safe to release the mutex. */
|
|
mysql_mutex_unlock(&mutex);
|
|
#ifdef UNIV_DEBUG
|
|
for (ulint len2= 0, i= 0; i < old_first_free; len2 += srv_page_size, i++)
|
|
{
|
|
buf_page_t *bpage= flush_slot->buf_block_arr[i].request.bpage;
|
|
|
|
if (bpage->zip.data)
|
|
/* No simple validate for ROW_FORMAT=COMPRESSED pages exists. */
|
|
continue;
|
|
|
|
/* Check that the actual page in the buffer pool is not corrupt
|
|
and the LSN values are sane. */
|
|
buf_dblwr_check_block(bpage);
|
|
ut_d(buf_dblwr_check_page_lsn(*bpage, write_buf + len2));
|
|
}
|
|
#endif /* UNIV_DEBUG */
|
|
const IORequest request{nullptr, nullptr, fil_system.sys_space->chain.start,
|
|
IORequest::DBLWR_BATCH};
|
|
ut_a(fil_system.sys_space->acquire());
|
|
if (multi_batch)
|
|
{
|
|
fil_system.sys_space->reacquire();
|
|
os_aio(request, write_buf,
|
|
os_offset_t{block1.page_no()} << srv_page_size_shift,
|
|
size << srv_page_size_shift);
|
|
os_aio(request, write_buf + (size << srv_page_size_shift),
|
|
os_offset_t{block2.page_no()} << srv_page_size_shift,
|
|
(old_first_free - size) << srv_page_size_shift);
|
|
}
|
|
else
|
|
os_aio(request, write_buf,
|
|
os_offset_t{block1.page_no()} << srv_page_size_shift,
|
|
old_first_free << srv_page_size_shift);
|
|
return true;
|
|
}
|
|
|
|
static void *get_frame(const IORequest &request) noexcept
|
|
{
|
|
if (request.slot)
|
|
return request.slot->out_buf;
|
|
const buf_page_t *bpage= request.bpage;
|
|
return bpage->zip.data ? bpage->zip.data : bpage->frame;
|
|
}
|
|
|
|
void buf_dblwr_t::flush_buffered_writes_completed(const IORequest &request)
|
|
noexcept
|
|
{
|
|
ut_ad(this == &buf_dblwr);
|
|
ut_ad(is_created());
|
|
ut_ad(!srv_read_only_mode);
|
|
ut_ad(!request.bpage);
|
|
ut_ad(request.node == fil_system.sys_space->chain.start);
|
|
ut_ad(request.type == IORequest::DBLWR_BATCH);
|
|
mysql_mutex_lock(&mutex);
|
|
ut_ad(batch_running);
|
|
ut_ad(flushing_buffered_writes);
|
|
ut_ad(flushing_buffered_writes <= 2);
|
|
writes_completed++;
|
|
if (UNIV_UNLIKELY(--flushing_buffered_writes))
|
|
{
|
|
mysql_mutex_unlock(&mutex);
|
|
return;
|
|
}
|
|
|
|
slot *const flush_slot= active_slot == &slots[0] ? &slots[1] : &slots[0];
|
|
ut_ad(flush_slot->reserved == flush_slot->first_free);
|
|
/* increment the doublewrite flushed pages counter */
|
|
pages_written+= flush_slot->first_free;
|
|
mysql_mutex_unlock(&mutex);
|
|
|
|
/* Make the doublewrite durable. Note: The doublewrite buffer is
|
|
always in the first file of the system tablespace. We will not
|
|
bother about fil_system.unflushed_spaces, which can result in a
|
|
redundant call during fil_flush_file_spaces() in
|
|
log_checkpoint(). Writes to the system tablespace should be rare,
|
|
except when executing DDL or using the non-default settings
|
|
innodb_file_per_table=OFF or innodb_undo_tablespaces=0. */
|
|
os_file_flush(request.node->handle);
|
|
|
|
/* The writes have been flushed to disk now and in recovery we will
|
|
find them in the doublewrite buffer blocks. Next, write the data pages. */
|
|
for (ulint i= 0, first_free= flush_slot->first_free; i < first_free; i++)
|
|
{
|
|
auto e= flush_slot->buf_block_arr[i];
|
|
buf_page_t* bpage= e.request.bpage;
|
|
ut_ad(bpage->in_file());
|
|
|
|
void *frame= get_frame(e.request);
|
|
ut_ad(frame);
|
|
|
|
auto e_size= e.size;
|
|
|
|
if (UNIV_LIKELY_NULL(bpage->zip.data))
|
|
{
|
|
e_size= bpage->zip_size();
|
|
ut_ad(e_size);
|
|
}
|
|
else
|
|
{
|
|
ut_ad(!bpage->zip_size());
|
|
ut_d(buf_dblwr_check_page_lsn(*bpage, static_cast<const byte*>(frame)));
|
|
}
|
|
|
|
const lsn_t lsn= mach_read_from_8(my_assume_aligned<8>
|
|
(FIL_PAGE_LSN +
|
|
static_cast<const byte*>(frame)));
|
|
ut_ad(lsn);
|
|
ut_ad(lsn >= bpage->oldest_modification());
|
|
log_write_up_to(lsn, true);
|
|
e.request.node->space->io(e.request, bpage->physical_offset(), e_size,
|
|
frame, bpage);
|
|
}
|
|
}
|
|
|
|
/** Flush possible buffered writes to persistent storage.
|
|
It is very important to call this function after a batch of writes has been
|
|
posted, and also when we may have to wait for a page latch!
|
|
Otherwise a deadlock of threads can occur. */
|
|
void buf_dblwr_t::flush_buffered_writes() noexcept
|
|
{
|
|
mysql_mutex_lock(&mutex);
|
|
|
|
if (!in_use() && active_slot->first_free == 0)
|
|
{
|
|
mysql_mutex_unlock(&mutex);
|
|
fil_flush_file_spaces();
|
|
return;
|
|
}
|
|
|
|
ut_ad(!srv_read_only_mode);
|
|
|
|
if (!flush_buffered_writes(block_size))
|
|
mysql_mutex_unlock(&mutex);
|
|
}
|
|
|
|
/** Schedule a page write. If the doublewrite memory buffer is full,
|
|
flush_buffered_writes() will be invoked to make space.
|
|
@param request asynchronous write request
|
|
@param size payload size in bytes */
|
|
void buf_dblwr_t::add_to_batch(const IORequest &request, size_t size) noexcept
|
|
{
|
|
ut_ad(request.bpage);
|
|
ut_ad(request.bpage->in_file());
|
|
ut_ad(request.node);
|
|
ut_ad(!request.node->space->is_temporary());
|
|
ut_ad(!request.node->space->is_being_imported());
|
|
ut_ad(request.node->space->id == request.bpage->id().space());
|
|
ut_ad(request.node->space->referenced());
|
|
ut_ad(!srv_read_only_mode);
|
|
|
|
const ulint buf_size= 2 * block_size;
|
|
|
|
mysql_mutex_lock(&mutex);
|
|
|
|
for (;;)
|
|
{
|
|
ut_ad(active_slot->first_free <= buf_size);
|
|
if (active_slot->first_free != buf_size)
|
|
break;
|
|
|
|
if (flush_buffered_writes(buf_size / 2))
|
|
mysql_mutex_lock(&mutex);
|
|
}
|
|
|
|
byte *p= active_slot->write_buf + srv_page_size * active_slot->first_free;
|
|
|
|
/* "frame" is at least 1024-byte aligned for ROW_FORMAT=COMPRESSED pages,
|
|
and at least srv_page_size (4096-byte) for everything else. */
|
|
memcpy_aligned<UNIV_ZIP_SIZE_MIN>(p, get_frame(request), size);
|
|
/* fil_page_compress() for page_compressed guarantees 256-byte alignment */
|
|
memset_aligned<256>(p + size, 0, srv_page_size - size);
|
|
/* FIXME: Inform the compiler that "size" and "srv_page_size - size"
|
|
are integer multiples of 256, so the above can translate into simple
|
|
SIMD instructions. Currently, we make no such assumptions about the
|
|
non-pointer parameters that are passed to the _aligned templates. */
|
|
ut_ad(!request.bpage->zip_size() || request.bpage->zip_size() == size);
|
|
ut_ad(active_slot->reserved == active_slot->first_free);
|
|
ut_ad(active_slot->reserved < buf_size);
|
|
new (active_slot->buf_block_arr + active_slot->first_free++)
|
|
element{request.doublewritten(), size};
|
|
active_slot->reserved= active_slot->first_free;
|
|
|
|
if (active_slot->first_free != buf_size ||
|
|
!flush_buffered_writes(buf_size / 2))
|
|
mysql_mutex_unlock(&mutex);
|
|
}
|