MDEV-27416 InnoDB hang in buf_flush_wait_flushed(), on log checkpoint

InnoDB could sometimes hang when triggering a log checkpoint. This is
due to commit 7b1252c03d (MDEV-24278),
which introduced an untimed wait to buf_flush_page_cleaner().

The hang was noticed by occasional failures of IMPORT TABLESPACE tests,
such as innodb.innodb-wl5522, which would (unnecessarily) invoke
log_make_checkpoint() from row_import_cleanup().

The reason of the hang was that buf_flush_page_cleaner() would enter
untimed sleep despite buf_flush_sync_lsn being set. The exact failure
scenario is unclear, because buf_flush_sync_lsn should actually be
protected by buf_pool.flush_list_mutex. We prevent the hang by
invoking buf_pool.page_cleaner_set_idle(false) whenever we are
setting buf_flush_sync_lsn and signaling buf_pool.do_flush_list.

The bulk of these changes was originally developed as a preparation
for MDEV-26827, to invoke buf_flush_list() from fewer threads,
and tested on 10.6 by Matthias Leich.

This fix was tested by running 100 repetitions of 100 concurrent instances
of the test innodb.innodb-wl5522 on a RelWithDebInfo build, using ext4fs
and innodb_flush_method=O_DIRECT on a SATA SSD with 4096-byte block size.
During the test, the call to log_make_checkpoint() in row_import_cleanup()
was present.

buf_flush_list(): Make static.

buf_flush_wait(): Wait for buf_pool.get_oldest_modification()
to reach a target, by work done in the buf_flush_page_cleaner.
If buf_flush_sync_lsn is going to be set, we will invoke
buf_pool.page_cleaner_set_idle(false).

buf_flush_ahead(): If buf_flush_sync_lsn or buf_flush_async_lsn
is going to be set and the page cleaner woken up, we will invoke
buf_pool.page_cleaner_set_idle(false).

buf_flush_wait_flushed(): Invoke buf_flush_wait().

buf_flush_sync(): Invoke recv_sys.apply() at the start in case
crash recovery is active. Invoke buf_flush_wait().

buf_flush_sync_batch(): A lower-level variant of buf_flush_sync()
that is only called by recv_sys_t::apply().

buf_flush_sync_for_checkpoint(): Do not trigger log apply
or checkpoint during recovery.

buf_dblwr_t::create(): Only initiate a buffer pool flush, not
a checkpoint.

row_import_cleanup(): Do not unnecessarily invoke log_make_checkpoint().
Invoking buf_flush_list_space() before starting to generate redo log
for the imported tablespace should suffice.

srv_prepare_to_delete_redo_log_file():
Set recv_sys.recovery_on in order to prevent
buf_flush_sync_for_checkpoint() from initiating a checkpoint
while the log is inaccessible. Remove a wait loop that is already
part of buf_flush_sync().
Do not invoke fil_names_clear() if the log is being upgraded,
because the FILE_MODIFY record is specific to the latest format.

create_log_file(): Clear recv_sys.recovery_on only after calling
log_make_checkpoint(), to prevent buf_flush_page_cleaner from
invoking a checkpoint.

innodb_shutdown(): Simplify the logic in mariadb-backup --prepare.

os_aio_wait_until_no_pending_writes(): Update the function comment.
Apart from row_quiesce_table_start() during FLUSH TABLES...FOR EXPORT,
this is being called by buf_flush_list_space(), which is invoked
by ALTER TABLE...IMPORT TABLESPACE as well as some encryption operations.
This commit is contained in:
Marko Mäkelä 2022-01-04 07:40:31 +02:00
commit 4c3ad24413
11 changed files with 155 additions and 139 deletions

View file

@ -329,6 +329,13 @@ static dberr_t create_log_file(bool create_new_db, lsn_t lsn,
log_sys.log.write_header_durable(lsn);
ut_ad(srv_startup_is_before_trx_rollback_phase);
if (create_new_db) {
srv_startup_is_before_trx_rollback_phase = false;
}
/* Enable checkpoints in buf_flush_page_cleaner(). */
recv_sys.recovery_on = false;
mysql_mutex_unlock(&log_sys.mutex);
log_make_checkpoint();
@ -895,91 +902,74 @@ buffer pools. Flush the redo log buffer to the redo log file.
@return lsn upto which data pages have been flushed. */
static lsn_t srv_prepare_to_delete_redo_log_file(bool old_exists)
{
DBUG_ENTER("srv_prepare_to_delete_redo_log_file");
DBUG_ENTER("srv_prepare_to_delete_redo_log_file");
lsn_t flushed_lsn;
ulint count = 0;
/* Disable checkpoints in the page cleaner. */
ut_ad(!recv_sys.recovery_on);
recv_sys.recovery_on= true;
if (log_sys.log.subformat != 2) {
srv_log_file_size = 0;
}
/* Clean the buffer pool. */
buf_flush_sync();
for (;;) {
/* Clean the buffer pool. */
buf_flush_sync();
if (log_sys.log.subformat != 2)
srv_log_file_size= 0;
DBUG_EXECUTE_IF("innodb_log_abort_1", DBUG_RETURN(0););
DBUG_PRINT("ib_log", ("After innodb_log_abort_1"));
DBUG_EXECUTE_IF("innodb_log_abort_1", DBUG_RETURN(0););
DBUG_PRINT("ib_log", ("After innodb_log_abort_1"));
mysql_mutex_lock(&log_sys.mutex);
mysql_mutex_lock(&log_sys.mutex);
const bool latest_format= (log_sys.log.format & ~log_t::FORMAT_ENCRYPTED) ==
log_t::FORMAT_10_5;
lsn_t flushed_lsn= log_sys.get_lsn();
fil_names_clear(log_sys.get_lsn(), false);
if (latest_format)
{
fil_names_clear(flushed_lsn, false);
flushed_lsn= log_sys.get_lsn();
}
flushed_lsn = log_sys.get_lsn();
{
const char *msg;
if (!latest_format || srv_log_file_size == 0)
{
msg= "Upgrading redo log: ";
same_size:
ib::info() << msg << srv_log_file_size_requested << " bytes; LSN="
<< flushed_lsn;
}
else if (old_exists && srv_log_file_size == srv_log_file_size_requested)
{
msg= srv_encrypt_log
? "Encrypting redo log: " : "Removing redo log encryption: ";
goto same_size;
}
else
{
if (srv_encrypt_log == (my_bool)log_sys.is_encrypted())
msg= srv_encrypt_log ? "Resizing encrypted" : "Resizing";
else
msg= srv_encrypt_log
? "Encrypting and resizing"
: "Removing encryption and resizing";
{
ib::info info;
if (srv_log_file_size == 0
|| (log_sys.log.format & ~log_t::FORMAT_ENCRYPTED)
!= log_t::FORMAT_10_5) {
info << "Upgrading redo log: ";
} else if (!old_exists
|| srv_log_file_size
!= srv_log_file_size_requested) {
if (srv_encrypt_log
== (my_bool)log_sys.is_encrypted()) {
info << (srv_encrypt_log
? "Resizing encrypted"
: "Resizing");
} else if (srv_encrypt_log) {
info << "Encrypting and resizing";
} else {
info << "Removing encryption"
" and resizing";
}
ib::info() << msg << " redo log from " << srv_log_file_size << " to "
<< srv_log_file_size_requested
<< " bytes; LSN=" << flushed_lsn;
}
}
info << " redo log from " << srv_log_file_size
<< " to ";
} else if (srv_encrypt_log) {
info << "Encrypting redo log: ";
} else {
info << "Removing redo log encryption: ";
}
mysql_mutex_unlock(&log_sys.mutex);
info << srv_log_file_size_requested
<< " bytes; LSN=" << flushed_lsn;
}
if (flushed_lsn != log_sys.get_flushed_lsn())
{
log_write_up_to(flushed_lsn, false);
log_sys.log.flush();
}
mysql_mutex_unlock(&log_sys.mutex);
ut_ad(flushed_lsn == log_sys.get_lsn());
ut_ad(!buf_pool.any_io_pending());
if (flushed_lsn != log_sys.get_flushed_lsn()) {
log_write_up_to(flushed_lsn, false);
log_sys.log.flush();
}
ut_ad(flushed_lsn == log_sys.get_lsn());
/* Check if the buffer pools are clean. If not
retry till it is clean. */
if (ulint pending_io = buf_pool.io_pending()) {
count++;
/* Print a message every 60 seconds if we
are waiting to clean the buffer pools */
if (srv_print_verbose_log && count > 600) {
ib::info() << "Waiting for "
<< pending_io << " buffer "
<< "page I/Os to complete";
count = 0;
}
os_thread_sleep(100000);
continue;
}
break;
}
DBUG_RETURN(flushed_lsn);
DBUG_RETURN(flushed_lsn);
}
/** Tries to locate LOG_FILE_NAME and check it's size, etc
@ -1259,7 +1249,7 @@ dberr_t srv_start(bool create_new_db)
ut_ad(buf_page_cleaner_is_active);
}
srv_startup_is_before_trx_rollback_phase = !create_new_db;
srv_startup_is_before_trx_rollback_phase = true;
/* Check if undo tablespaces and redo log files exist before creating
a new system tablespace */
@ -1308,7 +1298,6 @@ dberr_t srv_start(bool create_new_db)
if (create_new_db) {
flushed_lsn = log_sys.get_lsn();
log_sys.set_flushed_lsn(flushed_lsn);
buf_flush_sync();
err = create_log_file(true, flushed_lsn, logfile0);
@ -1371,6 +1360,9 @@ dberr_t srv_start(bool create_new_db)
if (!log_set_capacity(srv_log_file_size_requested)) {
return(srv_init_abort(DB_ERROR));
}
/* Enable checkpoints in the page cleaner. */
recv_sys.recovery_on = false;
}
file_checked:
@ -2024,11 +2016,8 @@ void innodb_shutdown()
break;
case SRV_OPERATION_RESTORE:
case SRV_OPERATION_RESTORE_EXPORT:
srv_shutdown_state = SRV_SHUTDOWN_CLEANUP;
if (!buf_page_cleaner_is_active) {
break;
}
mysql_mutex_lock(&buf_pool.flush_list_mutex);
srv_shutdown_state = SRV_SHUTDOWN_CLEANUP;
while (buf_page_cleaner_is_active) {
pthread_cond_signal(&buf_pool.do_flush_list);
my_cond_wait(&buf_pool.done_flush_list,