MDEV-24302 follow-up: RESET MASTER hangs

As pointed out by Andrei Elkin, the previous fix did not fix one
race condition that may have caused the observed hang.

innodb_log_flush_request(): If we are enqueueing the very first
request at the same time the log write is being completed,
we must ensure that a near-concurrent call to log_flush_notify()
will not result in a missed notification. We guarantee this by
release-acquire operations on log_requests.start and
log_sys.flushed_to_disk_lsn.

log_flush_notify_and_unlock(): Cleanup: Always release the mutex.

log_sys_t::get_flushed_lsn(): Use acquire memory order.

log_sys_t::set_flushed_lsn(): Use release memory order.

log_sys_t::set_lsn(): Use release memory order.

log_sys_t::get_lsn(): Use relaxed memory order by default, and
allow the caller to specify acquire memory order explicitly.
Whenever the log_sys.mutex is being held or when log writes are
prohibited during startup, we can use a relaxed load. Likewise,
in some assertions where reading a stale value of log_sys.lsn
should not matter, we can use a relaxed load.

This will cause some additional instructions to be emitted on
architectures that do not implement Total Store Ordering (TSO),
such as POWER, ARM, and RISC-V Weak Memory Ordering (RVWMO).
This commit is contained in:
Marko Mäkelä 2021-03-30 09:58:24 +03:00
parent 0df74a0197
commit 8c2e3259c1
7 changed files with 71 additions and 69 deletions

View file

@ -1,4 +1,4 @@
/* Copyright (c) 2020, MariaDB
/* Copyright (c) 2020, 2021, MariaDB
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@ -41,7 +41,9 @@ public:
Atomic_relaxed(Type val) : m(val) {}
Atomic_relaxed() {}
operator Type() const { return m.load(std::memory_order_relaxed); }
Type load(std::memory_order o= std::memory_order_relaxed) const
{ return m.load(o); }
operator Type() const { return m.load(); }
Type operator=(const Type val)
{ m.store(val, std::memory_order_relaxed); return val; }
Type operator=(const Atomic_relaxed<Type> &rhs) { return *this= Type{rhs}; }

View file

@ -1559,7 +1559,7 @@ static void log_flush(void *)
fil_flush_file_spaces();
/* Guarantee progress for buf_flush_lists(). */
log_write_up_to(log_sys.get_lsn(), true);
log_buffer_flush_to_disk(true);
log_flush_pending.clear();
}
@ -1577,15 +1577,17 @@ ulint buf_flush_lists(ulint max_n, lsn_t lsn)
if (n_flush)
return 0;
if (log_sys.get_lsn() > log_sys.get_flushed_lsn())
lsn_t flushed_lsn= log_sys.get_flushed_lsn();
if (log_sys.get_lsn() > flushed_lsn)
{
log_flush_task.wait();
if (log_sys.get_lsn() > log_sys.get_flushed_lsn() &&
flushed_lsn= log_sys.get_flushed_lsn();
if (log_sys.get_lsn() > flushed_lsn &&
!log_flush_pending.test_and_set())
srv_thread_pool->submit_task(&log_flush_task);
#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
if (UNIV_UNLIKELY(ibuf_debug))
log_write_up_to(log_sys.get_lsn(), true);
log_buffer_flush_to_disk(true);
#endif
}
@ -1730,7 +1732,7 @@ static bool log_checkpoint()
/** Make a checkpoint. */
ATTRIBUTE_COLD void log_make_checkpoint()
{
buf_flush_wait_flushed(log_sys.get_lsn());
buf_flush_wait_flushed(log_sys.get_lsn(std::memory_order_acquire));
while (!log_checkpoint());
}

View file

@ -1182,7 +1182,7 @@ MY_ALIGNED(CPU_LEVEL1_DCACHE_LINESIZE) static
struct
{
/** first request */
Atomic_relaxed<log_flush_request*> start;
std::atomic<log_flush_request*> start;
/** last request */
log_flush_request *end;
/** mutex protecting this object */
@ -4460,13 +4460,11 @@ innobase_rollback_trx(
/** Invoke commit_checkpoint_notify_ha() on completed log flush requests.
@param pending log_requests.start
@param lsn log_sys.get_flushed_lsn()
@return whether something was notified (and log_requests.mutex released) */
static bool log_flush_notify_and_unlock(log_flush_request *pending, lsn_t lsn)
@param lsn log_sys.get_flushed_lsn() */
static void log_flush_notify_and_unlock(log_flush_request *pending, lsn_t lsn)
{
mysql_mutex_assert_owner(&log_requests.mutex);
ut_ad(pending == log_requests.start);
ut_ad(pending == log_requests.start.load(std::memory_order_relaxed));
log_flush_request *entry= pending, *last= nullptr;
/* Process the first requests that have been completed. Since
the list is not necessarily in ascending order of LSN, we may
@ -4478,12 +4476,15 @@ static bool log_flush_notify_and_unlock(log_flush_request *pending, lsn_t lsn)
for (; entry && entry->lsn <= lsn; last= entry, entry= entry->next);
if (!last)
return false;
{
mysql_mutex_unlock(&log_requests.mutex);
return;
}
/* Detach the head of the list that corresponds to persisted log writes. */
log_requests.start= entry;
if (!entry)
log_requests.end= nullptr;
log_requests.end= entry;
log_requests.start.store(entry, std::memory_order_relaxed);
mysql_mutex_unlock(&log_requests.mutex);
/* Now that we have released the mutex, notify the submitters
@ -4496,21 +4497,16 @@ static bool log_flush_notify_and_unlock(log_flush_request *pending, lsn_t lsn)
my_free(entry);
}
while (entry != last);
return true;
}
/** Invoke commit_checkpoint_notify_ha() to notify that outstanding
log writes have been completed. */
void log_flush_notify(lsn_t flush_lsn)
{
if (log_requests.start)
if (auto pending= log_requests.start.load(std::memory_order_acquire))
{
mysql_mutex_lock(&log_requests.mutex);
if (log_flush_request *pending= log_requests.start)
if (log_flush_notify_and_unlock(pending, flush_lsn))
return;
mysql_mutex_unlock(&log_requests.mutex);
log_flush_notify_and_unlock(pending, flush_lsn);
}
}
@ -4520,8 +4516,9 @@ checkpoint complete when we have flushed the redo log.
If we have already flushed all relevant redo log, we notify immediately.*/
static void innodb_log_flush_request(void *cookie)
{
const lsn_t lsn= log_sys.get_lsn();
lsn_t flush_lsn= log_sys.get_flushed_lsn();
/* Load lsn relaxed after flush_lsn was loaded from the same cache line */
const lsn_t lsn= log_sys.get_lsn();
if (flush_lsn >= lsn)
/* All log is already persistent. */;
@ -4539,24 +4536,38 @@ static void innodb_log_flush_request(void *cookie)
req->cookie= cookie;
req->lsn= lsn;
log_flush_request *start= nullptr;
mysql_mutex_lock(&log_requests.mutex);
auto old_end= log_requests.end;
log_requests.end= req;
if (old_end)
/* In order to prevent a race condition where log_flush_notify()
would skip a notification due to, we must update log_requests.start from
nullptr (empty) to the first req using std::memory_order_release. */
if (log_requests.start.compare_exchange_strong(start, req,
std::memory_order_release,
std::memory_order_relaxed))
{
ut_ad(!log_requests.end);
start= req;
/* In case log_flush_notify() executed
log_requests.start.load(std::memory_order_acquire) right before
our successful compare_exchange, we must re-read flush_lsn to
ensure that our request will be notified immediately if applicable. */
flush_lsn= log_sys.get_flushed_lsn();
}
else
{
/* Append the entry to the list. Because we determined req->lsn before
acquiring the mutex, this list may not be ordered by req->lsn,
even though log_flush_notify_and_unlock() assumes so. */
old_end->next= req;
/* This hopefully addresses the hang that was reported in MDEV-24302.
Upon receiving a new request, we will notify old requests of
completion. */
if (log_flush_notify_and_unlock(log_requests.start, flush_lsn))
return;
log_requests.end->next= req;
}
else
log_requests.start= req;
mysql_mutex_unlock(&log_requests.mutex);
log_requests.end= req;
/* This hopefully addresses the hang that was reported in MDEV-24302.
Upon receiving a new request, we will notify old requests of
completion. */
log_flush_notify_and_unlock(start, flush_lsn);
return;
}
else
@ -18345,16 +18356,17 @@ checkpoint_now_set(THD*, st_mysql_sys_var*, void*, const void* save)
if (*(my_bool*) save) {
mysql_mutex_unlock(&LOCK_global_system_variables);
while (log_sys.last_checkpoint_lsn
lsn_t lsn;
while (log_sys.last_checkpoint_lsn.load(
std::memory_order_acquire)
+ SIZE_OF_FILE_CHECKPOINT
< log_sys.get_lsn()) {
< (lsn= log_sys.get_lsn(std::memory_order_acquire))) {
log_make_checkpoint();
log_sys.log.flush();
}
dberr_t err = fil_write_flushed_lsn(log_sys.get_lsn());
if (err != DB_SUCCESS) {
if (dberr_t err = fil_write_flushed_lsn(lsn)) {
ib::warn() << "Checkpoint set failed " << err;
}

View file

@ -2,7 +2,7 @@
Copyright (c) 1995, 2017, Oracle and/or its affiliates. All rights reserved.
Copyright (c) 2009, Google Inc.
Copyright (c) 2017, 2020, MariaDB Corporation.
Copyright (c) 2017, 2021, MariaDB Corporation.
Portions of this file contain modifications contributed and copyrighted by
Google, Inc. Those modifications are gratefully acknowledged and are described
@ -87,12 +87,6 @@ log_free_check(void);
@param[in] len requested minimum size in bytes */
void log_buffer_extend(ulong len);
/** Read the current LSN. */
#define log_get_lsn() log_sys.get_lsn()
/** Read the durable LSN */
#define log_get_flush_lsn() log_sys.get_flushed_lsn()
/** Calculate the recommended highest values for lsn - last_checkpoint_lsn
and lsn - buf_pool.get_oldest_modification().
@param[in] file_size requested innodb_log_file_size
@ -646,13 +640,14 @@ public:
bool is_initialised() const { return m_initialised; }
lsn_t get_lsn() const { return lsn.load(std::memory_order_relaxed); }
void set_lsn(lsn_t lsn) { this->lsn.store(lsn, std::memory_order_relaxed); }
lsn_t get_lsn(std::memory_order order= std::memory_order_relaxed) const
{ return lsn.load(order); }
void set_lsn(lsn_t lsn) { this->lsn.store(lsn, std::memory_order_release); }
lsn_t get_flushed_lsn() const
{ return flushed_to_disk_lsn.load(std::memory_order_relaxed); }
{ return flushed_to_disk_lsn.load(std::memory_order_acquire); }
void set_flushed_lsn(lsn_t lsn)
{ flushed_to_disk_lsn.store(lsn, std::memory_order_relaxed); }
{ flushed_to_disk_lsn.store(lsn, std::memory_order_release); }
bool check_flush_or_checkpoint() const
{

View file

@ -835,12 +835,10 @@ void log_write_up_to(lsn_t lsn, bool flush_to_disk, bool rotate_key)
/** write to the log file up to the last log entry.
@param[in] sync whether we want the written log
also to be flushed to disk. */
void
log_buffer_flush_to_disk(
bool sync)
void log_buffer_flush_to_disk(bool sync)
{
ut_ad(!srv_read_only_mode);
log_write_up_to(log_get_lsn(), sync);
ut_ad(!srv_read_only_mode);
log_write_up_to(log_sys.get_lsn(std::memory_order_acquire), sync);
}
/********************************************************************

View file

@ -1340,14 +1340,7 @@ void srv_monitor_task(void*)
where the lsn seems to decrease at times */
lsn_t new_lsn = log_sys.get_lsn();
if (new_lsn < old_lsn) {
ib::error() << "Old log sequence number " << old_lsn << " was"
<< " greater than the new log sequence number "
<< new_lsn << ". Please submit a bug report to"
" https://jira.mariadb.org/";
ut_ad(0);
}
ut_a(new_lsn >= old_lsn);
old_lsn = new_lsn;
/* Update the statistics collected for deciding LRU

View file

@ -956,7 +956,7 @@ static lsn_t srv_prepare_to_delete_redo_log_file(bool old_exists)
log_sys.log.flush();
}
ut_ad(flushed_lsn == log_get_lsn());
ut_ad(flushed_lsn == log_sys.get_lsn());
/* Check if the buffer pools are clean. If not
retry till it is clean. */
@ -1346,7 +1346,7 @@ dberr_t srv_start(bool create_new_db)
/* Suppress the message about
crash recovery. */
flushed_lsn = log_get_lsn();
flushed_lsn = log_sys.get_lsn();
goto file_checked;
}
@ -1424,7 +1424,7 @@ file_checked:
buf_flush_sync();
flushed_lsn = log_get_lsn();
flushed_lsn = log_sys.get_lsn();
err = fil_write_flushed_lsn(flushed_lsn);
@ -1600,7 +1600,7 @@ file_checked:
InnoDB files is needed. */
ut_ad(!srv_force_recovery);
ut_ad(recv_no_log_write);
err = fil_write_flushed_lsn(log_get_lsn());
err = fil_write_flushed_lsn(log_sys.get_lsn());
DBUG_ASSERT(!buf_pool.any_io_pending());
log_sys.log.close_file();
if (err == DB_SUCCESS) {