MDEV-35110 Deadlock on Replica during BACKUP STAGE BLOCK_COMMIT on XA transactions

This is an extension of MDEV-30423 "Deadlock on Replica during BACKUP
STAGE BLOCK_COMMIT on XA transactions"

The original commit in MDEV-30423 was not complete as some usage in XA of
MDL_BACKUP_COMMIT locks did not set thd->backup_commit_lock.
This is required to be set when using parallel replication.

Fixed by ensuring that all usage of BACKUP_COMMIT lock i XA is uniform and
all sets thd->backup_commit_lock. I also changed all locks to be
MDL_EXPLICIT to keep also that part uniform.

A regression test is added.
This commit is contained in:
Monty 2024-10-07 18:11:26 +03:00 committed by Andrei Elkin
parent 8c7786e7d5
commit 066f920484
4 changed files with 168 additions and 68 deletions

View file

@ -0,0 +1,42 @@
include/master-slave.inc
[connection master]
connection master;
CREATE TABLE t (a INT) ENGINE = innodb;
connection slave;
include/stop_slave.inc
SET @old_parallel_threads= @@GLOBAL.slave_parallel_threads;
SET @old_parallel_mode = @@GLOBAL.slave_parallel_mode;
SET @@global.slave_parallel_threads= 2;
SET @@global.slave_parallel_mode = 'optimistic';
connection master;
# MDEV-35110
SET @@gtid_seq_no=100;
insert into t set a=1;
xa start 'x';
insert into t set a=2;
xa end 'x';
xa prepare 'x';
connection slave;
SET @@global.debug_dbug="+d,hold_worker_on_schedule";
start slave;
connection slave1;
backup stage start;
backup stage block_commit;
connection slave;
SET debug_sync = 'now SIGNAL continue_worker';
SET debug_sync = RESET;
connection slave1;
backup stage end;
connection master;
xa rollback 'x';
connection slave;
# Clean up.
connection slave;
include/stop_slave.inc
SET @@global.debug_dbug="";
SET @@global.slave_parallel_threads= @old_parallel_threads;
SET @@global.slave_parallel_mode = @old_parallel_mode;
include/start_slave.inc
connection server_1;
DROP TABLE t;
include/rpl_end.inc

View file

@ -0,0 +1,64 @@
# Verify deadlock between XA-PREPARE and BACKUP on the optimistic slave
--source include/have_debug.inc
--source include/have_debug_sync.inc
--source include/have_innodb.inc
# The test is not format specific, MIXED is required to optimize testing time
--source include/have_binlog_format_mixed.inc
--source include/master-slave.inc
--connection master
CREATE TABLE t (a INT) ENGINE = innodb;
--sync_slave_with_master
--source include/stop_slave.inc
SET @old_parallel_threads= @@GLOBAL.slave_parallel_threads;
SET @old_parallel_mode = @@GLOBAL.slave_parallel_mode;
SET @@global.slave_parallel_threads= 2;
SET @@global.slave_parallel_mode = 'optimistic';
--connection master
--echo # MDEV-35110
SET @@gtid_seq_no=100;
insert into t set a=1;
xa start 'x';
insert into t set a=2;
xa end 'x';
xa prepare 'x';
--connection slave
SET @@global.debug_dbug="+d,hold_worker_on_schedule";
start slave;
--let $wait_condition= SELECT count(*) = 1 FROM information_schema.processlist WHERE state LIKE "Waiting for prior transaction to commit"
--source include/wait_condition.inc
--connection slave1
backup stage start;
--send backup stage block_commit
--connection slave
--let $wait_condition= SELECT count(*) = 1 FROM information_schema.processlist WHERE state LIKE "Waiting for backup lock"
SET debug_sync = 'now SIGNAL continue_worker';
SET debug_sync = RESET;
--connection slave1
reap;
backup stage end;
--connection master
xa rollback 'x';
--sync_slave_with_master
--echo # Clean up.
--connection slave
--source include/stop_slave.inc
SET @@global.debug_dbug="";
SET @@global.slave_parallel_threads= @old_parallel_threads;
SET @@global.slave_parallel_mode = @old_parallel_mode;
--source include/start_slave.inc
--connection server_1
DROP TABLE t;
--source include/rpl_end.inc

View file

@ -1781,7 +1781,13 @@ int ha_commit_trans(THD *thd, bool all)
DBUG_PRINT("info", ("is_real_trans: %d rw_trans: %d rw_ha_count: %d",
is_real_trans, rw_trans, rw_ha_count));
if (rw_trans)
/*
backup_commit_lock may have already been set.
This can happen in case of spider that does xa_commit() by
calling ha_commit_trans() from spader_commit().
*/
if (rw_trans && !thd->backup_commit_lock)
{
/*
Acquire a metadata lock which will ensure that COMMIT is blocked
@ -2052,8 +2058,8 @@ end:
not needed.
*/
thd->mdl_context.release_lock(mdl_backup.ticket);
thd->backup_commit_lock= 0;
}
thd->backup_commit_lock= 0;
#ifdef WITH_WSREP
if (wsrep_is_active(thd) && is_real_trans && !error &&
(rw_ha_count == 0 || all) &&

120
sql/xa.cc
View file

@ -507,6 +507,40 @@ bool trans_xa_end(THD *thd)
}
/*
Get the BACKUP_COMMIT lock for the duration of the XA.
The metadata lock which will ensure that COMMIT is blocked
by active FLUSH TABLES WITH READ LOCK (and vice versa COMMIT in
progress blocks FTWRL) and also by MDL_BACKUP_WAIT_COMMIT.
We allow FLUSHer to COMMIT; we assume FLUSHer knows what it does.
Note that the function sets thd->backup_lock on sucess. The caller needs
to reset thd->backup_commit_lock before returning!
*/
static bool trans_xa_get_backup_lock(THD *thd, MDL_request *mdl_request)
{
DBUG_ASSERT(thd->backup_commit_lock == 0);
MDL_REQUEST_INIT(mdl_request, MDL_key::BACKUP, "", "", MDL_BACKUP_COMMIT,
MDL_EXPLICIT);
if (thd->mdl_context.acquire_lock(mdl_request,
thd->variables.lock_wait_timeout))
return 1;
thd->backup_commit_lock= mdl_request;
return 0;
}
static inline void trans_xa_release_backup_lock(THD *thd)
{
if (thd->backup_commit_lock)
{
thd->mdl_context.release_lock(thd->backup_commit_lock->ticket);
thd->backup_commit_lock= 0;
}
}
/**
Put a XA transaction in the PREPARED state.
@ -529,22 +563,15 @@ bool trans_xa_prepare(THD *thd)
my_error(ER_XAER_NOTA, MYF(0));
else
{
/*
Acquire metadata lock which will ensure that COMMIT is blocked
by active FLUSH TABLES WITH READ LOCK (and vice versa COMMIT in
progress blocks FTWRL).
We allow FLUSHer to COMMIT; we assume FLUSHer knows what it does.
*/
MDL_request mdl_request;
MDL_REQUEST_INIT(&mdl_request, MDL_key::BACKUP, "", "", MDL_BACKUP_COMMIT,
MDL_STATEMENT);
if (thd->mdl_context.acquire_lock(&mdl_request,
thd->variables.lock_wait_timeout) ||
if (trans_xa_get_backup_lock(thd, &mdl_request) ||
ha_prepare(thd))
{
if (!mdl_request.ticket)
{
/* Failed to get the backup lock */
ha_rollback_trans(thd, TRUE);
}
thd->variables.option_bits&= ~(OPTION_BEGIN | OPTION_KEEP_LOG);
thd->transaction->all.reset();
thd->server_status&=
@ -572,6 +599,7 @@ bool trans_xa_prepare(THD *thd)
res= thd->variables.pseudo_slave_mode || thd->slave_thread ?
slave_applier_reset_xa_trans(thd) : 0;
}
trans_xa_release_backup_lock(thd);
}
DBUG_RETURN(res);
@ -635,19 +663,8 @@ bool trans_xa_commit(THD *thd)
res= 1;
goto _end_external_xid;
}
res= xa_trans_rolled_back(xs);
/*
Acquire metadata lock which will ensure that COMMIT is blocked
by active FLUSH TABLES WITH READ LOCK (and vice versa COMMIT in
progress blocks FTWRL).
We allow FLUSHer to COMMIT; we assume FLUSHer knows what it does.
*/
MDL_REQUEST_INIT(&mdl_request, MDL_key::BACKUP, "", "", MDL_BACKUP_COMMIT,
MDL_EXPLICIT);
if (thd->mdl_context.acquire_lock(&mdl_request,
thd->variables.lock_wait_timeout))
if (trans_xa_get_backup_lock(thd, &mdl_request))
{
/*
We can't rollback an XA transaction on lock failure due to
@ -659,14 +676,11 @@ bool trans_xa_commit(THD *thd)
res= true;
goto _end_external_xid;
}
else
{
thd->backup_commit_lock= &mdl_request;
}
DBUG_ASSERT(!xid_state.xid_cache_element);
xid_state.xid_cache_element= xs;
ha_commit_or_rollback_by_xid(thd->lex->xid, !res);
if (!res && thd->is_error())
{
// hton completion error retains xs/xid in the cache,
@ -682,11 +696,7 @@ bool trans_xa_commit(THD *thd)
res= res || thd->is_error();
if (!xid_deleted)
xs->acquired_to_recovered();
if (mdl_request.ticket)
{
thd->mdl_context.release_lock(mdl_request.ticket);
thd->backup_commit_lock= 0;
}
trans_xa_release_backup_lock(thd);
}
else
my_error(ER_XAER_NOTA, MYF(0));
@ -709,7 +719,8 @@ bool trans_xa_commit(THD *thd)
if ((res= MY_TEST(r)))
my_error(r == 1 ? ER_XA_RBROLLBACK : ER_XAER_RMERR, MYF(0));
}
else if (thd->transaction->xid_state.xid_cache_element->xa_state == XA_PREPARED)
else if (thd->transaction->xid_state.xid_cache_element->xa_state ==
XA_PREPARED)
{
MDL_request mdl_request;
if (thd->lex->xa_opt != XA_NONE)
@ -718,18 +729,7 @@ bool trans_xa_commit(THD *thd)
DBUG_RETURN(TRUE);
}
/*
Acquire metadata lock which will ensure that COMMIT is blocked
by active FLUSH TABLES WITH READ LOCK (and vice versa COMMIT in
progress blocks FTWRL).
We allow FLUSHer to COMMIT; we assume FLUSHer knows what it does.
*/
MDL_REQUEST_INIT(&mdl_request, MDL_key::BACKUP, "", "", MDL_BACKUP_COMMIT,
MDL_TRANSACTION);
if (thd->mdl_context.acquire_lock(&mdl_request,
thd->variables.lock_wait_timeout))
if (trans_xa_get_backup_lock(thd, &mdl_request))
{
/*
We can't rollback an XA transaction on lock failure due to
@ -756,6 +756,7 @@ bool trans_xa_commit(THD *thd)
}
thd->m_transaction_psi= NULL;
trans_xa_release_backup_lock(thd);
}
}
else
@ -793,7 +794,8 @@ bool trans_xa_commit(THD *thd)
bool trans_xa_rollback(THD *thd)
{
XID_STATE &xid_state= thd->transaction->xid_state;
MDL_request mdl_request;
bool error;
DBUG_ENTER("trans_xa_rollback");
if (!xid_state.is_explicit_XA() ||
@ -814,7 +816,6 @@ bool trans_xa_rollback(THD *thd)
{
bool res;
bool xid_deleted= false;
MDL_request mdl_request;
bool rw_trans= (xs->rm_error != ER_XA_RBROLLBACK);
if (rw_trans && thd->is_read_only_ctx())
@ -824,10 +825,7 @@ bool trans_xa_rollback(THD *thd)
goto _end_external_xid;
}
MDL_REQUEST_INIT(&mdl_request, MDL_key::BACKUP, "", "", MDL_BACKUP_COMMIT,
MDL_EXPLICIT);
if (thd->mdl_context.acquire_lock(&mdl_request,
thd->variables.lock_wait_timeout))
if (trans_xa_get_backup_lock(thd, &mdl_request))
{
/*
We can't rollback an XA transaction on lock failure due to
@ -838,10 +836,6 @@ bool trans_xa_rollback(THD *thd)
goto _end_external_xid;
}
else
{
thd->backup_commit_lock= &mdl_request;
}
res= xa_trans_rolled_back(xs);
DBUG_ASSERT(!xid_state.xid_cache_element);
@ -858,11 +852,7 @@ bool trans_xa_rollback(THD *thd)
xid_state.xid_cache_element= 0;
if (!xid_deleted)
xs->acquired_to_recovered();
if (mdl_request.ticket)
{
thd->mdl_context.release_lock(mdl_request.ticket);
thd->backup_commit_lock= 0;
}
trans_xa_release_backup_lock(thd);
}
else
my_error(ER_XAER_NOTA, MYF(0));
@ -879,11 +869,7 @@ bool trans_xa_rollback(THD *thd)
DBUG_RETURN(TRUE);
}
MDL_request mdl_request;
MDL_REQUEST_INIT(&mdl_request, MDL_key::BACKUP, "", "", MDL_BACKUP_COMMIT,
MDL_STATEMENT);
if (thd->mdl_context.acquire_lock(&mdl_request,
thd->variables.lock_wait_timeout))
if (trans_xa_get_backup_lock(thd, &mdl_request))
{
/*
We can't rollback an XA transaction on lock failure due to
@ -894,7 +880,9 @@ bool trans_xa_rollback(THD *thd)
DBUG_RETURN(true);
}
DBUG_RETURN(xa_trans_force_rollback(thd));
error= xa_trans_force_rollback(thd);
trans_xa_release_backup_lock(thd);
DBUG_RETURN(error);
}