MDEV-31038: Parallel Replication Breaks if XA PREPARE Fails Updating Slave GTID State

If a replica failed to update the GTID slave state when committing
an XA PREPARE, the replica would retry the transaction and get an
out-of-order GTID error. This is because the commit phase of an XA
PREPARE is bifurcated. That is, first, the prepare is handled by the
relevant storage engines. Then second, the GTID slave state is
updated as a separate autocommit transaction. If the second phase
fails, and the transaction is retried, then the same transaction is
attempted to be committed again, resulting in a GTID out-of-order
error.

This patch fixes this error by immediately stopping the slave and
reporting the appropriate error. That is, there was logic to bypass
the error when updating the GTID slave state table if the underlying
error is allowed for retry on a parallel slave. This patch adds a
parameter to disallow the error bypass, thereby forcing the error
state to still happen.

Reviewed By
============
Andrei Elkin <andrei.elkin@mariadb.com>
This commit is contained in:
Brandon Nesterenko 2023-04-18 13:22:43 -06:00
parent 1d74927c58
commit 31f09e36c1
5 changed files with 208 additions and 22 deletions

View file

@ -0,0 +1,51 @@
include/master-slave.inc
[connection master]
connection slave;
include/stop_slave.inc
change master to master_use_gtid=slave_pos;
set @@global.slave_parallel_threads= 4;
set @@global.slave_parallel_mode= optimistic;
set @@global.gtid_strict_mode=ON;
set sql_log_bin= 0;
alter table mysql.gtid_slave_pos engine=innodb;
call mtr.add_suppression("Deadlock found.*");
set sql_log_bin= 1;
include/start_slave.inc
connection master;
create table t1 (a int primary key, b int) engine=innodb;
insert t1 values (1,1);
include/save_master_gtid.inc
connection slave;
include/sync_with_master_gtid.inc
include/stop_slave.inc
set @@global.innodb_lock_wait_timeout= 1;
connection master;
set @@session.gtid_seq_no=100;
xa start '1';
update t1 set b=b+10 where a=1;
xa end '1';
xa prepare '1';
xa commit '1';
include/save_master_gtid.inc
connection slave;
connection slave1;
BEGIN;
SELECT * FROM mysql.gtid_slave_pos WHERE seq_no=100 FOR UPDATE;
domain_id sub_id server_id seq_no
connection slave;
include/start_slave.inc
include/wait_for_slave_sql_error.inc [errno=1942,1213]
connection slave1;
ROLLBACK;
# Cleanup
connection master;
drop table t1;
connection slave;
include/stop_slave.inc
set @@global.gtid_slave_pos= "0-1-100";
set @@global.slave_parallel_threads= 0;
set @@global.gtid_strict_mode= 0;
set @@global.innodb_lock_wait_timeout= 50;
include/start_slave.inc
include/rpl_end.inc
# End of rpl_xa_prepare_gtid_fail.test

View file

@ -0,0 +1,106 @@
#
# When handling the replication of an XA PREPARE, the commit phase is
# bifurcated. First, the prepare is handled by the relevant storage engines.
# Then second,the GTID slave state is updated as a separate autocommit
# transaction. If the second stage fails, i.e. we are unable to update the
# GTID slave state, then the slave should immediately quit in error, without
# retry.
#
# This tests validates the above behavior by simulating a deadlock on the
# GTID slave state table during the second part of XA PREPARE's commit, to
# ensure that the appropriate error is reported and the transaction was never
# retried.
#
#
# References
# MDEV-31038: Parallel Replication Breaks if XA PREPARE Fails Updating Slave
# GTID State
#
source include/master-slave.inc;
source include/have_binlog_format_row.inc;
source include/have_innodb.inc;
--connection slave
--source include/stop_slave.inc
--let $save_par_thds= `SELECT @@global.slave_parallel_threads`
--let $save_strict_mode= `SELECT @@global.gtid_strict_mode`
--let $save_innodb_lock_wait_timeout= `SELECT @@global.innodb_lock_wait_timeout`
change master to master_use_gtid=slave_pos;
set @@global.slave_parallel_threads= 4;
set @@global.slave_parallel_mode= optimistic;
set @@global.gtid_strict_mode=ON;
set sql_log_bin= 0;
alter table mysql.gtid_slave_pos engine=innodb;
call mtr.add_suppression("Deadlock found.*");
set sql_log_bin= 1;
--source include/start_slave.inc
--connection master
let $datadir= `select @@datadir`;
create table t1 (a int primary key, b int) engine=innodb;
insert t1 values (1,1);
--source include/save_master_gtid.inc
--connection slave
--source include/sync_with_master_gtid.inc
--source include/stop_slave.inc
set @@global.innodb_lock_wait_timeout= 1;
--let $retried_tx_initial= query_get_value(SHOW ALL SLAVES STATUS, Retried_transactions, 1)
--connection master
--let $gtid_domain_id=`SELECT @@GLOBAL.gtid_domain_id`
--let $gtid_server_id=`SELECT @@GLOBAL.server_id`
--let $xap_seq_no=100
--eval set @@session.gtid_seq_no=$xap_seq_no
xa start '1';
update t1 set b=b+10 where a=1;
xa end '1';
xa prepare '1';
--let $new_gtid= `SELECT @@global.gtid_binlog_pos`
xa commit '1';
--source include/save_master_gtid.inc
--connection slave
#--eval set statement sql_log_bin=0 for insert into mysql.gtid_slave_pos values ($gtid_domain_id, 5, $gtid_server_id, $xap_seq_no)
--connection slave1
BEGIN;
--eval SELECT * FROM mysql.gtid_slave_pos WHERE seq_no=$xap_seq_no FOR UPDATE
--connection slave
--source include/start_slave.inc
--let $slave_sql_errno= 1942,1213
--source include/wait_for_slave_sql_error.inc
--let $retried_tx_test= query_get_value(SHOW ALL SLAVES STATUS, Retried_transactions, 1)
if ($retried_tx_initial != $retried_tx_test)
{
--echo Transaction was retried when a failed XA PREPARE slave GTID update should lead to immediate slave stop without retry
--die Transaction was retried when a failed XA PREPARE slave GTID update should lead to immediate slave stop without retry
}
--connection slave1
ROLLBACK;
--echo # Cleanup
--connection master
drop table t1;
--connection slave
--source include/stop_slave.inc
--eval set @@global.gtid_slave_pos= "$new_gtid"
--eval set @@global.slave_parallel_threads= $save_par_thds
--eval set @@global.gtid_strict_mode= $save_strict_mode
--eval set @@global.innodb_lock_wait_timeout= $save_innodb_lock_wait_timeout
--source include/start_slave.inc
--source include/rpl_end.inc
--echo # End of rpl_xa_prepare_gtid_fail.test

View file

@ -2057,8 +2057,11 @@ int ha_rollback_trans(THD *thd, bool all)
rollback without signalling following transactions. And in release
builds, we explicitly do the signalling before rolling back.
*/
DBUG_ASSERT(!(thd->rgi_slave && thd->rgi_slave->did_mark_start_commit) ||
thd->transaction->xid_state.is_explicit_XA());
DBUG_ASSERT(
!(thd->rgi_slave && thd->rgi_slave->did_mark_start_commit) ||
(thd->transaction->xid_state.is_explicit_XA() ||
(thd->rgi_slave->gtid_ev_flags2 & Gtid_log_event::FL_PREPARED_XA)));
if (thd->rgi_slave && thd->rgi_slave->did_mark_start_commit)
thd->rgi_slave->unmark_start_commit();
}

View file

@ -3042,7 +3042,7 @@ private:
virtual int do_commit()= 0;
virtual int do_apply_event(rpl_group_info *rgi);
int do_record_gtid(THD *thd, rpl_group_info *rgi, bool in_trans,
void **out_hton);
void **out_hton, bool force_err= false);
enum_skip_reason do_shall_skip(rpl_group_info *rgi);
virtual const char* get_query()= 0;
#endif

View file

@ -152,6 +152,30 @@ is_parallel_retry_error(rpl_group_info *rgi, int err)
return has_temporary_error(rgi->thd);
}
/**
Accumulate a Diagnostics_area's errors and warnings into an output buffer
@param errbuf The output buffer to write error messages
@param errbuf_size The size of the output buffer
@param da The Diagnostics_area to check for errors
*/
static void inline aggregate_da_errors(char *errbuf, size_t errbuf_size,
Diagnostics_area *da)
{
const char *errbuf_end= errbuf + errbuf_size;
char *slider;
Diagnostics_area::Sql_condition_iterator it= da->sql_conditions();
const Sql_condition *err;
size_t len;
for (err= it++, slider= errbuf; err && slider < errbuf_end - 1;
slider += len, err= it++)
{
len= my_snprintf(slider, errbuf_end - slider,
" %s, Error_code: %d;", err->get_message_text(),
err->get_sql_errno());
}
}
/**
Error reporting facility for Rows_log_event::do_apply_event
@ -172,13 +196,8 @@ static void inline slave_rows_error_report(enum loglevel level, int ha_error,
const char *log_name, my_off_t pos)
{
const char *handler_error= (ha_error ? HA_ERR(ha_error) : NULL);
char buff[MAX_SLAVE_ERRMSG], *slider;
const char *buff_end= buff + sizeof(buff);
size_t len;
Diagnostics_area::Sql_condition_iterator it=
thd->get_stmt_da()->sql_conditions();
char buff[MAX_SLAVE_ERRMSG];
Relay_log_info const *rli= rgi->rli;
const Sql_condition *err;
buff[0]= 0;
int errcode= thd->is_error() ? thd->get_stmt_da()->sql_errno() : 0;
@ -191,13 +210,7 @@ static void inline slave_rows_error_report(enum loglevel level, int ha_error,
if (is_parallel_retry_error(rgi, errcode))
return;
for (err= it++, slider= buff; err && slider < buff_end - 1;
slider += len, err= it++)
{
len= my_snprintf(slider, buff_end - slider,
" %s, Error_code: %d;", err->get_message_text(),
err->get_sql_errno());
}
aggregate_da_errors(buff, sizeof(buff), thd->get_stmt_da());
if (ha_error != 0)
rli->report(level, errcode, rgi->gtid_info(),
@ -3893,7 +3906,8 @@ bool slave_execute_deferred_events(THD *thd)
#if defined(HAVE_REPLICATION)
int Xid_apply_log_event::do_record_gtid(THD *thd, rpl_group_info *rgi,
bool in_trans, void **out_hton)
bool in_trans, void **out_hton,
bool force_err)
{
int err= 0;
Relay_log_info const *rli= rgi->rli;
@ -3908,14 +3922,26 @@ int Xid_apply_log_event::do_record_gtid(THD *thd, rpl_group_info *rgi,
int ec= thd->get_stmt_da()->sql_errno();
/*
Do not report an error if this is really a kill due to a deadlock.
In this case, the transaction will be re-tried instead.
In this case, the transaction will be re-tried instead. Unless force_err
is set, as in the case of XA PREPARE, as the GTID state is updated as a
separate transaction, and if that fails, we should not retry but exit in
error immediately.
*/
if (!is_parallel_retry_error(rgi, ec))
if (!is_parallel_retry_error(rgi, ec) || force_err)
{
char buff[MAX_SLAVE_ERRMSG];
buff[0]= 0;
aggregate_da_errors(buff, sizeof(buff), thd->get_stmt_da());
if (force_err)
thd->clear_error();
rli->report(ERROR_LEVEL, ER_CANNOT_UPDATE_GTID_STATE, rgi->gtid_info(),
"Error during XID COMMIT: failed to update GTID state in "
"%s.%s: %d: %s",
"%s.%s: %d: %s the event's master log %s, end_log_pos %llu",
"mysql", rpl_gtid_slave_state_table_name.str, ec,
thd->get_stmt_da()->message());
buff, RPL_LOG_NAME, log_pos);
}
thd->is_slave_error= 1;
}
@ -3989,7 +4015,7 @@ int Xid_apply_log_event::do_apply_event(rpl_group_info *rgi)
{
DBUG_ASSERT(!thd->transaction->xid_state.is_explicit_XA());
if ((err= do_record_gtid(thd, rgi, false, &hton)))
if ((err= do_record_gtid(thd, rgi, false, &hton, true)))
return err;
}