mirror of
https://github.com/MariaDB/server.git
synced 2025-01-16 12:02:42 +01:00
MDEV-8302: Duplicate key with parallel replication
This bug is essentially another variant of MDEV-7458. If a transaction conflict caused a deadlock kill of T2 in record_gtid() during commit, the code would do a rollback _before_ running rgi->unmark_start_commit(). This creates a race where following transactions could start too early (before T2 has completed its transaction retry). This in turn could lead to replication failure, if there was a conflict that caused eg. duplicate key error or similar. The fix is to remove these rollbacks (in Query_log_event::do_apply_event() and Xid_log_event::do_apply_event(). They seem out-of-place; code in log_event.cc generally does not roll back on error, this is handled higher up. In addition, because of the extreme difficulty of reproducing bugs like MDEV-7458 and MDEV-8302, this patch adds some extra precations to try to detect (in debug builds) or prevent (in release builds) similar bugs. ha_rollback_trans() will now call unmark_start_commit() if needed (and assert in debug build when a caller does rollback without unmark first). We also add an extra check for thd->killed() so that we avoid doing mark_start_commit() if we already have a pending deadlock kill. And we add a missing unmark_start_commit() call in the error case, found by the above assertion.
This commit is contained in:
parent
a6087e7dc1
commit
9b9c5e890c
5 changed files with 147 additions and 6 deletions
|
@ -1649,6 +1649,46 @@ include/stop_slave.inc
|
|||
SET GLOBAL debug_dbug= @old_debg;
|
||||
SET GLOBAL max_relay_log_size= @old_max;
|
||||
include/start_slave.inc
|
||||
*** MDEV-8302: Duplicate key with parallel replication ***
|
||||
include/stop_slave.inc
|
||||
/* Inject a small sleep which makes the race easier to hit. */
|
||||
SET @old_dbug=@@GLOBAL.debug_dbug;
|
||||
SET GLOBAL debug_dbug="+d,inject_mdev8302";
|
||||
INSERT INTO t7 VALUES (100,1), (101,2), (102,3), (103,4), (104,5);
|
||||
SET @old_dbug= @@SESSION.debug_dbug;
|
||||
SET @commit_id= 20000;
|
||||
SET SESSION debug_dbug="+d,binlog_force_commit_id";
|
||||
SET SESSION debug_dbug=@old_dbug;
|
||||
SELECT * FROM t7 ORDER BY a;
|
||||
a b
|
||||
1 1
|
||||
2 2
|
||||
3 86
|
||||
4 4
|
||||
5 5
|
||||
100 5
|
||||
101 1
|
||||
102 2
|
||||
103 3
|
||||
104 4
|
||||
include/save_master_gtid.inc
|
||||
include/start_slave.inc
|
||||
include/sync_with_master_gtid.inc
|
||||
SELECT * FROM t7 ORDER BY a;
|
||||
a b
|
||||
1 1
|
||||
2 2
|
||||
3 86
|
||||
4 4
|
||||
5 5
|
||||
100 5
|
||||
101 1
|
||||
102 2
|
||||
103 3
|
||||
104 4
|
||||
include/stop_slave.inc
|
||||
SET GLOBAL debug_dbug=@old_dbug;
|
||||
include/start_slave.inc
|
||||
include/stop_slave.inc
|
||||
SET GLOBAL slave_parallel_threads=@old_parallel_threads;
|
||||
include/start_slave.inc
|
||||
|
|
|
@ -2314,6 +2314,60 @@ SET GLOBAL max_relay_log_size= @old_max;
|
|||
--source include/start_slave.inc
|
||||
|
||||
|
||||
--echo *** MDEV-8302: Duplicate key with parallel replication ***
|
||||
|
||||
--connection server_2
|
||||
--source include/stop_slave.inc
|
||||
/* Inject a small sleep which makes the race easier to hit. */
|
||||
SET @old_dbug=@@GLOBAL.debug_dbug;
|
||||
SET GLOBAL debug_dbug="+d,inject_mdev8302";
|
||||
|
||||
|
||||
--connection server_1
|
||||
INSERT INTO t7 VALUES (100,1), (101,2), (102,3), (103,4), (104,5);
|
||||
|
||||
# Artificially create a bunch of group commits with conflicting transactions.
|
||||
# The bug happened when T1 and T2 was in one group commit, and T3 was in the
|
||||
# following group commit. T2 is a DELETE of a row with same primary key as a
|
||||
# row that T3 inserts. T1 and T2 can conflict, causing T2 to be deadlock
|
||||
# killed after starting to commit. The bug was that T2 could roll back before
|
||||
# doing unmark_start_commit(); this could allow T3 to run before the retry
|
||||
# of T2, causing duplicate key violation.
|
||||
|
||||
SET @old_dbug= @@SESSION.debug_dbug;
|
||||
SET @commit_id= 20000;
|
||||
SET SESSION debug_dbug="+d,binlog_force_commit_id";
|
||||
|
||||
--let $n = 100
|
||||
--disable_query_log
|
||||
while ($n)
|
||||
{
|
||||
eval UPDATE t7 SET b=b+1 WHERE a=100+($n MOD 5);
|
||||
eval DELETE FROM t7 WHERE a=100+($n MOD 5);
|
||||
|
||||
SET @commit_id = @commit_id + 1;
|
||||
eval INSERT INTO t7 VALUES (100+($n MOD 5), $n);
|
||||
SET @commit_id = @commit_id + 1;
|
||||
dec $n;
|
||||
}
|
||||
--enable_query_log
|
||||
SET SESSION debug_dbug=@old_dbug;
|
||||
|
||||
|
||||
SELECT * FROM t7 ORDER BY a;
|
||||
--source include/save_master_gtid.inc
|
||||
|
||||
|
||||
--connection server_2
|
||||
--source include/start_slave.inc
|
||||
--source include/sync_with_master_gtid.inc
|
||||
SELECT * FROM t7 ORDER BY a;
|
||||
|
||||
--source include/stop_slave.inc
|
||||
SET GLOBAL debug_dbug=@old_dbug;
|
||||
--source include/start_slave.inc
|
||||
|
||||
|
||||
|
||||
# Clean up.
|
||||
--connection server_2
|
||||
|
|
|
@ -1582,6 +1582,24 @@ int ha_rollback_trans(THD *thd, bool all)
|
|||
DBUG_ASSERT(thd->transaction.stmt.ha_list == NULL ||
|
||||
trans == &thd->transaction.stmt);
|
||||
|
||||
if (is_real_trans)
|
||||
{
|
||||
/*
|
||||
In parallel replication, if we need to rollback during commit, we must
|
||||
first inform following transactions that we are going to abort our commit
|
||||
attempt. Otherwise those following transactions can run too early, and
|
||||
possibly cause replication to fail. See comments in retry_event_group().
|
||||
|
||||
There were several bugs with this in the past that were very hard to
|
||||
track down (MDEV-7458, MDEV-8302). So we add here an assertion for
|
||||
rollback without signalling following transactions. And in release
|
||||
builds, we explicitly do the signalling before rolling back.
|
||||
*/
|
||||
DBUG_ASSERT(!(thd->rgi_slave && thd->rgi_slave->did_mark_start_commit));
|
||||
if (thd->rgi_slave && thd->rgi_slave->did_mark_start_commit)
|
||||
thd->rgi_slave->unmark_start_commit();
|
||||
}
|
||||
|
||||
if (thd->in_sub_stmt)
|
||||
{
|
||||
DBUG_ASSERT(0);
|
||||
|
|
|
@ -4261,7 +4261,6 @@ int Query_log_event::do_apply_event(rpl_group_info *rgi,
|
|||
"mysql", rpl_gtid_slave_state_table_name.str,
|
||||
errcode,
|
||||
thd->get_stmt_da()->message());
|
||||
trans_rollback(thd);
|
||||
sub_id= 0;
|
||||
thd->is_slave_error= 1;
|
||||
goto end;
|
||||
|
@ -7351,7 +7350,6 @@ int Xid_log_event::do_apply_event(rpl_group_info *rgi)
|
|||
"%s.%s: %d: %s",
|
||||
"mysql", rpl_gtid_slave_state_table_name.str, ec,
|
||||
thd->get_stmt_da()->message());
|
||||
trans_rollback(thd);
|
||||
thd->is_slave_error= 1;
|
||||
return err;
|
||||
}
|
||||
|
|
|
@ -226,6 +226,11 @@ static void
|
|||
signal_error_to_sql_driver_thread(THD *thd, rpl_group_info *rgi, int err)
|
||||
{
|
||||
rgi->worker_error= err;
|
||||
/*
|
||||
In case we get an error during commit, inform following transactions that
|
||||
we aborted our commit.
|
||||
*/
|
||||
rgi->unmark_start_commit();
|
||||
rgi->cleanup_context(thd, true);
|
||||
rgi->rli->abort_slave= true;
|
||||
rgi->rli->stop_for_until= false;
|
||||
|
@ -370,6 +375,7 @@ do_retry:
|
|||
transaction we deadlocked with will not signal that it started to commit
|
||||
until after the unmark.
|
||||
*/
|
||||
DBUG_EXECUTE_IF("inject_mdev8302", { my_sleep(20000);});
|
||||
rgi->unmark_start_commit();
|
||||
DEBUG_SYNC(thd, "rpl_parallel_retry_after_unmark");
|
||||
|
||||
|
@ -884,9 +890,24 @@ handle_rpl_parallel_thread(void *arg)
|
|||
group_ending= is_group_ending(qev->ev, event_type);
|
||||
if (group_ending && likely(!rgi->worker_error))
|
||||
{
|
||||
DEBUG_SYNC(thd, "rpl_parallel_before_mark_start_commit");
|
||||
rgi->mark_start_commit();
|
||||
DEBUG_SYNC(thd, "rpl_parallel_after_mark_start_commit");
|
||||
/*
|
||||
Do an extra check for (deadlock) kill here. This helps prevent a
|
||||
lingering deadlock kill that occured during normal DML processing to
|
||||
propagate past the mark_start_commit(). If we detect a deadlock only
|
||||
after mark_start_commit(), we have to unmark, which has at least a
|
||||
theoretical possibility of leaving a window where it looks like all
|
||||
transactions in a GCO have started committing, while in fact one
|
||||
will need to rollback and retry. This is not supposed to be possible
|
||||
(since there is a deadlock, at least one transaction should be
|
||||
blocked from reaching commit), but this seems a fragile ensurance,
|
||||
and there were historically a number of subtle bugs in this area.
|
||||
*/
|
||||
if (!thd->killed)
|
||||
{
|
||||
DEBUG_SYNC(thd, "rpl_parallel_before_mark_start_commit");
|
||||
rgi->mark_start_commit();
|
||||
DEBUG_SYNC(thd, "rpl_parallel_after_mark_start_commit");
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -911,7 +932,17 @@ handle_rpl_parallel_thread(void *arg)
|
|||
});
|
||||
if (!err)
|
||||
#endif
|
||||
err= rpt_handle_event(qev, rpt);
|
||||
{
|
||||
if (thd->check_killed())
|
||||
{
|
||||
thd->clear_error();
|
||||
thd->get_stmt_da()->reset_diagnostics_area();
|
||||
thd->send_kill_message();
|
||||
err= 1;
|
||||
}
|
||||
else
|
||||
err= rpt_handle_event(qev, rpt);
|
||||
}
|
||||
delete_or_keep_event_post_apply(rgi, event_type, qev->ev);
|
||||
DBUG_EXECUTE_IF("rpl_parallel_simulate_temp_err_gtid_0_x_100",
|
||||
err= dbug_simulate_tmp_error(rgi, thd););
|
||||
|
|
Loading…
Reference in a new issue