MDEV-8302: Duplicate key with parallel replication

This bug is essentially another variant of MDEV-7458.

If a transaction conflict caused a deadlock kill of T2 in record_gtid()
during commit, the code would do a rollback _before_ running
rgi->unmark_start_commit(). This creates a race where following transactions
could start too early (before T2 has completed its transaction retry). This
in turn could lead to replication failure, if there was a conflict that
caused eg. duplicate key error or similar.

The fix is to remove these rollbacks (in Query_log_event::do_apply_event()
and Xid_log_event::do_apply_event(). They seem out-of-place; code in
log_event.cc generally does not roll back on error, this is handled higher
up.

In addition, because of the extreme difficulty of reproducing bugs like
MDEV-7458 and MDEV-8302, this patch adds some extra precations to try to
detect (in debug builds) or prevent (in release builds) similar bugs.
ha_rollback_trans() will now call unmark_start_commit() if needed (and
assert in debug build when a caller does rollback without unmark first).

We also add an extra check for thd->killed() so that we avoid doing
mark_start_commit() if we already have a pending deadlock kill.

And we add a missing unmark_start_commit() call in the error case, found by
the above assertion.
This commit is contained in:
Kristian Nielsen 2015-08-04 11:20:03 +02:00
parent a6087e7dc1
commit 9b9c5e890c
5 changed files with 147 additions and 6 deletions

View file

@ -1649,6 +1649,46 @@ include/stop_slave.inc
SET GLOBAL debug_dbug= @old_debg;
SET GLOBAL max_relay_log_size= @old_max;
include/start_slave.inc
*** MDEV-8302: Duplicate key with parallel replication ***
include/stop_slave.inc
/* Inject a small sleep which makes the race easier to hit. */
SET @old_dbug=@@GLOBAL.debug_dbug;
SET GLOBAL debug_dbug="+d,inject_mdev8302";
INSERT INTO t7 VALUES (100,1), (101,2), (102,3), (103,4), (104,5);
SET @old_dbug= @@SESSION.debug_dbug;
SET @commit_id= 20000;
SET SESSION debug_dbug="+d,binlog_force_commit_id";
SET SESSION debug_dbug=@old_dbug;
SELECT * FROM t7 ORDER BY a;
a b
1 1
2 2
3 86
4 4
5 5
100 5
101 1
102 2
103 3
104 4
include/save_master_gtid.inc
include/start_slave.inc
include/sync_with_master_gtid.inc
SELECT * FROM t7 ORDER BY a;
a b
1 1
2 2
3 86
4 4
5 5
100 5
101 1
102 2
103 3
104 4
include/stop_slave.inc
SET GLOBAL debug_dbug=@old_dbug;
include/start_slave.inc
include/stop_slave.inc
SET GLOBAL slave_parallel_threads=@old_parallel_threads;
include/start_slave.inc

View file

@ -2314,6 +2314,60 @@ SET GLOBAL max_relay_log_size= @old_max;
--source include/start_slave.inc
--echo *** MDEV-8302: Duplicate key with parallel replication ***
--connection server_2
--source include/stop_slave.inc
/* Inject a small sleep which makes the race easier to hit. */
SET @old_dbug=@@GLOBAL.debug_dbug;
SET GLOBAL debug_dbug="+d,inject_mdev8302";
--connection server_1
INSERT INTO t7 VALUES (100,1), (101,2), (102,3), (103,4), (104,5);
# Artificially create a bunch of group commits with conflicting transactions.
# The bug happened when T1 and T2 was in one group commit, and T3 was in the
# following group commit. T2 is a DELETE of a row with same primary key as a
# row that T3 inserts. T1 and T2 can conflict, causing T2 to be deadlock
# killed after starting to commit. The bug was that T2 could roll back before
# doing unmark_start_commit(); this could allow T3 to run before the retry
# of T2, causing duplicate key violation.
SET @old_dbug= @@SESSION.debug_dbug;
SET @commit_id= 20000;
SET SESSION debug_dbug="+d,binlog_force_commit_id";
--let $n = 100
--disable_query_log
while ($n)
{
eval UPDATE t7 SET b=b+1 WHERE a=100+($n MOD 5);
eval DELETE FROM t7 WHERE a=100+($n MOD 5);
SET @commit_id = @commit_id + 1;
eval INSERT INTO t7 VALUES (100+($n MOD 5), $n);
SET @commit_id = @commit_id + 1;
dec $n;
}
--enable_query_log
SET SESSION debug_dbug=@old_dbug;
SELECT * FROM t7 ORDER BY a;
--source include/save_master_gtid.inc
--connection server_2
--source include/start_slave.inc
--source include/sync_with_master_gtid.inc
SELECT * FROM t7 ORDER BY a;
--source include/stop_slave.inc
SET GLOBAL debug_dbug=@old_dbug;
--source include/start_slave.inc
# Clean up.
--connection server_2

View file

@ -1582,6 +1582,24 @@ int ha_rollback_trans(THD *thd, bool all)
DBUG_ASSERT(thd->transaction.stmt.ha_list == NULL ||
trans == &thd->transaction.stmt);
if (is_real_trans)
{
/*
In parallel replication, if we need to rollback during commit, we must
first inform following transactions that we are going to abort our commit
attempt. Otherwise those following transactions can run too early, and
possibly cause replication to fail. See comments in retry_event_group().
There were several bugs with this in the past that were very hard to
track down (MDEV-7458, MDEV-8302). So we add here an assertion for
rollback without signalling following transactions. And in release
builds, we explicitly do the signalling before rolling back.
*/
DBUG_ASSERT(!(thd->rgi_slave && thd->rgi_slave->did_mark_start_commit));
if (thd->rgi_slave && thd->rgi_slave->did_mark_start_commit)
thd->rgi_slave->unmark_start_commit();
}
if (thd->in_sub_stmt)
{
DBUG_ASSERT(0);

View file

@ -4261,7 +4261,6 @@ int Query_log_event::do_apply_event(rpl_group_info *rgi,
"mysql", rpl_gtid_slave_state_table_name.str,
errcode,
thd->get_stmt_da()->message());
trans_rollback(thd);
sub_id= 0;
thd->is_slave_error= 1;
goto end;
@ -7351,7 +7350,6 @@ int Xid_log_event::do_apply_event(rpl_group_info *rgi)
"%s.%s: %d: %s",
"mysql", rpl_gtid_slave_state_table_name.str, ec,
thd->get_stmt_da()->message());
trans_rollback(thd);
thd->is_slave_error= 1;
return err;
}

View file

@ -226,6 +226,11 @@ static void
signal_error_to_sql_driver_thread(THD *thd, rpl_group_info *rgi, int err)
{
rgi->worker_error= err;
/*
In case we get an error during commit, inform following transactions that
we aborted our commit.
*/
rgi->unmark_start_commit();
rgi->cleanup_context(thd, true);
rgi->rli->abort_slave= true;
rgi->rli->stop_for_until= false;
@ -370,6 +375,7 @@ do_retry:
transaction we deadlocked with will not signal that it started to commit
until after the unmark.
*/
DBUG_EXECUTE_IF("inject_mdev8302", { my_sleep(20000);});
rgi->unmark_start_commit();
DEBUG_SYNC(thd, "rpl_parallel_retry_after_unmark");
@ -884,9 +890,24 @@ handle_rpl_parallel_thread(void *arg)
group_ending= is_group_ending(qev->ev, event_type);
if (group_ending && likely(!rgi->worker_error))
{
DEBUG_SYNC(thd, "rpl_parallel_before_mark_start_commit");
rgi->mark_start_commit();
DEBUG_SYNC(thd, "rpl_parallel_after_mark_start_commit");
/*
Do an extra check for (deadlock) kill here. This helps prevent a
lingering deadlock kill that occured during normal DML processing to
propagate past the mark_start_commit(). If we detect a deadlock only
after mark_start_commit(), we have to unmark, which has at least a
theoretical possibility of leaving a window where it looks like all
transactions in a GCO have started committing, while in fact one
will need to rollback and retry. This is not supposed to be possible
(since there is a deadlock, at least one transaction should be
blocked from reaching commit), but this seems a fragile ensurance,
and there were historically a number of subtle bugs in this area.
*/
if (!thd->killed)
{
DEBUG_SYNC(thd, "rpl_parallel_before_mark_start_commit");
rgi->mark_start_commit();
DEBUG_SYNC(thd, "rpl_parallel_after_mark_start_commit");
}
}
/*
@ -911,7 +932,17 @@ handle_rpl_parallel_thread(void *arg)
});
if (!err)
#endif
err= rpt_handle_event(qev, rpt);
{
if (thd->check_killed())
{
thd->clear_error();
thd->get_stmt_da()->reset_diagnostics_area();
thd->send_kill_message();
err= 1;
}
else
err= rpt_handle_event(qev, rpt);
}
delete_or_keep_event_post_apply(rgi, event_type, qev->ev);
DBUG_EXECUTE_IF("rpl_parallel_simulate_temp_err_gtid_0_x_100",
err= dbug_simulate_tmp_error(rgi, thd););