MDEV-14357: rpl.rpl_domain_id_filter_io_crash failed in buildbot with wrong result

A race condition with the SQL thread, where depending on if it was
killed before or after it had executed the fake/generated IGN_GTIDS
Gtid_list_log_event, may or may not update gtid_slave_pos with the
position of the ignored events. Then, the slave would be restarted
while resetting IGNORE_DOMAIN_IDS to be empty, which would result in
the slave requesting different starting locations, depending on
whether or not gtid_slave_pos was updated. And, because previously
ignored events could now be requested and executed (no longer
ignored), their presence would fail the test.

This patch fixes this in two ways. First, to use GTID positions for
synchronization rather than binlog file positions. Then second, to
synchronize the SQL thread’s gtid_slave_pos with the ignored events
before killing the SQL thread.

To consistently reproduce the test failure, the following patch can
be applied:

diff --git a/sql/log_event_server.cc b/sql/log_event_server.cc
index f51f5b7deec..de62233acff 100644
--- a/sql/log_event_server.cc
+++ b/sql/log_event_server.cc
@@ -3686,6 +3686,12 @@ Gtid_list_log_event::do_apply_event(rpl_group_info *rgi)
     void *hton= NULL;
     uint32 i;

+    sleep(1);
+    if (rli->sql_driver_thd->killed || rli->abort_slave)
+    {
+      return 0;
+    }
+

Reviewed By:
============
Kristian Nielsen <knielsen@knielsen-hq.org>
This commit is contained in:
Brandon Nesterenko 2024-02-08 09:55:02 -07:00
parent 8ec12e0d6d
commit ee89558312
2 changed files with 75 additions and 18 deletions

View file

@ -11,6 +11,7 @@ SELECT * FROM t1;
i
1
connection slave;
include/save_master_gtid.inc
connection slave;
call mtr.add_suppression("Slave I/O: Relay log write failure: could not queue event from master.*");
# Case 0 : Start slave with IGNORE_DOMAIN_IDS=(), then restart
@ -24,6 +25,7 @@ DO_DOMAIN_IDS (BEFORE) :
IGNORE_DOMAIN_IDS (BEFORE) :
CHANGE MASTER TO IGNORE_DOMAIN_IDS=(), MASTER_USE_GTID=slave_pos;
include/start_slave.inc
include/sync_with_master_gtid.inc
DO_DOMAIN_IDS (AFTER) :
IGNORE_DOMAIN_IDS (AFTER) :
SET @saved_dbug = @@GLOBAL.debug_dbug;
@ -33,6 +35,7 @@ START TRANSACTION;
INSERT INTO t1 VALUES(2);
INSERT INTO t1 VALUES(3);
COMMIT;
include/save_master_gtid.inc
SELECT * FROM t1;
i
1
@ -46,6 +49,7 @@ i
SET @@global.debug_dbug=@saved_dbug;
START SLAVE io_thread;
include/wait_for_slave_io_to_start.inc
include/sync_with_master_gtid.inc
SELECT * FROM t1;
i
1
@ -59,6 +63,7 @@ DO_DOMAIN_IDS (BEFORE) :
IGNORE_DOMAIN_IDS (BEFORE) :
CHANGE MASTER TO IGNORE_DOMAIN_IDS=(1), MASTER_USE_GTID=slave_pos;
include/start_slave.inc
include/sync_with_master_gtid.inc
DO_DOMAIN_IDS (AFTER) :
IGNORE_DOMAIN_IDS (AFTER) : 1
SET @@global.debug_dbug="d,kill_slave_io_before_commit";
@ -67,6 +72,7 @@ START TRANSACTION;
INSERT INTO t1 VALUES(4);
INSERT INTO t1 VALUES(5);
COMMIT;
include/save_master_gtid.inc
SELECT * FROM t1;
i
1
@ -84,6 +90,7 @@ i
SET @@global.debug_dbug=@saved_dbug;
START SLAVE io_thread;
include/wait_for_slave_io_to_start.inc
include/sync_with_master_gtid.inc
SELECT * FROM t1;
i
1
@ -97,6 +104,7 @@ DO_DOMAIN_IDS (BEFORE) :
IGNORE_DOMAIN_IDS (BEFORE) : 1
CHANGE MASTER TO IGNORE_DOMAIN_IDS=(), MASTER_USE_GTID=slave_pos;
include/start_slave.inc
include/sync_with_master_gtid.inc
DO_DOMAIN_IDS (AFTER) :
IGNORE_DOMAIN_IDS (AFTER) :
SET @@global.debug_dbug="d,kill_slave_io_before_commit";
@ -114,6 +122,7 @@ START TRANSACTION;
INSERT INTO t1 VALUES(10);
INSERT INTO t1 VALUES(11);
COMMIT;
include/save_master_gtid.inc
SELECT * FROM t1;
i
1
@ -140,6 +149,7 @@ DO_DOMAIN_IDS (BEFORE) :
IGNORE_DOMAIN_IDS (BEFORE) :
CHANGE MASTER TO IGNORE_DOMAIN_IDS=(1), MASTER_USE_GTID=slave_pos;
include/start_slave.inc
include/sync_with_master_gtid.inc
DO_DOMAIN_IDS (AFTER) :
IGNORE_DOMAIN_IDS (AFTER) : 1
SELECT * FROM t1;
@ -157,6 +167,7 @@ DO_DOMAIN_IDS (BEFORE) :
IGNORE_DOMAIN_IDS (BEFORE) : 1
CHANGE MASTER TO IGNORE_DOMAIN_IDS=(1), MASTER_USE_GTID=slave_pos;
include/start_slave.inc
include/sync_with_master_gtid.inc
DO_DOMAIN_IDS (AFTER) :
IGNORE_DOMAIN_IDS (AFTER) : 1
SET @@global.debug_dbug="d,kill_slave_io_before_commit";
@ -166,6 +177,7 @@ START TRANSACTION;
INSERT INTO t1 VALUES(12);
INSERT INTO t1 VALUES(13);
COMMIT;
include/save_master_gtid.inc
START TRANSACTION;
INSERT INTO t1 VALUES(14);
INSERT INTO t1 VALUES(15);
@ -204,11 +216,16 @@ i
10
11
SET @@global.debug_dbug=@saved_dbug;
include/sync_with_master_gtid.inc
include/stop_slave_sql.inc
DO_DOMAIN_IDS (BEFORE) :
IGNORE_DOMAIN_IDS (BEFORE) : 1
CHANGE MASTER TO IGNORE_DOMAIN_IDS=(), MASTER_USE_GTID=slave_pos;
connection master;
include/save_master_gtid.inc
connection slave;
include/start_slave.inc
include/sync_with_master_gtid.inc
DO_DOMAIN_IDS (AFTER) :
IGNORE_DOMAIN_IDS (AFTER) :
SELECT * FROM t1;
@ -230,6 +247,7 @@ DO_DOMAIN_IDS (BEFORE) :
IGNORE_DOMAIN_IDS (BEFORE) :
CHANGE MASTER TO IGNORE_DOMAIN_IDS=(1), MASTER_USE_GTID=slave_pos;
include/start_slave.inc
include/sync_with_master_gtid.inc
DO_DOMAIN_IDS (AFTER) :
IGNORE_DOMAIN_IDS (AFTER) : 1
SET @@global.debug_dbug="d,kill_slave_io_after_2_events";
@ -239,6 +257,7 @@ START TRANSACTION;
INSERT INTO t1 VALUES(18);
INSERT INTO t1 VALUES(19);
COMMIT;
include/save_master_gtid.inc
START TRANSACTION;
INSERT INTO t1 VALUES(20);
INSERT INTO t1 VALUES(21);
@ -287,11 +306,16 @@ i
16
17
SET @@global.debug_dbug=@saved_dbug;
include/sync_with_master_gtid.inc
include/stop_slave_sql.inc
DO_DOMAIN_IDS (BEFORE) :
IGNORE_DOMAIN_IDS (BEFORE) : 1
CHANGE MASTER TO IGNORE_DOMAIN_IDS=(), MASTER_USE_GTID=slave_pos;
connection master;
include/save_master_gtid.inc
connection slave;
include/start_slave.inc
include/sync_with_master_gtid.inc
DO_DOMAIN_IDS (AFTER) :
IGNORE_DOMAIN_IDS (AFTER) :
SELECT * FROM t1;
@ -317,6 +341,7 @@ DO_DOMAIN_IDS (BEFORE) :
IGNORE_DOMAIN_IDS (BEFORE) :
CHANGE MASTER TO IGNORE_DOMAIN_IDS=(), MASTER_USE_GTID=slave_pos;
include/start_slave.inc
include/sync_with_master_gtid.inc
DO_DOMAIN_IDS (AFTER) :
IGNORE_DOMAIN_IDS (AFTER) :
SET @@global.debug_dbug="d,kill_slave_io_after_2_events";
@ -335,6 +360,7 @@ START TRANSACTION;
INSERT INTO t1 VALUES(28);
INSERT INTO t1 VALUES(29);
COMMIT;
include/save_master_gtid.inc
SELECT * FROM t1;
i
1
@ -389,6 +415,7 @@ DO_DOMAIN_IDS (BEFORE) :
IGNORE_DOMAIN_IDS (BEFORE) :
CHANGE MASTER TO IGNORE_DOMAIN_IDS=(1), MASTER_USE_GTID=slave_pos;
include/start_slave.inc
include/sync_with_master_gtid.inc
DO_DOMAIN_IDS (AFTER) :
IGNORE_DOMAIN_IDS (AFTER) : 1
SELECT * FROM t1;

View file

@ -9,6 +9,7 @@ CREATE TABLE t1(i INT) ENGINE=INNODB;
INSERT INTO t1 VALUES(1);
SELECT * FROM t1;
sync_slave_with_master;
--source include/save_master_gtid.inc
connection slave;
@ -28,7 +29,7 @@ let $ignore_domain_ids_before= query_get_value(SHOW SLAVE STATUS, Replicate_Igno
CHANGE MASTER TO IGNORE_DOMAIN_IDS=(), MASTER_USE_GTID=slave_pos;
--source include/start_slave.inc
sync_with_master;
--source include/sync_with_master_gtid.inc
let $do_domain_ids_after= query_get_value(SHOW SLAVE STATUS, Replicate_Do_Domain_Ids, 1);
let $ignore_domain_ids_after= query_get_value(SHOW SLAVE STATUS, Replicate_Ignore_Domain_Ids, 1);
@ -44,7 +45,7 @@ START TRANSACTION;
INSERT INTO t1 VALUES(2);
INSERT INTO t1 VALUES(3);
COMMIT;
save_master_pos;
--source include/save_master_gtid.inc
SELECT * FROM t1;
connection slave;
@ -55,7 +56,7 @@ SET @@global.debug_dbug=@saved_dbug;
START SLAVE io_thread;
--source include/wait_for_slave_io_to_start.inc
sync_with_master;
--source include/sync_with_master_gtid.inc
SELECT * FROM t1;
--echo # Case 1 : Start slave with IGNORE_DOMAIN_IDS=(1), then restart
@ -70,7 +71,7 @@ let $ignore_domain_ids_before= query_get_value(SHOW SLAVE STATUS, Replicate_Igno
CHANGE MASTER TO IGNORE_DOMAIN_IDS=(1), MASTER_USE_GTID=slave_pos;
--source include/start_slave.inc
sync_with_master;
--source include/sync_with_master_gtid.inc
let $do_domain_ids_after= query_get_value(SHOW SLAVE STATUS, Replicate_Do_Domain_Ids, 1);
let $ignore_domain_ids_after= query_get_value(SHOW SLAVE STATUS, Replicate_Ignore_Domain_Ids, 1);
@ -86,7 +87,7 @@ INSERT INTO t1 VALUES(4);
INSERT INTO t1 VALUES(5);
COMMIT;
save_master_pos;
--source include/save_master_gtid.inc
SELECT * FROM t1;
connection slave;
@ -97,7 +98,7 @@ SET @@global.debug_dbug=@saved_dbug;
START SLAVE io_thread;
--source include/wait_for_slave_io_to_start.inc
sync_with_master;
--source include/sync_with_master_gtid.inc
SELECT * FROM t1;
--echo # Case 2 : Start slave with IGNORE_DOMAIN_IDS=(), then restart
@ -112,7 +113,7 @@ let $ignore_domain_ids_before= query_get_value(SHOW SLAVE STATUS, Replicate_Igno
CHANGE MASTER TO IGNORE_DOMAIN_IDS=(), MASTER_USE_GTID=slave_pos;
--source include/start_slave.inc
sync_with_master;
--source include/sync_with_master_gtid.inc
let $do_domain_ids_after= query_get_value(SHOW SLAVE STATUS, Replicate_Do_Domain_Ids, 1);
let $ignore_domain_ids_after= query_get_value(SHOW SLAVE STATUS, Replicate_Ignore_Domain_Ids, 1);
@ -140,7 +141,7 @@ INSERT INTO t1 VALUES(10);
INSERT INTO t1 VALUES(11);
COMMIT;
save_master_pos;
--source include/save_master_gtid.inc
SELECT * FROM t1;
connection slave;
@ -157,7 +158,7 @@ let $ignore_domain_ids_before= query_get_value(SHOW SLAVE STATUS, Replicate_Igno
CHANGE MASTER TO IGNORE_DOMAIN_IDS=(1), MASTER_USE_GTID=slave_pos;
--source include/start_slave.inc
sync_with_master;
--source include/sync_with_master_gtid.inc
let $do_domain_ids_after= query_get_value(SHOW SLAVE STATUS, Replicate_Do_Domain_Ids, 1);
let $ignore_domain_ids_after= query_get_value(SHOW SLAVE STATUS, Replicate_Ignore_Domain_Ids, 1);
@ -178,7 +179,7 @@ let $ignore_domain_ids_before= query_get_value(SHOW SLAVE STATUS, Replicate_Igno
CHANGE MASTER TO IGNORE_DOMAIN_IDS=(1), MASTER_USE_GTID=slave_pos;
--source include/start_slave.inc
sync_with_master;
--source include/sync_with_master_gtid.inc
let $do_domain_ids_after= query_get_value(SHOW SLAVE STATUS, Replicate_Do_Domain_Ids, 1);
let $ignore_domain_ids_after= query_get_value(SHOW SLAVE STATUS, Replicate_Ignore_Domain_Ids, 1);
@ -196,6 +197,15 @@ INSERT INTO t1 VALUES(12);
INSERT INTO t1 VALUES(13);
COMMIT; # IO thread gets killed here.
# MDEV-14357
# As the prior transaction will be ignored on slave because its domain id is
# ignored, the replica's gtid_slave_pos will be updated to have seen it,
# despite its eventual failure to queue the whole transaction to the relay log.
# So for test consistency, we need to synchronize the SQL thread with this
# position; otherwise, when restarting the server after resetting
# IGNORE_DOMAIN_IDS, we will re-fetch this event and execute it.
--source include/save_master_gtid.inc
START TRANSACTION;
INSERT INTO t1 VALUES(14);
INSERT INTO t1 VALUES(15);
@ -207,7 +217,6 @@ INSERT INTO t1 VALUES(16);
INSERT INTO t1 VALUES(17);
COMMIT;
save_master_pos;
SELECT * FROM t1;
connection slave;
@ -217,6 +226,11 @@ SELECT * FROM t1;
SET @@global.debug_dbug=@saved_dbug;
# MDEV-14357
# Ensure the SQL thread is updated with the GTID of the ignored transaction
# so we don't fetch it and execute it after restarting without any ignored
# domain ids.
--source include/sync_with_master_gtid.inc
--source include/stop_slave_sql.inc
let $do_domain_ids_before= query_get_value(SHOW SLAVE STATUS, Replicate_Do_Domain_Ids, 1);
let $ignore_domain_ids_before= query_get_value(SHOW SLAVE STATUS, Replicate_Ignore_Domain_Ids, 1);
@ -224,8 +238,12 @@ let $ignore_domain_ids_before= query_get_value(SHOW SLAVE STATUS, Replicate_Igno
--echo IGNORE_DOMAIN_IDS (BEFORE) : $ignore_domain_ids_before
CHANGE MASTER TO IGNORE_DOMAIN_IDS=(), MASTER_USE_GTID=slave_pos;
--connection master
--source include/save_master_gtid.inc
--connection slave
--source include/start_slave.inc
sync_with_master;
--source include/sync_with_master_gtid.inc
let $do_domain_ids_after= query_get_value(SHOW SLAVE STATUS, Replicate_Do_Domain_Ids, 1);
let $ignore_domain_ids_after= query_get_value(SHOW SLAVE STATUS, Replicate_Ignore_Domain_Ids, 1);
@ -246,7 +264,7 @@ let $ignore_domain_ids_before= query_get_value(SHOW SLAVE STATUS, Replicate_Igno
CHANGE MASTER TO IGNORE_DOMAIN_IDS=(1), MASTER_USE_GTID=slave_pos;
--source include/start_slave.inc
sync_with_master;
--source include/sync_with_master_gtid.inc
let $do_domain_ids_after= query_get_value(SHOW SLAVE STATUS, Replicate_Do_Domain_Ids, 1);
let $ignore_domain_ids_after= query_get_value(SHOW SLAVE STATUS, Replicate_Ignore_Domain_Ids, 1);
@ -264,6 +282,11 @@ INSERT INTO t1 VALUES(18);
INSERT INTO t1 VALUES(19); # IO thread gets killed here.
COMMIT;
# MDEV-14357
# Synchronize gtid_slave_pos with the ignored event. See prior comments about
# MDEV-14357 for details.
--source include/save_master_gtid.inc
START TRANSACTION;
INSERT INTO t1 VALUES(20);
INSERT INTO t1 VALUES(21);
@ -275,7 +298,6 @@ INSERT INTO t1 VALUES(22);
INSERT INTO t1 VALUES(23);
COMMIT;
save_master_pos;
SELECT * FROM t1;
connection slave;
@ -285,6 +307,10 @@ SELECT * FROM t1;
SET @@global.debug_dbug=@saved_dbug;
# MDEV-14357
# Synchronize gtid_slave_pos with the ignored event. See prior comments about
# MDEV-14357 for details.
--source include/sync_with_master_gtid.inc
--source include/stop_slave_sql.inc
let $do_domain_ids_before= query_get_value(SHOW SLAVE STATUS, Replicate_Do_Domain_Ids, 1);
let $ignore_domain_ids_before= query_get_value(SHOW SLAVE STATUS, Replicate_Ignore_Domain_Ids, 1);
@ -292,8 +318,12 @@ let $ignore_domain_ids_before= query_get_value(SHOW SLAVE STATUS, Replicate_Igno
--echo IGNORE_DOMAIN_IDS (BEFORE) : $ignore_domain_ids_before
CHANGE MASTER TO IGNORE_DOMAIN_IDS=(), MASTER_USE_GTID=slave_pos;
--connection master
--source include/save_master_gtid.inc
--connection slave
--source include/start_slave.inc
sync_with_master;
--source include/sync_with_master_gtid.inc
let $do_domain_ids_after= query_get_value(SHOW SLAVE STATUS, Replicate_Do_Domain_Ids, 1);
let $ignore_domain_ids_after= query_get_value(SHOW SLAVE STATUS, Replicate_Ignore_Domain_Ids, 1);
@ -314,7 +344,7 @@ let $ignore_domain_ids_before= query_get_value(SHOW SLAVE STATUS, Replicate_Igno
CHANGE MASTER TO IGNORE_DOMAIN_IDS=(), MASTER_USE_GTID=slave_pos;
--source include/start_slave.inc
sync_with_master;
--source include/sync_with_master_gtid.inc
let $do_domain_ids_after= query_get_value(SHOW SLAVE STATUS, Replicate_Do_Domain_Ids, 1);
let $ignore_domain_ids_after= query_get_value(SHOW SLAVE STATUS, Replicate_Ignore_Domain_Ids, 1);
@ -343,7 +373,7 @@ INSERT INTO t1 VALUES(28);
INSERT INTO t1 VALUES(29);
COMMIT;
save_master_pos;
--source include/save_master_gtid.inc
SELECT * FROM t1;
connection slave;
@ -361,7 +391,7 @@ let $ignore_domain_ids_before= query_get_value(SHOW SLAVE STATUS, Replicate_Igno
CHANGE MASTER TO IGNORE_DOMAIN_IDS=(1), MASTER_USE_GTID=slave_pos;
--source include/start_slave.inc
sync_with_master;
--source include/sync_with_master_gtid.inc
let $do_domain_ids_after= query_get_value(SHOW SLAVE STATUS, Replicate_Do_Domain_Ids, 1);
let $ignore_domain_ids_after= query_get_value(SHOW SLAVE STATUS, Replicate_Ignore_Domain_Ids, 1);