mariadb/mysql-test/suite/rpl/t/rpl_parallel_sbm.test
Brandon Nesterenko 68938d2b42 MDEV-33500 (part 2): rpl.rpl_parallel_sbm can still fail
The failing test case validates Seconds_Behind_Master for a delayed
slave, while STOP SLAVE is executed during a delay. The test fixes
initially added to the test (commit b04c857596) added a table lock
to ensure a transaction could not finish before validating the
Seconds_Behind_Master field after SLAVE START, but did not address a
possibility that the transaction could finish before running the
STOP SLAVE command, which invalidates the validations for the rest
of the test case. Specifically, this would result in 1) a timeout in
“Waiting for table metadata lock” on the replica, which expects the
transaction to retry after slave restart and hit a lock conflict on
the locked tables (added in b04c857596), and 2) that
Seconds_Behind_Master should have increased, but did not.

The failure can be reproduced by synchronizing the slave to the master
before the MDEV-32265 echo statement (i.e. before the SLAVE STOP).

This patch fixes the test by adding a mechanism to use DEBUG_SYNC to
synchronize a MASTER_DELAY, rather than continually increase the
duration of the delay each time the test fails on buildbot. This is
to ensure that on slow machines, a delay does not pass before the
test gets a chance to validate results. Additionally, it decreases
overall test time because the test can continue immediately after
validation, thereby bypassing the remainder of a full delay for each
transaction.
2024-09-17 06:29:20 -06:00

266 lines
9.8 KiB
Text

#
# Ensure that Seconds_Behind_Master works correctly on the parallel replica.
#
--source include/have_log_bin.inc
--source include/have_debug.inc
--source include/have_debug_sync.inc
--source include/master-slave.inc
--echo #
--echo # MDEV-29639: Seconds_Behind_Master is incorrect for Delayed, Parallel Replicas
--echo #
# This test ensures that after a delayed parallel slave has idled, i.e.
# executed everything in its relay log, the next event group that the SQL
# thread reads from the relay log will immediately be used in the
# Seconds_Behind_Master. In particular, it ensures that the calculation for
# Seconds_Behind_Master is based on the timestamp of the new transaction,
# rather than the last committed transaction.
#
# Note that the test doesn't actually use the value of MASTER_DELAY, but
# rather uses debug_sync to coordinate the end of the delay. This is to ensure
# that on slow machines, a delay doesn't pass before the test gets a chance to
# validate results. Additionally, it lets us continue testing after validation
# so we don't have to wait out a full delay unnecessarily. The debug_sync point
# is enabled via sql_delay_by_debug_sync, which will delay transactions based
# only on GTID events, so only one synchronization is needed per transaction.
#
--connection slave
--source include/stop_slave.inc
--let $save_dbug= `SELECT @@GLOBAL.debug_dbug`
--let $save_parallel_mode= `SELECT @@GLOBAL.slave_parallel_mode`
set @@GLOBAL.slave_parallel_mode= CONSERVATIVE;
set @@GLOBAL.debug_dbug= "d,negate_clock_diff_with_master,sql_delay_by_debug_sync";
--let $master_delay= 1
--eval change master to master_delay=$master_delay, master_use_gtid=Slave_Pos
--source include/start_slave.inc
--connection master
--let insert_ctr= 0
create table t1 (a int);
create table t2 (a int);
--source include/save_master_gtid.inc
--connection slave
--let $slave_sync_method= gtid
--let $num_event_groups= 2
--source include/sync_with_master_sql_delay_debug_sync.inc
--echo #
--echo # Pt 1) Ensure SBM is updated immediately upon arrival of the next event
--connection master
--echo # Sleep 2 to allow a buffer between events for SBM check
sleep 2;
--let $ts_trx_before_ins= `SELECT UNIX_TIMESTAMP()`
--eval insert into t1 values ($insert_ctr)
--inc $insert_ctr
--source include/save_master_gtid.inc
--connection slave
--echo # Waiting for transaction to arrive on slave and begin SQL Delay..
set debug_sync= "now WAIT_FOR at_sql_delay";
--echo # Validating SBM is updated on event arrival..
--let $sbm_trx1_arrive= query_get_value(SHOW SLAVE STATUS, Seconds_Behind_Master, 1)
--let $seconds_since_idling= `SELECT UNIX_TIMESTAMP() - $ts_trx_before_ins`
if (`SELECT $sbm_trx1_arrive > ($seconds_since_idling + 1)`)
{
--echo # SBM was $sbm_trx1_arrive yet shouldn't have been larger than $seconds_since_idling + 1 (for possible negative clock_diff_with_master)
--die Seconds_Behind_Master should reset after idling
}
--echo # ..done
--echo # MDEV-32265. At time of STOP SLAVE, if the SQL Thread is currently
--echo # delaying a transaction; then when the reciprocal START SLAVE occurs,
--echo # if the event is still to be delayed, SBM should resume accordingly
--connection server_2
--echo # Ensure the kill from STOP SLAVE will be received before continuing the
--echo # SQL thread
set debug_sync="after_thd_awake_kill SIGNAL slave_notified_of_kill";
--send STOP SLAVE
--connection slave
set debug_sync= "now WAIT_FOR slave_notified_of_kill";
set debug_sync= "now SIGNAL continue_sql_thread";
--connection server_2
--reap
--source include/wait_for_slave_to_stop.inc
set debug_sync="RESET";
--echo # Lock t1 on slave to ensure the event can't finish (and thereby update
--echo # Seconds_Behind_Master) so slow running servers don't accidentally
--echo # catch up to the master before checking SBM.
--connection server_2
LOCK TABLES t1 WRITE;
--source include/start_slave.inc
--connection slave
--echo # SQL delay has no impact for the rest of the test case, so ignore it
--let $slave_sync_method= none
--source include/sync_with_master_sql_delay_debug_sync.inc
--echo # Waiting for replica to get blocked by the table lock
--let $wait_condition= SELECT count(*) FROM information_schema.processlist WHERE state LIKE 'Waiting for table metadata lock';
--source include/wait_condition.inc
--echo # Sleeping 1s to increment SBM
sleep 1;
--echo # Ensuring Seconds_Behind_Master increases after sleeping..
--let $sbm_trx1_after_1s_sleep= query_get_value(SHOW SLAVE STATUS, Seconds_Behind_Master, 1)
if (`SELECT $sbm_trx1_after_1s_sleep <= $sbm_trx1_arrive`)
{
--echo # ..failed
--die Seconds_Behind_Master did not increase after sleeping, but should have
}
--echo # ..done
--connection server_2
UNLOCK TABLES;
--source include/sync_with_master_gtid.inc
--echo #
--echo # Pt 2) If the worker threads have not entered an idle state, ensure
--echo # following events do not update SBM
--connection slave
LOCK TABLES t1 WRITE;
--connection master
--echo # Sleep 2 to allow a buffer between events for SBM check
sleep 2;
--let $ts_trxpt2_before_ins= `SELECT UNIX_TIMESTAMP()`
--eval insert into t1 values ($insert_ctr)
--inc $insert_ctr
--echo # Sleep 3 to create gap between events
sleep 3;
--eval insert into t1 values ($insert_ctr)
--inc $insert_ctr
--let $ts_trx_after_ins= `SELECT UNIX_TIMESTAMP()`
--source include/save_master_pos.inc
--connection slave
--echo # Wait for first transaction to complete SQL delay and begin execution..
--let $slave_sync_method= none
--source include/sync_with_master_sql_delay_debug_sync.inc
--let $wait_condition= SELECT count(*) FROM information_schema.processlist WHERE state LIKE 'Waiting for table metadata lock%' AND command LIKE 'Slave_Worker';
--source include/wait_condition.inc
--echo # Wait for second transaction to complete SQL delay..
--source include/sync_with_master_sql_delay_debug_sync.inc
--echo # Validate SBM calculation doesn't use the second transaction because worker threads shouldn't have gone idle..
--let $sbm_after_trx_no_idle= query_get_value(SHOW SLAVE STATUS, Seconds_Behind_Master, 1)
--let $timestamp_trxpt2_arrive= `SELECT UNIX_TIMESTAMP()`
if (`SELECT $sbm_after_trx_no_idle < $timestamp_trxpt2_arrive - $ts_trx_after_ins - 1`)
{
--let $cmpv= `SELECT $timestamp_trxpt2_arrive - $ts_trx_after_ins`
--echo # SBM $sbm_after_trx_no_idle was more recent than time since last transaction ($cmpv seconds)
--die Seconds_Behind_Master should not have used second transaction timestamp
}
--let $seconds_since_idling= `SELECT ($timestamp_trxpt2_arrive - $ts_trxpt2_before_ins)`
--echo # ..and that SBM wasn't calculated using prior committed transactions
if (`SELECT $sbm_after_trx_no_idle > ($seconds_since_idling + 1)`)
{
--echo # SBM was $sbm_after_trx_no_idle yet shouldn't have been larger than $seconds_since_idling + 1 (for possible negative clock_diff_with_master)
--die Seconds_Behind_Master calculation should not have used prior committed transaction
}
--echo # ..done
--connection slave
UNLOCK TABLES;
--source include/sync_with_master.inc
--echo # Cleanup
--source include/stop_slave.inc
set debug_sync= "RESET";
set @@GLOBAL.debug_dbug= "-d,sql_delay_by_debug_sync";
--eval CHANGE MASTER TO master_delay=0
--source include/start_slave.inc
--echo #
--echo # MDEV-30619: Parallel Slave SQL Thread Can Update Seconds_Behind_Master with Active Workers
--echo #
# This test ensures that a parallel slave will not update
# Seconds_Behind_Master after the SQL Thread has idled if the worker threads
# are still executing events. To test this, two events are executed on the
# primary with $sleep seconds in-between them. Once the second event begins
# execution on the replica, Seconds_Behind_Master is queried to ensure it
# reflects the value of the first transaction, rather than the second.
--connection slave
--echo # Ensure the replica is fully idle before starting transactions
--let $wait_condition= SELECT count(*) FROM information_schema.processlist WHERE state LIKE 'Slave has read all relay log%';
--source include/wait_condition.inc
--let $wait_condition= SELECT count(*)=2 FROM information_schema.processlist WHERE state LIKE 'Waiting for work from SQL thread';
--source include/wait_condition.inc
--echo # Lock t1 on slave so the first received transaction does not complete/commit
LOCK TABLES t1 WRITE;
--connection master
--let $ts_t1_before_master_ins= `SELECT UNIX_TIMESTAMP()`
--eval insert into t1 values ($insert_ctr)
--inc $insert_ctr
--source include/save_master_gtid.inc
--connection slave
--echo # Waiting for first transaction to begin..
--let $wait_condition= SELECT count(*) FROM information_schema.processlist WHERE state LIKE 'Waiting for table metadata lock';
--source include/wait_condition.inc
--let $sbm_1= query_get_value(SHOW SLAVE STATUS, Seconds_Behind_Master, 1)
--connection master
--let $sleep = 2
--echo # Sleep $sleep sec to create a gap between events
sleep $sleep;
INSERT INTO t2 VALUES (1);
--source include/save_master_gtid.inc
--connection slave
--echo # Waiting for second transaction to begin..
--let $wait_condition= SELECT count(*) FROM information_schema.processlist WHERE state LIKE 'Waiting for prior transaction to start commit%';
--source include/wait_condition.inc
--let $sbm_2= query_get_value(SHOW SLAVE STATUS, Seconds_Behind_Master, 1)
if (`SELECT $sbm_1 + $sleep > $sbm_2`)
{
--echo # Seconds_Behind_Masters: $sbm_1 $sbm_2_0
--die Two successive Seconds_Behind_Master timestamps must be separated by the sleep parameter value or greater
}
--connection slave
UNLOCK TABLES;
--source include/sync_with_master_gtid.inc
--echo #
--echo # Cleanup
--connection master
DROP TABLE t1, t2;
--source include/save_master_gtid.inc
--connection slave
--source include/sync_with_master_gtid.inc
--source include/stop_slave.inc
--eval set @@GLOBAL.debug_dbug= "$save_dbug"
--evalp set @@GLOBAL.slave_parallel_mode= "$save_parallel_mode"
--source include/start_slave.inc
--source include/rpl_end.inc
--echo # End of rpl_parallel_sbm.test