mariadb/mysql-test/suite/rpl/t/rpl_parallel_sbm.test
Brandon Nesterenko 01fafd45eb MDEV-35939: rpl.rpl_parallel_sbm: "Slave_last_event_time is not equal to Master_last_event_time"
The test rpl.rpl_parallel_sbm would fail because
Slave_last_event_time would not match Master_last_event_time after
syncing with the master (i.e. via sync_with_master_gtid.inc). This
happens because the timing statistics are updated after updating
gtid_slave_pos. That is, gtid_slave_pos is updated with the
transaction commit, whereas the timing statistics are updated when
the relay log position is updated (i.e. after the commit). This is
by design. For the test, this means there is a small amount of time
after sync_with_master_gtid.inc where the SHOW SLAVE STATUS output
lags behind the data state of the slave. This would cause the test
to fail if comparing Slave_last_event_time to Master_last_event_time
during this period, because Slave_last_event_time would not yet
reflect the latest committed transaction.

The fix is to add a wait_condition before the comparison to wait for
the SHOW SLAVE STATUS statistics to be updated. The actual check is
to wait for Seconds_Behind_Master == 0, because the slave should be
idle at this point, which forces the value to be 0.

The test failure can be reproduced with the following patch:

diff --git a/sql/rpl_gtid.cc b/sql/rpl_gtid.cc
index db5c26b6237..b346a32b573 100644
--- a/sql/rpl_gtid.cc
+++ b/sql/rpl_gtid.cc
@@ -300,6 +300,7 @@ rpl_slave_state::update(uint32 domain_id, uint32 server_id, uint64 sub_id,
   mysql_mutex_lock(&LOCK_slave_state);
   res= update_nolock(domain_id, server_id, sub_id, seq_no, hton, rgi);
   mysql_mutex_unlock(&LOCK_slave_state);
+  sleep(1);
   return res;
 }
2025-01-26 09:17:23 -07:00

317 lines
12 KiB
Text

#
# Ensure that Seconds_Behind_Master works correctly on the parallel replica.
#
--source include/have_log_bin.inc
--source include/have_debug.inc
--source include/have_debug_sync.inc
--source include/master-slave.inc
--echo #
--echo # MDEV-29639: Seconds_Behind_Master is incorrect for Delayed, Parallel Replicas
--echo #
# This test ensures that after a delayed parallel slave has idled, i.e.
# executed everything in its relay log, the next event group that the SQL
# thread reads from the relay log will immediately be used in the
# Seconds_Behind_Master. In particular, it ensures that the calculation for
# Seconds_Behind_Master is based on the timestamp of the new transaction,
# rather than the last committed transaction.
#
# Note that the test doesn't actually use the value of MASTER_DELAY, but
# rather uses debug_sync to coordinate the end of the delay. This is to ensure
# that on slow machines, a delay doesn't pass before the test gets a chance to
# validate results. Additionally, it lets us continue testing after validation
# so we don't have to wait out a full delay unnecessarily. The debug_sync point
# is enabled via sql_delay_by_debug_sync, which will delay transactions based
# only on GTID events, so only one synchronization is needed per transaction.
#
--connection slave
--source include/stop_slave.inc
--let $save_dbug= `SELECT @@GLOBAL.debug_dbug`
--let $save_parallel_mode= `SELECT @@GLOBAL.slave_parallel_mode`
set @@GLOBAL.slave_parallel_mode= CONSERVATIVE;
set @@GLOBAL.debug_dbug= "d,negate_clock_diff_with_master,sql_delay_by_debug_sync";
--let $master_delay= 1
--eval change master to master_delay=$master_delay, master_use_gtid=Slave_Pos
--source include/start_slave.inc
--connection master
--let insert_ctr= 0
create table t1 (a int);
create table t2 (a int);
--source include/save_master_gtid.inc
--connection slave
--let $slave_sync_method= gtid
--let $num_event_groups= 2
--source include/sync_with_master_sql_delay_debug_sync.inc
--echo #
--echo # Pt 1) Ensure SBM is updated immediately upon arrival of the next event
--connection master
--echo # Sleep 2 to allow a buffer between events for SBM check
sleep 2;
--let $ts_trx_before_ins= `SELECT UNIX_TIMESTAMP()`
--eval insert into t1 values ($insert_ctr)
--inc $insert_ctr
--source include/save_master_gtid.inc
--connection slave
--echo # Waiting for transaction to arrive on slave and begin SQL Delay..
set debug_sync= "now WAIT_FOR at_sql_delay";
--echo # Validating SBM is updated on event arrival..
--let $sbm_trx1_arrive= query_get_value(SHOW SLAVE STATUS, Seconds_Behind_Master, 1)
--let $seconds_since_idling= `SELECT UNIX_TIMESTAMP() - $ts_trx_before_ins`
if (`SELECT $sbm_trx1_arrive > ($seconds_since_idling + 1)`)
{
--echo # SBM was $sbm_trx1_arrive yet shouldn't have been larger than $seconds_since_idling + 1 (for possible negative clock_diff_with_master)
--die Seconds_Behind_Master should reset after idling
}
--echo # ..done
--echo # MDEV-33856: New definition for Seconds_Behind_Master
--echo # Validating Master_last_event_time is updated on event arrival..
--let $mle_time_trx1_arrive= query_get_value(SHOW ALL SLAVES STATUS, Master_last_event_time, 1)
--let $mle_time_trx1_arrive_unix= `SELECT truncate(unix_timestamp("$mle_time_trx1_arrive"), 0)`
if (`SELECT $mle_time_trx1_arrive_unix < ($ts_trx_before_ins - 1)`)
{
--echo # Master_last_event_time: $mle_time_trx1_arrive_unix ($mle_time_trx1_arrive)
--die Master_last_event_time was not updated for delayed replica at event arrival time
}
--echo # ..done
--echo # Validating Slave_last_event_time is still from the last transaction..
# Note we infer Slave_last_event_time via Master_Slave_time_diff
--let $time_diff_trx1_arrive= query_get_value(SHOW ALL SLAVES STATUS, Master_Slave_time_diff, 1)
if ($time_diff_trx1_arrive < 2)
{
--let $slave_time_trx1_arrive= query_get_value(SHOW ALL SLAVES STATUS, Slave_last_event_time, 1)
--let $slave_time_trx1_arrive_unix= `SELECT truncate(unix_timestamp("$slave_time_trx1_arrive"), 0)`
--echo # Slave_last_event_time: $slave_time_trx1_arrive_unix ($slave_time_trx1_arrive)
--echo # Master_Slave_time_diff: $time_diff_trx1_arrive
--die Slave_last_event_time is too recent, should not be less than 2 seconds, ie. from sleeping
}
--echo # ..done
--echo # MDEV-32265. At time of STOP SLAVE, if the SQL Thread is currently
--echo # delaying a transaction; then when the reciprocal START SLAVE occurs,
--echo # if the event is still to be delayed, SBM should resume accordingly
--connection server_2
--echo # Ensure the kill from STOP SLAVE will be received before continuing the
--echo # SQL thread
set debug_sync="after_thd_awake_kill SIGNAL slave_notified_of_kill";
--send STOP SLAVE
--connection slave
set debug_sync= "now WAIT_FOR slave_notified_of_kill";
set debug_sync= "now SIGNAL continue_sql_thread";
--connection server_2
--reap
--source include/wait_for_slave_to_stop.inc
set debug_sync="RESET";
--echo # Lock t1 on slave to ensure the event can't finish (and thereby update
--echo # Seconds_Behind_Master) so slow running servers don't accidentally
--echo # catch up to the master before checking SBM.
--connection server_2
LOCK TABLES t1 WRITE;
--source include/start_slave.inc
--connection slave
--echo # SQL delay has no impact for the rest of the test case, so ignore it
--let $slave_sync_method= none
--source include/sync_with_master_sql_delay_debug_sync.inc
--echo # Waiting for replica to get blocked by the table lock
--let $wait_condition= SELECT count(*) FROM information_schema.processlist WHERE state LIKE 'Waiting for table metadata lock';
--source include/wait_condition.inc
--echo # Sleeping 1s to increment SBM
sleep 1;
--echo # Ensuring Seconds_Behind_Master increases after sleeping..
--let $sbm_trx1_after_1s_sleep= query_get_value(SHOW SLAVE STATUS, Seconds_Behind_Master, 1)
if (`SELECT $sbm_trx1_after_1s_sleep <= $sbm_trx1_arrive`)
{
--echo # ..failed
--die Seconds_Behind_Master did not increase after sleeping, but should have
}
--echo # ..done
--connection server_2
UNLOCK TABLES;
--source include/sync_with_master_gtid.inc
--echo # MDEV-33856: New definition for Seconds_Behind_Master
# The update of slave timing statistics (Seconds_Behind_Master,
# Slave_last_event_time, etc) lags slightly behind the update of
# gtid_slave_pos (we just called sync_with_master_gtid), so wait for SBM==0 to
# indicate the statistics have been updated
--echo # Waiting for slave timing statistics to update ..
--let $wait_condition= SELECT count(*) FROM information_schema.slave_status WHERE Seconds_Behind_Master=0
--source include/wait_condition.inc
--echo # Ensuring Slave_last_event_time is now up-to-date once event is executed
--let $slave_time_trx1_commit= query_get_value(SHOW ALL SLAVES STATUS, Slave_last_event_time, 1)
--let $slave_time_trx1_commit_unix= `SELECT truncate(unix_timestamp("$slave_time_trx1_commit"),0)`
if ($slave_time_trx1_commit_unix != $mle_time_trx1_arrive_unix)
{
--echo # Slave_last_event_time: $slave_time_trx1_commit_unix ($slave_time_trx1_commit)
--echo # Master_last_event_time: $mle_time_trx1_arrive_unix ($mle_time_trx1_arrive)
--die Slave_last_event_time is not equal to Master_last_event_time despite being up-to-date
}
--let $time_diff_trx1_commit= query_get_value(SHOW ALL SLAVES STATUS, Master_Slave_time_diff, 1)
if ($time_diff_trx1_commit != 0)
{
--echo # Master_Slave_time_diff: $time_diff_trx1_commit
--die Master_Slave_time_diff should be 0, as slave is up-to-date
}
--echo # ..done
--echo #
--echo # Pt 2) If the worker threads have not entered an idle state, ensure
--echo # following events do not update SBM
--connection slave
LOCK TABLES t1 WRITE;
--connection master
--echo # Sleep 2 to allow a buffer between events for SBM check
sleep 2;
--let $ts_trxpt2_before_ins= `SELECT UNIX_TIMESTAMP()`
--eval insert into t1 values ($insert_ctr)
--inc $insert_ctr
--echo # Sleep 3 to create gap between events
sleep 3;
--eval insert into t1 values ($insert_ctr)
--inc $insert_ctr
--let $ts_trx_after_ins= `SELECT UNIX_TIMESTAMP()`
--source include/save_master_pos.inc
--connection slave
--echo # Wait for first transaction to complete SQL delay and begin execution..
--let $slave_sync_method= none
--source include/sync_with_master_sql_delay_debug_sync.inc
--let $wait_condition= SELECT count(*) FROM information_schema.processlist WHERE state LIKE 'Waiting for table metadata lock%' AND command LIKE 'Slave_Worker';
--source include/wait_condition.inc
--echo # Wait for second transaction to complete SQL delay..
--source include/sync_with_master_sql_delay_debug_sync.inc
--echo # Validate SBM calculation doesn't use the second transaction because worker threads shouldn't have gone idle..
--let $sbm_after_trx_no_idle= query_get_value(SHOW SLAVE STATUS, Seconds_Behind_Master, 1)
--let $timestamp_trxpt2_arrive= `SELECT UNIX_TIMESTAMP()`
if (`SELECT $sbm_after_trx_no_idle < $timestamp_trxpt2_arrive - $ts_trx_after_ins - 1`)
{
--let $cmpv= `SELECT $timestamp_trxpt2_arrive - $ts_trx_after_ins`
--echo # SBM $sbm_after_trx_no_idle was more recent than time since last transaction ($cmpv seconds)
--die Seconds_Behind_Master should not have used second transaction timestamp
}
--let $seconds_since_idling= `SELECT ($timestamp_trxpt2_arrive - $ts_trxpt2_before_ins)`
--echo # ..and that SBM wasn't calculated using prior committed transactions
if (`SELECT $sbm_after_trx_no_idle > ($seconds_since_idling + 1)`)
{
--echo # SBM was $sbm_after_trx_no_idle yet shouldn't have been larger than $seconds_since_idling + 1 (for possible negative clock_diff_with_master)
--die Seconds_Behind_Master calculation should not have used prior committed transaction
}
--echo # ..done
--connection slave
UNLOCK TABLES;
--source include/sync_with_master.inc
--echo # Cleanup
--source include/stop_slave.inc
set debug_sync= "RESET";
set @@GLOBAL.debug_dbug= "-d,sql_delay_by_debug_sync";
--eval CHANGE MASTER TO master_delay=0
--source include/start_slave.inc
--echo #
--echo # MDEV-30619: Parallel Slave SQL Thread Can Update Seconds_Behind_Master with Active Workers
--echo #
# This test ensures that a parallel slave will not update
# Seconds_Behind_Master after the SQL Thread has idled if the worker threads
# are still executing events. To test this, two events are executed on the
# primary with $sleep seconds in-between them. Once the second event begins
# execution on the replica, Seconds_Behind_Master is queried to ensure it
# reflects the value of the first transaction, rather than the second.
--connection slave
--echo # Ensure the replica is fully idle before starting transactions
--let $wait_condition= SELECT count(*) FROM information_schema.processlist WHERE state LIKE 'Slave has read all relay log%';
--source include/wait_condition.inc
--let $wait_condition= SELECT count(*)=2 FROM information_schema.processlist WHERE state LIKE 'Waiting for work from SQL thread';
--source include/wait_condition.inc
--echo # Lock t1 on slave so the first received transaction does not complete/commit
LOCK TABLES t1 WRITE;
--connection master
--let $ts_t1_before_master_ins= `SELECT UNIX_TIMESTAMP()`
--eval insert into t1 values ($insert_ctr)
--inc $insert_ctr
--source include/save_master_gtid.inc
--connection slave
--echo # Waiting for first transaction to begin..
--let $wait_condition= SELECT count(*) FROM information_schema.processlist WHERE state LIKE 'Waiting for table metadata lock';
--source include/wait_condition.inc
--let $sbm_1= query_get_value(SHOW SLAVE STATUS, Seconds_Behind_Master, 1)
--connection master
--let $sleep = 2
--echo # Sleep $sleep sec to create a gap between events
sleep $sleep;
INSERT INTO t2 VALUES (1);
--source include/save_master_gtid.inc
--connection slave
--echo # Waiting for second transaction to begin..
--let $wait_condition= SELECT count(*) FROM information_schema.processlist WHERE state LIKE 'Waiting for prior transaction to start commit%';
--source include/wait_condition.inc
--let $sbm_2= query_get_value(SHOW SLAVE STATUS, Seconds_Behind_Master, 1)
if (`SELECT $sbm_1 + $sleep > $sbm_2`)
{
--echo # Seconds_Behind_Masters: $sbm_1 $sbm_2_0
--die Two successive Seconds_Behind_Master timestamps must be separated by the sleep parameter value or greater
}
--connection slave
UNLOCK TABLES;
--source include/sync_with_master_gtid.inc
--echo #
--echo # Cleanup
--connection master
DROP TABLE t1, t2;
--source include/save_master_gtid.inc
--connection slave
--source include/sync_with_master_gtid.inc
--source include/stop_slave.inc
--eval set @@GLOBAL.debug_dbug= "$save_dbug"
--evalp set @@GLOBAL.slave_parallel_mode= "$save_parallel_mode"
--source include/start_slave.inc
--source include/rpl_end.inc
--echo # End of rpl_parallel_sbm.test