From 080d92a6217a757300b629360cacf625f3c7ea60 Mon Sep 17 00:00:00 2001 From: Pekka Lampio Date: Thu, 27 Nov 2025 16:38:11 +0200 Subject: [PATCH] MDEV-31517 Wrong variable name in the configuration leads Galera to think SST/IST failed, at next restart will request a full SST This patch fixes an unwanted behavior of a Galera cluster node when Server startup fails because of an error in configuration file: after the failure full SST is requested at the next Server startup even though full SST is not needed (MDEV-31517). If Server startup fails because of a configuration error, this patch ensures that Galera state of the failing node remains unchanged. This avoids full SST at the next Server restart. This fix consists of three patches for the following components: 1) Server, 2) WSREP library, 3) Galera. --- .../suite/galera/r/galera_mdev_31517.result | 24 ++++ .../suite/galera/t/galera_mdev_31517.test | 132 ++++++++++++++++++ sql/mysqld.cc | 8 +- wsrep-lib | 2 +- 4 files changed, 164 insertions(+), 2 deletions(-) create mode 100644 mysql-test/suite/galera/r/galera_mdev_31517.result create mode 100644 mysql-test/suite/galera/t/galera_mdev_31517.test diff --git a/mysql-test/suite/galera/r/galera_mdev_31517.result b/mysql-test/suite/galera/r/galera_mdev_31517.result new file mode 100644 index 00000000000..efb3f985622 --- /dev/null +++ b/mysql-test/suite/galera/r/galera_mdev_31517.result @@ -0,0 +1,24 @@ +connection node_2; +connection node_1; +connection node_2; +CALL mtr.add_suppression("unknown variable 'non_existing_variable=ON'"); +CALL mtr.add_suppression("Aborting"); +CALL mtr.add_suppression("sst_received failed: State wait was interrupted"); +CALL mtr.add_suppression("State transfer interrupted, shutting down gracefully"); +connection node_1; +CREATE TABLE t(i INT NOT NULL PRIMARY KEY) ENGINE INNODB; +INSERT INTO t VALUES(1); +connection node_2; +connection node_1; +connection node_2; +connection node_1; +connection node_2; +connection node_1; +connection node_2; +Starting server ... +Starting server ... +SET GLOBAL wsrep_mode = DEFAULT; +connection node_1; +DROP TABLE t; +disconnect node_2; +disconnect node_1; diff --git a/mysql-test/suite/galera/t/galera_mdev_31517.test b/mysql-test/suite/galera/t/galera_mdev_31517.test new file mode 100644 index 00000000000..39ceb2db5a1 --- /dev/null +++ b/mysql-test/suite/galera/t/galera_mdev_31517.test @@ -0,0 +1,132 @@ +# +# Test for MDEV-31517: Wrong variable name in the configuration leads +# Galera to think SST/IST failed, at next restart will request a full +# SST. +# +# To reproduce: +# +# 1. Start Galera cluster +# 2. Stop a Node +# 3. Start the node +# 4. Stop a Node +# 5. Add non_existing_variable=ON in the config +# 6. Start the node, this will fail +# 7 Remove non_existing_variable=ON from the config file +# 8 Restart the server +# 9. Observe a full SST happening +# +# This test checks that an IST takes place at Step 9 instead of a full SST. + +# Step 1: Start Galera cluster +--source include/galera_cluster.inc +--source include/have_mariabackup.inc +--echo # Make sure that the test is operating on the right version of galera library. +--let $galera_version=26.4.25 +source ../wsrep/include/check_galera_version.inc; + +# Suppress expected errors and warnings: +--connection node_2 +CALL mtr.add_suppression("unknown variable 'non_existing_variable=ON'"); +CALL mtr.add_suppression("Aborting"); +CALL mtr.add_suppression("sst_received failed: State wait was interrupted"); +CALL mtr.add_suppression("State transfer interrupted, shutting down gracefully"); + +# Count the number of "SST completed" messages in the log file before +# and after testing. To do this we need to save original log file +# before testing: +# +--let TEST_LOG=$MYSQLTEST_VARDIR/log/mysqld.2.err +--perl + use strict; + my $test_log=$ENV{'TEST_LOG'} or die "TEST_LOG not set"; + my $test_log_copy=$test_log . '.copy'; + if (-e $test_log_copy) { + unlink $test_log_copy; + } +EOF +--copy_file $TEST_LOG $TEST_LOG.copy + +--connection node_1 +CREATE TABLE t(i INT NOT NULL PRIMARY KEY) ENGINE INNODB; +INSERT INTO t VALUES(1); + +# Step 2: Stop node 2 +--connection node_2 +--source include/shutdown_mysqld.inc + +--connection node_1 +--let $wait_condition = SELECT VARIABLE_VALUE = 1 FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_cluster_size' +--source include/wait_condition.inc + +# Step 3: Start node 2 +--connection node_2 +--source include/start_mysqld.inc + +--connection node_1 +--let $wait_condition = SELECT VARIABLE_VALUE = 2 FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_cluster_size' +--source include/wait_condition.inc + +# Step 4: Stop node 2 +--connection node_2 +let $MYSQLD_DATADIR= `SELECT @@datadir`; +--source include/shutdown_mysqld.inc + +--connection node_1 +--let $wait_condition = SELECT VARIABLE_VALUE = 1 FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_cluster_size' +--source include/wait_condition.inc + +# Step 5: Add non_existing_variable=ON in the config +--exec cp $MYSQLTEST_VARDIR/my.cnf $MYSQLTEST_VARDIR/my.cnf-orig +--exec echo '[mysqld.2]' >> $MYSQLTEST_VARDIR/my.cnf +--exec echo 'non_existing_variable=ON' >> $MYSQLTEST_VARDIR/my.cnf + +# Step 6: start the stopped node, this will fail +--connection node_2 +--exec cp -p $MYSQLD_DATADIR/grastate.dat $MYSQLD_DATADIR/grastate.dat_before +--echo Starting server ... +--error 1 +--exec $MYSQLD --defaults-group-suffix=.2 --defaults-file=$MYSQLTEST_VARDIR/my.cnf | grep 'non_existing_variable' +--exec cp -p $MYSQLD_DATADIR/grastate.dat $MYSQLD_DATADIR/grastate.dat_after + +# Step 7: remove the wrong variable in the config file +--exec cp $MYSQLTEST_VARDIR/my.cnf-orig $MYSQLTEST_VARDIR/my.cnf + +# Step 8: Start the node +--echo Starting server ... +let $restart_noprint=2; +--source include/start_mysqld.inc + +--let $wait_condition = SELECT VARIABLE_VALUE = 2 FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_cluster_size'; +--source include/wait_condition.inc + +--let $wait_condition = SELECT VARIABLE_VALUE = 'ON' FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_ready'; +--source include/wait_condition.inc + +# cleanup +SET GLOBAL wsrep_mode = DEFAULT; + +--connection node_1 +DROP TABLE t; + +# Count the number of "SST completed" messages in the log file during +# test phase - to print the error message if the number of such +# messages in log file increased at the end of the test: +# +--perl + use strict; + my $test_log=$ENV{'TEST_LOG'} or die "TEST_LOG not set"; + my $test_log_copy=$test_log . '.copy'; + open(FILE, $test_log_copy) or die("Unable to open $test_log_copy: $!\n"); + my $initial=grep(/SST completed/gi,); + close(FILE); + open(FILE, $test_log) or die("Unable to open $test_log: $!\n"); + my $final=grep(/SST completed/gi,); + close(FILE); + if ($final != $initial) { + my $diff=$final-$initial; + print("Full WSREP SST performed $diff times.\n"); + } +EOF +--remove_file $TEST_LOG.copy + +--source include/galera_end.inc diff --git a/sql/mysqld.cc b/sql/mysqld.cc index f7e0100d6bf..0d8008d2df5 100644 --- a/sql/mysqld.cc +++ b/sql/mysqld.cc @@ -5361,8 +5361,14 @@ static int init_server_components() #endif if ((ho_error= handle_options(&remaining_argc, &remaining_argv, removed_opts, - mysqld_get_one_option))) + mysqld_get_one_option))) { +#ifdef WITH_WSREP + Wsrep_server_state::instance().disable_node_reset(); +#endif + unireg_abort(ho_error); + } + /* Add back the program name handle_options removes */ remaining_argc++; remaining_argv--; diff --git a/wsrep-lib b/wsrep-lib index 14ce8cab76e..7010f0ab584 160000 --- a/wsrep-lib +++ b/wsrep-lib @@ -1 +1 @@ -Subproject commit 14ce8cab76e9388f7266b91b163a62c654b1d329 +Subproject commit 7010f0ab584ab9cdebb285272a0fb0ff0a5a791d