MDEV-31517 Wrong variable name in the configuration leads Galera to

think SST/IST failed, at next restart will request a full SST

This patch fixes an unwanted behavior of a Galera cluster node when
Server startup fails because of an error in configuration file: after
the failure full SST is requested at the next Server startup even
though full SST is not needed (MDEV-31517).

If Server startup fails because of a configuration error, this patch
ensures that Galera state of the failing node remains unchanged. This
avoids full SST at the next Server restart.

This fix consists of three patches for the following components:

1) Server,
2) WSREP library,
3) Galera.
This commit is contained in:
Pekka Lampio 2025-11-27 16:38:11 +02:00 committed by Jan Lindström
commit 080d92a621
4 changed files with 164 additions and 2 deletions

View file

@ -0,0 +1,24 @@
connection node_2;
connection node_1;
connection node_2;
CALL mtr.add_suppression("unknown variable 'non_existing_variable=ON'");
CALL mtr.add_suppression("Aborting");
CALL mtr.add_suppression("sst_received failed: State wait was interrupted");
CALL mtr.add_suppression("State transfer interrupted, shutting down gracefully");
connection node_1;
CREATE TABLE t(i INT NOT NULL PRIMARY KEY) ENGINE INNODB;
INSERT INTO t VALUES(1);
connection node_2;
connection node_1;
connection node_2;
connection node_1;
connection node_2;
connection node_1;
connection node_2;
Starting server ...
Starting server ...
SET GLOBAL wsrep_mode = DEFAULT;
connection node_1;
DROP TABLE t;
disconnect node_2;
disconnect node_1;

View file

@ -0,0 +1,132 @@
#
# Test for MDEV-31517: Wrong variable name in the configuration leads
# Galera to think SST/IST failed, at next restart will request a full
# SST.
#
# To reproduce:
#
# 1. Start Galera cluster
# 2. Stop a Node
# 3. Start the node
# 4. Stop a Node
# 5. Add non_existing_variable=ON in the config
# 6. Start the node, this will fail
# 7 Remove non_existing_variable=ON from the config file
# 8 Restart the server
# 9. Observe a full SST happening
#
# This test checks that an IST takes place at Step 9 instead of a full SST.
# Step 1: Start Galera cluster
--source include/galera_cluster.inc
--source include/have_mariabackup.inc
--echo # Make sure that the test is operating on the right version of galera library.
--let $galera_version=26.4.25
source ../wsrep/include/check_galera_version.inc;
# Suppress expected errors and warnings:
--connection node_2
CALL mtr.add_suppression("unknown variable 'non_existing_variable=ON'");
CALL mtr.add_suppression("Aborting");
CALL mtr.add_suppression("sst_received failed: State wait was interrupted");
CALL mtr.add_suppression("State transfer interrupted, shutting down gracefully");
# Count the number of "SST completed" messages in the log file before
# and after testing. To do this we need to save original log file
# before testing:
#
--let TEST_LOG=$MYSQLTEST_VARDIR/log/mysqld.2.err
--perl
use strict;
my $test_log=$ENV{'TEST_LOG'} or die "TEST_LOG not set";
my $test_log_copy=$test_log . '.copy';
if (-e $test_log_copy) {
unlink $test_log_copy;
}
EOF
--copy_file $TEST_LOG $TEST_LOG.copy
--connection node_1
CREATE TABLE t(i INT NOT NULL PRIMARY KEY) ENGINE INNODB;
INSERT INTO t VALUES(1);
# Step 2: Stop node 2
--connection node_2
--source include/shutdown_mysqld.inc
--connection node_1
--let $wait_condition = SELECT VARIABLE_VALUE = 1 FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_cluster_size'
--source include/wait_condition.inc
# Step 3: Start node 2
--connection node_2
--source include/start_mysqld.inc
--connection node_1
--let $wait_condition = SELECT VARIABLE_VALUE = 2 FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_cluster_size'
--source include/wait_condition.inc
# Step 4: Stop node 2
--connection node_2
let $MYSQLD_DATADIR= `SELECT @@datadir`;
--source include/shutdown_mysqld.inc
--connection node_1
--let $wait_condition = SELECT VARIABLE_VALUE = 1 FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_cluster_size'
--source include/wait_condition.inc
# Step 5: Add non_existing_variable=ON in the config
--exec cp $MYSQLTEST_VARDIR/my.cnf $MYSQLTEST_VARDIR/my.cnf-orig
--exec echo '[mysqld.2]' >> $MYSQLTEST_VARDIR/my.cnf
--exec echo 'non_existing_variable=ON' >> $MYSQLTEST_VARDIR/my.cnf
# Step 6: start the stopped node, this will fail
--connection node_2
--exec cp -p $MYSQLD_DATADIR/grastate.dat $MYSQLD_DATADIR/grastate.dat_before
--echo Starting server ...
--error 1
--exec $MYSQLD --defaults-group-suffix=.2 --defaults-file=$MYSQLTEST_VARDIR/my.cnf | grep 'non_existing_variable'
--exec cp -p $MYSQLD_DATADIR/grastate.dat $MYSQLD_DATADIR/grastate.dat_after
# Step 7: remove the wrong variable in the config file
--exec cp $MYSQLTEST_VARDIR/my.cnf-orig $MYSQLTEST_VARDIR/my.cnf
# Step 8: Start the node
--echo Starting server ...
let $restart_noprint=2;
--source include/start_mysqld.inc
--let $wait_condition = SELECT VARIABLE_VALUE = 2 FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_cluster_size';
--source include/wait_condition.inc
--let $wait_condition = SELECT VARIABLE_VALUE = 'ON' FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_ready';
--source include/wait_condition.inc
# cleanup
SET GLOBAL wsrep_mode = DEFAULT;
--connection node_1
DROP TABLE t;
# Count the number of "SST completed" messages in the log file during
# test phase - to print the error message if the number of such
# messages in log file increased at the end of the test:
#
--perl
use strict;
my $test_log=$ENV{'TEST_LOG'} or die "TEST_LOG not set";
my $test_log_copy=$test_log . '.copy';
open(FILE, $test_log_copy) or die("Unable to open $test_log_copy: $!\n");
my $initial=grep(/SST completed/gi,<FILE>);
close(FILE);
open(FILE, $test_log) or die("Unable to open $test_log: $!\n");
my $final=grep(/SST completed/gi,<FILE>);
close(FILE);
if ($final != $initial) {
my $diff=$final-$initial;
print("Full WSREP SST performed $diff times.\n");
}
EOF
--remove_file $TEST_LOG.copy
--source include/galera_end.inc

View file

@ -5361,8 +5361,14 @@ static int init_server_components()
#endif
if ((ho_error= handle_options(&remaining_argc, &remaining_argv, removed_opts,
mysqld_get_one_option)))
mysqld_get_one_option))) {
#ifdef WITH_WSREP
Wsrep_server_state::instance().disable_node_reset();
#endif
unireg_abort(ho_error);
}
/* Add back the program name handle_options removes */
remaining_argc++;
remaining_argv--;

@ -1 +1 @@
Subproject commit 14ce8cab76e9388f7266b91b163a62c654b1d329
Subproject commit 7010f0ab584ab9cdebb285272a0fb0ff0a5a791d