mirror of
https://github.com/MariaDB/server.git
synced 2025-01-29 02:05:57 +01:00
MDEV-30260: Slave crashed:reload_acl_and_cache during shutdown
The signal handler thread can use various different runtime resources when processing a SIGHUP (e.g. master-info information) due to calling into reload_acl_and_cache(). Currently, the shutdown process waits for the termination of the signal thread after performing cleanup. However, this could cause resources actively used by the signal handler to be freed while reload_acl_and_cache() is processing. The specific resource that caused MDEV-30260 is a race condition for the hostname_cache, such that mysqld would delete it in clean_up()::hostname_cache_free(), before the signal handler would use it in reload_acl_and_cache()::hostname_cache_refresh(). Another similar resource is the active_mi/master_info_index. There was a race between its deletion by the main thread in end_slave(), and their usage by the Signal Handler as a part of Master_info_index::flush_all_relay_logs.read(active_mi) in reload_acl_and_cache(). This patch fixes these race conditions by relocating where server shutdown waits for the signal handler to die until after server-level threads have been killed (i.e., as a last step of close_connections()). With respect to the hostname_cache, active_mi and master_info_cache, this ensures that they cannot be destroyed while the signal handler is still active, and potentially using them. Additionally: 1) This requires that Events memory is still in place for SIGHUP handling's mysql_print_status(). So event deinitialization is moved into clean_up(), but the event scheduler still needs to be stopped in close_connections() at the same spot. 2) The function kill_server_thread is no longer used, so it is deleted 3) The timeout to wait for the death of the signal thread was not consistent with the comment. The comment mentioned up to 10 seconds, whereas it was actually 0.01s. The code has been fixed to wait up to 10 seconds. 4) A warning has been added if the signal handler thread fails to exit in time. 5) Added pthread_join() to end of wait_for_signal_thread_to_end() if it hadn't ended in 10s with a warning. Note this also removes the pthread_detached attribute from the signal_thread to allow for the pthread_join(). Reviewed By: =========== Vladislav Vaintroub <wlad@mariadb.com> Andrei Elkin <andrei.elkin@mariadb.com>
This commit is contained in:
parent
4980fcb990
commit
952ab9a596
4 changed files with 259 additions and 35 deletions
50
mysql-test/suite/rpl/r/rpl_shutdown_sighup.result
Normal file
50
mysql-test/suite/rpl/r/rpl_shutdown_sighup.result
Normal file
|
@ -0,0 +1,50 @@
|
|||
include/master-slave.inc
|
||||
[connection master]
|
||||
connection slave;
|
||||
set statement sql_log_bin=0 for call mtr.add_suppression("Signal handler thread did not exit in a timely manner");
|
||||
#
|
||||
# Main test
|
||||
connection master;
|
||||
create table t1 (a int);
|
||||
insert into t1 values (1);
|
||||
include/save_master_gtid.inc
|
||||
connection slave;
|
||||
include/sync_with_master_gtid.inc
|
||||
set @@global.debug_dbug= "+d,hold_sighup_log_refresh";
|
||||
# Waiting for sighup to reach reload_acl_and_cache..
|
||||
set debug_sync="now wait_for in_reload_acl_and_cache";
|
||||
# Signalling signal handler to proceed to sleep before REFRESH_HOSTS
|
||||
set debug_sync="now signal refresh_logs";
|
||||
# Starting shutdown (note this will take 3+ seconds due to DBUG my_sleep in reload_acl_and_cache)
|
||||
shutdown;
|
||||
connection server_2;
|
||||
connection slave;
|
||||
include/assert_grep.inc [Ensure Mariadbd did not segfault when shutting down]
|
||||
connection master;
|
||||
connection slave;
|
||||
#
|
||||
# Error testcase to ensure an error message is shown if the signal
|
||||
# takes longer than the timeout while processing the SIGHUP
|
||||
connection slave;
|
||||
set @@global.debug_dbug= "+d,force_sighup_processing_timeout";
|
||||
set @@global.debug_dbug= "+d,hold_sighup_log_refresh";
|
||||
connection master;
|
||||
insert into t1 values (1);
|
||||
include/save_master_gtid.inc
|
||||
connection slave;
|
||||
include/sync_with_master_gtid.inc
|
||||
# Waiting for sighup to reach reload_acl_and_cache..
|
||||
set debug_sync="now wait_for in_reload_acl_and_cache";
|
||||
# Signalling signal handler to proceed to sleep before REFRESH_HOSTS
|
||||
set debug_sync="now signal refresh_logs";
|
||||
# Starting shutdown (note this will take 3+ seconds due to DBUG my_sleep in reload_acl_and_cache)
|
||||
shutdown;
|
||||
connection server_2;
|
||||
connection slave;
|
||||
include/assert_grep.inc [Ensure warning is issued that signal handler thread is still processing]
|
||||
#
|
||||
# Cleanup
|
||||
connection master;
|
||||
drop table t1;
|
||||
include/rpl_end.inc
|
||||
# End of rpl_shutdown_sighup.test
|
154
mysql-test/suite/rpl/t/rpl_shutdown_sighup.test
Normal file
154
mysql-test/suite/rpl/t/rpl_shutdown_sighup.test
Normal file
|
@ -0,0 +1,154 @@
|
|||
#
|
||||
# The signal handler thread can use various different runtime resources when
|
||||
# processing a SIGHUP (e.g. master-info information), as the logic calls into
|
||||
# reload_acl_and_cache(). This test ensures that SIGHUP processing, when
|
||||
# concurrent with server shutdown, the shutdown logic must wait for the SIGHUP
|
||||
# processing to finish before cleaning up any resources.
|
||||
#
|
||||
# Additionally, the error case is tested such that the signal handler thread
|
||||
# takes too long processing a SIGHUP, and the main mysqld thread must skip its
|
||||
# wait and output a warning.
|
||||
#
|
||||
# Note the SIGHUP is sent via the command-line kill program via a perl script.
|
||||
#
|
||||
# References:
|
||||
# MDEV-30260: Slave crashed:reload_acl_and_cache during shutdown
|
||||
#
|
||||
|
||||
--source include/not_windows.inc
|
||||
--source include/not_embedded.inc
|
||||
--source include/have_debug.inc
|
||||
--source include/have_debug_sync.inc
|
||||
|
||||
# Binlog format doesn't matter
|
||||
--source include/have_binlog_format_statement.inc
|
||||
--source include/master-slave.inc
|
||||
|
||||
# For error test case which forces timeout
|
||||
--connection slave
|
||||
set statement sql_log_bin=0 for call mtr.add_suppression("Signal handler thread did not exit in a timely manner");
|
||||
|
||||
|
||||
--echo #
|
||||
--echo # Main test
|
||||
--connection master
|
||||
create table t1 (a int);
|
||||
insert into t1 values (1);
|
||||
--source include/save_master_gtid.inc
|
||||
|
||||
--connection slave
|
||||
--source include/sync_with_master_gtid.inc
|
||||
|
||||
# Make signal handler handle SIGHUP..
|
||||
set @@global.debug_dbug= "+d,hold_sighup_log_refresh";
|
||||
--let KILL_NODE_PIDFILE = `SELECT @@pid_file`
|
||||
--perl
|
||||
my $kill_sig = $ENV{'KILL_SIGNAL_VALUE'};
|
||||
my $pid_filename = $ENV{'KILL_NODE_PIDFILE'};
|
||||
my $mysqld_pid = `cat $pid_filename`;
|
||||
chomp($mysqld_pid);
|
||||
system("kill -HUP $mysqld_pid");
|
||||
exit(0);
|
||||
EOF
|
||||
|
||||
--echo # Waiting for sighup to reach reload_acl_and_cache..
|
||||
set debug_sync="now wait_for in_reload_acl_and_cache";
|
||||
--echo # Signalling signal handler to proceed to sleep before REFRESH_HOSTS
|
||||
set debug_sync="now signal refresh_logs";
|
||||
|
||||
# ..while we are shutting down
|
||||
--write_file $MYSQLTEST_VARDIR/tmp/mysqld.2.expect
|
||||
wait
|
||||
EOF
|
||||
--echo # Starting shutdown (note this will take 3+ seconds due to DBUG my_sleep in reload_acl_and_cache)
|
||||
shutdown;
|
||||
|
||||
--source include/wait_until_disconnected.inc
|
||||
--append_file $MYSQLTEST_VARDIR/tmp/mysqld.2.expect
|
||||
restart: --skip-slave-start=0
|
||||
EOF
|
||||
|
||||
--connection server_2
|
||||
--enable_reconnect
|
||||
--source include/wait_until_connected_again.inc
|
||||
|
||||
--connection slave
|
||||
--enable_reconnect
|
||||
--source include/wait_until_connected_again.inc
|
||||
|
||||
--let $assert_text= Ensure Mariadbd did not segfault when shutting down
|
||||
--let $assert_select= got signal 11
|
||||
--let $assert_file= $MYSQLTEST_VARDIR/log/mysqld.2.err
|
||||
--let $assert_count= 0
|
||||
--let $assert_only_after = CURRENT_TEST: rpl.rpl_shutdown_sighup
|
||||
--source include/assert_grep.inc
|
||||
|
||||
--connection master
|
||||
--sync_slave_with_master
|
||||
|
||||
|
||||
--echo #
|
||||
--echo # Error testcase to ensure an error message is shown if the signal
|
||||
--echo # takes longer than the timeout while processing the SIGHUP
|
||||
|
||||
--connection slave
|
||||
set @@global.debug_dbug= "+d,force_sighup_processing_timeout";
|
||||
set @@global.debug_dbug= "+d,hold_sighup_log_refresh";
|
||||
|
||||
--connection master
|
||||
insert into t1 values (1);
|
||||
--source include/save_master_gtid.inc
|
||||
|
||||
--connection slave
|
||||
--source include/sync_with_master_gtid.inc
|
||||
|
||||
# Make signal handler handle SIGHUP..
|
||||
--let KILL_NODE_PIDFILE = `SELECT @@pid_file`
|
||||
--perl
|
||||
my $kill_sig = $ENV{'KILL_SIGNAL_VALUE'};
|
||||
my $pid_filename = $ENV{'KILL_NODE_PIDFILE'};
|
||||
my $mysqld_pid = `cat $pid_filename`;
|
||||
chomp($mysqld_pid);
|
||||
system("kill -HUP $mysqld_pid");
|
||||
exit(0);
|
||||
EOF
|
||||
--echo # Waiting for sighup to reach reload_acl_and_cache..
|
||||
set debug_sync="now wait_for in_reload_acl_and_cache";
|
||||
--echo # Signalling signal handler to proceed to sleep before REFRESH_HOSTS
|
||||
set debug_sync="now signal refresh_logs";
|
||||
|
||||
# ..while we are shutting down
|
||||
--write_file $MYSQLTEST_VARDIR/tmp/mysqld.2.expect
|
||||
wait
|
||||
EOF
|
||||
--echo # Starting shutdown (note this will take 3+ seconds due to DBUG my_sleep in reload_acl_and_cache)
|
||||
shutdown;
|
||||
|
||||
--source include/wait_until_disconnected.inc
|
||||
--append_file $MYSQLTEST_VARDIR/tmp/mysqld.2.expect
|
||||
restart: --skip-slave-start=0
|
||||
EOF
|
||||
|
||||
--connection server_2
|
||||
--enable_reconnect
|
||||
--source include/wait_until_connected_again.inc
|
||||
|
||||
--connection slave
|
||||
--enable_reconnect
|
||||
--source include/wait_until_connected_again.inc
|
||||
|
||||
--let $assert_text= Ensure warning is issued that signal handler thread is still processing
|
||||
--let $assert_select= Signal handler thread did not exit in a timely manner.
|
||||
--let $assert_file= $MYSQLTEST_VARDIR/log/mysqld.2.err
|
||||
--let $assert_count= 1
|
||||
--let $assert_only_after = CURRENT_TEST: rpl.rpl_shutdown_sighup
|
||||
--source include/assert_grep.inc
|
||||
|
||||
|
||||
--echo #
|
||||
--echo # Cleanup
|
||||
--connection master
|
||||
drop table t1;
|
||||
|
||||
--source include/rpl_end.inc
|
||||
--echo # End of rpl_shutdown_sighup.test
|
|
@ -1766,7 +1766,8 @@ static void close_connections(void)
|
|||
DBUG_EXECUTE_IF("delay_shutdown_phase_2_after_semisync_wait",
|
||||
my_sleep(500000););
|
||||
|
||||
Events::deinit();
|
||||
if (Events::inited)
|
||||
Events::stop();
|
||||
slave_prepare_for_shutdown();
|
||||
ack_receiver.stop();
|
||||
|
||||
|
@ -1827,6 +1828,12 @@ static void close_connections(void)
|
|||
}
|
||||
/* End of kill phase 2 */
|
||||
|
||||
/*
|
||||
The signal thread can use server resources, e.g. when processing SIGHUP,
|
||||
and it must end gracefully before clean_up()
|
||||
*/
|
||||
wait_for_signal_thread_to_end();
|
||||
|
||||
DBUG_PRINT("quit",("close_connections thread"));
|
||||
DBUG_VOID_RETURN;
|
||||
}
|
||||
|
@ -1930,14 +1937,8 @@ extern "C" void unireg_abort(int exit_code)
|
|||
static void mysqld_exit(int exit_code)
|
||||
{
|
||||
DBUG_ENTER("mysqld_exit");
|
||||
/*
|
||||
Important note: we wait for the signal thread to end,
|
||||
but if a kill -15 signal was sent, the signal thread did
|
||||
spawn the kill_server_thread thread, which is running concurrently.
|
||||
*/
|
||||
rpl_deinit_gtid_waiting();
|
||||
rpl_deinit_gtid_slave_state();
|
||||
wait_for_signal_thread_to_end();
|
||||
#ifdef WITH_WSREP
|
||||
wsrep_deinit_server();
|
||||
wsrep_sst_auth_free();
|
||||
|
@ -2016,6 +2017,9 @@ static void clean_up(bool print_message)
|
|||
free_status_vars();
|
||||
end_thr_alarm(1); /* Free allocated memory */
|
||||
end_thr_timer();
|
||||
#ifndef EMBEDDED_LIBRARY
|
||||
Events::deinit();
|
||||
#endif
|
||||
my_free_open_file_info();
|
||||
if (defaults_argv)
|
||||
free_defaults(defaults_argv);
|
||||
|
@ -2083,16 +2087,32 @@ static void clean_up(bool print_message)
|
|||
*/
|
||||
static void wait_for_signal_thread_to_end()
|
||||
{
|
||||
uint i;
|
||||
uint i, n_waits= DBUG_EVALUATE("force_sighup_processing_timeout", 5, 100);
|
||||
int err= 0;
|
||||
/*
|
||||
Wait up to 10 seconds for signal thread to die. We use this mainly to
|
||||
avoid getting warnings that my_thread_end has not been called
|
||||
*/
|
||||
for (i= 0 ; i < 100 && signal_thread_in_use; i++)
|
||||
for (i= 0 ; i < n_waits && signal_thread_in_use; i++)
|
||||
{
|
||||
if (pthread_kill(signal_thread, MYSQL_KILL_SIGNAL) == ESRCH)
|
||||
err= pthread_kill(signal_thread, MYSQL_KILL_SIGNAL);
|
||||
if (err)
|
||||
break;
|
||||
my_sleep(100); // Give it time to die
|
||||
my_sleep(100000); // Give it time to die, .1s per iteration
|
||||
}
|
||||
|
||||
if (err && err != ESRCH)
|
||||
{
|
||||
sql_print_error("Failed to send kill signal to signal handler thread, "
|
||||
"pthread_kill() errno: %d",
|
||||
err);
|
||||
}
|
||||
|
||||
if (i == n_waits && signal_thread_in_use)
|
||||
{
|
||||
sql_print_warning("Signal handler thread did not exit in a timely manner. "
|
||||
"Continuing to wait for it to stop..");
|
||||
pthread_join(signal_thread, NULL);
|
||||
}
|
||||
}
|
||||
#endif /*EMBEDDED_LIBRARY*/
|
||||
|
@ -2916,7 +2936,6 @@ static void start_signal_handler(void)
|
|||
|
||||
(void) pthread_attr_init(&thr_attr);
|
||||
pthread_attr_setscope(&thr_attr,PTHREAD_SCOPE_SYSTEM);
|
||||
(void) pthread_attr_setdetachstate(&thr_attr,PTHREAD_CREATE_DETACHED);
|
||||
(void) my_setstacksize(&thr_attr,my_thread_stack_size);
|
||||
|
||||
mysql_mutex_lock(&LOCK_start_thread);
|
||||
|
@ -2936,18 +2955,6 @@ static void start_signal_handler(void)
|
|||
}
|
||||
|
||||
|
||||
#if defined(USE_ONE_SIGNAL_HAND)
|
||||
pthread_handler_t kill_server_thread(void *arg __attribute__((unused)))
|
||||
{
|
||||
my_thread_init(); // Initialize new thread
|
||||
break_connect_loop();
|
||||
my_thread_end();
|
||||
pthread_exit(0);
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
/** This threads handles all signals and alarms. */
|
||||
/* ARGSUSED */
|
||||
pthread_handler_t signal_hand(void *arg __attribute__((unused)))
|
||||
|
@ -3004,7 +3011,7 @@ pthread_handler_t signal_hand(void *arg __attribute__((unused)))
|
|||
int origin;
|
||||
|
||||
while ((error= my_sigwait(&set, &sig, &origin)) == EINTR) /* no-op */;
|
||||
if (cleanup_done)
|
||||
if (abort_loop)
|
||||
{
|
||||
DBUG_PRINT("quit",("signal_handler: calling my_thread_end()"));
|
||||
my_thread_end();
|
||||
|
@ -3027,18 +3034,8 @@ pthread_handler_t signal_hand(void *arg __attribute__((unused)))
|
|||
{
|
||||
/* Delete the instrumentation for the signal thread */
|
||||
PSI_CALL_delete_current_thread();
|
||||
#ifdef USE_ONE_SIGNAL_HAND
|
||||
pthread_t tmp;
|
||||
if (unlikely((error= mysql_thread_create(0, /* Not instrumented */
|
||||
&tmp, &connection_attrib,
|
||||
kill_server_thread,
|
||||
(void*) &sig))))
|
||||
sql_print_error("Can't create thread to kill server (errno= %d)",
|
||||
error);
|
||||
#else
|
||||
my_sigset(sig, SIG_IGN);
|
||||
break_connect_loop(); // MIT THREAD has a alarm thread
|
||||
#endif
|
||||
}
|
||||
break;
|
||||
case SIGHUP:
|
||||
|
|
|
@ -67,6 +67,15 @@ bool reload_acl_and_cache(THD *thd, unsigned long long options,
|
|||
bool result=0;
|
||||
select_errors=0; /* Write if more errors */
|
||||
int tmp_write_to_binlog= *write_to_binlog= 1;
|
||||
#ifndef DBUG_OFF
|
||||
/*
|
||||
When invoked for handling a SIGHUP by rpl_shutdown_sighup.test, we need to
|
||||
force the signal handler to wait after REFRESH_TABLES, as that will check
|
||||
for a killed server, and we need to call hostname_cache_refresh after
|
||||
server cleanup has happened to trigger MDEV-30260.
|
||||
*/
|
||||
int do_dbug_sleep= 0;
|
||||
#endif
|
||||
|
||||
DBUG_ASSERT(!thd || !thd->in_sub_stmt);
|
||||
|
||||
|
@ -99,6 +108,15 @@ bool reload_acl_and_cache(THD *thd, unsigned long long options,
|
|||
*/
|
||||
my_error(ER_UNKNOWN_ERROR, MYF(0));
|
||||
}
|
||||
|
||||
#ifndef DBUG_OFF
|
||||
DBUG_EXECUTE_IF("hold_sighup_log_refresh", {
|
||||
DBUG_ASSERT(!debug_sync_set_action(
|
||||
thd, STRING_WITH_LEN("now SIGNAL in_reload_acl_and_cache "
|
||||
"WAIT_FOR refresh_logs")));
|
||||
do_dbug_sleep= 1;
|
||||
});
|
||||
#endif
|
||||
}
|
||||
opt_noacl= 0;
|
||||
|
||||
|
@ -351,6 +369,11 @@ bool reload_acl_and_cache(THD *thd, unsigned long long options,
|
|||
}
|
||||
my_dbopt_cleanup();
|
||||
}
|
||||
|
||||
#ifndef DBUG_OFF
|
||||
if (do_dbug_sleep)
|
||||
my_sleep(3000000); // 3s
|
||||
#endif
|
||||
if (options & REFRESH_HOSTS)
|
||||
hostname_cache_refresh();
|
||||
if (thd && (options & REFRESH_STATUS))
|
||||
|
|
Loading…
Add table
Reference in a new issue