From c71dc395292f41685530c0fa92c5ce4d78eabfa0 Mon Sep 17 00:00:00 2001 From: Daniele Sciascia Date: Thu, 24 Nov 2022 14:47:18 +0100 Subject: [PATCH] MDEV-26499 Fix error "mysql_shutdown failed" during MTR tests - Fix to avoid mysqltest client getting killed abruptly during mysql_shutdown(). When Galera replication is shutdown, wait for THDs with `thd->stmt_da()->is_eof()` to disconnect (these are about to disconnect anyway). - Extract duplicate code from `wsrep_stop_replication()` and `wsrep_shutdown_replication()` in a new function. - No need to use a custom `shutdown_mysqld.inc` in galera suite. Delete it, so that the one in `mysql-test/include/` is used. Signed-off-by: Julius Goryavsky --- .../suite/galera/include/shutdown_mysqld.inc | 18 ----- mysql-test/suite/galera/r/MDEV-26499.result | 6 ++ mysql-test/suite/galera/t/MDEV-26499.test | 20 ++++++ sql/sql_parse.cc | 1 + sql/wsrep_mysqld.cc | 70 ++++++++++--------- 5 files changed, 64 insertions(+), 51 deletions(-) delete mode 100644 mysql-test/suite/galera/include/shutdown_mysqld.inc create mode 100644 mysql-test/suite/galera/r/MDEV-26499.result create mode 100644 mysql-test/suite/galera/t/MDEV-26499.test diff --git a/mysql-test/suite/galera/include/shutdown_mysqld.inc b/mysql-test/suite/galera/include/shutdown_mysqld.inc deleted file mode 100644 index 793be8d76ac..00000000000 --- a/mysql-test/suite/galera/include/shutdown_mysqld.inc +++ /dev/null @@ -1,18 +0,0 @@ -# This is the first half of include/restart_mysqld.inc. -if ($rpl_inited) -{ - if (!$allow_rpl_inited) - { - --die ERROR IN TEST: When using the replication test framework (master-slave.inc, rpl_init.inc etc), use rpl_restart_server.inc instead of restart_mysqld.inc. If you know what you are doing and you really have to use restart_mysqld.inc, set allow_rpl_inited=1 before you source restart_mysqld.inc - } -} - -# Write file to make mysql-test-run.pl expect the "crash", but don't start it ---let $_expect_file_name= `select regexp_replace(@@tmpdir, '^.*/','')` ---let $_expect_file_name= $MYSQLTEST_VARDIR/tmp/$_expect_file_name.expect ---exec echo "wait" > $_expect_file_name - -# Send shutdown to the connected server ---shutdown_server ---source include/wait_until_disconnected.inc - diff --git a/mysql-test/suite/galera/r/MDEV-26499.result b/mysql-test/suite/galera/r/MDEV-26499.result new file mode 100644 index 00000000000..15d372470a4 --- /dev/null +++ b/mysql-test/suite/galera/r/MDEV-26499.result @@ -0,0 +1,6 @@ +connection node_2; +connection node_1; +connection node_1; +connection node_2; +connection node_2; +SET GLOBAL debug_dbug="+d,simulate_slow_client_at_shutdown"; diff --git a/mysql-test/suite/galera/t/MDEV-26499.test b/mysql-test/suite/galera/t/MDEV-26499.test new file mode 100644 index 00000000000..824b28c14f3 --- /dev/null +++ b/mysql-test/suite/galera/t/MDEV-26499.test @@ -0,0 +1,20 @@ +# +# MDEV-26499 +# +# This test reproduces some failure on mysql_shutdown() call +# which manifests sporadically in some galera MTR tests during +# restart of a node. +# + +--source include/galera_cluster.inc +--source include/have_debug_sync.inc + +--let $node_1=node_1 +--let $node_2=node_2 +--source include/auto_increment_offset_save.inc + +--connection node_2 +SET GLOBAL debug_dbug="+d,simulate_slow_client_at_shutdown"; +--source include/restart_mysqld.inc + +--source include/auto_increment_offset_restore.inc diff --git a/sql/sql_parse.cc b/sql/sql_parse.cc index 4d52e515710..0fa793cea1a 100644 --- a/sql/sql_parse.cc +++ b/sql/sql_parse.cc @@ -2208,6 +2208,7 @@ bool dispatch_command(enum enum_server_command command, THD *thd, my_eof(thd); kill_mysql(thd); error=TRUE; + DBUG_EXECUTE_IF("simulate_slow_client_at_shutdown", my_sleep(2000000);); break; } #endif diff --git a/sql/wsrep_mysqld.cc b/sql/wsrep_mysqld.cc index 5ec96478e4a..2e5bf3f426e 100644 --- a/sql/wsrep_mysqld.cc +++ b/sql/wsrep_mysqld.cc @@ -1014,10 +1014,8 @@ void wsrep_recover() WSREP_INFO("Recovered position: %s", oss.str().c_str()); } - -void wsrep_stop_replication(THD *thd) +static void wsrep_stop_replication_common(THD *thd) { - WSREP_INFO("Stop replication by %llu", (thd) ? thd->thread_id : 0); if (Wsrep_server_state::instance().state() != Wsrep_server_state::s_disconnected) { @@ -1030,40 +1028,30 @@ void wsrep_stop_replication(THD *thd) } } - /* my connection, should not terminate with wsrep_close_client_connection(), - make transaction to rollback - */ - if (thd && !thd->wsrep_applier) trans_rollback(thd); + /* my connection, should not terminate with + wsrep_close_client_connections(), make transaction to rollback */ + if (thd && !thd->wsrep_applier) + trans_rollback(thd); wsrep_close_client_connections(TRUE, thd); - + /* wait until appliers have stopped */ wsrep_wait_appliers_close(thd); node_uuid= WSREP_UUID_UNDEFINED; } +void wsrep_stop_replication(THD *thd) +{ + WSREP_INFO("Stop replication by %llu", (thd) ? thd->thread_id : 0); + wsrep_stop_replication_common(thd); +} + void wsrep_shutdown_replication() { WSREP_INFO("Shutdown replication"); - if (Wsrep_server_state::instance().state() != wsrep::server_state::s_disconnected) - { - WSREP_DEBUG("Disconnect provider"); - Wsrep_server_state::instance().disconnect(); - if (Wsrep_server_state::instance().wait_until_state( - Wsrep_server_state::s_disconnected)) - { - WSREP_WARN("Wsrep interrupted while waiting for disconnected state"); - } - } - - wsrep_close_client_connections(TRUE); - - /* wait until appliers have stopped */ - wsrep_wait_appliers_close(NULL); - node_uuid= WSREP_UUID_UNDEFINED; - + wsrep_stop_replication_common(nullptr); /* Undocking the thread specific data. */ - my_pthread_setspecific_ptr(THR_THD, NULL); + my_pthread_setspecific_ptr(THR_THD, nullptr); } bool wsrep_start_replication(const char *wsrep_cluster_address) @@ -2644,14 +2632,19 @@ static my_bool have_client_connections(THD *thd, void*) { DBUG_PRINT("quit",("Informing thread %lld that it's time to die", (longlong) thd->thread_id)); - if (is_client_connection(thd) && thd->killed == KILL_CONNECTION) + if (is_client_connection(thd)) { - WSREP_DEBUG("Informing thread %lld that it's time to die", - thd->thread_id); - (void)abort_replicated(thd); - return true; + if (thd->killed == KILL_CONNECTION) + { + (void)abort_replicated(thd); + return true; + } + if (thd->get_stmt_da()->is_eof()) + { + return true; + } } - return 0; + return false; } static void wsrep_close_thread(THD *thd) @@ -2691,14 +2684,24 @@ static my_bool kill_all_threads(THD *thd, THD *caller_thd) /* We skip slave threads & scheduler on this first loop through. */ if (is_client_connection(thd) && thd != caller_thd) { + if (thd->get_stmt_da()->is_eof()) + { + return 0; + } + if (is_replaying_connection(thd)) + { thd->set_killed(KILL_CONNECTION); - else if (!abort_replicated(thd)) + return 0; + } + + if (!abort_replicated(thd)) { /* replicated transactions must be skipped */ WSREP_DEBUG("closing connection %lld", (longlong) thd->thread_id); /* instead of wsrep_close_thread() we do now soft kill by THD::awake */ thd->awake(KILL_CONNECTION); + return 0; } } return 0; @@ -2710,6 +2713,7 @@ static my_bool kill_remaining_threads(THD *thd, THD *caller_thd) if (is_client_connection(thd) && !abort_replicated(thd) && !is_replaying_connection(thd) && + !thd->get_stmt_da()->is_eof() && thd_is_connection_alive(thd) && thd != caller_thd) {