MDEV-26499 Fix error "mysql_shutdown failed" during MTR tests

- Fix to avoid mysqltest client getting killed abruptly during
  mysql_shutdown(). When Galera replication is shutdown, wait for
  THDs with `thd->stmt_da()->is_eof()` to disconnect (these are about
  to disconnect anyway).
- Extract duplicate code from `wsrep_stop_replication()` and
  `wsrep_shutdown_replication()` in a new function.
- No need to use a custom `shutdown_mysqld.inc` in galera
  suite. Delete it, so that the one in `mysql-test/include/` is used.

Signed-off-by: Julius Goryavsky <julius.goryavsky@mariadb.com>
This commit is contained in:
Daniele Sciascia 2022-11-24 14:47:18 +01:00 committed by Julius Goryavsky
parent db0b9ec37b
commit c71dc39529
5 changed files with 64 additions and 51 deletions

View file

@ -1,18 +0,0 @@
# This is the first half of include/restart_mysqld.inc.
if ($rpl_inited)
{
if (!$allow_rpl_inited)
{
--die ERROR IN TEST: When using the replication test framework (master-slave.inc, rpl_init.inc etc), use rpl_restart_server.inc instead of restart_mysqld.inc. If you know what you are doing and you really have to use restart_mysqld.inc, set allow_rpl_inited=1 before you source restart_mysqld.inc
}
}
# Write file to make mysql-test-run.pl expect the "crash", but don't start it
--let $_expect_file_name= `select regexp_replace(@@tmpdir, '^.*/','')`
--let $_expect_file_name= $MYSQLTEST_VARDIR/tmp/$_expect_file_name.expect
--exec echo "wait" > $_expect_file_name
# Send shutdown to the connected server
--shutdown_server
--source include/wait_until_disconnected.inc

View file

@ -0,0 +1,6 @@
connection node_2;
connection node_1;
connection node_1;
connection node_2;
connection node_2;
SET GLOBAL debug_dbug="+d,simulate_slow_client_at_shutdown";

View file

@ -0,0 +1,20 @@
#
# MDEV-26499
#
# This test reproduces some failure on mysql_shutdown() call
# which manifests sporadically in some galera MTR tests during
# restart of a node.
#
--source include/galera_cluster.inc
--source include/have_debug_sync.inc
--let $node_1=node_1
--let $node_2=node_2
--source include/auto_increment_offset_save.inc
--connection node_2
SET GLOBAL debug_dbug="+d,simulate_slow_client_at_shutdown";
--source include/restart_mysqld.inc
--source include/auto_increment_offset_restore.inc

View file

@ -2208,6 +2208,7 @@ bool dispatch_command(enum enum_server_command command, THD *thd,
my_eof(thd); my_eof(thd);
kill_mysql(thd); kill_mysql(thd);
error=TRUE; error=TRUE;
DBUG_EXECUTE_IF("simulate_slow_client_at_shutdown", my_sleep(2000000););
break; break;
} }
#endif #endif

View file

@ -1014,10 +1014,8 @@ void wsrep_recover()
WSREP_INFO("Recovered position: %s", oss.str().c_str()); WSREP_INFO("Recovered position: %s", oss.str().c_str());
} }
static void wsrep_stop_replication_common(THD *thd)
void wsrep_stop_replication(THD *thd)
{ {
WSREP_INFO("Stop replication by %llu", (thd) ? thd->thread_id : 0);
if (Wsrep_server_state::instance().state() != if (Wsrep_server_state::instance().state() !=
Wsrep_server_state::s_disconnected) Wsrep_server_state::s_disconnected)
{ {
@ -1030,40 +1028,30 @@ void wsrep_stop_replication(THD *thd)
} }
} }
/* my connection, should not terminate with wsrep_close_client_connection(), /* my connection, should not terminate with
make transaction to rollback wsrep_close_client_connections(), make transaction to rollback */
*/ if (thd && !thd->wsrep_applier)
if (thd && !thd->wsrep_applier) trans_rollback(thd); trans_rollback(thd);
wsrep_close_client_connections(TRUE, thd); wsrep_close_client_connections(TRUE, thd);
/* wait until appliers have stopped */ /* wait until appliers have stopped */
wsrep_wait_appliers_close(thd); wsrep_wait_appliers_close(thd);
node_uuid= WSREP_UUID_UNDEFINED; node_uuid= WSREP_UUID_UNDEFINED;
} }
void wsrep_stop_replication(THD *thd)
{
WSREP_INFO("Stop replication by %llu", (thd) ? thd->thread_id : 0);
wsrep_stop_replication_common(thd);
}
void wsrep_shutdown_replication() void wsrep_shutdown_replication()
{ {
WSREP_INFO("Shutdown replication"); WSREP_INFO("Shutdown replication");
if (Wsrep_server_state::instance().state() != wsrep::server_state::s_disconnected) wsrep_stop_replication_common(nullptr);
{
WSREP_DEBUG("Disconnect provider");
Wsrep_server_state::instance().disconnect();
if (Wsrep_server_state::instance().wait_until_state(
Wsrep_server_state::s_disconnected))
{
WSREP_WARN("Wsrep interrupted while waiting for disconnected state");
}
}
wsrep_close_client_connections(TRUE);
/* wait until appliers have stopped */
wsrep_wait_appliers_close(NULL);
node_uuid= WSREP_UUID_UNDEFINED;
/* Undocking the thread specific data. */ /* Undocking the thread specific data. */
my_pthread_setspecific_ptr(THR_THD, NULL); my_pthread_setspecific_ptr(THR_THD, nullptr);
} }
bool wsrep_start_replication(const char *wsrep_cluster_address) bool wsrep_start_replication(const char *wsrep_cluster_address)
@ -2644,14 +2632,19 @@ static my_bool have_client_connections(THD *thd, void*)
{ {
DBUG_PRINT("quit",("Informing thread %lld that it's time to die", DBUG_PRINT("quit",("Informing thread %lld that it's time to die",
(longlong) thd->thread_id)); (longlong) thd->thread_id));
if (is_client_connection(thd) && thd->killed == KILL_CONNECTION) if (is_client_connection(thd))
{ {
WSREP_DEBUG("Informing thread %lld that it's time to die", if (thd->killed == KILL_CONNECTION)
thd->thread_id); {
(void)abort_replicated(thd); (void)abort_replicated(thd);
return true; return true;
}
if (thd->get_stmt_da()->is_eof())
{
return true;
}
} }
return 0; return false;
} }
static void wsrep_close_thread(THD *thd) static void wsrep_close_thread(THD *thd)
@ -2691,14 +2684,24 @@ static my_bool kill_all_threads(THD *thd, THD *caller_thd)
/* We skip slave threads & scheduler on this first loop through. */ /* We skip slave threads & scheduler on this first loop through. */
if (is_client_connection(thd) && thd != caller_thd) if (is_client_connection(thd) && thd != caller_thd)
{ {
if (thd->get_stmt_da()->is_eof())
{
return 0;
}
if (is_replaying_connection(thd)) if (is_replaying_connection(thd))
{
thd->set_killed(KILL_CONNECTION); thd->set_killed(KILL_CONNECTION);
else if (!abort_replicated(thd)) return 0;
}
if (!abort_replicated(thd))
{ {
/* replicated transactions must be skipped */ /* replicated transactions must be skipped */
WSREP_DEBUG("closing connection %lld", (longlong) thd->thread_id); WSREP_DEBUG("closing connection %lld", (longlong) thd->thread_id);
/* instead of wsrep_close_thread() we do now soft kill by THD::awake */ /* instead of wsrep_close_thread() we do now soft kill by THD::awake */
thd->awake(KILL_CONNECTION); thd->awake(KILL_CONNECTION);
return 0;
} }
} }
return 0; return 0;
@ -2710,6 +2713,7 @@ static my_bool kill_remaining_threads(THD *thd, THD *caller_thd)
if (is_client_connection(thd) && if (is_client_connection(thd) &&
!abort_replicated(thd) && !abort_replicated(thd) &&
!is_replaying_connection(thd) && !is_replaying_connection(thd) &&
!thd->get_stmt_da()->is_eof() &&
thd_is_connection_alive(thd) && thd_is_connection_alive(thd) &&
thd != caller_thd) thd != caller_thd)
{ {