Start servers in parallel

Add list of clusters and nodes
This commit is contained in:
msvensson@shellback.(none) 2006-05-18 23:35:17 +02:00
parent 0613889746
commit 3a3690a934
4 changed files with 1013 additions and 744 deletions

View file

@ -14,12 +14,17 @@ use POSIX 'WNOHANG';
sub mtr_run ($$$$$$;$);
sub mtr_spawn ($$$$$$;$);
sub mtr_stop_mysqld_servers ($);
sub mtr_check_stop_servers ($);
sub mtr_kill_leftovers ();
sub mtr_wait_blocking ($);
sub mtr_record_dead_children ();
sub mtr_ndbmgm_start($$);
sub mtr_mysqladmin_start($$$);
sub mtr_exit ($);
sub sleep_until_file_created ($$$);
sub mtr_kill_processes ($);
sub mtr_ping_with_timeout($);
sub mtr_ping_port ($);
# static in C
sub spawn_impl ($$$$$$$$);
@ -31,7 +36,6 @@ sub spawn_impl ($$$$$$$$);
##############################################################################
# This function try to mimic the C version used in "netware/mysql_test_run.c"
# FIXME learn it to handle append mode as well, a "new" flag or a "append"
sub mtr_run ($$$$$$;$) {
my $path= shift;
@ -346,49 +350,71 @@ sub mtr_process_exit_status {
#
##############################################################################
# We just "ping" on the ports, and if we can't do a socket connect
# we assume the server is dead. So we don't *really* know a server
# is dead, we just hope that it after letting the listen port go,
# it is dead enough for us to start a new server.
# Kill all processes(mysqld, ndbd, ndb_mgmd and im) that would conflict with
# this run
# Make sure to remove the PID file, if any.
# kill IM manager first, else it will restart the servers
sub mtr_kill_leftovers () {
# First, kill all masters and slaves that would conflict with
# this run. Make sure to remove the PID file, if any.
# FIXME kill IM manager first, else it will restart the servers, how?!
my @args;
my @kill_pids;
my %admin_pids;
my $pid;
for ( my $idx; $idx < 2; $idx++ )
#Start shutdown of instance_managers, masters and slaves
foreach my $srv (@{$::instance_manager->{'instances'}},@{$::master},@{$::slave})
{
push(@args,{
pid => 0, # We don't know the PID
pidfile => $::instance_manager->{'instances'}->[$idx]->{'path_pid'},
sockfile => $::instance_manager->{'instances'}->[$idx]->{'path_sock'},
port => $::instance_manager->{'instances'}->[$idx]->{'port'},
});
$pid= mtr_mysqladmin_start($srv, "shutdown", 70);
# Save the pid of the mysqladmin process
$admin_pids{$pid}= 1;
push(@kill_pids,{
pid => $srv->{'pid'},
pidfile => $srv->{'path_pid'},
sockfile => $srv->{'path_sock'},
port => $srv->{'port'},
});
$srv->{'pid'}= 0; # Assume we are done with it
}
for ( my $idx; $idx < 2; $idx++ )
# Start shutdown of clusters
foreach my $cluster (@{$::clusters})
{
push(@args,{
pid => 0, # We don't know the PID
pidfile => $::master->[$idx]->{'path_mypid'},
sockfile => $::master->[$idx]->{'path_mysock'},
port => $::master->[$idx]->{'path_myport'},
});
$pid= mtr_ndbmgm_start($cluster, "shutdown");
# Save the pid of the ndb_mgm process
$admin_pids{$pid}= 1;
push(@kill_pids,{
pid => $cluster->{'pid'},
pidfile => $cluster->{'path_pid'},
port => $cluster->{'port'},
});
$cluster->{'pid'}= 0; # Assume we are done with it
foreach my $ndbd (@{$cluster->{'ndbds'}})
{
push(@kill_pids,{
pid => $ndbd->{'pid'},
pidfile => $ndbd->{'path_pid'},
});
$ndbd->{'pid'}= 0; # Assume we are done with it
}
}
for ( my $idx; $idx < 3; $idx++ )
{
push(@args,{
pid => 0, # We don't know the PID
pidfile => $::slave->[$idx]->{'path_mypid'},
sockfile => $::slave->[$idx]->{'path_mysock'},
port => $::slave->[$idx]->{'path_myport'},
});
}
# Wait for all the admin processes to complete
mtr_wait_blocking(\%admin_pids);
mtr_mysqladmin_shutdown(\@args, 20);
# If we trusted "mysqladmin --shutdown_timeout= ..." we could just
# terminate now, but we don't (FIXME should be debugged).
# So we try again to ping and at least wait the same amount of time
# mysqladmin would for all to die.
mtr_ping_with_timeout(\@kill_pids);
# We now have tried to terminate nice. We have waited for the listen
# port to be free, but can't really tell if the mysqld process died
@ -453,7 +479,7 @@ sub mtr_kill_leftovers () {
do
{
kill(9, @pids);
mtr_debug("Sleep 1 second waiting for processes to die");
mtr_report("Sleep 1 second waiting for processes to die");
sleep(1) # Wait one second
} while ( $retries-- and kill(0, @pids) );
@ -465,53 +491,60 @@ sub mtr_kill_leftovers () {
}
}
# We may have failed everything, bug we now check again if we have
# We may have failed everything, but we now check again if we have
# the listen ports free to use, and if they are free, just go for it.
foreach my $srv ( @args )
foreach my $srv ( @kill_pids )
{
if ( mtr_ping_mysqld_server($srv->{'port'}, $srv->{'sockfile'}) )
if ( mtr_ping_port($srv->{'port'}) )
{
mtr_warning("can't kill old mysqld holding port $srv->{'port'}");
mtr_warning("can't kill old process holding port $srv->{'port'}");
}
}
}
##############################################################################
#
# Shut down mysqld servers we have started from this run of this script
#
##############################################################################
# To speed things we kill servers in parallel. The argument is a list
# of 'ports', 'pids', 'pidfiles' and 'socketfiles'.
# Check that all processes in list is killed
# The argument is a list of 'ports', 'pids', 'pidfiles' and 'socketfiles'
# for which shutdown has been started. Make sure they all get killes
# in one way or the other.
#
# FIXME On Cygwin, and maybe some other platforms, $srv->{'pid'} and
# $srv->{'pidfile'} will not be the same PID. We need to try to kill
# the pid in $srv->{'pidfile'} will not be the same PID. We need to try to kill
# both I think.
sub mtr_stop_mysqld_servers ($) {
sub mtr_check_stop_servers ($) {
my $spec= shift;
# ----------------------------------------------------------------------
# First try nice normal shutdown using 'mysqladmin'
# ----------------------------------------------------------------------
# Return if no processes are defined
return if ! @$spec;
# Shutdown time must be high as slave may be in reconnect
mtr_mysqladmin_shutdown($spec, 70);
#mtr_report("mtr_check_stop_servers");
mtr_ping_with_timeout(\@$spec);
# ----------------------------------------------------------------------
# We loop with waitpid() nonblocking to see how many of the ones we
# are to kill, actually got killed by mtr_mysqladmin_shutdown().
# Note that we don't rely on this, the mysqld server might have stop
# are to kill, actually got killed by mysqladmin or ndb_mgm
#
# Note that we don't rely on this, the mysqld server might have stopped
# listening to the port, but still be alive. But it is a start.
# ----------------------------------------------------------------------
foreach my $srv ( @$spec )
{
if ( $srv->{'pid'} and (waitpid($srv->{'pid'},&WNOHANG) == $srv->{'pid'}) )
my $ret_pid;
if ( $srv->{'pid'} )
{
$srv->{'pid'}= 0;
$ret_pid= waitpid($srv->{'pid'},&WNOHANG);
if ($ret_pid == $srv->{'pid'})
{
$srv->{'pid'}= 0;
}
else
{
# mtr_warning("catched exit of unknown child $ret_pid");
}
}
}
@ -545,13 +578,12 @@ sub mtr_stop_mysqld_servers ($) {
}
# ----------------------------------------------------------------------
# If the processes where started from this script, and we had no PIDS
# If all the processes in list already have been killed,
# then we don't have to do anything.
# ----------------------------------------------------------------------
if ( ! keys %mysqld_pids )
{
# cluck "This is how we got here!";
return;
}
@ -618,89 +650,100 @@ sub mtr_stop_mysqld_servers ($) {
# FIXME We just assume they are all dead, for Cygwin we are not
# really sure
}
# Wait for all the process in the list to terminate
sub mtr_wait_blocking($) {
my $admin_pids= shift;
# Return if no processes defined
return if ! %$admin_pids;
# mtr_report("mtr_wait_blocking");
# Wait for all the started processes to exit
# As mysqladmin is such a simple program, we trust it to terminate itself.
# I.e. we wait blocking, and wait wait for them all before we go on.
foreach my $pid (keys %{$admin_pids})
{
# mtr_report("pid: $pid");
my $ret_pid= waitpid($pid,0);
}
}
# Start "mysqladmin shutdown" for a specific mysqld
sub mtr_mysqladmin_start($$$) {
my $srv= shift;
my $command= shift;
my $adm_shutdown_tmo= shift;
my $args;
mtr_init_args(\$args);
mtr_add_arg($args, "--no-defaults");
mtr_add_arg($args, "--user=%s", $::opt_user);
mtr_add_arg($args, "--password=");
mtr_add_arg($args, "--silent");
if ( -e $srv->{'path_sock'} )
{
mtr_add_arg($args, "--socket=%s", $srv->{'path_sock'});
}
if ( $srv->{'port'} )
{
mtr_add_arg($args, "--port=%s", $srv->{'port'});
}
if ( $srv->{'port'} and ! -e $srv->{'path_sock'} )
{
mtr_add_arg($args, "--protocol=tcp"); # Needed if no --socket
}
mtr_add_arg($args, "--connect_timeout=5");
# Shutdown time must be high as slave may be in reconnect
mtr_add_arg($args, "--shutdown_timeout=$adm_shutdown_tmo");
mtr_add_arg($args, "$command");
my $path_mysqladmin_log= "$::opt_vardir/log/mysqladmin.log";
my $pid= mtr_spawn($::exe_mysqladmin, $args,
"", $path_mysqladmin_log, $path_mysqladmin_log, "",
{ append_log_file => 1 });
# mtr_report("mtr_mysqladmin_start, pid: $pid");
return $pid;
}
# Start "ndb_mgm shutdown" for a specific cluster, it will
# shutdown all data nodes and leave the ndb_mgmd running
sub mtr_ndbmgm_start($$) {
my $cluster= shift;
my $command= shift;
my $args;
mtr_init_args(\$args);
mtr_add_arg($args, "--no-defaults");
mtr_add_arg($args, "--core");
mtr_add_arg($args, "--try-reconnect=1");
mtr_add_arg($args, "--ndb_connectstring=%s", $cluster->{'connect_string'});
mtr_add_arg($args, "-e");
mtr_add_arg($args, "$command");
my $pid= mtr_spawn($::exe_ndb_mgm, $args,
"", "/dev/null", "/dev/null", "",
{});
# mtr_report("mtr_ndbmgm_start, pid: $pid");
return $pid;
}
##############################################################################
#
# Shut down mysqld servers using "mysqladmin ... shutdown".
# To speed this up, we start them in parallel and use waitpid() to
# catch their termination. Note that this doesn't say the servers
# are terminated, just that 'mysqladmin' is terminated.
#
# Note that mysqladmin will ask the server about what PID file it uses,
# and mysqladmin will wait for it to be removed before it terminates
# (unless passes timeout).
#
# This function will take at most about 20 seconds, and we still are not
# sure we killed them all. If none is responding to ping, we return 1,
# else we return 0.
#
##############################################################################
sub mtr_mysqladmin_shutdown {
# Ping all servers in list, exit when none of them answers
# or when timeout has passed
sub mtr_ping_with_timeout($) {
my $spec= shift;
my $adm_shutdown_tmo= shift;
my %mysql_admin_pids;
# Start one "mysqladmin shutdown" for each server
foreach my $srv ( @$spec )
{
my $args;
mtr_init_args(\$args);
mtr_add_arg($args, "--no-defaults");
mtr_add_arg($args, "--user=%s", $::opt_user);
mtr_add_arg($args, "--password=");
mtr_add_arg($args, "--silent");
if ( -e $srv->{'sockfile'} )
{
mtr_add_arg($args, "--socket=%s", $srv->{'sockfile'});
}
if ( $srv->{'port'} )
{
mtr_add_arg($args, "--port=%s", $srv->{'port'});
}
if ( $srv->{'port'} and ! -e $srv->{'sockfile'} )
{
mtr_add_arg($args, "--protocol=tcp"); # Needed if no --socket
}
mtr_add_arg($args, "--connect_timeout=5");
# Shutdown time must be high as slave may be in reconnect
mtr_add_arg($args, "--shutdown_timeout=$adm_shutdown_tmo");
mtr_add_arg($args, "shutdown");
my $path_mysqladmin_log= "$::opt_vardir/log/mysqladmin.log";
# Start mysqladmin in paralell and wait for termination later
my $pid= mtr_spawn($::exe_mysqladmin, $args,
"", $path_mysqladmin_log, $path_mysqladmin_log, "",
{ append_log_file => 1 });
# Save the pid of the mysqladmin process
$mysql_admin_pids{$pid}= 1;
# We don't wait for termination of mysqladmin
}
# Wait for all the started mysqladmin to exit
# As mysqladmin is such a simple program, we trust it to terminate.
# I.e. we wait blocking, and wait wait for them all before we go on.
foreach my $pid (keys %mysql_admin_pids)
{
my $ret_pid= waitpid($pid,0);
# If this was any of the mysqladmin's we waited for, delete its
# pid from list
delete $mysql_admin_pids{$ret_pid} if exists $mysql_admin_pids{$ret_pid};
}
# If we trusted "mysqladmin --shutdown_timeout= ..." we could just
# terminate now, but we don't (FIXME should be debugged).
# So we try again to ping and at least wait the same amount of time
# mysqladmin would for all to die.
my $timeout= 20; # 20 seconds max
my $timeout= 200; # 20 seconds max
my $res= 1; # If we just fall through, we are done
# in the sense that the servers don't
# listen to their ports any longer
@ -710,10 +753,13 @@ sub mtr_mysqladmin_shutdown {
foreach my $srv ( @$spec )
{
$res= 1; # We are optimistic
if ( mtr_ping_mysqld_server($srv->{'port'}, $srv->{'sockfile'}) )
if ( $srv->{'pid'} and mtr_ping_port($srv->{'port'}) )
{
mtr_debug("Sleep 1 second waiting for processes to stop using port");
sleep(1); # One second
mtr_report("waiting for process $srv->{'pid'} to stop ".
"using port $srv->{'port'}");
# Millisceond sleep emulated with select
select(undef, undef, undef, (0.1));
$res= 0;
next TIME;
}
@ -721,7 +767,7 @@ sub mtr_mysqladmin_shutdown {
last; # If we got here, we are done
}
$timeout or mtr_debug("At least one server is still listening to its port");
$timeout or mtr_report("At least one server is still listening to its port");
return $res;
}
@ -742,12 +788,12 @@ sub mtr_record_dead_children () {
# -1 or 0 means there are no more procesess to wait for
while ( ($ret_pid= waitpid(-1,&WNOHANG)) != 0 and $ret_pid != -1)
{
mtr_debug("waitpid() catched exit of child $ret_pid");
mtr_warning("waitpid() catched exit of child $ret_pid");
foreach my $idx (0..1)
{
if ( $::master->[$idx]->{'pid'} eq $ret_pid )
{
mtr_debug("child $ret_pid was master[$idx]");
mtr_warning("child $ret_pid was master[$idx]");
$::master->[$idx]->{'pid'}= 0;
}
}
@ -756,11 +802,31 @@ sub mtr_record_dead_children () {
{
if ( $::slave->[$idx]->{'pid'} eq $ret_pid )
{
mtr_debug("child $ret_pid was slave[$idx]");
mtr_warning("child $ret_pid was slave[$idx]");
$::slave->[$idx]->{'pid'}= 0;
last;
}
}
foreach my $cluster (@{$::clusters})
{
if ( $cluster->{'pid'} eq $ret_pid )
{
mtr_warning("child $ret_pid was $cluster->{'name'} cluster ndb_mgmd");
$cluster->{'pid'}= 0;
last;
}
foreach my $ndbd (@{$cluster->{'ndbds'}})
{
if ( $ndbd->{'pid'} eq $ret_pid )
{
mtr_warning("child $ret_pid was $cluster->{'name'} cluster ndbd");
$ndbd->{'pid'}= 0;
last;
}
}
}
}
}
@ -784,7 +850,8 @@ sub stop_reap_all {
$SIG{CHLD}= 'DEFAULT';
}
sub mtr_ping_mysqld_server () {
sub mtr_ping_port ($) {
my $port= shift;
my $remote= "localhost";
@ -832,18 +899,17 @@ sub sleep_until_file_created ($$$) {
return $pid;
}
# Check if it died after the fork() was successful
# Check if it died after the fork() was successful
if ( $pid != 0 && waitpid($pid,&WNOHANG) == $pid )
{
return 0;
}
mtr_debug("Sleep $sleeptime milliseconds waiting for ".
"creation of $pidfile");
mtr_debug("Sleep $sleeptime milliseconds waiting for $pidfile");
# Print extra message every 60 seconds
my $seconds= ($loop * $sleeptime) / 1000;
if ( $seconds > 1 and $seconds % 60 == 0 )
if ( $seconds > 1 and int($seconds) % 60 == 0 )
{
my $left= $timeout - $seconds;
mtr_warning("Waited $seconds seconds for $pidfile to be created, " .
@ -860,7 +926,7 @@ sub sleep_until_file_created ($$$) {
sub mtr_kill_processes ($) {
my $pids = shift;
# mtr_report("mtr_kill_processes " . join(" ", @$pids));
foreach my $sig (15, 9)
{
my $retries= 10;
@ -868,9 +934,6 @@ sub mtr_kill_processes ($) {
{
kill($sig, @{$pids});
last unless kill (0, @{$pids}) and $retries--;
mtr_debug("Sleep 2 second waiting for processes to die");
sleep(2);
}
}
}
@ -884,7 +947,7 @@ sub mtr_kill_processes ($) {
# FIXME something is wrong, we sometimes terminate with "Hangup" written
# to tty, and no STDERR output telling us why.
# FIXME for some readon, setting HUP to 'IGNORE' will cause exit() to
# FIXME for some reason, setting HUP to 'IGNORE' will cause exit() to
# write out "Hangup", and maybe loose some output. We insert a sleep...
sub mtr_exit ($) {

View file

@ -122,7 +122,7 @@ sub mtr_report_test_failed ($) {
{
print "[ fail ] timeout\n";
}
elsif ( $tinfo->{'ndb_test'} and !$::flag_ndb_status_ok)
elsif ( $tinfo->{'ndb_test'} and $::cluster->[0]->{'installed_ok'} eq "NO")
{
print "[ fail ] ndbcluster start failure\n";
return;
@ -157,6 +157,7 @@ sub mtr_report_stats ($) {
my $tot_passed= 0;
my $tot_failed= 0;
my $tot_tests= 0;
my $tot_restarts= 0;
my $found_problems= 0; # Some warnings are errors...
foreach my $tinfo (@$tests)
@ -175,6 +176,10 @@ sub mtr_report_stats ($) {
$tot_tests++;
$tot_failed++;
}
if ( $tinfo->{'restarted'} )
{
$tot_restarts++;
}
}
# ----------------------------------------------------------------------
@ -197,6 +202,8 @@ sub mtr_report_stats ($) {
"the documentation at\n",
"http://www.mysql.com/doc/en/MySQL_test_suite.html\n";
}
print
"The servers was restarted $tot_restarts times\n";
# ----------------------------------------------------------------------
# If a debug run, there might be interesting information inside

View file

@ -27,8 +27,7 @@ sub run_stress_test ()
if ( ! $::glob_use_embedded_server and ! $::opt_local_master )
{
$::master->[0]->{'pid'}= mysqld_start('master',0,[],[],0);
if ( ! $::master->[0]->{'pid'} )
if ( ! mysqld_start($::master->[0],[],[]) )
{
mtr_error("Can't start the mysqld server");
}

File diff suppressed because it is too large Load diff