mariadb/server-tools/instance-manager/guardian.cc
unknown dce2554f91 Post-review fixes + some bugs fixed + several minor features
BitKeeper/deleted/.del-client_func.c~3476a8a85cbd3c29:
  Delete: server-tools/instance-manager/client_func.c
server-tools/instance-manager/Makefile.am:
  clien_func removed
server-tools/instance-manager/buffer.cc:
  several methods added
server-tools/instance-manager/buffer.h:
  Some error-handling fixes.
server-tools/instance-manager/commands.cc:
  check for Buffer errors
server-tools/instance-manager/guardian.cc:
  Guardian rewiriten. Not it works in a finite state machine-way.
server-tools/instance-manager/guardian.h:
  Appropriate (to .cc) changes in the header + some comment added
server-tools/instance-manager/instance.cc:
  added proxy thread to monitor instance. Two kinds of stop() now -- stop() and kill_instance which
  only sends a signal
server-tools/instance-manager/instance.h:
  appropriate changes
server-tools/instance-manager/instance_map.cc:
  cleanup
server-tools/instance-manager/instance_map.h:
  cleanup
server-tools/instance-manager/instance_options.cc:
  Caching of the pid-file-name is added. some comments added
server-tools/instance-manager/instance_options.h:
  cleanup
server-tools/instance-manager/listener.cc:
  listener my_thread_init added (though it doesn't use any mysys functions). Just in case
server-tools/instance-manager/manager.cc:
  SIGCHLD handler removed. now instance monitoring is implemented through proxy threads. This is to work nicely
  with LinuxThreads
server-tools/instance-manager/options.cc:
  added option to create a password file entry (this was implemented by Sergei Vojtovich)
server-tools/instance-manager/parse.cc:
  inline function get_word moved to the header
server-tools/instance-manager/parse.h:
  get_word moved here to use form parse_output
server-tools/instance-manager/parse_output.cc:
  get_word() clone removed. now looking through the output linewise
server-tools/instance-manager/protocol.cc:
  Buffer error chech added
server-tools/instance-manager/user_map.cc:
  typo fixed
2005-02-11 14:21:59 +03:00

421 lines
11 KiB
C++

/* Copyright (C) 2004 MySQL AB
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
#ifdef __GNUC__
#pragma implementation
#endif
#include "guardian.h"
#include "instance_map.h"
#include "instance.h"
#include "mysql_manager_error.h"
#include "log.h"
#include <string.h>
#include <sys/types.h>
#include <signal.h>
/*
The Guardian list node structure. Guardian utilizes it to store
guarded instances plus some additional info.
*/
struct GUARD_NODE
{
Instance *instance;
/* state of an instance (i.e. STARTED, CRASHED, etc.) */
int state;
/* the amount of attemts to restart instance (cleaned up at success) */
int restart_counter;
/* triggered at a crash */
time_t crash_moment;
/* General time field. Used to provide timeouts (at shutdown and restart) */
time_t last_checked;
};
C_MODE_START
pthread_handler_decl(guardian, arg)
{
Guardian_thread *guardian_thread= (Guardian_thread *) arg;
guardian_thread->run();
return 0;
}
C_MODE_END
Guardian_thread::Guardian_thread(Thread_registry &thread_registry_arg,
Instance_map *instance_map_arg,
uint monitoring_interval_arg) :
Guardian_thread_args(thread_registry_arg, instance_map_arg,
monitoring_interval_arg),
thread_info(pthread_self()), guarded_instances(0)
{
pthread_mutex_init(&LOCK_guardian, 0);
pthread_cond_init(&COND_guardian, 0);
shutdown_requested= FALSE;
stopped= FALSE;
init_alloc_root(&alloc, MEM_ROOT_BLOCK_SIZE, 0);
}
Guardian_thread::~Guardian_thread()
{
/* delay guardian destruction to the moment when no one needs it */
pthread_mutex_lock(&LOCK_guardian);
free_root(&alloc, MYF(0));
pthread_mutex_unlock(&LOCK_guardian);
pthread_mutex_destroy(&LOCK_guardian);
pthread_cond_destroy(&COND_guardian);
}
void Guardian_thread::request_shutdown(bool stop_instances_arg)
{
pthread_mutex_lock(&LOCK_guardian);
/* stop instances or just clean up Guardian repository */
stop_instances(stop_instances_arg);
shutdown_requested= TRUE;
pthread_mutex_unlock(&LOCK_guardian);
}
void Guardian_thread::process_instance(Instance *instance,
GUARD_NODE *current_node,
LIST **guarded_instances,
LIST *elem)
{
int waitchild= Instance::DEFAULT_SHUTDOWN_DELAY;
/* The amount of times, Guardian attempts to restart an instance */
int restart_retry= 100;
time_t current_time= time(NULL);
if (current_node->state == STOPPING)
{
/* this brach is executed during shutdown */
if (instance->options.shutdown_delay != NULL)
waitchild= atoi(instance->options.shutdown_delay);
/* this returns true if and only if an instance was stopped for shure */
if (instance->is_crashed())
*guarded_instances= list_delete(*guarded_instances, elem);
else if (current_time - current_node->last_checked > waitchild)
{
instance->kill_instance(SIGKILL);
/*
Later we do elem= elem->next. This is ok, as we are only removing
the node from the list. The pointer to the next one is still valid.
*/
*guarded_instances= list_delete(*guarded_instances, elem);
}
return;
}
if (instance->is_running())
{
/* clear status fields */
current_node->restart_counter= 0;
current_node->crash_moment= 0;
current_node->state= STARTED;
}
else
{
switch (current_node->state)
{
case NOT_STARTED:
instance->start();
current_node->last_checked= current_time;
log_info("guardian: starting instance %s",
instance->options.instance_name);
current_node->state= STARTING;
break;
case STARTED: /* fallthrough */
case STARTING: /* let the instance start or crash */
if (instance->is_crashed())
{
current_node->crash_moment= current_time;
current_node->last_checked= current_time;
current_node->state= JUST_CRASHED;
/* fallthrough -- restart an instance immediately */
}
else
break;
case JUST_CRASHED:
if (current_time - current_node->crash_moment <= 2)
{
instance->start();
log_info("guardian: starting instance %s",
instance->options.instance_name);
}
else current_node->state= CRASHED;
break;
case CRASHED: /* just regular restarts */
if (current_time - current_node->last_checked >
monitoring_interval)
{
if ((current_node->restart_counter < restart_retry))
{
instance->start();
current_node->last_checked= current_time;
((GUARD_NODE *) elem->data)->restart_counter++;
log_info("guardian: starting instance %s",
instance->options.instance_name);
}
else current_node->state= CRASHED_AND_ABANDONED;
}
break;
case CRASHED_AND_ABANDONED:
break; /* do nothing */
default:
DBUG_ASSERT(0);
}
}
}
/*
Run guardian thread
SYNOPSYS
run()
DESCRIPTION
Check for all guarded instances and restart them if needed. If everything
is fine go and sleep for some time.
*/
void Guardian_thread::run()
{
Instance *instance;
LIST *elem;
struct timespec timeout;
thread_registry.register_thread(&thread_info);
my_thread_init();
pthread_mutex_lock(&LOCK_guardian);
/* loop, until all instances were shut down at the end */
while (!(shutdown_requested && (guarded_instances == NULL)))
{
elem= guarded_instances;
while (elem != NULL)
{
struct timespec timeout;
GUARD_NODE *current_node= (GUARD_NODE *) elem->data;
instance= ((GUARD_NODE *) elem->data)->instance;
process_instance(instance, current_node, &guarded_instances, elem);
elem= elem->next;
}
timeout.tv_sec= time(NULL) + monitoring_interval;
timeout.tv_nsec= 0;
/* check the loop predicate before sleeping */
if (!(shutdown_requested && (guarded_instances == NULL)))
pthread_cond_timedwait(&COND_guardian, &LOCK_guardian, &timeout);
}
stopped= TRUE;
pthread_mutex_unlock(&LOCK_guardian);
/* now, when the Guardian is stopped we can stop the IM */
thread_registry.unregister_thread(&thread_info);
thread_registry.request_shutdown();
my_thread_end();
}
int Guardian_thread::is_stopped()
{
int var;
pthread_mutex_lock(&LOCK_guardian);
var= stopped;
pthread_mutex_unlock(&LOCK_guardian);
return var;
}
/*
Initialize the list of guarded instances: loop through the Instance_map and
add all of the instances, which don't have 'nonguarded' option specified.
SYNOPSYS
Guardian_thread::init()
RETURN
0 - ok
1 - error occured
*/
int Guardian_thread::init()
{
Instance *instance;
Instance_map::Iterator iterator(instance_map);
instance_map->lock();
while ((instance= iterator.next()))
{
if ((instance->options.nonguarded == NULL))
if (guard(instance))
return 1;
}
instance_map->unlock();
return 0;
}
/*
Add instance to the Guardian list
SYNOPSYS
guard()
instance the instance to be guarded
DESCRIPTION
The instance is added to the guarded instances list. Usually guard() is
called after we start an instance.
RETURN
0 - ok
1 - error occured
*/
int Guardian_thread::guard(Instance *instance)
{
LIST *node;
GUARD_NODE *content;
node= (LIST *) alloc_root(&alloc, sizeof(LIST));
content= (GUARD_NODE *) alloc_root(&alloc, sizeof(GUARD_NODE));
if ((node == NULL) || (content == NULL))
return 1;
/* we store the pointers to instances from the instance_map's MEM_ROOT */
content->instance= instance;
content->restart_counter= 0;
content->crash_moment= 0;
content->state= NOT_STARTED;
node->data= (void *) content;
pthread_mutex_lock(&LOCK_guardian);
guarded_instances= list_add(guarded_instances, node);
pthread_mutex_unlock(&LOCK_guardian);
return 0;
}
/*
TODO: perhaps it would make sense to create a pool of the LIST elements
and give them upon request. Now we are loosing a bit of memory when
guarded instance was stopped and then restarted (since we cannot free just
a piece of the MEM_ROOT).
*/
int Guardian_thread::stop_guard(Instance *instance)
{
LIST *node;
pthread_mutex_lock(&LOCK_guardian);
node= guarded_instances;
while (node != NULL)
{
/*
We compare only pointers, as we always use pointers from the
instance_map's MEM_ROOT.
*/
if (((GUARD_NODE *) node->data)->instance == instance)
{
guarded_instances= list_delete(guarded_instances, node);
pthread_mutex_unlock(&LOCK_guardian);
return 0;
}
else
node= node->next;
}
pthread_mutex_unlock(&LOCK_guardian);
/* if there is nothing to delete it is also fine */
return 0;
}
/*
Start Guardian shutdown. Attempt to start instances if requested.
SYNOPSYS
stop_instances()
stop_instances_arg whether we should stop instances at shutdown
DESCRIPTION
Loops through the guarded_instances list and prepares them for shutdown.
If stop_instances was requested, we need to issue a stop command and change
the state accordingly. Otherwise we could simply delete an entry.
NOTE: Guardian should be locked by the calling function
RETURN
0 - ok
1 - error occured
*/
int Guardian_thread::stop_instances(bool stop_instances_arg)
{
LIST *node;
node= guarded_instances;
while (node != NULL)
{
if (!stop_instances_arg)
{
/* just forget about an instance */
guarded_instances= list_delete(guarded_instances, node);
/*
This should still work fine, as we have only removed the
node from the list. The pointer to the next one is still valid
*/
node= node->next;
}
else
{
GUARD_NODE *current_node= (GUARD_NODE *) node->data;
/*
If instance is running or was running (and now probably hanging),
request stop.
*/
if (current_node->instance->is_running() ||
(current_node->state == STARTED))
{
current_node->state= STOPPING;
current_node->last_checked= time(NULL);
}
else
/* otherwise remove it from the list */
guarded_instances= list_delete(guarded_instances, node);
/* But try to kill it anyway. Just in case */
current_node->instance->kill_instance(SIGTERM);
node= node->next;
}
}
return 0;
}