Bug #28899 not possible to set separate watchdog timeout at startup

storage/ndb/include/mgmapi/mgmapi_config_parameters.h:
  add new configuration parameter TimeBetweenWatchDogCheckInitial
storage/ndb/include/portlib/NdbTick.h:
  enable timing code
storage/ndb/src/common/portlib/NdbTick.c:
  enable timing code
storage/ndb/src/kernel/blocks/ndbcntr/NdbcntrMain.cpp:
  read watchdog timeout to set it after malloc
storage/ndb/src/kernel/vm/Configuration.cpp:
  read initial watchdog timeout and set it in the beginning
storage/ndb/src/kernel/vm/Configuration.hpp:
  read initial watchdog timeout and set it in the beginning
storage/ndb/src/kernel/vm/SimulatedBlock.cpp:
  introduce new state for "action" malloc of memory
storage/ndb/src/kernel/vm/SimulatedBlock.hpp:
  introduce new state for "action" malloc of memory
storage/ndb/src/kernel/vm/WatchDog.cpp:
  rewrite watchdog to check every 100ms for being stuch, but keep shutdown after 3 * interval
  for "action" == 9 (malloc)  keep old behavior and only output every interval
storage/ndb/src/mgmsrv/ConfigInfo.cpp:
  add new configuration parameter TimeBetweenWatchDogCheckInitial
This commit is contained in:
unknown 2007-06-05 17:06:33 +02:00
parent 1a166bc4c9
commit 1182b801d4
10 changed files with 149 additions and 71 deletions

View file

@ -81,6 +81,8 @@
#define CFG_DB_BACKUP_WRITE_SIZE 136
#define CFG_DB_BACKUP_MAX_WRITE_SIZE 139
#define CFG_DB_WATCHDOG_INTERVAL_INITIAL 141
#define CFG_LOG_DESTINATION 147
#define CFG_DB_DISCLESS 148

View file

@ -37,9 +37,6 @@ NDB_TICKS NdbTick_CurrentMillisecond(void);
*/
int NdbTick_CurrentMicrosecond(NDB_TICKS * secs, Uint32 * micros);
/*#define TIME_MEASUREMENT*/
#ifdef TIME_MEASUREMENT
struct MicroSecondTimer {
NDB_TICKS seconds;
NDB_TICKS micro_seconds;
@ -54,7 +51,6 @@ struct MicroSecondTimer {
NDB_TICKS NdbTick_getMicrosPassed(struct MicroSecondTimer start,
struct MicroSecondTimer stop);
int NdbTick_getMicroTimer(struct MicroSecondTimer* time_now);
#endif
#ifdef __cplusplus
}

View file

@ -15,7 +15,7 @@
#include <ndb_global.h>
#include "NdbTick.h"
#include <NdbTick.h>
#define NANOSEC_PER_SEC 1000000000
#define MICROSEC_PER_SEC 1000000
@ -71,7 +71,6 @@ NdbTick_CurrentMicrosecond(NDB_TICKS * secs, Uint32 * micros){
}
#endif
#ifdef TIME_MEASUREMENT
int
NdbTick_getMicroTimer(struct MicroSecondTimer* input_timer)
{
@ -102,4 +101,3 @@ NdbTick_getMicrosPassed(struct MicroSecondTimer start,
}
return ret_value;
}
#endif

View file

@ -277,6 +277,14 @@ void Ndbcntr::execSTTOR(Signal* signal)
break;
case ZSTART_PHASE_1:
jam();
{
Uint32 db_watchdog_interval = 0;
const ndb_mgm_configuration_iterator * p =
m_ctx.m_config.getOwnConfigIterator();
ndb_mgm_get_int_parameter(p, CFG_DB_WATCHDOG_INTERVAL, &db_watchdog_interval);
ndbrequire(db_watchdog_interval);
update_watch_dog_timer(db_watchdog_interval);
}
startPhase1Lab(signal);
break;
case ZSTART_PHASE_2:

View file

@ -443,6 +443,11 @@ Configuration::setupConfiguration(){
"TimeBetweenWatchDogCheck missing");
}
if(iter.get(CFG_DB_WATCHDOG_INTERVAL_INITIAL, &_timeBetweenWatchDogCheckInitial)){
ERROR_SET(fatal, NDBD_EXIT_INVALID_CONFIG, "Invalid configuration fetched",
"TimeBetweenWatchDogCheckInitial missing");
}
/**
* Get paths
*/
@ -462,9 +467,12 @@ Configuration::setupConfiguration(){
* Create the watch dog thread
*/
{
Uint32 t = _timeBetweenWatchDogCheck;
if (_timeBetweenWatchDogCheckInitial < _timeBetweenWatchDogCheck)
_timeBetweenWatchDogCheckInitial = _timeBetweenWatchDogCheck;
Uint32 t = _timeBetweenWatchDogCheckInitial;
t = globalEmulatorData.theWatchDog ->setCheckInterval(t);
_timeBetweenWatchDogCheck = t;
_timeBetweenWatchDogCheckInitial = t;
}
ConfigValues* cf = ConfigValuesFactory::extractCurrentSection(iter.m_config);

View file

@ -84,6 +84,7 @@ private:
Uint32 _maxErrorLogs;
Uint32 _lockPagesInMainMemory;
Uint32 _timeBetweenWatchDogCheck;
Uint32 _timeBetweenWatchDogCheckInitial;
ndb_mgm_configuration * m_ownConfig;
ndb_mgm_configuration * m_clusterConfig;

View file

@ -19,6 +19,7 @@
#include <NdbOut.hpp>
#include <GlobalData.hpp>
#include <Emulator.hpp>
#include <WatchDog.hpp>
#include <ErrorHandlingMacros.hpp>
#include <TimeQueue.hpp>
#include <TransporterRegistry.hpp>
@ -662,7 +663,7 @@ SimulatedBlock::allocRecord(const char * type, size_t s, size_t n, bool clear, U
void * p = NULL;
size_t size = n*s;
Uint64 real_size = (Uint64)((Uint64)n)*((Uint64)s);
refresh_watch_dog();
refresh_watch_dog(9);
if (real_size > 0){
#ifdef VM_TRACE_MEM
ndbout_c("%s::allocRecord(%s, %u, %u) = %llu bytes",
@ -696,12 +697,12 @@ SimulatedBlock::allocRecord(const char * type, size_t s, size_t n, bool clear, U
char * ptr = (char*)p;
const Uint32 chunk = 128 * 1024;
while(size > chunk){
refresh_watch_dog();
refresh_watch_dog(9);
memset(ptr, 0, chunk);
ptr += chunk;
size -= chunk;
}
refresh_watch_dog();
refresh_watch_dog(9);
memset(ptr, 0, size);
}
}
@ -720,9 +721,16 @@ SimulatedBlock::deallocRecord(void ** ptr,
}
void
SimulatedBlock::refresh_watch_dog()
SimulatedBlock::refresh_watch_dog(Uint32 place)
{
globalData.incrementWatchDogCounter(1);
globalData.incrementWatchDogCounter(place);
}
void
SimulatedBlock::update_watch_dog_timer(Uint32 interval)
{
extern EmulatorData globalEmulatorData;
globalEmulatorData.theWatchDog->setCheckInterval(interval);
}
void

View file

@ -334,7 +334,8 @@ protected:
* Refresh Watch Dog in initialising code
*
*/
void refresh_watch_dog();
void refresh_watch_dog(Uint32 place = 1);
void update_watch_dog_timer(Uint32 interval);
/**
* Prog error

View file

@ -25,6 +25,8 @@
#include <ErrorHandlingMacros.hpp>
#include <EventLogger.hpp>
#include <NdbTick.h>
extern EventLogger g_eventLogger;
extern "C"
@ -72,73 +74,115 @@ WatchDog::doStop(){
}
}
const char *get_action(Uint32 IPValue)
{
const char *action;
switch (IPValue) {
case 1:
action = "Job Handling";
break;
case 2:
action = "Scanning Timers";
break;
case 3:
action = "External I/O";
break;
case 4:
action = "Print Job Buffers at crash";
break;
case 5:
action = "Checking connections";
break;
case 6:
action = "Performing Send";
break;
case 7:
action = "Polling for Receive";
break;
case 8:
action = "Performing Receive";
break;
case 9:
action = "Allocating memory";
break;
default:
action = "Unknown place";
break;
}//switch
return action;
}
void
WatchDog::run(){
unsigned int anIPValue;
unsigned int alerts = 0;
WatchDog::run()
{
unsigned int anIPValue, sleep_time;
unsigned int oldIPValue = 0;
unsigned int theIntervalCheck = theInterval;
struct MicroSecondTimer start_time, last_time, now;
NdbTick_getMicroTimer(&start_time);
last_time = start_time;
// WatchDog for the single threaded NDB
while(!theStop){
Uint32 tmp = theInterval / 500;
tmp= (tmp ? tmp : 1);
while(!theStop && tmp > 0){
NdbSleep_MilliSleep(500);
tmp--;
}
while (!theStop)
{
sleep_time= 100;
NdbSleep_MilliSleep(sleep_time);
if(theStop)
break;
NdbTick_getMicroTimer(&now);
if (NdbTick_getMicrosPassed(last_time, now)/1000 > sleep_time*2)
{
struct tms my_tms;
times(&my_tms);
g_eventLogger.info("Watchdog: User time: %llu System time: %llu",
(Uint64)my_tms.tms_utime,
(Uint64)my_tms.tms_stime);
g_eventLogger.warning("Watchdog: Warning overslept %u ms, expected %u ms.",
NdbTick_getMicrosPassed(last_time, now)/1000,
sleep_time);
}
last_time = now;
// Verify that the IP thread is not stuck in a loop
anIPValue = *theIPValue;
if(anIPValue != 0) {
if (anIPValue != 0)
{
oldIPValue = anIPValue;
globalData.incrementWatchDogCounter(0);
alerts = 0;
} else {
const char *last_stuck_action;
alerts++;
switch (oldIPValue) {
case 1:
last_stuck_action = "Job Handling";
break;
case 2:
last_stuck_action = "Scanning Timers";
break;
case 3:
last_stuck_action = "External I/O";
break;
case 4:
last_stuck_action = "Print Job Buffers at crash";
break;
case 5:
last_stuck_action = "Checking connections";
break;
case 6:
last_stuck_action = "Performing Send";
break;
case 7:
last_stuck_action = "Polling for Receive";
break;
case 8:
last_stuck_action = "Performing Receive";
break;
default:
last_stuck_action = "Unknown place";
break;
}//switch
g_eventLogger.warning("Ndb kernel is stuck in: %s", last_stuck_action);
NdbTick_getMicroTimer(&start_time);
theIntervalCheck = theInterval;
}
else
{
int warn = 1;
Uint32 elapsed = NdbTick_getMicrosPassed(start_time, now)/1000;
/*
oldIPValue == 9 indicates malloc going on, this can take some time
so only warn if we pass the watchdog interval
*/
if (oldIPValue == 9)
if (elapsed < theIntervalCheck)
warn = 0;
else
theIntervalCheck += theInterval;
if (warn)
{
struct tms my_tms;
times(&my_tms);
g_eventLogger.info("User time: %llu System time: %llu",
(Uint64)my_tms.tms_utime,
(Uint64)my_tms.tms_stime);
}
if(alerts == 3){
shutdownSystem(last_stuck_action);
const char *last_stuck_action = get_action(oldIPValue);
g_eventLogger.warning("Ndb kernel is stuck in: %s", last_stuck_action);
{
struct tms my_tms;
times(&my_tms);
g_eventLogger.info("Watchdog: User time: %llu System time: %llu",
(Uint64)my_tms.tms_utime,
(Uint64)my_tms.tms_stime);
}
if (elapsed > 3 * theInterval)
{
shutdownSystem(last_stuck_action);
}
}
}
}

View file

@ -571,6 +571,18 @@ const ConfigInfo::ParamInfo ConfigInfo::m_ParamInfo[] = {
"70",
STR_VALUE(MAX_INT_RNIL) },
{
CFG_DB_WATCHDOG_INTERVAL_INITIAL,
"TimeBetweenWatchDogCheckInitial",
DB_TOKEN,
"Time between execution checks inside a database node in the early start phases when memory is allocated",
ConfigInfo::CI_USED,
true,
ConfigInfo::CI_INT,
"6000",
"70",
STR_VALUE(MAX_INT_RNIL) },
{
CFG_DB_STOP_ON_ERROR,
"StopOnError",