mirror of
https://github.com/MariaDB/server.git
synced 2025-01-16 20:12:31 +01:00
Bug #28899 not possible to set separate watchdog timeout at startup
storage/ndb/include/mgmapi/mgmapi_config_parameters.h: add new configuration parameter TimeBetweenWatchDogCheckInitial storage/ndb/include/portlib/NdbTick.h: enable timing code storage/ndb/src/common/portlib/NdbTick.c: enable timing code storage/ndb/src/kernel/blocks/ndbcntr/NdbcntrMain.cpp: read watchdog timeout to set it after malloc storage/ndb/src/kernel/vm/Configuration.cpp: read initial watchdog timeout and set it in the beginning storage/ndb/src/kernel/vm/Configuration.hpp: read initial watchdog timeout and set it in the beginning storage/ndb/src/kernel/vm/SimulatedBlock.cpp: introduce new state for "action" malloc of memory storage/ndb/src/kernel/vm/SimulatedBlock.hpp: introduce new state for "action" malloc of memory storage/ndb/src/kernel/vm/WatchDog.cpp: rewrite watchdog to check every 100ms for being stuch, but keep shutdown after 3 * interval for "action" == 9 (malloc) keep old behavior and only output every interval storage/ndb/src/mgmsrv/ConfigInfo.cpp: add new configuration parameter TimeBetweenWatchDogCheckInitial
This commit is contained in:
parent
1a166bc4c9
commit
1182b801d4
10 changed files with 149 additions and 71 deletions
|
@ -81,6 +81,8 @@
|
|||
#define CFG_DB_BACKUP_WRITE_SIZE 136
|
||||
#define CFG_DB_BACKUP_MAX_WRITE_SIZE 139
|
||||
|
||||
#define CFG_DB_WATCHDOG_INTERVAL_INITIAL 141
|
||||
|
||||
#define CFG_LOG_DESTINATION 147
|
||||
|
||||
#define CFG_DB_DISCLESS 148
|
||||
|
|
|
@ -37,9 +37,6 @@ NDB_TICKS NdbTick_CurrentMillisecond(void);
|
|||
*/
|
||||
int NdbTick_CurrentMicrosecond(NDB_TICKS * secs, Uint32 * micros);
|
||||
|
||||
/*#define TIME_MEASUREMENT*/
|
||||
#ifdef TIME_MEASUREMENT
|
||||
|
||||
struct MicroSecondTimer {
|
||||
NDB_TICKS seconds;
|
||||
NDB_TICKS micro_seconds;
|
||||
|
@ -54,7 +51,6 @@ struct MicroSecondTimer {
|
|||
NDB_TICKS NdbTick_getMicrosPassed(struct MicroSecondTimer start,
|
||||
struct MicroSecondTimer stop);
|
||||
int NdbTick_getMicroTimer(struct MicroSecondTimer* time_now);
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
|
|
@ -15,7 +15,7 @@
|
|||
|
||||
|
||||
#include <ndb_global.h>
|
||||
#include "NdbTick.h"
|
||||
#include <NdbTick.h>
|
||||
|
||||
#define NANOSEC_PER_SEC 1000000000
|
||||
#define MICROSEC_PER_SEC 1000000
|
||||
|
@ -71,7 +71,6 @@ NdbTick_CurrentMicrosecond(NDB_TICKS * secs, Uint32 * micros){
|
|||
}
|
||||
|
||||
#endif
|
||||
#ifdef TIME_MEASUREMENT
|
||||
int
|
||||
NdbTick_getMicroTimer(struct MicroSecondTimer* input_timer)
|
||||
{
|
||||
|
@ -102,4 +101,3 @@ NdbTick_getMicrosPassed(struct MicroSecondTimer start,
|
|||
}
|
||||
return ret_value;
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -277,6 +277,14 @@ void Ndbcntr::execSTTOR(Signal* signal)
|
|||
break;
|
||||
case ZSTART_PHASE_1:
|
||||
jam();
|
||||
{
|
||||
Uint32 db_watchdog_interval = 0;
|
||||
const ndb_mgm_configuration_iterator * p =
|
||||
m_ctx.m_config.getOwnConfigIterator();
|
||||
ndb_mgm_get_int_parameter(p, CFG_DB_WATCHDOG_INTERVAL, &db_watchdog_interval);
|
||||
ndbrequire(db_watchdog_interval);
|
||||
update_watch_dog_timer(db_watchdog_interval);
|
||||
}
|
||||
startPhase1Lab(signal);
|
||||
break;
|
||||
case ZSTART_PHASE_2:
|
||||
|
|
|
@ -443,6 +443,11 @@ Configuration::setupConfiguration(){
|
|||
"TimeBetweenWatchDogCheck missing");
|
||||
}
|
||||
|
||||
if(iter.get(CFG_DB_WATCHDOG_INTERVAL_INITIAL, &_timeBetweenWatchDogCheckInitial)){
|
||||
ERROR_SET(fatal, NDBD_EXIT_INVALID_CONFIG, "Invalid configuration fetched",
|
||||
"TimeBetweenWatchDogCheckInitial missing");
|
||||
}
|
||||
|
||||
/**
|
||||
* Get paths
|
||||
*/
|
||||
|
@ -462,9 +467,12 @@ Configuration::setupConfiguration(){
|
|||
* Create the watch dog thread
|
||||
*/
|
||||
{
|
||||
Uint32 t = _timeBetweenWatchDogCheck;
|
||||
if (_timeBetweenWatchDogCheckInitial < _timeBetweenWatchDogCheck)
|
||||
_timeBetweenWatchDogCheckInitial = _timeBetweenWatchDogCheck;
|
||||
|
||||
Uint32 t = _timeBetweenWatchDogCheckInitial;
|
||||
t = globalEmulatorData.theWatchDog ->setCheckInterval(t);
|
||||
_timeBetweenWatchDogCheck = t;
|
||||
_timeBetweenWatchDogCheckInitial = t;
|
||||
}
|
||||
|
||||
ConfigValues* cf = ConfigValuesFactory::extractCurrentSection(iter.m_config);
|
||||
|
|
|
@ -84,6 +84,7 @@ private:
|
|||
Uint32 _maxErrorLogs;
|
||||
Uint32 _lockPagesInMainMemory;
|
||||
Uint32 _timeBetweenWatchDogCheck;
|
||||
Uint32 _timeBetweenWatchDogCheckInitial;
|
||||
|
||||
ndb_mgm_configuration * m_ownConfig;
|
||||
ndb_mgm_configuration * m_clusterConfig;
|
||||
|
|
|
@ -19,6 +19,7 @@
|
|||
#include <NdbOut.hpp>
|
||||
#include <GlobalData.hpp>
|
||||
#include <Emulator.hpp>
|
||||
#include <WatchDog.hpp>
|
||||
#include <ErrorHandlingMacros.hpp>
|
||||
#include <TimeQueue.hpp>
|
||||
#include <TransporterRegistry.hpp>
|
||||
|
@ -662,7 +663,7 @@ SimulatedBlock::allocRecord(const char * type, size_t s, size_t n, bool clear, U
|
|||
void * p = NULL;
|
||||
size_t size = n*s;
|
||||
Uint64 real_size = (Uint64)((Uint64)n)*((Uint64)s);
|
||||
refresh_watch_dog();
|
||||
refresh_watch_dog(9);
|
||||
if (real_size > 0){
|
||||
#ifdef VM_TRACE_MEM
|
||||
ndbout_c("%s::allocRecord(%s, %u, %u) = %llu bytes",
|
||||
|
@ -696,12 +697,12 @@ SimulatedBlock::allocRecord(const char * type, size_t s, size_t n, bool clear, U
|
|||
char * ptr = (char*)p;
|
||||
const Uint32 chunk = 128 * 1024;
|
||||
while(size > chunk){
|
||||
refresh_watch_dog();
|
||||
refresh_watch_dog(9);
|
||||
memset(ptr, 0, chunk);
|
||||
ptr += chunk;
|
||||
size -= chunk;
|
||||
}
|
||||
refresh_watch_dog();
|
||||
refresh_watch_dog(9);
|
||||
memset(ptr, 0, size);
|
||||
}
|
||||
}
|
||||
|
@ -720,9 +721,16 @@ SimulatedBlock::deallocRecord(void ** ptr,
|
|||
}
|
||||
|
||||
void
|
||||
SimulatedBlock::refresh_watch_dog()
|
||||
SimulatedBlock::refresh_watch_dog(Uint32 place)
|
||||
{
|
||||
globalData.incrementWatchDogCounter(1);
|
||||
globalData.incrementWatchDogCounter(place);
|
||||
}
|
||||
|
||||
void
|
||||
SimulatedBlock::update_watch_dog_timer(Uint32 interval)
|
||||
{
|
||||
extern EmulatorData globalEmulatorData;
|
||||
globalEmulatorData.theWatchDog->setCheckInterval(interval);
|
||||
}
|
||||
|
||||
void
|
||||
|
|
|
@ -334,7 +334,8 @@ protected:
|
|||
* Refresh Watch Dog in initialising code
|
||||
*
|
||||
*/
|
||||
void refresh_watch_dog();
|
||||
void refresh_watch_dog(Uint32 place = 1);
|
||||
void update_watch_dog_timer(Uint32 interval);
|
||||
|
||||
/**
|
||||
* Prog error
|
||||
|
|
|
@ -25,6 +25,8 @@
|
|||
#include <ErrorHandlingMacros.hpp>
|
||||
#include <EventLogger.hpp>
|
||||
|
||||
#include <NdbTick.h>
|
||||
|
||||
extern EventLogger g_eventLogger;
|
||||
|
||||
extern "C"
|
||||
|
@ -72,73 +74,115 @@ WatchDog::doStop(){
|
|||
}
|
||||
}
|
||||
|
||||
const char *get_action(Uint32 IPValue)
|
||||
{
|
||||
const char *action;
|
||||
switch (IPValue) {
|
||||
case 1:
|
||||
action = "Job Handling";
|
||||
break;
|
||||
case 2:
|
||||
action = "Scanning Timers";
|
||||
break;
|
||||
case 3:
|
||||
action = "External I/O";
|
||||
break;
|
||||
case 4:
|
||||
action = "Print Job Buffers at crash";
|
||||
break;
|
||||
case 5:
|
||||
action = "Checking connections";
|
||||
break;
|
||||
case 6:
|
||||
action = "Performing Send";
|
||||
break;
|
||||
case 7:
|
||||
action = "Polling for Receive";
|
||||
break;
|
||||
case 8:
|
||||
action = "Performing Receive";
|
||||
break;
|
||||
case 9:
|
||||
action = "Allocating memory";
|
||||
break;
|
||||
default:
|
||||
action = "Unknown place";
|
||||
break;
|
||||
}//switch
|
||||
return action;
|
||||
}
|
||||
|
||||
void
|
||||
WatchDog::run(){
|
||||
unsigned int anIPValue;
|
||||
unsigned int alerts = 0;
|
||||
WatchDog::run()
|
||||
{
|
||||
unsigned int anIPValue, sleep_time;
|
||||
unsigned int oldIPValue = 0;
|
||||
|
||||
unsigned int theIntervalCheck = theInterval;
|
||||
struct MicroSecondTimer start_time, last_time, now;
|
||||
NdbTick_getMicroTimer(&start_time);
|
||||
last_time = start_time;
|
||||
|
||||
// WatchDog for the single threaded NDB
|
||||
while(!theStop){
|
||||
Uint32 tmp = theInterval / 500;
|
||||
tmp= (tmp ? tmp : 1);
|
||||
|
||||
while(!theStop && tmp > 0){
|
||||
NdbSleep_MilliSleep(500);
|
||||
tmp--;
|
||||
}
|
||||
|
||||
while (!theStop)
|
||||
{
|
||||
sleep_time= 100;
|
||||
|
||||
NdbSleep_MilliSleep(sleep_time);
|
||||
if(theStop)
|
||||
break;
|
||||
|
||||
NdbTick_getMicroTimer(&now);
|
||||
if (NdbTick_getMicrosPassed(last_time, now)/1000 > sleep_time*2)
|
||||
{
|
||||
struct tms my_tms;
|
||||
times(&my_tms);
|
||||
g_eventLogger.info("Watchdog: User time: %llu System time: %llu",
|
||||
(Uint64)my_tms.tms_utime,
|
||||
(Uint64)my_tms.tms_stime);
|
||||
g_eventLogger.warning("Watchdog: Warning overslept %u ms, expected %u ms.",
|
||||
NdbTick_getMicrosPassed(last_time, now)/1000,
|
||||
sleep_time);
|
||||
}
|
||||
last_time = now;
|
||||
|
||||
// Verify that the IP thread is not stuck in a loop
|
||||
anIPValue = *theIPValue;
|
||||
if(anIPValue != 0) {
|
||||
if (anIPValue != 0)
|
||||
{
|
||||
oldIPValue = anIPValue;
|
||||
globalData.incrementWatchDogCounter(0);
|
||||
alerts = 0;
|
||||
} else {
|
||||
const char *last_stuck_action;
|
||||
alerts++;
|
||||
switch (oldIPValue) {
|
||||
case 1:
|
||||
last_stuck_action = "Job Handling";
|
||||
break;
|
||||
case 2:
|
||||
last_stuck_action = "Scanning Timers";
|
||||
break;
|
||||
case 3:
|
||||
last_stuck_action = "External I/O";
|
||||
break;
|
||||
case 4:
|
||||
last_stuck_action = "Print Job Buffers at crash";
|
||||
break;
|
||||
case 5:
|
||||
last_stuck_action = "Checking connections";
|
||||
break;
|
||||
case 6:
|
||||
last_stuck_action = "Performing Send";
|
||||
break;
|
||||
case 7:
|
||||
last_stuck_action = "Polling for Receive";
|
||||
break;
|
||||
case 8:
|
||||
last_stuck_action = "Performing Receive";
|
||||
break;
|
||||
default:
|
||||
last_stuck_action = "Unknown place";
|
||||
break;
|
||||
}//switch
|
||||
g_eventLogger.warning("Ndb kernel is stuck in: %s", last_stuck_action);
|
||||
NdbTick_getMicroTimer(&start_time);
|
||||
theIntervalCheck = theInterval;
|
||||
}
|
||||
else
|
||||
{
|
||||
int warn = 1;
|
||||
Uint32 elapsed = NdbTick_getMicrosPassed(start_time, now)/1000;
|
||||
/*
|
||||
oldIPValue == 9 indicates malloc going on, this can take some time
|
||||
so only warn if we pass the watchdog interval
|
||||
*/
|
||||
if (oldIPValue == 9)
|
||||
if (elapsed < theIntervalCheck)
|
||||
warn = 0;
|
||||
else
|
||||
theIntervalCheck += theInterval;
|
||||
|
||||
if (warn)
|
||||
{
|
||||
struct tms my_tms;
|
||||
times(&my_tms);
|
||||
g_eventLogger.info("User time: %llu System time: %llu",
|
||||
(Uint64)my_tms.tms_utime,
|
||||
(Uint64)my_tms.tms_stime);
|
||||
}
|
||||
if(alerts == 3){
|
||||
shutdownSystem(last_stuck_action);
|
||||
const char *last_stuck_action = get_action(oldIPValue);
|
||||
g_eventLogger.warning("Ndb kernel is stuck in: %s", last_stuck_action);
|
||||
{
|
||||
struct tms my_tms;
|
||||
times(&my_tms);
|
||||
g_eventLogger.info("Watchdog: User time: %llu System time: %llu",
|
||||
(Uint64)my_tms.tms_utime,
|
||||
(Uint64)my_tms.tms_stime);
|
||||
}
|
||||
if (elapsed > 3 * theInterval)
|
||||
{
|
||||
shutdownSystem(last_stuck_action);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -571,6 +571,18 @@ const ConfigInfo::ParamInfo ConfigInfo::m_ParamInfo[] = {
|
|||
"70",
|
||||
STR_VALUE(MAX_INT_RNIL) },
|
||||
|
||||
{
|
||||
CFG_DB_WATCHDOG_INTERVAL_INITIAL,
|
||||
"TimeBetweenWatchDogCheckInitial",
|
||||
DB_TOKEN,
|
||||
"Time between execution checks inside a database node in the early start phases when memory is allocated",
|
||||
ConfigInfo::CI_USED,
|
||||
true,
|
||||
ConfigInfo::CI_INT,
|
||||
"6000",
|
||||
"70",
|
||||
STR_VALUE(MAX_INT_RNIL) },
|
||||
|
||||
{
|
||||
CFG_DB_STOP_ON_ERROR,
|
||||
"StopOnError",
|
||||
|
|
Loading…
Reference in a new issue