mirror of
https://github.com/MariaDB/server.git
synced 2025-01-17 12:32:27 +01:00
ndb - bug#29331 (51)
Add better handling of GCP Stop Only kill "offending" node storage/ndb/src/kernel/blocks/ERROR_codes.txt: add new error codes storage/ndb/src/kernel/blocks/dbdih/Dbdih.hpp: add better GCP stop handling storage/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp: add better GCP stop handling
This commit is contained in:
parent
ea88a770ea
commit
b4199e9106
3 changed files with 167 additions and 38 deletions
|
@ -5,7 +5,7 @@ Next DBACC 3002
|
|||
Next DBTUP 4029
|
||||
Next DBLQH 5045
|
||||
Next DBDICT 6007
|
||||
Next DBDIH 7183
|
||||
Next DBDIH 7186
|
||||
Next DBTC 8040
|
||||
Next CMVMI 9000
|
||||
Next BACKUP 10038
|
||||
|
@ -75,6 +75,10 @@ Delay GCP_SAVEREQ by 10 secs
|
|||
|
||||
7180: Crash master during master-take-over in execMASTER_LCPCONF
|
||||
|
||||
7184: Crash before starting next GCP after a node failure
|
||||
|
||||
7185: Dont reply to COPY_GCI_REQ where reason == GCP
|
||||
|
||||
ERROR CODES FOR TESTING NODE FAILURE, LOCAL CHECKPOINT HANDLING:
|
||||
-----------------------------------------------------------------
|
||||
|
||||
|
|
|
@ -899,7 +899,7 @@ private:
|
|||
void ndbsttorry10Lab(Signal *, Uint32 _line);
|
||||
void createMutexes(Signal* signal, Uint32 no);
|
||||
void createMutex_done(Signal* signal, Uint32 no, Uint32 retVal);
|
||||
void crashSystemAtGcpStop(Signal *);
|
||||
void crashSystemAtGcpStop(Signal *, bool);
|
||||
void sendFirstDictfragsreq(Signal *, TabRecordPtr regTabPtr);
|
||||
void addtabrefuseLab(Signal *, ConnectRecordPtr regConnectPtr, Uint32 errorCode);
|
||||
void GCP_SAVEhandling(Signal *, Uint32 nodeId);
|
||||
|
|
|
@ -747,6 +747,13 @@ done:
|
|||
}
|
||||
ndbrequire(ok);
|
||||
|
||||
|
||||
if (ERROR_INSERTED(7185) && reason==CopyGCIReq::GLOBAL_CHECKPOINT)
|
||||
{
|
||||
jam();
|
||||
return;
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------- */
|
||||
/* WE START BY TRYING TO OPEN THE FIRST RESTORABLE GCI FILE. */
|
||||
/* ----------------------------------------------------------------------- */
|
||||
|
@ -4071,6 +4078,11 @@ void Dbdih::execNODE_FAILREP(Signal* signal)
|
|||
CLEAR_ERROR_INSERT_VALUE;
|
||||
}
|
||||
|
||||
if (ERROR_INSERTED(7184))
|
||||
{
|
||||
SET_ERROR_INSERT_VALUE(7000);
|
||||
}
|
||||
|
||||
/*-------------------------------------------------------------------------*/
|
||||
// The first step is to convert from a bit mask to an array of failed nodes.
|
||||
/*-------------------------------------------------------------------------*/
|
||||
|
@ -7745,7 +7757,7 @@ void Dbdih::checkGcpStopLab(Signal* signal)
|
|||
g_eventLogger.error("System crash due to GCP Stop in state = %u",
|
||||
(Uint32) cgcpStatus);
|
||||
#endif
|
||||
crashSystemAtGcpStop(signal);
|
||||
crashSystemAtGcpStop(signal, false);
|
||||
return;
|
||||
}//if
|
||||
} else {
|
||||
|
@ -7759,7 +7771,7 @@ void Dbdih::checkGcpStopLab(Signal* signal)
|
|||
g_eventLogger.error("System crash due to GCP Stop in state = %u",
|
||||
(Uint32) cgcpStatus);
|
||||
#endif
|
||||
crashSystemAtGcpStop(signal);
|
||||
crashSystemAtGcpStop(signal, false);
|
||||
return;
|
||||
}//if
|
||||
} else {
|
||||
|
@ -11117,37 +11129,128 @@ void Dbdih::tableCloseLab(Signal* signal, FileRecordPtr filePtr)
|
|||
* GCP stop detected,
|
||||
* send SYSTEM_ERROR to all other alive nodes
|
||||
*/
|
||||
void Dbdih::crashSystemAtGcpStop(Signal* signal)
|
||||
void Dbdih::crashSystemAtGcpStop(Signal* signal, bool local)
|
||||
{
|
||||
if (local)
|
||||
goto dolocal;
|
||||
|
||||
switch(cgcpStatus){
|
||||
case GCP_PREPARE_SENT:
|
||||
{
|
||||
jam();
|
||||
/**
|
||||
* We're waiting for a GCP PREPARE CONF
|
||||
*/
|
||||
infoEvent("Detected GCP stop(%d)...sending kill to %s",
|
||||
cgcpStatus, c_GCP_PREPARE_Counter.getText());
|
||||
ndbout_c("Detected GCP stop(%d)...sending kill to %s",
|
||||
cgcpStatus, c_GCP_PREPARE_Counter.getText());
|
||||
|
||||
{
|
||||
NodeReceiverGroup rg(DBDIH, c_GCP_PREPARE_Counter);
|
||||
signal->theData[0] = 7022;
|
||||
sendSignal(rg, GSN_DUMP_STATE_ORD, signal, 1, JBA);
|
||||
}
|
||||
|
||||
{
|
||||
NodeReceiverGroup rg(NDBCNTR, c_GCP_PREPARE_Counter);
|
||||
SystemError * const sysErr = (SystemError*)&signal->theData[0];
|
||||
sysErr->errorCode = SystemError::GCPStopDetected;
|
||||
sysErr->errorRef = reference();
|
||||
sysErr->data1 = cgcpStatus;
|
||||
sysErr->data2 = cgcpOrderBlocked;
|
||||
sendSignal(rg, GSN_SYSTEM_ERROR, signal,
|
||||
SystemError::SignalLength, JBA);
|
||||
}
|
||||
ndbrequire(!c_GCP_PREPARE_Counter.done());
|
||||
return;
|
||||
}
|
||||
case GCP_COMMIT_SENT:
|
||||
{
|
||||
jam();
|
||||
/**
|
||||
* We're waiting for a GCP_NODEFINISH
|
||||
*/
|
||||
infoEvent("Detected GCP stop(%d)...sending kill to %s",
|
||||
cgcpStatus, c_GCP_COMMIT_Counter.getText());
|
||||
ndbout_c("Detected GCP stop(%d)...sending kill to %s",
|
||||
cgcpStatus, c_GCP_COMMIT_Counter.getText());
|
||||
|
||||
{
|
||||
NodeReceiverGroup rg(DBDIH, c_GCP_COMMIT_Counter);
|
||||
signal->theData[0] = 7022;
|
||||
sendSignal(rg, GSN_DUMP_STATE_ORD, signal, 1, JBA);
|
||||
}
|
||||
|
||||
{
|
||||
NodeReceiverGroup rg(NDBCNTR, c_GCP_COMMIT_Counter);
|
||||
SystemError * const sysErr = (SystemError*)&signal->theData[0];
|
||||
sysErr->errorCode = SystemError::GCPStopDetected;
|
||||
sysErr->errorRef = reference();
|
||||
sysErr->data1 = cgcpStatus;
|
||||
sysErr->data2 = cgcpOrderBlocked;
|
||||
sendSignal(rg, GSN_SYSTEM_ERROR, signal,
|
||||
SystemError::SignalLength, JBA);
|
||||
}
|
||||
ndbrequire(!c_GCP_COMMIT_Counter.done());
|
||||
return;
|
||||
}
|
||||
case GCP_NODE_FINISHED:
|
||||
{
|
||||
jam();
|
||||
/**
|
||||
* We're waiting for a GCP save conf
|
||||
*/
|
||||
ndbrequire(!c_GCP_SAVEREQ_Counter.done());
|
||||
NodeReceiverGroup rg(DBLQH, c_GCP_SAVEREQ_Counter);
|
||||
signal->theData[0] = 2305;
|
||||
sendSignal(rg, GSN_DUMP_STATE_ORD, signal, 1, JBB);
|
||||
|
||||
infoEvent("Detected GCP stop...sending kill to %s",
|
||||
c_GCP_SAVEREQ_Counter.getText());
|
||||
g_eventLogger.error("Detected GCP stop...sending kill to %s",
|
||||
c_GCP_SAVEREQ_Counter.getText());
|
||||
infoEvent("Detected GCP stop(%d)...sending kill to %s",
|
||||
cgcpStatus, c_GCP_SAVEREQ_Counter.getText());
|
||||
ndbout_c("Detected GCP stop(%d)...sending kill to %s",
|
||||
cgcpStatus, c_GCP_SAVEREQ_Counter.getText());
|
||||
ndbrequire(!c_GCP_SAVEREQ_Counter.done());
|
||||
return;
|
||||
}
|
||||
case GCP_SAVE_LQH_FINISHED:
|
||||
g_eventLogger.error("m_copyReason: %d m_waiting: %d",
|
||||
c_copyGCIMaster.m_copyReason,
|
||||
c_copyGCIMaster.m_waiting);
|
||||
break;
|
||||
case GCP_READY: // shut up lint
|
||||
case GCP_PREPARE_SENT:
|
||||
case GCP_COMMIT_SENT:
|
||||
break;
|
||||
{
|
||||
jam();
|
||||
/**
|
||||
* We're waiting for a COPY_GCICONF
|
||||
*/
|
||||
infoEvent("Detected GCP stop(%d)...sending kill to %s",
|
||||
cgcpStatus, c_COPY_GCIREQ_Counter.getText());
|
||||
ndbout_c("Detected GCP stop(%d)...sending kill to %s",
|
||||
cgcpStatus, c_COPY_GCIREQ_Counter.getText());
|
||||
|
||||
{
|
||||
NodeReceiverGroup rg(DBDIH, c_COPY_GCIREQ_Counter);
|
||||
signal->theData[0] = 7022;
|
||||
sendSignal(rg, GSN_DUMP_STATE_ORD, signal, 1, JBA);
|
||||
}
|
||||
|
||||
g_eventLogger.error("c_copyGCISlave: sender{Data, Ref} %d %x reason: %d nextWord: %d",
|
||||
{
|
||||
NodeReceiverGroup rg(NDBCNTR, c_COPY_GCIREQ_Counter);
|
||||
SystemError * const sysErr = (SystemError*)&signal->theData[0];
|
||||
sysErr->errorCode = SystemError::GCPStopDetected;
|
||||
sysErr->errorRef = reference();
|
||||
sysErr->data1 = cgcpStatus;
|
||||
sysErr->data2 = cgcpOrderBlocked;
|
||||
sendSignal(rg, GSN_SYSTEM_ERROR, signal,
|
||||
SystemError::SignalLength, JBA);
|
||||
}
|
||||
ndbrequire(!c_COPY_GCIREQ_Counter.done());
|
||||
return;
|
||||
}
|
||||
case GCP_READY: (void)1;
|
||||
}
|
||||
|
||||
dolocal:
|
||||
ndbout_c("m_copyReason: %d m_waiting: %d",
|
||||
c_copyGCIMaster.m_copyReason,
|
||||
c_copyGCIMaster.m_waiting);
|
||||
|
||||
ndbout_c("c_copyGCISlave: sender{Data, Ref} %d %x reason: %d nextWord: %d",
|
||||
c_copyGCISlave.m_senderData,
|
||||
c_copyGCISlave.m_senderRef,
|
||||
c_copyGCISlave.m_copyReason,
|
||||
|
@ -11202,6 +11305,9 @@ void Dbdih::crashSystemAtGcpStop(Signal* signal)
|
|||
c_TCGETOPSIZEREQ_Counter.getText());
|
||||
ndbout_c("c_UPDATE_TOREQ_Counter = %s", c_UPDATE_TOREQ_Counter.getText());
|
||||
|
||||
if (local == false)
|
||||
{
|
||||
jam();
|
||||
NodeRecordPtr nodePtr;
|
||||
for (nodePtr.i = 1; nodePtr.i < MAX_NDB_NODES; nodePtr.i++) {
|
||||
jam();
|
||||
|
@ -11219,6 +11325,19 @@ void Dbdih::crashSystemAtGcpStop(Signal* signal)
|
|||
SystemError::SignalLength, JBA);
|
||||
}//if
|
||||
}//for
|
||||
}
|
||||
else
|
||||
{
|
||||
jam();
|
||||
SystemError * const sysErr = (SystemError*)&signal->theData[0];
|
||||
sysErr->errorCode = SystemError::GCPStopDetected;
|
||||
sysErr->errorRef = reference();
|
||||
sysErr->data1 = cgcpStatus;
|
||||
sysErr->data2 = cgcpOrderBlocked;
|
||||
EXECUTE_DIRECT(NDBCNTR, GSN_SYSTEM_ERROR,
|
||||
signal, SystemError::SignalLength);
|
||||
ndbrequire(false);
|
||||
}
|
||||
return;
|
||||
}//Dbdih::crashSystemAtGcpStop()
|
||||
|
||||
|
@ -14304,6 +14423,12 @@ Dbdih::execDUMP_STATE_ORD(Signal* signal)
|
|||
infoEvent(buf);
|
||||
}
|
||||
}
|
||||
|
||||
if (arg == 7022)
|
||||
{
|
||||
jam();
|
||||
crashSystemAtGcpStop(signal, true);
|
||||
}
|
||||
}//Dbdih::execDUMP_STATE_ORD()
|
||||
|
||||
void
|
||||
|
|
Loading…
Reference in a new issue