mirror of
https://github.com/MariaDB/server.git
synced 2026-05-09 16:44:29 +02:00
ndb - bug#16772
dont't allow node to join cluster until all nodes has completed failure handling ndb/src/kernel/blocks/qmgr/QmgrMain.cpp: When getting CM_ADD for node that I haven't completed failure handling for do _not_ just override. But instead set state...and send CM_ACK_ADD on execCONNECT_REP (much...later) ndb/test/ndbapi/testNodeRestart.cpp: testcase for bug#16772 ndb/test/run-test/daily-basic-tests.txt: Run test in basic suite
This commit is contained in:
parent
6ac6b08c41
commit
3bfaf33392
3 changed files with 141 additions and 14 deletions
|
|
@ -257,6 +257,7 @@ void Qmgr::setArbitTimeout(UintR aArbitTimeout)
|
|||
|
||||
void Qmgr::execCONNECT_REP(Signal* signal)
|
||||
{
|
||||
jamEntry();
|
||||
const Uint32 nodeId = signal->theData[0];
|
||||
c_connectedNodes.set(nodeId);
|
||||
NodeRecPtr nodePtr;
|
||||
|
|
@ -264,9 +265,13 @@ void Qmgr::execCONNECT_REP(Signal* signal)
|
|||
ptrCheckGuard(nodePtr, MAX_NODES, nodeRec);
|
||||
switch(nodePtr.p->phase){
|
||||
case ZSTARTING:
|
||||
jam();
|
||||
break;
|
||||
case ZRUNNING:
|
||||
jam();
|
||||
if(!c_start.m_nodes.isWaitingFor(nodeId)){
|
||||
jam();
|
||||
return;
|
||||
}
|
||||
break;
|
||||
case ZPREPARE_FAIL:
|
||||
case ZFAIL_CLOSING:
|
||||
jam();
|
||||
|
|
@ -277,21 +282,28 @@ void Qmgr::execCONNECT_REP(Signal* signal)
|
|||
case ZAPI_INACTIVE:
|
||||
return;
|
||||
}
|
||||
|
||||
if(!c_start.m_nodes.isWaitingFor(nodeId)){
|
||||
jam();
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
switch(c_start.m_gsn){
|
||||
case GSN_CM_REGREQ:
|
||||
jam();
|
||||
sendCmRegReq(signal, nodeId);
|
||||
return;
|
||||
case GSN_CM_NODEINFOREQ:{
|
||||
case GSN_CM_NODEINFOREQ:
|
||||
jam();
|
||||
sendCmNodeInfoReq(signal, nodeId, nodePtr.p);
|
||||
return;
|
||||
case GSN_CM_ADD:{
|
||||
jam();
|
||||
|
||||
ndbrequire(getOwnNodeId() != cpresident);
|
||||
c_start.m_nodes.clearWaitingFor(nodeId);
|
||||
c_start.m_gsn = RNIL;
|
||||
|
||||
NodeRecPtr addNodePtr;
|
||||
addNodePtr.i = nodeId;
|
||||
ptrCheckGuard(addNodePtr, MAX_NDB_NODES, nodeRec);
|
||||
cmAddPrepare(signal, addNodePtr, nodePtr.p);
|
||||
return;
|
||||
}
|
||||
default:
|
||||
return;
|
||||
|
|
@ -924,15 +936,27 @@ Qmgr::cmAddPrepare(Signal* signal, NodeRecPtr nodePtr, const NodeRec * self){
|
|||
return;
|
||||
case ZFAIL_CLOSING:
|
||||
jam();
|
||||
#ifdef VM_TRACE
|
||||
ndbout_c("Enabling communication to CM_ADD node state=%d",
|
||||
nodePtr.p->phase);
|
||||
#endif
|
||||
|
||||
#if 1
|
||||
warningEvent("Recieved request to incorperate node %u, "
|
||||
"while error handling has not yet completed",
|
||||
nodePtr.i);
|
||||
|
||||
ndbrequire(getOwnNodeId() != cpresident);
|
||||
ndbrequire(signal->header.theVerId_signalNumber == GSN_CM_ADD);
|
||||
c_start.m_nodes.clearWaitingFor();
|
||||
c_start.m_nodes.setWaitingFor(nodePtr.i);
|
||||
c_start.m_gsn = GSN_CM_ADD;
|
||||
#else
|
||||
warningEvent("Enabling communication to CM_ADD node %u state=%d",
|
||||
nodePtr.i,
|
||||
nodePtr.p->phase);
|
||||
nodePtr.p->phase = ZSTARTING;
|
||||
nodePtr.p->failState = NORMAL;
|
||||
signal->theData[0] = 0;
|
||||
signal->theData[1] = nodePtr.i;
|
||||
sendSignal(CMVMI_REF, GSN_OPEN_COMREQ, signal, 2, JBA);
|
||||
#endif
|
||||
return;
|
||||
case ZSTARTING:
|
||||
break;
|
||||
|
|
@ -1766,11 +1790,27 @@ void Qmgr::execNDB_FAILCONF(Signal* signal)
|
|||
|
||||
jamEntry();
|
||||
failedNodePtr.i = signal->theData[0];
|
||||
|
||||
if (ERROR_INSERTED(930))
|
||||
{
|
||||
CLEAR_ERROR_INSERT_VALUE;
|
||||
infoEvent("Discarding NDB_FAILCONF for %u", failedNodePtr.i);
|
||||
return;
|
||||
}
|
||||
|
||||
ptrCheckGuard(failedNodePtr, MAX_NODES, nodeRec);
|
||||
if (failedNodePtr.p->failState == WAITING_FOR_NDB_FAILCONF){
|
||||
failedNodePtr.p->failState = NORMAL;
|
||||
} else {
|
||||
jam();
|
||||
|
||||
char buf[100];
|
||||
BaseString::snprintf(buf, 100,
|
||||
"Received NDB_FAILCONF for node %u with state: %d %d",
|
||||
failedNodePtr.i,
|
||||
failedNodePtr.p->phase,
|
||||
failedNodePtr.p->failState);
|
||||
progError(__LINE__, 0, buf);
|
||||
systemErrorLab(signal, __LINE__);
|
||||
}//if
|
||||
if (cpresident == getOwnNodeId()) {
|
||||
|
|
@ -2077,10 +2117,42 @@ void Qmgr::failReportLab(Signal* signal, Uint16 aFailedNode,
|
|||
ptrCheckGuard(failedNodePtr, MAX_NODES, nodeRec);
|
||||
if (failedNodePtr.i == getOwnNodeId()) {
|
||||
jam();
|
||||
systemErrorLab(signal, __LINE__);
|
||||
|
||||
const char * msg = 0;
|
||||
switch(aFailCause){
|
||||
case FailRep::ZOWN_FAILURE:
|
||||
msg = "Own failure";
|
||||
break;
|
||||
case FailRep::ZOTHER_NODE_WHEN_WE_START:
|
||||
case FailRep::ZOTHERNODE_FAILED_DURING_START:
|
||||
msg = "Other node died during start";
|
||||
break;
|
||||
case FailRep::ZIN_PREP_FAIL_REQ:
|
||||
msg = "Prep fail";
|
||||
break;
|
||||
case FailRep::ZSTART_IN_REGREQ:
|
||||
msg = "Start timeout";
|
||||
break;
|
||||
case FailRep::ZHEARTBEAT_FAILURE:
|
||||
msg = "Hearbeat failure";
|
||||
break;
|
||||
case FailRep::ZLINK_FAILURE:
|
||||
msg = "Connection failure";
|
||||
break;
|
||||
}
|
||||
|
||||
char buf[100];
|
||||
BaseString::snprintf(buf, 100,
|
||||
"We(%u) have been declared dead by %u reason: %s(%u)",
|
||||
getOwnNodeId(),
|
||||
refToNode(signal->getSendersBlockRef()),
|
||||
aFailCause,
|
||||
msg ? msg : "<Unknown>");
|
||||
|
||||
progError(__LINE__, 0, buf);
|
||||
return;
|
||||
}//if
|
||||
|
||||
|
||||
myNodePtr.i = getOwnNodeId();
|
||||
ptrCheckGuard(myNodePtr, MAX_NDB_NODES, nodeRec);
|
||||
if (myNodePtr.p->phase != ZRUNNING) {
|
||||
|
|
@ -2791,6 +2863,7 @@ void Qmgr::failReport(Signal* signal,
|
|||
cfailureNr = cprepareFailureNr;
|
||||
ctoFailureNr = 0;
|
||||
ctoStatus = Q_ACTIVE;
|
||||
c_start.reset(); // Don't take over nodes being started
|
||||
if (cnoCommitFailedNodes > 0) {
|
||||
jam();
|
||||
/**-----------------------------------------------------------------
|
||||
|
|
|
|||
|
|
@ -535,6 +535,52 @@ err:
|
|||
return NDBT_FAILED;
|
||||
}
|
||||
|
||||
int
|
||||
runBug16772(NDBT_Context* ctx, NDBT_Step* step){
|
||||
|
||||
NdbRestarter restarter;
|
||||
if (restarter.getNumDbNodes() < 2)
|
||||
{
|
||||
ctx->stopTest();
|
||||
return NDBT_OK;
|
||||
}
|
||||
|
||||
int aliveNodeId = restarter.getRandomNotMasterNodeId(rand());
|
||||
int deadNodeId = aliveNodeId;
|
||||
while (deadNodeId == aliveNodeId)
|
||||
deadNodeId = restarter.getDbNodeId(rand() % restarter.getNumDbNodes());
|
||||
|
||||
if (restarter.insertErrorInNode(aliveNodeId, 930))
|
||||
return NDBT_FAILED;
|
||||
|
||||
if (restarter.restartOneDbNode(deadNodeId,
|
||||
/** initial */ false,
|
||||
/** nostart */ true,
|
||||
/** abort */ true))
|
||||
return NDBT_FAILED;
|
||||
|
||||
if (restarter.waitNodesNoStart(&deadNodeId, 1))
|
||||
return NDBT_FAILED;
|
||||
|
||||
if (restarter.startNodes(&deadNodeId, 1))
|
||||
return NDBT_FAILED;
|
||||
|
||||
// It should now be hanging since we throw away NDB_FAILCONF
|
||||
int ret = restarter.waitNodesStartPhase(&deadNodeId, 1, 3, 10);
|
||||
// So this should fail...i.e it should not reach startphase 3
|
||||
|
||||
// Now send a NDB_FAILCONF for deadNo
|
||||
int dump[] = { 7020, 323, 252, 0 };
|
||||
dump[3] = deadNodeId;
|
||||
if (restarter.dumpStateOneNode(aliveNodeId, dump, 4))
|
||||
return NDBT_FAILED;
|
||||
|
||||
if (restarter.waitNodesStarted(&deadNodeId, 1))
|
||||
return NDBT_FAILED;
|
||||
|
||||
return ret ? NDBT_OK : NDBT_FAILED;
|
||||
}
|
||||
|
||||
|
||||
NDBT_TESTSUITE(testNodeRestart);
|
||||
TESTCASE("NoLoad",
|
||||
|
|
@ -820,6 +866,10 @@ TESTCASE("Bug15685",
|
|||
STEP(runBug15685);
|
||||
FINALIZER(runClearTable);
|
||||
}
|
||||
TESTCASE("Bug16772",
|
||||
"Test bug with restarting before NF handling is complete"){
|
||||
STEP(runBug16772);
|
||||
}
|
||||
NDBT_TESTSUITE_END(testNodeRestart);
|
||||
|
||||
int main(int argc, const char** argv){
|
||||
|
|
|
|||
|
|
@ -446,6 +446,10 @@ max-time: 500
|
|||
cmd: testNodeRestart
|
||||
args: -n Bug15685 T1
|
||||
|
||||
max-time: 500
|
||||
cmd: testNodeRestart
|
||||
args: -n Bug16772 T1
|
||||
|
||||
# OLD FLEX
|
||||
max-time: 500
|
||||
cmd: flexBench
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue