diff --git a/storage/ndb/src/kernel/blocks/dbdih/Dbdih.hpp b/storage/ndb/src/kernel/blocks/dbdih/Dbdih.hpp index 37eb54028a6..3436a609fe7 100644 --- a/storage/ndb/src/kernel/blocks/dbdih/Dbdih.hpp +++ b/storage/ndb/src/kernel/blocks/dbdih/Dbdih.hpp @@ -637,6 +637,7 @@ private: void execTCGETOPSIZECONF(Signal *); void execTC_CLOPSIZECONF(Signal *); + int handle_invalid_lcp_no(const class LcpFragRep*, ReplicaRecordPtr); void execLCP_FRAG_REP(Signal *); void execLCP_COMPLETE_REP(Signal *); void execSTART_LCP_REQ(Signal *); diff --git a/storage/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp b/storage/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp index 50c7c5472ba..f9b7eb9d100 100644 --- a/storage/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp +++ b/storage/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp @@ -4046,6 +4046,11 @@ void Dbdih::execNODE_FAILREP(Signal* signal) Uint32 newMasterId = nodeFail->masterNodeId; const Uint32 noOfFailedNodes = nodeFail->noOfNodes; + if (ERROR_INSERTED(7179)) + { + CLEAR_ERROR_INSERT_VALUE; + } + /*-------------------------------------------------------------------------*/ // The first step is to convert from a bit mask to an array of failed nodes. /*-------------------------------------------------------------------------*/ @@ -10256,12 +10261,42 @@ void Dbdih::execLCP_FRAG_REP(Signal* signal) Uint32 fragId = lcpReport->fragId; jamEntry(); + + if (ERROR_INSERTED(7178) && nodeId != getOwnNodeId()) + { + jam(); + Uint32 owng =Sysfile::getNodeGroup(getOwnNodeId(), SYSFILE->nodeGroups); + Uint32 nodeg = Sysfile::getNodeGroup(nodeId, SYSFILE->nodeGroups); + if (owng == nodeg) + { + jam(); + ndbout_c("throwing away LCP_FRAG_REP from (and killing) %d", nodeId); + SET_ERROR_INSERT_VALUE(7179); + signal->theData[0] = 9999; + sendSignal(numberToRef(CMVMI, nodeId), + GSN_NDB_TAMPER, signal, 1, JBA); + return; + } + } + if (ERROR_INSERTED(7179) && nodeId != getOwnNodeId()) + { + jam(); + Uint32 owng =Sysfile::getNodeGroup(getOwnNodeId(), SYSFILE->nodeGroups); + Uint32 nodeg = Sysfile::getNodeGroup(nodeId, SYSFILE->nodeGroups); + if (owng == nodeg) + { + jam(); + ndbout_c("throwing away LCP_FRAG_REP from %d", nodeId); + return; + } + } + CRASH_INSERTION2(7025, isMaster()); CRASH_INSERTION2(7016, !isMaster()); - + bool fromTimeQueue = (signal->senderBlockRef() == reference()); - + TabRecordPtr tabPtr; tabPtr.i = tableId; ptrCheckGuard(tabPtr, ctabFileSize, tabRecord); @@ -10463,6 +10498,37 @@ void Dbdih::findReplica(ReplicaRecordPtr& replicaPtr, ndbrequire(false); }//Dbdih::findReplica() + +int +Dbdih::handle_invalid_lcp_no(const LcpFragRep* rep, + ReplicaRecordPtr replicaPtr) +{ + ndbrequire(!isMaster()); + Uint32 lcpNo = rep->lcpNo; + Uint32 lcpId = rep->lcpId; + Uint32 replicaLcpNo = replicaPtr.p->nextLcp; + Uint32 prevReplicaLcpNo = prevLcpNo(replicaLcpNo); + + warningEvent("Detected previous node failure of %d during lcp", + rep->nodeId); + replicaPtr.p->nextLcp = lcpNo; + replicaPtr.p->lcpId[lcpNo] = 0; + replicaPtr.p->lcpStatus[lcpNo] = ZINVALID; + + for (Uint32 i = lcpNo; i != lcpNo; i = nextLcpNo(i)) + { + jam(); + if (replicaPtr.p->lcpStatus[i] == ZVALID && + replicaPtr.p->lcpId[i] >= lcpId) + { + ndbout_c("i: %d lcpId: %d", i, replicaPtr.p->lcpId[i]); + ndbrequire(false); + } + } + + return 0; +} + /** * Return true if table is all fragment replicas have been checkpointed * to disk (in all LQHs) @@ -10491,9 +10557,12 @@ Dbdih::reportLcpCompletion(const LcpFragRep* lcpReport) ndbrequire(replicaPtr.p->lcpOngoingFlag == true); if(lcpNo != replicaPtr.p->nextLcp){ - ndbout_c("lcpNo = %d replicaPtr.p->nextLcp = %d", - lcpNo, replicaPtr.p->nextLcp); - ndbrequire(false); + if (handle_invalid_lcp_no(lcpReport, replicaPtr)) + { + ndbout_c("lcpNo = %d replicaPtr.p->nextLcp = %d", + lcpNo, replicaPtr.p->nextLcp); + ndbrequire(false); + } } ndbrequire(lcpNo == replicaPtr.p->nextLcp); ndbrequire(lcpNo < MAX_LCP_STORED); diff --git a/storage/ndb/test/ndbapi/testNodeRestart.cpp b/storage/ndb/test/ndbapi/testNodeRestart.cpp index 0ceb3b5d6f5..92e59b92c5a 100644 --- a/storage/ndb/test/ndbapi/testNodeRestart.cpp +++ b/storage/ndb/test/ndbapi/testNodeRestart.cpp @@ -1073,6 +1073,63 @@ int runBug25364(NDBT_Context* ctx, NDBT_Step* step){ return NDBT_OK; } +int runBug25468(NDBT_Context* ctx, NDBT_Step* step){ + + int result = NDBT_OK; + int loops = ctx->getNumLoops(); + int records = ctx->getNumRecords(); + NdbRestarter restarter; + + for (int i = 0; i<loops; i++) + { + int master = restarter.getMasterNodeId(); + int node1, node2; + switch(i % 5){ + case 0: + node1 = master; + node2 = restarter.getRandomNodeSameNodeGroup(master, rand()); + break; + case 1: + node1 = restarter.getRandomNodeSameNodeGroup(master, rand()); + node2 = master; + break; + case 2: + case 3: + case 4: + node1 = restarter.getRandomNodeOtherNodeGroup(master, rand()); + if (node1 == -1) + node1 = master; + node2 = restarter.getRandomNodeSameNodeGroup(node1, rand()); + break; + } + + ndbout_c("node1: %d node2: %d master: %d", node1, node2, master); + + int val2[] = { DumpStateOrd::CmvmiSetRestartOnErrorInsert, 1 }; + + if (restarter.dumpStateOneNode(node2, val2, 2)) + return NDBT_FAILED; + + if (restarter.insertErrorInNode(node1, 7178)) + return NDBT_FAILED; + + int val1 = 7099; + if (restarter.dumpStateOneNode(master, &val1, 1)) + return NDBT_FAILED; + + if (restarter.waitNodesNoStart(&node2, 1)) + return NDBT_FAILED; + + if (restarter.startAll()) + return NDBT_FAILED; + + if (restarter.waitClusterStarted()) + return NDBT_FAILED; + } + + return NDBT_OK; +} + NDBT_TESTSUITE(testNodeRestart); TESTCASE("NoLoad", @@ -1403,6 +1460,9 @@ TESTCASE("Bug24717", ""){ TESTCASE("Bug25364", ""){ INITIALIZER(runBug25364); } +TESTCASE("Bug25468", ""){ + INITIALIZER(runBug25468); +} NDBT_TESTSUITE_END(testNodeRestart); int main(int argc, const char** argv){ diff --git a/storage/ndb/test/run-test/daily-basic-tests.txt b/storage/ndb/test/run-test/daily-basic-tests.txt index 3f869d92a35..a55f52e80f9 100644 --- a/storage/ndb/test/run-test/daily-basic-tests.txt +++ b/storage/ndb/test/run-test/daily-basic-tests.txt @@ -768,6 +768,10 @@ max-time: 1500 cmd: testSystemRestart args: -n Bug24664 +max-time: 1000 +cmd: testNodeRestart +args: -n Bug25468 T1 + # OLD FLEX max-time: 500 cmd: flexBench