diff --git a/ndb/include/kernel/signaldata/DumpStateOrd.hpp b/ndb/include/kernel/signaldata/DumpStateOrd.hpp index bde690e056d..4dd22cf5092 100644 --- a/ndb/include/kernel/signaldata/DumpStateOrd.hpp +++ b/ndb/include/kernel/signaldata/DumpStateOrd.hpp @@ -78,6 +78,8 @@ public: LqhDumpAllScanRec = 2301, LqhDumpAllActiveScanRec = 2302, LqhDumpLcpState = 2303, + LqhErrorInsert5042 = 2315, + AccDumpOneScanRec = 2400, AccDumpAllScanRec = 2401, AccDumpAllActiveScanRec = 2402, diff --git a/ndb/include/ndb_version.h.in b/ndb/include/ndb_version.h.in index 826f5124407..38b72306d03 100644 --- a/ndb/include/ndb_version.h.in +++ b/ndb/include/ndb_version.h.in @@ -57,5 +57,8 @@ char ndb_version_string_buf[NDB_VERSION_STRING_BUF_SZ]; */ /*#define NDB_VERSION_ID 0*/ +#define NDBD_INCL_NODECONF_VERSION_4 MAKE_VERSION(4,1,17) +#define NDBD_INCL_NODECONF_VERSION_5 MAKE_VERSION(5,0,18) + #endif diff --git a/ndb/src/kernel/blocks/ERROR_codes.txt b/ndb/src/kernel/blocks/ERROR_codes.txt index 1a72537a77e..0be5e91cd71 100644 --- a/ndb/src/kernel/blocks/ERROR_codes.txt +++ b/ndb/src/kernel/blocks/ERROR_codes.txt @@ -61,6 +61,8 @@ Insert system error in GCP participant when receiving GCP_SAVEREQ. 5007: Delay GCP_SAVEREQ by 10 secs +7165: Delay INCL_NODE_REQ in starting node yeilding error in GCP_PREPARE + ERROR CODES FOR TESTING NODE FAILURE, LOCAL CHECKPOINT HANDLING: ----------------------------------------------------------------- @@ -155,11 +157,15 @@ Insert node failure handling when receiving COMPLETEREQ. 5006: Insert node failure handling when receiving ABORTREQ. +5042: +As 5002, but with specified table (see DumpStateOrd) + These error code can be combined with error codes for testing time-out handling in DBTC to ensure that node failures are also well handled in time-out handling. They can also be used to test multiple node failure handling. + ERROR CODES FOR TESTING TIME-OUT HANDLING IN DBLQH ------------------------------------------------- 5011: @@ -198,6 +204,9 @@ Delay execution of ABORTREQ signal 2 seconds to generate time-out. 8050: Send TCKEYREF is operation is non local +5100,5101: Drop ABORT req in primary replica + Crash on "next" ABORT + ERROR CODES FOR TESTING TIME-OUT HANDLING IN DBTC ------------------------------------------------- 8040: diff --git a/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp b/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp index 29de0368212..c1e90dccf12 100644 --- a/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp +++ b/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp @@ -215,7 +215,7 @@ void Dbdih::sendINCL_NODEREQ(Signal* signal, Uint32 nodeId) signal->theData[2] = c_nodeStartMaster.failNr; signal->theData[3] = 0; signal->theData[4] = currentgcp; - sendSignal(nodeDihRef, GSN_INCL_NODEREQ, signal, 5, JBB); + sendSignal(nodeDihRef, GSN_INCL_NODEREQ, signal, 5, JBA); }//Dbdih::sendINCL_NODEREQ() void Dbdih::sendMASTER_GCPREQ(Signal* signal, Uint32 nodeId) @@ -1863,6 +1863,14 @@ void Dbdih::gcpBlockedLab(Signal* signal) // global checkpoint id and the correct state. We do not wait for any reply // since the starting node will not send any. /*-------------------------------------------------------------------------*/ + Uint32 startVersion = getNodeInfo(c_nodeStartMaster.startNode).m_version; + + if ((getMajor(startVersion) == 4 && startVersion >= NDBD_INCL_NODECONF_VERSION_4) || + (getMajor(startVersion) == 5 && startVersion >= NDBD_INCL_NODECONF_VERSION_5)) + { + c_INCL_NODEREQ_Counter.setWaitingFor(c_nodeStartMaster.startNode); + } + sendINCL_NODEREQ(signal, c_nodeStartMaster.startNode); }//Dbdih::gcpBlockedLab() @@ -2065,6 +2073,13 @@ void Dbdih::execINCL_NODEREQ(Signal* signal) jamEntry(); Uint32 retRef = signal->theData[0]; Uint32 nodeId = signal->theData[1]; + if (nodeId == getOwnNodeId() && ERROR_INSERTED(7165)) + { + CLEAR_ERROR_INSERT_VALUE; + sendSignalWithDelay(reference(), GSN_INCL_NODEREQ, signal, 5000, signal->getLength()); + return; + } + Uint32 tnodeStartFailNr = signal->theData[2]; currentgcp = signal->theData[4]; CRASH_INSERTION(7127); @@ -2092,6 +2107,15 @@ void Dbdih::execINCL_NODEREQ(Signal* signal) // id's and the lcp status. /*-----------------------------------------------------------------------*/ CRASH_INSERTION(7171); + Uint32 masterVersion = getNodeInfo(refToNode(cmasterdihref)).m_version; + + if ((NDB_VERSION_MAJOR == 4 && masterVersion >= NDBD_INCL_NODECONF_VERSION_4) || + (NDB_VERSION_MAJOR == 5 && masterVersion >= NDBD_INCL_NODECONF_VERSION_5)) + { + signal->theData[0] = getOwnNodeId(); + signal->theData[1] = getOwnNodeId(); + sendSignal(cmasterdihref, GSN_INCL_NODECONF, signal, 2, JBB); + } return; }//if if (getNodeStatus(nodeId) != NodeRecord::STARTING) { @@ -3741,8 +3765,16 @@ void Dbdih::execNODE_FAILREP(Signal* signal) /*------------------------------------------------------------------------*/ // Verify that a starting node has also crashed. Reset the node start record. /*-------------------------------------------------------------------------*/ - if (c_nodeStartMaster.startNode != RNIL) { - ndbrequire(getNodeStatus(c_nodeStartMaster.startNode)!= NodeRecord::ALIVE); + if (false && c_nodeStartMaster.startNode != RNIL && getNodeStatus(c_nodeStartMaster.startNode) == NodeRecord::ALIVE) + { + BlockReference cntrRef = calcNdbCntrBlockRef(c_nodeStartMaster.startNode); + SystemError * const sysErr = (SystemError*)&signal->theData[0]; + sysErr->errorCode = SystemError::StartInProgressError; + sysErr->errorRef = reference(); + sysErr->data1= 0; + sysErr->data2= __LINE__; + sendSignal(cntrRef, GSN_SYSTEM_ERROR, signal, SystemError::SignalLength, JBA); + nodeResetStart(); }//if /*--------------------------------------------------*/ @@ -5189,15 +5221,16 @@ void Dbdih::removeNodeFromTable(Signal* signal, /** * For each of replica record */ - Uint32 replicaNo = 0; + bool found = false; ReplicaRecordPtr replicaPtr; for(replicaPtr.i = fragPtr.p->storedReplicas; replicaPtr.i != RNIL; - replicaPtr.i = replicaPtr.p->nextReplica, replicaNo++) { + replicaPtr.i = replicaPtr.p->nextReplica) { jam(); ptrCheckGuard(replicaPtr, creplicaFileSize, replicaRecord); if(replicaPtr.p->procNode == nodeId){ jam(); + found = true; noOfRemovedReplicas++; removeNodeFromStored(nodeId, fragPtr, replicaPtr); if(replicaPtr.p->lcpOngoingFlag){ @@ -5213,6 +5246,15 @@ void Dbdih::removeNodeFromTable(Signal* signal, } } } + if (!found) + { + jam(); + /** + * Run updateNodeInfo to remove any dead nodes from list of activeNodes + * see bug#15587 + */ + updateNodeInfo(fragPtr); + } noOfRemainingLcpReplicas += fragPtr.p->noLcpReplicas; } diff --git a/ndb/src/kernel/blocks/dblqh/Dblqh.hpp b/ndb/src/kernel/blocks/dblqh/Dblqh.hpp index 94a40adcd4a..1ed383853ba 100644 --- a/ndb/src/kernel/blocks/dblqh/Dblqh.hpp +++ b/ndb/src/kernel/blocks/dblqh/Dblqh.hpp @@ -2885,6 +2885,7 @@ private: UintR ctransidHash[1024]; Uint32 c_diskless; + Uint32 c_error_insert_table_id; public: /** diff --git a/ndb/src/kernel/blocks/dblqh/DblqhMain.cpp b/ndb/src/kernel/blocks/dblqh/DblqhMain.cpp index 2170f890f35..2e50c0f8f73 100644 --- a/ndb/src/kernel/blocks/dblqh/DblqhMain.cpp +++ b/ndb/src/kernel/blocks/dblqh/DblqhMain.cpp @@ -3543,6 +3543,7 @@ void Dblqh::execLQHKEYREQ(Signal* signal) jam(); regTcPtr->activeCreat = ZTRUE; CRASH_INSERTION(5002); + CRASH_INSERTION2(5042, tabptr.i == c_error_insert_table_id); } else { regTcPtr->activeCreat = ZFALSE; }//if @@ -5880,12 +5881,21 @@ void Dblqh::execABORT(Signal* signal) warningReport(signal, 8); return; }//if + + TcConnectionrec * const regTcPtr = tcConnectptr.p; + + if (ERROR_INSERTED(5100)) + { + SET_ERROR_INSERT_VALUE(5101); + return; + } + CRASH_INSERTION2(5101, regTcPtr->nextReplica != ZNIL); + /* ------------------------------------------------------------------------- */ /*A GUIDING DESIGN PRINCIPLE IN HANDLING THESE ERROR SITUATIONS HAVE BEEN */ /*KEEP IT SIMPLE. THUS WE RATHER INSERT A WAIT AND SET THE ABORT_STATE TO */ /*ACTIVE RATHER THAN WRITE NEW CODE TO HANDLE EVERY SPECIAL SITUATION. */ /* ------------------------------------------------------------------------- */ - TcConnectionrec * const regTcPtr = tcConnectptr.p; if (regTcPtr->nextReplica != ZNIL) { /* ------------------------------------------------------------------------- */ // We will immediately send the ABORT message also to the next LQH node in line. @@ -18522,6 +18532,12 @@ Dblqh::execDUMP_STATE_ORD(Signal* signal) } } + + if (dumpState->args[0] == DumpStateOrd::LqhErrorInsert5042 && signal->getLength() == 2) + { + c_error_insert_table_id = dumpState->args[1]; + SET_ERROR_INSERT_VALUE(5042); + } }//Dblqh::execDUMP_STATE_ORD() void Dblqh::execSET_VAR_REQ(Signal* signal) diff --git a/ndb/src/kernel/blocks/dbtc/DbtcMain.cpp b/ndb/src/kernel/blocks/dbtc/DbtcMain.cpp index f2646fd4176..d88ffae1d85 100644 --- a/ndb/src/kernel/blocks/dbtc/DbtcMain.cpp +++ b/ndb/src/kernel/blocks/dbtc/DbtcMain.cpp @@ -6183,7 +6183,6 @@ void Dbtc::timeOutFoundLab(Signal* signal, Uint32 TapiConPtr) << " - place: " << c_apiConTimer_line[apiConnectptr.i]); switch (apiConnectptr.p->apiConnectstate) { case CS_STARTED: - ndbrequire(c_apiConTimer_line[apiConnectptr.i] != 3615); if(apiConnectptr.p->lqhkeyreqrec == apiConnectptr.p->lqhkeyconfrec){ jam(); /* @@ -6443,8 +6442,8 @@ void Dbtc::sendAbortedAfterTimeout(Signal* signal, int Tcheck) warningEvent(buf); ndbout_c(buf); ndbrequire(false); + releaseAbortResources(signal); } - releaseAbortResources(signal); return; }//if TloopCount++; diff --git a/ndb/test/ndbapi/testNodeRestart.cpp b/ndb/test/ndbapi/testNodeRestart.cpp index 6ef3da2d760..92d6c1830ef 100644 --- a/ndb/test/ndbapi/testNodeRestart.cpp +++ b/ndb/test/ndbapi/testNodeRestart.cpp @@ -21,6 +21,7 @@ #include #include #include +#include int runLoadTable(NDBT_Context* ctx, NDBT_Step* step){ @@ -409,6 +410,132 @@ int runLateCommit(NDBT_Context* ctx, NDBT_Step* step){ return NDBT_OK; } +int runBug15587(NDBT_Context* ctx, NDBT_Step* step){ + int result = NDBT_OK; + int loops = ctx->getNumLoops(); + int records = ctx->getNumRecords(); + NdbRestarter restarter; + + Uint32 tableId = ctx->getTab()->getTableId(); + int dump[2] = { DumpStateOrd::LqhErrorInsert5042, 0 }; + dump[1] = tableId; + + int nodeId = restarter.getDbNodeId(1); + + ndbout << "Restart node " << nodeId << endl; + + if (restarter.restartOneDbNode(nodeId, + /** initial */ false, + /** nostart */ true, + /** abort */ true)) + return NDBT_FAILED; + + if (restarter.waitNodesNoStart(&nodeId, 1)) + return NDBT_FAILED; + + if (restarter.dumpStateOneNode(nodeId, dump, 2)) + return NDBT_FAILED; + + if (restarter.startNodes(&nodeId, 1)) + return NDBT_FAILED; + + if (restarter.waitNodesStarted(&nodeId, 1)) + return NDBT_FAILED; + + ctx->stopTest(); + return NDBT_OK; +} + +int runBug15632(NDBT_Context* ctx, NDBT_Step* step){ + int result = NDBT_OK; + int loops = ctx->getNumLoops(); + int records = ctx->getNumRecords(); + NdbRestarter restarter; + + int nodeId = restarter.getDbNodeId(1); + + ndbout << "Restart node " << nodeId << endl; + + if (restarter.restartOneDbNode(nodeId, + /** initial */ false, + /** nostart */ true, + /** abort */ true)) + return NDBT_FAILED; + + if (restarter.waitNodesNoStart(&nodeId, 1)) + return NDBT_FAILED; + + if (restarter.insertErrorInNode(nodeId, 7165)) + return NDBT_FAILED; + + if (restarter.startNodes(&nodeId, 1)) + return NDBT_FAILED; + + if (restarter.waitNodesStarted(&nodeId, 1)) + return NDBT_FAILED; + + if (restarter.restartOneDbNode(nodeId, + /** initial */ false, + /** nostart */ true, + /** abort */ true)) + return NDBT_FAILED; + + if (restarter.waitNodesNoStart(&nodeId, 1)) + return NDBT_FAILED; + + if (restarter.insertErrorInNode(nodeId, 7171)) + return NDBT_FAILED; + + if (restarter.startNodes(&nodeId, 1)) + return NDBT_FAILED; + + if (restarter.waitNodesStarted(&nodeId, 1)) + return NDBT_FAILED; + + ctx->stopTest(); + return NDBT_OK; +} + +int runBug15685(NDBT_Context* ctx, NDBT_Step* step){ + + Ndb* pNdb = GETNDB(step); + HugoOperations hugoOps(*ctx->getTab()); + NdbRestarter restarter; + + HugoTransactions hugoTrans(*ctx->getTab()); + if (hugoTrans.loadTable(GETNDB(step), 10) != 0){ + return NDBT_FAILED; + } + + if(hugoOps.startTransaction(pNdb) != 0) + goto err; + + if(hugoOps.pkUpdateRecord(pNdb, 0, 1, rand()) != 0) + goto err; + + if(hugoOps.execute_NoCommit(pNdb) != 0) + goto err; + + if (restarter.insertErrorInAllNodes(5100)) + return NDBT_FAILED; + + hugoOps.execute_Rollback(pNdb); + + if (restarter.waitClusterStarted() != 0) + goto err; + + if (restarter.insertErrorInAllNodes(0)) + return NDBT_FAILED; + + ctx->stopTest(); + return NDBT_OK; + +err: + ctx->stopTest(); + return NDBT_FAILED; +} + + NDBT_TESTSUITE(testNodeRestart); TESTCASE("NoLoad", "Test that one node at a time can be stopped and then restarted "\ @@ -558,6 +685,8 @@ TESTCASE("RestartNFDuringNR", INITIALIZER(runCheckAllNodesStarted); INITIALIZER(runLoadTable); STEP(runRestarts); + STEP(runPkUpdateUntilStopped); + STEP(runScanUpdateUntilStopped); FINALIZER(runScanReadVerify); FINALIZER(runClearTable); } @@ -647,6 +776,8 @@ TESTCASE("RestartNodeDuringLCP", INITIALIZER(runCheckAllNodesStarted); INITIALIZER(runLoadTable); STEP(runRestarts); + STEP(runPkUpdateUntilStopped); + STEP(runScanUpdateUntilStopped); FINALIZER(runScanReadVerify); FINALIZER(runClearTable); } @@ -671,6 +802,24 @@ TESTCASE("LateCommit", STEP(runLateCommit); FINALIZER(runClearTable); } +TESTCASE("Bug15587", + "Test bug with NF during NR"){ + INITIALIZER(runLoadTable); + STEP(runScanUpdateUntilStopped); + STEP(runBug15587); + FINALIZER(runClearTable); +} +TESTCASE("Bug15632", + "Test bug with NF during NR"){ + INITIALIZER(runLoadTable); + STEP(runBug15632); + FINALIZER(runClearTable); +} +TESTCASE("Bug15685", + "Test bug with NF during abort"){ + STEP(runBug15685); + FINALIZER(runClearTable); +} NDBT_TESTSUITE_END(testNodeRestart); int main(int argc, const char** argv){ diff --git a/ndb/test/run-test/daily-basic-tests.txt b/ndb/test/run-test/daily-basic-tests.txt index 8b44594a9b5..59f51044b51 100644 --- a/ndb/test/run-test/daily-basic-tests.txt +++ b/ndb/test/run-test/daily-basic-tests.txt @@ -413,6 +413,27 @@ max-time: 500 cmd: testScan args: -n ScanParallelism +max-time: 500 +cmd: testNodeRestart +args: -n Bug15587 T1 + +max-time: 500 +cmd: testNodeRestart +args: -n Bug15632 T1 + +max-time: 500 +cmd: testNodeRestart +args: -n Bug15685 T1 + +# OLD FLEX +max-time: 500 +cmd: flexBench +args: -c 25 -t 10 + +max-time: 500 +cmd: flexHammer +args: -r 5 -t 32 + # # DICT TESTS max-time: 1500