ndb - bug#27003

Handle random(not in order) LQHKEYREQ failures during node-restart


ndb/src/kernel/blocks/ERROR_codes.txt:
  Document new error codes
ndb/src/kernel/blocks/dblqh/DblqhMain.cpp:
  Handle random(not in order) LQHKEYREQ failures during node-restart
ndb/src/kernel/blocks/dbtup/DbtupExecQuery.cpp:
  Error codes for various oom problems
ndb/src/kernel/blocks/dbtup/DbtupGen.cpp:
  move CLEAR_ERROR_INSERT_VALUE to constructor so that it's reasonable to use it for
  restart testing
ndb/src/kernel/blocks/ndbcntr/NdbcntrMain.cpp:
  Add error insert for CopyFragRef
ndb/test/ndbapi/testNodeRestart.cpp:
  Testprg for bug#27003
ndb/test/run-test/daily-basic-tests.txt:
  add testprg
This commit is contained in:
unknown 2007-03-13 11:29:14 +01:00
parent b5b3c616d5
commit a609410829
7 changed files with 114 additions and 4 deletions

View file

@ -489,3 +489,15 @@ Dbdict:
6003 Crash in participant @ CreateTabReq::Prepare
6004 Crash in participant @ CreateTabReq::Commit
6005 Crash in participant @ CreateTabReq::CreateDrop
TUP:
----
4025: Fail all inserts with out of memory
4026: Fail one insert with oom
4027: Fail inserts randomly with oom
4028: Fail one random insert with oom
NDBCNTR:
1000: Crash insertion on SystemError::CopyFragRef

View file

@ -9641,6 +9641,15 @@ void Dblqh::copyCompletedLab(Signal* signal)
closeCopyLab(signal);
return;
}//if
if (scanptr.p->scanState == ScanRecord::WAIT_LQHKEY_COPY &&
scanptr.p->scanErrorCounter)
{
jam();
closeCopyLab(signal);
return;
}
if (scanptr.p->scanState == ScanRecord::WAIT_LQHKEY_COPY) {
jam();
/*---------------------------------------------------------------------------*/
@ -9717,13 +9726,16 @@ void Dblqh::continueCopyAfterBlockedLab(Signal* signal)
void Dblqh::copyLqhKeyRefLab(Signal* signal)
{
ndbrequire(tcConnectptr.p->transid[1] == signal->theData[4]);
tcConnectptr.p->copyCountWords -= signal->theData[3];
Uint32 copyWords = signal->theData[3];
scanptr.i = tcConnectptr.p->tcScanRec;
c_scanRecordPool.getPtr(scanptr);
scanptr.p->scanErrorCounter++;
tcConnectptr.p->errorCode = terrorCode;
closeCopyLab(signal);
return;
LqhKeyConf* conf = (LqhKeyConf*)signal->getDataPtrSend();
conf->transId1 = copyWords;
conf->transId2 = tcConnectptr.p->transid[1];
copyCompletedLab(signal);
}//Dblqh::copyLqhKeyRefLab()
void Dblqh::closeCopyLab(Signal* signal)
@ -9734,6 +9746,7 @@ void Dblqh::closeCopyLab(Signal* signal)
// Wait until all of those have arrived until we start the
// close process.
/*---------------------------------------------------------------------------*/
scanptr.p->scanState = ScanRecord::WAIT_LQHKEY_COPY;
jam();
return;
}//if

View file

@ -213,6 +213,30 @@ void Dbtup::execTUP_ALLOCREQ(Signal* signal)
//---------------------------------------------------
PagePtr pagePtr;
Uint32 pageOffset;
if (ERROR_INSERTED(4025))
{
signal->theData[0] = 827;
return;
}
if (ERROR_INSERTED(4026))
{
CLEAR_ERROR_INSERT_VALUE;
signal->theData[0] = 827;
return;
}
if (ERROR_INSERTED(4027) && (rand() % 100) > 25)
{
signal->theData[0] = 827;
return;
}
if (ERROR_INSERTED(4028) && (rand() % 100) > 25)
{
CLEAR_ERROR_INSERT_VALUE;
signal->theData[0] = 827;
return;
}
if (!allocTh(regFragPtr.p,
regTabPtr.p,
NORMAL_PAGE,

View file

@ -66,6 +66,7 @@ void Dbtup::initData()
undoPage = 0;
totNoOfPagesAllocated = 0;
cnoOfAllocatedPages = 0;
CLEAR_ERROR_INSERT_VALUE;
// Records with constant sizes
}//Dbtup::initData()
@ -570,7 +571,6 @@ void Dbtup::execSTTOR(Signal* signal)
switch (startPhase) {
case ZSTARTPHASE1:
ljam();
CLEAR_ERROR_INSERT_VALUE;
cownref = calcTupBlockRef(0);
break;
default:

View file

@ -180,6 +180,7 @@ void Ndbcntr::execSYSTEM_ERROR(Signal* signal)
break;
case SystemError::CopyFragRefError:
CRASH_INSERTION(1000);
BaseString::snprintf(buf, sizeof(buf),
"Killed by node %d as "
"copyfrag failed, error: %u",

View file

@ -1125,6 +1125,59 @@ runBug26481(NDBT_Context* ctx, NDBT_Step* step)
return NDBT_OK;
}
int
runBug27003(NDBT_Context* ctx, NDBT_Step* step)
{
int result = NDBT_OK;
int loops = ctx->getNumLoops();
int records = ctx->getNumRecords();
NdbRestarter res;
static const int errnos[] = { 4025, 4026, 4027, 4028, 0 };
int node = res.getRandomNotMasterNodeId(rand());
ndbout_c("node: %d", node);
if (res.restartOneDbNode(node, false, true, true))
return NDBT_FAILED;
Uint32 pos = 0;
for (Uint32 i = 0; i<loops; i++)
{
while (errnos[pos] != 0)
{
ndbout_c("Tesing err: %d", errnos[pos]);
if (res.waitNodesNoStart(&node, 1))
return NDBT_FAILED;
if (res.insertErrorInNode(node, 1000))
return NDBT_FAILED;
if (res.insertErrorInNode(node, errnos[pos]))
return NDBT_FAILED;
int val2[] = { DumpStateOrd::CmvmiSetRestartOnErrorInsert, 1 };
if (res.dumpStateOneNode(node, val2, 2))
return NDBT_FAILED;
res.startNodes(&node, 1);
res.waitNodesStartPhase(&node, 1, 2);
pos++;
}
pos = 0;
}
if (res.waitNodesNoStart(&node, 1))
return NDBT_FAILED;
res.startNodes(&node, 1);
if (res.waitClusterStarted())
return NDBT_FAILED;
return NDBT_OK;
}
NDBT_TESTSUITE(testNodeRestart);
TESTCASE("NoLoad",
"Test that one node at a time can be stopped and then restarted "\
@ -1452,6 +1505,9 @@ TESTCASE("Bug26457", ""){
TESTCASE("Bug26481", ""){
INITIALIZER(runBug26481);
}
TESTCASE("Bug27003", ""){
INITIALIZER(runBug27003);
}
NDBT_TESTSUITE_END(testNodeRestart);
int main(int argc, const char** argv){

View file

@ -425,6 +425,10 @@ max-time: 500
cmd: testScan
args: -n Bug24447 T1
max-time: 1000
cmd: testNodeRestart
args: -n Bug27003 T1
max-time: 500
cmd: testNodeRestart
args: -n Bug15587 T1