mirror of
https://github.com/MariaDB/server.git
synced 2025-01-19 05:22:25 +01:00
ndb - bug#27466 nf during nr can leave cluster in inconsistent state (recommit in 5.1)
Fix race condition between NODE_FAILREP and local INCL_NODEREQ loop Also retry on ZNODE_START_DISALLOWED_ERROR
This commit is contained in:
parent
0e1974d2bf
commit
e5faf16bc0
8 changed files with 130 additions and 51 deletions
|
@ -67,6 +67,7 @@ private:
|
|||
enum ErrorCode
|
||||
{
|
||||
ZNODE_ALREADY_STARTING_ERROR = 305,
|
||||
ZNODE_START_DISALLOWED_ERROR = 309,
|
||||
InitialStartRequired = 320
|
||||
};
|
||||
};
|
||||
|
|
|
@ -6,7 +6,7 @@ Next DBTUP 4029
|
|||
Next DBLQH 5045
|
||||
Next DBDICT 6007
|
||||
Next DBDIH 7183
|
||||
Next DBTC 8039
|
||||
Next DBTC 8040
|
||||
Next CMVMI 9000
|
||||
Next BACKUP 10038
|
||||
Next DBUTIL 11002
|
||||
|
@ -327,6 +327,8 @@ Test Crashes in handling node restarts
|
|||
|
||||
7170: Crash when receiving START_PERMREF (InitialStartRequired)
|
||||
|
||||
8039: DBTC delay INCL_NODECONF and kill starting node
|
||||
|
||||
7174: Crash starting node before sending DICT_LOCK_REQ
|
||||
7175: Master sends one fake START_PERMREF (ZNODE_ALREADY_STARTING_ERROR)
|
||||
7176: Slave NR pretends master does not support DICT lock (rolling upgrade)
|
||||
|
|
|
@ -74,7 +74,6 @@
|
|||
#define ZWRONG_FAILURE_NUMBER_ERROR 302
|
||||
#define ZWRONG_START_NODE_ERROR 303
|
||||
#define ZNO_REPLICA_FOUND_ERROR 304
|
||||
#define ZNODE_START_DISALLOWED_ERROR 309
|
||||
|
||||
// --------------------------------------
|
||||
// Codes from LQH
|
||||
|
|
|
@ -1709,7 +1709,8 @@ void Dbdih::execSTART_PERMREF(Signal* signal)
|
|||
{
|
||||
jamEntry();
|
||||
Uint32 errorCode = signal->theData[1];
|
||||
if (errorCode == StartPermRef::ZNODE_ALREADY_STARTING_ERROR) {
|
||||
if (errorCode == StartPermRef::ZNODE_ALREADY_STARTING_ERROR ||
|
||||
errorCode == StartPermRef::ZNODE_START_DISALLOWED_ERROR) {
|
||||
jam();
|
||||
/*-----------------------------------------------------------------------*/
|
||||
// The master was busy adding another node. We will wait for a second and
|
||||
|
@ -2056,49 +2057,49 @@ void Dbdih::execINCL_NODECONF(Signal* signal)
|
|||
TstartNode_or_blockref = signal->theData[0];
|
||||
TsendNodeId = signal->theData[1];
|
||||
|
||||
if (TstartNode_or_blockref == clocallqhblockref) {
|
||||
jam();
|
||||
/*-----------------------------------------------------------------------*/
|
||||
// THIS SIGNAL CAME FROM THE LOCAL LQH BLOCK.
|
||||
// WE WILL NOW SEND INCLUDE TO THE TC BLOCK.
|
||||
/*-----------------------------------------------------------------------*/
|
||||
signal->theData[0] = reference();
|
||||
signal->theData[1] = c_nodeStartSlave.nodeId;
|
||||
sendSignal(clocaltcblockref, GSN_INCL_NODEREQ, signal, 2, JBB);
|
||||
return;
|
||||
}//if
|
||||
if (TstartNode_or_blockref == clocaltcblockref) {
|
||||
jam();
|
||||
/*----------------------------------------------------------------------*/
|
||||
// THIS SIGNAL CAME FROM THE LOCAL LQH BLOCK.
|
||||
// WE WILL NOW SEND INCLUDE TO THE DICT BLOCK.
|
||||
/*----------------------------------------------------------------------*/
|
||||
signal->theData[0] = reference();
|
||||
signal->theData[1] = c_nodeStartSlave.nodeId;
|
||||
sendSignal(cdictblockref, GSN_INCL_NODEREQ, signal, 2, JBB);
|
||||
return;
|
||||
}//if
|
||||
if (TstartNode_or_blockref == cdictblockref) {
|
||||
jam();
|
||||
/*-----------------------------------------------------------------------*/
|
||||
// THIS SIGNAL CAME FROM THE LOCAL DICT BLOCK. WE WILL NOW SEND CONF TO THE
|
||||
// BACKUP.
|
||||
/*-----------------------------------------------------------------------*/
|
||||
signal->theData[0] = reference();
|
||||
signal->theData[1] = c_nodeStartSlave.nodeId;
|
||||
sendSignal(BACKUP_REF, GSN_INCL_NODEREQ, signal, 2, JBB);
|
||||
|
||||
// Suma will not send response to this for now, later...
|
||||
sendSignal(SUMA_REF, GSN_INCL_NODEREQ, signal, 2, JBB);
|
||||
return;
|
||||
}//if
|
||||
if (TstartNode_or_blockref == numberToRef(BACKUP, getOwnNodeId())){
|
||||
jam();
|
||||
signal->theData[0] = c_nodeStartSlave.nodeId;
|
||||
signal->theData[1] = cownNodeId;
|
||||
sendSignal(cmasterdihref, GSN_INCL_NODECONF, signal, 2, JBB);
|
||||
c_nodeStartSlave.nodeId = 0;
|
||||
return;
|
||||
static Uint32 blocklist[] = {
|
||||
clocallqhblockref,
|
||||
clocaltcblockref,
|
||||
cdictblockref,
|
||||
0,
|
||||
0,
|
||||
0
|
||||
};
|
||||
blocklist[3] = numberToRef(BACKUP, getOwnNodeId());
|
||||
blocklist[4] = numberToRef(SUMA, getOwnNodeId());
|
||||
|
||||
Uint32 i = 0;
|
||||
for (Uint32 i = 0; blocklist[i] != 0; i++)
|
||||
{
|
||||
if (TstartNode_or_blockref == blocklist[i])
|
||||
{
|
||||
jam();
|
||||
if (getNodeStatus(c_nodeStartSlave.nodeId) == NodeRecord::ALIVE &&
|
||||
blocklist[i+1] != 0)
|
||||
{
|
||||
/**
|
||||
* Send to next in block list
|
||||
*/
|
||||
jam();
|
||||
signal->theData[0] = reference();
|
||||
signal->theData[1] = c_nodeStartSlave.nodeId;
|
||||
sendSignal(blocklist[i+1], GSN_INCL_NODEREQ, signal, 2, JBB);
|
||||
return;
|
||||
}
|
||||
else
|
||||
{
|
||||
/**
|
||||
* All done, reply to master
|
||||
*/
|
||||
jam();
|
||||
signal->theData[0] = c_nodeStartSlave.nodeId;
|
||||
signal->theData[1] = cownNodeId;
|
||||
sendSignal(cmasterdihref, GSN_INCL_NODECONF, signal, 2, JBB);
|
||||
|
||||
c_nodeStartSlave.nodeId = 0;
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ndbrequire(cmasterdihref = reference());
|
||||
|
@ -2217,7 +2218,7 @@ void Dbdih::execSTART_INFOREQ(Signal* signal)
|
|||
StartInfoRef *const ref =(StartInfoRef*)&signal->theData[0];
|
||||
ref->startingNodeId = startNode;
|
||||
ref->sendingNodeId = cownNodeId;
|
||||
ref->errorCode = ZNODE_START_DISALLOWED_ERROR;
|
||||
ref->errorCode = StartPermRef::ZNODE_START_DISALLOWED_ERROR;
|
||||
sendSignal(cmasterdihref, GSN_START_INFOREF, signal,
|
||||
StartInfoRef::SignalLength, JBB);
|
||||
return;
|
||||
|
|
|
@ -311,6 +311,19 @@ void Dbtc::execINCL_NODEREQ(Signal* signal)
|
|||
hostptr.p->hostStatus = HS_ALIVE;
|
||||
signal->theData[0] = cownref;
|
||||
c_alive_nodes.set(hostptr.i);
|
||||
|
||||
if (ERROR_INSERTED(8039))
|
||||
{
|
||||
CLEAR_ERROR_INSERT_VALUE;
|
||||
Uint32 save = signal->theData[0];
|
||||
signal->theData[0] = 9999;
|
||||
sendSignal(numberToRef(CMVMI, hostptr.i),
|
||||
GSN_NDB_TAMPER, signal, 1, JBB);
|
||||
signal->theData[0] = save;
|
||||
sendSignalWithDelay(tblockref, GSN_INCL_NODECONF, signal, 5000, 1);
|
||||
return;
|
||||
}
|
||||
|
||||
sendSignal(tblockref, GSN_INCL_NODECONF, signal, 1, JBB);
|
||||
}
|
||||
|
||||
|
|
|
@ -813,17 +813,14 @@ void
|
|||
Suma::execINCL_NODEREQ(Signal* signal){
|
||||
jamEntry();
|
||||
|
||||
//const Uint32 senderRef = signal->theData[0];
|
||||
const Uint32 senderRef = signal->theData[0];
|
||||
const Uint32 nodeId = signal->theData[1];
|
||||
|
||||
ndbrequire(!c_alive_nodes.get(nodeId));
|
||||
c_alive_nodes.set(nodeId);
|
||||
|
||||
#if 0 // if we include this DIH's got to be prepared, later if needed...
|
||||
signal->theData[0] = reference();
|
||||
|
||||
sendSignal(senderRef, GSN_INCL_NODECONF, signal, 1, JBB);
|
||||
#endif
|
||||
}
|
||||
|
||||
void
|
||||
|
@ -953,6 +950,15 @@ Suma::execDUMP_STATE_ORD(Signal* signal){
|
|||
CLEAR_ERROR_INSERT_VALUE;
|
||||
}
|
||||
|
||||
if (tCase == 8010)
|
||||
{
|
||||
char buf1[255], buf2[255];
|
||||
c_subscriber_nodes.getText(buf1);
|
||||
c_connected_nodes.getText(buf2);
|
||||
infoEvent("c_subscriber_nodes: %s", buf1);
|
||||
infoEvent("c_connected_nodes: %s", buf2);
|
||||
}
|
||||
|
||||
if (tCase == 8009)
|
||||
{
|
||||
if (ERROR_INSERTED(13030))
|
||||
|
|
|
@ -1423,6 +1423,56 @@ runBug27283(NDBT_Context* ctx, NDBT_Step* step)
|
|||
return NDBT_OK;
|
||||
}
|
||||
|
||||
int
|
||||
runBug27466(NDBT_Context* ctx, NDBT_Step* step)
|
||||
{
|
||||
int result = NDBT_OK;
|
||||
int loops = ctx->getNumLoops();
|
||||
int records = ctx->getNumRecords();
|
||||
NdbRestarter res;
|
||||
|
||||
if (res.getNumDbNodes() < 2)
|
||||
{
|
||||
return NDBT_OK;
|
||||
}
|
||||
|
||||
Uint32 pos = 0;
|
||||
for (Uint32 i = 0; i<loops; i++)
|
||||
{
|
||||
int node1 = res.getDbNodeId(rand() % res.getNumDbNodes());
|
||||
int node2 = node1;
|
||||
while (node1 == node2)
|
||||
{
|
||||
node2 = res.getDbNodeId(rand() % res.getNumDbNodes());
|
||||
}
|
||||
|
||||
if (res.restartOneDbNode(node1, false, true, true))
|
||||
return NDBT_FAILED;
|
||||
|
||||
if (res.waitNodesNoStart(&node1, 1))
|
||||
return NDBT_FAILED;
|
||||
|
||||
int val2[] = { DumpStateOrd::CmvmiSetRestartOnErrorInsert, 1 };
|
||||
if (res.dumpStateOneNode(node1, val2, 2))
|
||||
return NDBT_FAILED;
|
||||
|
||||
if (res.insertErrorInNode(node2, 8039))
|
||||
return NDBT_FAILED;
|
||||
|
||||
res.startNodes(&node1, 1);
|
||||
NdbSleep_SecSleep(3);
|
||||
if (res.waitNodesNoStart(&node1, 1))
|
||||
return NDBT_FAILED;
|
||||
NdbSleep_SecSleep(5); // Wait for delayed INCL_NODECONF to arrive
|
||||
|
||||
res.startNodes(&node1, 1);
|
||||
if (res.waitClusterStarted())
|
||||
return NDBT_FAILED;
|
||||
}
|
||||
|
||||
return NDBT_OK;
|
||||
}
|
||||
|
||||
NDBT_TESTSUITE(testNodeRestart);
|
||||
TESTCASE("NoLoad",
|
||||
"Test that one node at a time can be stopped and then restarted "\
|
||||
|
@ -1774,6 +1824,9 @@ TESTCASE("Bug27003", ""){
|
|||
TESTCASE("Bug27283", ""){
|
||||
INITIALIZER(runBug27283);
|
||||
}
|
||||
TESTCASE("Bug27466", ""){
|
||||
INITIALIZER(runBug27466);
|
||||
}
|
||||
NDBT_TESTSUITE_END(testNodeRestart);
|
||||
|
||||
int main(int argc, const char** argv){
|
||||
|
|
|
@ -792,6 +792,10 @@ max-time: 1000
|
|||
cmd: testNodeRestart
|
||||
args: -n Bug25468 T1
|
||||
|
||||
max-time: 1000
|
||||
cmd: testNodeRestart
|
||||
args: -n Bug27466 T1
|
||||
|
||||
max-time: 1000
|
||||
cmd: test_event
|
||||
args: -l 10 -n Bug27169 T1
|
||||
|
|
Loading…
Reference in a new issue