From aa43e56b112b8afde201330af8a4fce2c89fed37 Mon Sep 17 00:00:00 2001
From: "jonas@perch.ndb.mysql.com" <>
Date: Thu, 30 Mar 2006 14:20:54 +0200
Subject: [PATCH 01/10] ndb - bug#15695 bug#16447 bug#18612   For various
 reasone have a partitioned cluster been created   This patch makes sure that
 when they connect   1) it's detected   2) shutdown is forced

---
 ndb/src/kernel/blocks/cmvmi/Cmvmi.cpp   |  66 ++++--
 ndb/src/kernel/blocks/qmgr/Qmgr.hpp     |  19 +-
 ndb/src/kernel/blocks/qmgr/QmgrMain.cpp | 283 +++++++++++++++++++++---
 3 files changed, 322 insertions(+), 46 deletions(-)

diff --git a/ndb/src/kernel/blocks/cmvmi/Cmvmi.cpp b/ndb/src/kernel/blocks/cmvmi/Cmvmi.cpp
index 04761cb67a8..d017705395c 100644
--- a/ndb/src/kernel/blocks/cmvmi/Cmvmi.cpp
+++ b/ndb/src/kernel/blocks/cmvmi/Cmvmi.cpp
@@ -133,6 +133,9 @@ Cmvmi::~Cmvmi()
 {
 }
 
+#ifdef ERROR_INSERT
+NodeBitmask c_error_9000_nodes_mask;
+#endif
 
 void Cmvmi::execNDB_TAMPER(Signal* signal) 
 {
@@ -390,21 +393,33 @@ void Cmvmi::execOPEN_COMREQ(Signal* signal)
 
   const Uint32 len = signal->getLength();
   if(len == 2){
-    globalTransporterRegistry.do_connect(tStartingNode);
-    globalTransporterRegistry.setIOState(tStartingNode, HaltIO);
 
-    //-----------------------------------------------------
-    // Report that the connection to the node is opened
-    //-----------------------------------------------------
-    signal->theData[0] = EventReport::CommunicationOpened;
-    signal->theData[1] = tStartingNode;
-    sendSignal(CMVMI_REF, GSN_EVENT_REP, signal, 2, JBB);
-    //-----------------------------------------------------
+#ifdef ERROR_INSERT
+    if (! (ERROR_INSERTED(9000) && c_error_9000_nodes_mask.get(tStartingNode)))
+#endif
+    {
+      globalTransporterRegistry.do_connect(tStartingNode);
+      globalTransporterRegistry.setIOState(tStartingNode, HaltIO);
+      
+      //-----------------------------------------------------
+      // Report that the connection to the node is opened
+      //-----------------------------------------------------
+      signal->theData[0] = EventReport::CommunicationOpened;
+      signal->theData[1] = tStartingNode;
+      sendSignal(CMVMI_REF, GSN_EVENT_REP, signal, 2, JBB);
+      //-----------------------------------------------------
+    }
   } else {
     for(unsigned int i = 1; i < MAX_NODES; i++ ) {
       jam();
       if (i != getOwnNodeId() && getNodeInfo(i).m_type == tData2){
 	jam();
+
+#ifdef ERROR_INSERT
+	if (ERROR_INSERTED(9000) && c_error_9000_nodes_mask.get(i))
+	  continue;
+#endif
+	
 	globalTransporterRegistry.do_connect(i);
 	globalTransporterRegistry.setIOState(i, HaltIO);
 	
@@ -1010,7 +1025,8 @@ Cmvmi::execDUMP_STATE_ORD(Signal* signal)
   }
 
   DumpStateOrd * const & dumpState = (DumpStateOrd *)&signal->theData[0];
-  if (dumpState->args[0] == DumpStateOrd::CmvmiDumpConnections){
+  Uint32 arg = dumpState->args[0];
+  if (arg == DumpStateOrd::CmvmiDumpConnections){
     for(unsigned int i = 1; i < MAX_NODES; i++ ){
       const char* nodeTypeStr = "";
       switch(getNodeInfo(i).m_type){
@@ -1043,13 +1059,13 @@ Cmvmi::execDUMP_STATE_ORD(Signal* signal)
     }
   }
   
-  if (dumpState->args[0] == DumpStateOrd::CmvmiDumpLongSignalMemory){
+  if (arg == DumpStateOrd::CmvmiDumpLongSignalMemory){
     infoEvent("Cmvmi: g_sectionSegmentPool size: %d free: %d",
 	      g_sectionSegmentPool.getSize(),
 	      g_sectionSegmentPool.getNoOfFree());
   }
   
-  if (dumpState->args[0] == DumpStateOrd::CmvmiSetRestartOnErrorInsert)
+  if (arg == DumpStateOrd::CmvmiSetRestartOnErrorInsert)
   {
     if(signal->getLength() == 1)
     {
@@ -1069,7 +1085,7 @@ Cmvmi::execDUMP_STATE_ORD(Signal* signal)
     }
   }
 
-  if (dumpState->args[0] == DumpStateOrd::CmvmiTestLongSigWithDelay) {
+  if (arg == DumpStateOrd::CmvmiTestLongSigWithDelay) {
     unsigned i;
     Uint32 loopCount = dumpState->args[1];
     const unsigned len0 = 11;
@@ -1097,6 +1113,30 @@ Cmvmi::execDUMP_STATE_ORD(Signal* signal)
     sendSignal(reference(), GSN_TESTSIG, signal, 8, JBB, ptr, 2);
   }
 
+#ifdef ERROR_INSERT
+  if (arg == 9000)
+  {
+    SET_ERROR_INSERT_VALUE(9000);
+    for (Uint32 i = 1; i<signal->getLength(); i++)
+      c_error_9000_nodes_mask.set(signal->theData[i]);
+  }
+  
+  if (arg == 9001)
+  {
+    CLEAR_ERROR_INSERT_VALUE;
+    for (Uint32 i = 0; i<MAX_NODES; i++)
+    {
+      if (c_error_9000_nodes_mask.get(i))
+      {
+	signal->theData[0] = 0;
+	signal->theData[1] = i;
+	EXECUTE_DIRECT(CMVMI, GSN_OPEN_COMREQ, signal, 2);
+      }
+    }
+    c_error_9000_nodes_mask.clear();
+  }
+#endif
+
 #ifdef VM_TRACE
 #if 0
   {
diff --git a/ndb/src/kernel/blocks/qmgr/Qmgr.hpp b/ndb/src/kernel/blocks/qmgr/Qmgr.hpp
index f6fafdae594..efcb8a30721 100644
--- a/ndb/src/kernel/blocks/qmgr/Qmgr.hpp
+++ b/ndb/src/kernel/blocks/qmgr/Qmgr.hpp
@@ -100,7 +100,12 @@ public:
   };
 
   struct StartRecord {
-    void reset(){ m_startKey++; m_startNode = 0;}
+    void reset(){ 
+      m_startKey++; 
+      m_startNode = 0; 
+      m_gsn = RNIL; 
+      m_nodes.clearWaitingFor();
+    }
     Uint32 m_startKey;
     Uint32 m_startNode;
     Uint64 m_startTimeout;
@@ -112,6 +117,14 @@ public:
   NdbNodeBitmask c_definedNodes; // DB nodes in config
   NdbNodeBitmask c_clusterNodes; // DB nodes in cluster
   NodeBitmask c_connectedNodes;  // All kinds of connected nodes
+
+  /**
+   * Nodes which we're checking for partitioned cluster
+   *
+   * i.e. nodes that connect to use, when we already have elected president
+   */
+  NdbNodeBitmask c_cmregreq_nodes;
+  
   Uint32 c_maxDynamicId;
   
   // Records
@@ -251,8 +264,10 @@ private:
 
   // Generated statement blocks
   void startphase1(Signal* signal);
-  void electionWon();
+  void electionWon(Signal* signal);
   void cmInfoconf010Lab(Signal* signal);
+  bool check_cmregreq_reply(Signal* signal, Uint32 nodeId, Uint32 gsn);
+  
   void apiHbHandlingLab(Signal* signal);
   void timerHandlingLab(Signal* signal);
   void hbReceivedLab(Signal* signal);
diff --git a/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp b/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp
index 70084e6b171..30e7f3f36a7 100644
--- a/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp
+++ b/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp
@@ -56,6 +56,33 @@
 #define DEBUG_START3(signal, msg)
 #endif
 
+/**
+ * c_start.m_gsn = GSN_CM_REGREQ
+ *   Possible for all nodes
+ *   c_start.m_nodes contains all nodes in config
+ *
+ * c_start.m_gsn = GSN_CM_NODEINFOREQ;
+ *   Set when receiving CM_REGCONF
+ *   State possible for starting node only (not in cluster)
+ *
+ *   c_start.m_nodes contains all node in alive cluster that
+ *                   that has not replied to GSN_CM_NODEINFOREQ
+ *                   passed by president in GSN_CM_REGCONF
+ *
+ * c_start.m_gsn = GSN_CM_ADD
+ *   Possible for president only
+ *   Set when receiving and accepting CM_REGREQ (to include node)
+ *
+ *   c_start.m_nodes contains all nodes in alive cluster + starting node
+ *                   that has not replied to GSN_CM_ADD
+ *                   by sending GSN_CM_ACKADD
+ *
+ * c_start.m_gsn = GSN_CM_NODEINFOCONF
+ *   Possible for non presidents only
+ *     c_start.m_nodes contains a node that has been accepted by president
+ *     but has not connected to us yet
+ */
+
 // Signal entries and statement blocks
 /* 4  P R O G R A M        */
 /*******************************/
@@ -259,18 +286,24 @@ void Qmgr::execCONNECT_REP(Signal* signal)
 {
   jamEntry();
   const Uint32 nodeId = signal->theData[0];
+
+  if (ERROR_INSERTED(931))
+  {
+    jam();
+    ndbout_c("Discarding CONNECT_REP(%d)", nodeId);
+    infoEvent("Discarding CONNECT_REP(%d)", nodeId);
+    return;
+  }
+  
   c_connectedNodes.set(nodeId);
   NodeRecPtr nodePtr;
   nodePtr.i = getOwnNodeId();
   ptrCheckGuard(nodePtr, MAX_NODES, nodeRec);
   switch(nodePtr.p->phase){
-  case ZSTARTING:
   case ZRUNNING:
+    ndbrequire(!c_clusterNodes.get(nodeId));
+  case ZSTARTING:
     jam();
-    if(!c_start.m_nodes.isWaitingFor(nodeId)){
-      jam();
-      return;
-    }
     break;
   case ZPREPARE_FAIL:
   case ZFAIL_CLOSING:
@@ -282,32 +315,64 @@ void Qmgr::execCONNECT_REP(Signal* signal)
   case ZAPI_INACTIVE:
     return;
   }
-  
+
+  if (getNodeInfo(nodeId).getType() != NodeInfo::DB)
+  {
+    jam();
+    return;
+  }
+
   switch(c_start.m_gsn){
   case GSN_CM_REGREQ:
     jam();
     sendCmRegReq(signal, nodeId);
+
+    /**
+     * We're waiting for CM_REGCONF c_start.m_nodes contains all configured
+     *   nodes
+     */
+    ndbrequire(nodePtr.p->phase == ZSTARTING);
+    ndbrequire(c_start.m_nodes.isWaitingFor(nodeId));
     return;
   case GSN_CM_NODEINFOREQ:
     jam();
-    sendCmNodeInfoReq(signal, nodeId, nodePtr.p);
-    return;
-  case GSN_CM_ADD:{
-    jam();
 
-    ndbrequire(getOwnNodeId() != cpresident);
-    c_start.m_nodes.clearWaitingFor(nodeId);
-    c_start.m_gsn = RNIL;
-    
-    NodeRecPtr addNodePtr;
-    addNodePtr.i = nodeId;
-    ptrCheckGuard(addNodePtr, MAX_NDB_NODES, nodeRec);
-    cmAddPrepare(signal, addNodePtr, nodePtr.p);
+    if (c_start.m_nodes.isWaitingFor(nodeId))
+    {
+      jam();
+      ndbrequire(getOwnNodeId() != cpresident);
+      ndbrequire(nodePtr.p->phase == ZSTARTING);
+      sendCmNodeInfoReq(signal, nodeId, nodePtr.p);
+      return;
+    }
     return;
+  case GSN_CM_NODEINFOCONF:{
+    jam();
+    
+    ndbrequire(getOwnNodeId() != cpresident);
+    ndbrequire(nodePtr.p->phase == ZRUNNING);
+    if (c_start.m_nodes.isWaitingFor(nodeId))
+    {
+      jam();
+      c_start.m_nodes.clearWaitingFor(nodeId);
+      c_start.m_gsn = RNIL;
+      
+      NodeRecPtr addNodePtr;
+      addNodePtr.i = nodeId;
+      ptrCheckGuard(addNodePtr, MAX_NDB_NODES, nodeRec);
+      cmAddPrepare(signal, addNodePtr, nodePtr.p);
+      return;
+    }
   }
   default:
-    return;
+    (void)1;
   }
+  
+  ndbrequire(!c_start.m_nodes.isWaitingFor(nodeId));
+  ndbrequire(!c_cmregreq_nodes.get(nodeId));
+  c_cmregreq_nodes.set(nodeId);
+  sendCmRegReq(signal, nodeId);  
+  c_regReqReqSent--;
   return;
 }//Qmgr::execCONNECT_REP()
 
@@ -601,22 +666,39 @@ void Qmgr::execCM_REGCONF(Signal* signal)
   jamEntry();
 
   const CmRegConf * const cmRegConf = (CmRegConf *)&signal->theData[0];
+  Uint32 presidentNodeId = cmRegConf->presidentNodeId;
+
+  if (check_cmregreq_reply(signal, presidentNodeId, GSN_CM_REGCONF))
+  {
+    jam();
+    return;
+  }
 
   if (!ndbCompatible_ndb_ndb(NDB_VERSION, cmRegConf->presidentVersion)) {
     jam();
     char buf[128];
-    BaseString::snprintf(buf,sizeof(buf),"incompatible version own=0x%x other=0x%x, shutting down", NDB_VERSION, cmRegConf->presidentVersion);
+    BaseString::snprintf(buf,sizeof(buf), 
+			 "incompatible version own=0x%x other=0x%x, "
+			 " shutting down", 
+			 NDB_VERSION, cmRegConf->presidentVersion);
     systemErrorLab(signal, __LINE__, buf);
     return;
   }
 
-
+  myNodePtr.i = getOwnNodeId();
+  ptrCheckGuard(myNodePtr, MAX_NDB_NODES, nodeRec);
+  
+  ndbrequire(c_start.m_gsn == GSN_CM_REGREQ);
+  ndbrequire(myNodePtr.p->phase = ZSTARTING);
+  
   cpdistref    = cmRegConf->presidentBlockRef;
   cpresident   = cmRegConf->presidentNodeId;
   UintR TdynamicId   = cmRegConf->dynamicId;
   c_maxDynamicId = TdynamicId;
   c_clusterNodes.assign(NdbNodeBitmask::Size, cmRegConf->allNdbNodes);
 
+  myNodePtr.p->ndynamicId = TdynamicId;
+  
 /*--------------------------------------------------------------*/
 // Send this as an EVENT REPORT to inform about hearing about
 // other NDB node proclaiming to be president.
@@ -627,10 +709,6 @@ void Qmgr::execCM_REGCONF(Signal* signal)
   signal->theData[3] = TdynamicId;
   sendSignal(CMVMI_REF, GSN_EVENT_REP, signal, 4, JBB);
 
-  myNodePtr.i = getOwnNodeId();
-  ptrCheckGuard(myNodePtr, MAX_NDB_NODES, nodeRec);
-  myNodePtr.p->ndynamicId = TdynamicId;
-
   for (nodePtr.i = 1; nodePtr.i < MAX_NDB_NODES; nodePtr.i++) {
     jam();
     if (c_clusterNodes.get(nodePtr.i)){
@@ -653,6 +731,134 @@ void Qmgr::execCM_REGCONF(Signal* signal)
   return;
 }//Qmgr::execCM_REGCONF()
 
+bool
+Qmgr::check_cmregreq_reply(Signal* signal, Uint32 nodeId, Uint32 gsn)
+{
+  NodeRecPtr myNodePtr;
+  myNodePtr.i = getOwnNodeId();
+  ptrCheckGuard(myNodePtr, MAX_NDB_NODES, nodeRec);
+  
+  NodeRecPtr nodePtr;
+  nodePtr.i = nodeId;
+  ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRec);
+  
+  /**
+   * Try to decide if replying node
+   *   knows who is president
+   */
+  Uint32 president_reply = RNIL;
+  switch(gsn){
+  case GSN_CM_REGREF:{
+    jam();
+    CmRegRef* ref = (CmRegRef*)signal->getDataPtr();
+    switch(ref->errorCode){
+    case CmRegRef::ZBUSY:
+    case CmRegRef::ZBUSY_PRESIDENT:
+    case CmRegRef::ZBUSY_TO_PRES:
+      jam();
+      /**
+       * Only president replies this
+       */
+      ndbrequire(nodeId == ref->presidentCandidate);
+      president_reply = nodeId;
+      break;
+    case CmRegRef::ZNOT_PRESIDENT:
+      jam();
+      president_reply = ref->presidentCandidate;
+      break;
+    case CmRegRef::ZNOT_IN_CFG:
+    case CmRegRef::ZNOT_DEAD:
+    case CmRegRef::ZELECTION:
+      // Neither of these replies give certain president knowledge
+      jam();
+    }
+    break;
+  }
+  case GSN_CM_REGCONF:
+    jam();
+    president_reply = nodeId;
+    break;
+  }
+  
+  char buf[256];
+  switch(c_start.m_gsn){
+  case GSN_CM_REGREQ:
+    jam();
+    ndbrequire(c_start.m_nodes.isWaitingFor(nodeId));
+    ndbrequire(c_cmregreq_nodes.isclear());    
+    ndbrequire(myNodePtr.p->phase == ZSTARTING);
+    return false;
+  case GSN_CM_NODEINFOREQ:
+    jam();
+
+    ndbrequire(myNodePtr.p->phase == ZSTARTING);
+    if (c_start.m_nodes.isWaitingFor(nodeId))
+    {
+      jam();
+      /**
+       * We're waiting for CM_NODEINFO
+       */
+      if (gsn == GSN_CM_REGREF)
+      {
+	jam();
+	return false;
+      }
+      
+      jam();
+      BaseString::snprintf(buf, sizeof(buf), 
+			   "Partitioned cluster! check StartPartialTimeout, "
+			   " received CM_REGCONF from %d"
+			   " while waiting for GSN_CM_NODEINFOCONF."
+			   " president=%d", 
+			   nodeId, cpresident);
+      goto die_direct;
+    }
+    
+    goto check_reply;
+  default:
+  case GSN_CM_NODEINFOCONF:
+    jam();
+    ndbrequire(myNodePtr.p->phase == ZRUNNING);
+    goto check_reply;
+  }
+  
+check_reply:
+  jam();
+  c_cmregreq_nodes.clear(nodeId);
+  
+  if (gsn == GSN_CM_REGCONF)
+  {
+    jam();
+    BaseString::snprintf(buf, sizeof(buf),
+			 "Partitioned cluster! check StartPartialTimeout, "
+			 " received CM_REGCONF"
+			 " from %d I think president: %d",
+			 nodeId, cpresident);
+    goto die_direct;
+  }
+  
+  if (president_reply != RNIL && president_reply != cpresident)
+  {
+    jam();
+    BaseString::snprintf(buf, sizeof(buf),
+			 "Partitioned cluster! check StartPartialTimeout, "
+			 " received CM_REGREF from %d specifying president as"
+			 " %d, president: %d",
+			 nodeId, president_reply, cpresident);
+    goto die_direct;
+  }
+  
+  return false;
+
+die_direct:
+  ndbout_c(buf);
+  progError(__LINE__, 
+	    ERR_ARBIT_SHUTDOWN, 
+	    buf);
+  
+  ndbrequire(false);
+}
+
 void
 Qmgr::sendCmNodeInfoReq(Signal* signal, Uint32 nodeId, const NodeRec * self){
   CmNodeInfoReq * const req = (CmNodeInfoReq*)signal->getDataPtrSend();
@@ -685,13 +891,21 @@ Qmgr::sendCmNodeInfoReq(Signal* signal, Uint32 nodeId, const NodeRec * self){
 void Qmgr::execCM_REGREF(Signal* signal) 
 {
   jamEntry();
-  c_regReqReqRecv++;
 
-  // Ignore block reference in data[0]
   UintR TaddNodeno = signal->theData[1];
   UintR TrefuseReason = signal->theData[2];
   Uint32 candidate = signal->theData[3];
   DEBUG_START3(signal, TrefuseReason);
+
+  if (check_cmregreq_reply(signal, TaddNodeno, GSN_CM_REGREF))
+  {
+    jam();
+    return;
+  }
+
+  c_regReqReqRecv++;
+
+  // Ignore block reference in data[0]
   
   if(candidate != cpresidentCandidate){
     jam();
@@ -779,7 +993,7 @@ void Qmgr::execCM_REGREF(Signal* signal)
   Uint64 now = NdbTick_CurrentMillisecond();
   if((c_regReqReqRecv == cnoOfNodes) || now > c_stopElectionTime){
     jam();
-    electionWon();
+    electionWon(signal);
     sendSttorryLab(signal);
     
     /**
@@ -793,7 +1007,7 @@ void Qmgr::execCM_REGREF(Signal* signal)
 }//Qmgr::execCM_REGREF()
 
 void
-Qmgr::electionWon(){
+Qmgr::electionWon(Signal* signal){
   NodeRecPtr myNodePtr;
   cpresident = getOwnNodeId(); /* This node becomes president. */
   myNodePtr.i = getOwnNodeId();
@@ -812,6 +1026,12 @@ Qmgr::electionWon(){
   cpresidentAlive = ZTRUE;
   c_stopElectionTime = ~0;
   c_start.reset();
+
+  signal->theData[0] = EventReport::CM_REGCONF;
+  signal->theData[1] = getOwnNodeId();
+  signal->theData[2] = cpresident;
+  signal->theData[3] = 1;
+  sendSignal(CMVMI_REF, GSN_EVENT_REP, signal, 4, JBB);
 }
 
 /*
@@ -946,7 +1166,7 @@ Qmgr::cmAddPrepare(Signal* signal, NodeRecPtr nodePtr, const NodeRec * self){
     ndbrequire(signal->header.theVerId_signalNumber == GSN_CM_ADD);
     c_start.m_nodes.clearWaitingFor();
     c_start.m_nodes.setWaitingFor(nodePtr.i);
-    c_start.m_gsn = GSN_CM_ADD;
+    c_start.m_gsn = GSN_CM_NODEINFOCONF;
 #else
     warningEvent("Enabling communication to CM_ADD node %u state=%d", 
 		 nodePtr.i,
@@ -1847,7 +2067,8 @@ void Qmgr::execDISCONNECT_REP(Signal* signal)
   const DisconnectRep * const rep = (DisconnectRep *)&signal->theData[0];
   const Uint32 nodeId = rep->nodeId;
   c_connectedNodes.clear(nodeId);
-
+  c_cmregreq_nodes.clear(nodeId);
+  
   NodeRecPtr nodePtr;
   nodePtr.i = getOwnNodeId();
   ptrCheckGuard(nodePtr, MAX_NODES, nodeRec);

From 96075f47f602c87d8db92e33c789013ca3d10c83 Mon Sep 17 00:00:00 2001
From: "jonas@perch.ndb.mysql.com" <>
Date: Fri, 31 Mar 2006 11:39:35 +0200
Subject: [PATCH 02/10] ndb - bug#16447   correct return value in
 check_cm_cmregreq

---
 ndb/src/kernel/blocks/qmgr/QmgrMain.cpp | 10 ++++++++--
 ndb/test/src/NdbRestarts.cpp            |  3 +--
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp b/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp
index 30e7f3f36a7..991e60a3efd 100644
--- a/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp
+++ b/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp
@@ -848,10 +848,12 @@ check_reply:
     goto die_direct;
   }
   
-  return false;
+  return true;
 
 die_direct:
   ndbout_c(buf);
+  CRASH_INSERTION(932);
+  
   progError(__LINE__, 
 	    ERR_ARBIT_SHUTDOWN, 
 	    buf);
@@ -2082,9 +2084,13 @@ void Qmgr::execDISCONNECT_REP(Signal* signal)
   case ZFAIL_CLOSING:
   case ZAPI_ACTIVE:
   case ZAPI_INACTIVE:
+  {
+    char buf[100];
+    BaseString::snprintf(buf, 100, "Node %u disconected", nodeId);    
+    progError(__LINE__, ERR_SR_OTHERNODEFAILED, buf);
     ndbrequire(false);
   }
-
+  }
   node_failed(signal, nodeId);
 }//DISCONNECT_REP
 
diff --git a/ndb/test/src/NdbRestarts.cpp b/ndb/test/src/NdbRestarts.cpp
index eea4af437c4..8465caaab48 100644
--- a/ndb/test/src/NdbRestarts.cpp
+++ b/ndb/test/src/NdbRestarts.cpp
@@ -445,8 +445,7 @@ int twoNodeFailure(NdbRestarter& _restarter,
 	 << ") secs " << endl;
   NdbSleep_SecSleep(seconds);
 
-  randomId = (rand() % _restarter.getNumDbNodes());
-  nodeId = _restarter.getDbNodeId(randomId);  
+  nodeId = _restarter.getRandomNodeOtherNodeGroup(nodeId, rand());
   g_info << _restart->m_name << ": node = "<< nodeId << endl;
 
   CHECK(_restarter.insertErrorInNode(nodeId, 9999) == 0,

From 6780538b261059b7e95511e4975b1149e702bf46 Mon Sep 17 00:00:00 2001
From: "jonas@perch.ndb.mysql.com" <>
Date: Fri, 31 Mar 2006 16:36:43 +0200
Subject: [PATCH 03/10] ndb - add support for blocking/unblocking GCP using
 WAIT_GCP_REQ

---
 ndb/include/kernel/signaldata/WaitGCP.hpp |  7 ++++--
 ndb/src/kernel/blocks/dbdih/DbdihMain.cpp | 28 +++++++++++++++++++++++
 2 files changed, 33 insertions(+), 2 deletions(-)

diff --git a/ndb/include/kernel/signaldata/WaitGCP.hpp b/ndb/include/kernel/signaldata/WaitGCP.hpp
index ebed28714d2..be2a5b9d5f0 100644
--- a/ndb/include/kernel/signaldata/WaitGCP.hpp
+++ b/ndb/include/kernel/signaldata/WaitGCP.hpp
@@ -46,7 +46,9 @@ public:
     Complete = 1,           ///< Wait for a GCP to complete
     CompleteForceStart = 2, ///< Wait for a GCP to complete start one if needed
     CompleteIfRunning = 3,  ///< Wait for ongoing GCP
-    CurrentGCI        = 8   ///< Immediately return current GCI
+    CurrentGCI        = 8,  ///< Immediately return current GCI
+    BlockStartGcp     = 9,
+    UnblockStartGcp   = 10
   };
 
   Uint32 senderRef;
@@ -70,11 +72,12 @@ class WaitGCPConf {
   //friend class Grep::PSCoord;
 
 public:
-  STATIC_CONST( SignalLength = 2 );
+  STATIC_CONST( SignalLength = 3 );
   
 public:
   Uint32 senderData;
   Uint32 gcp;
+  Uint32 blockStatus;
 };
 
 class WaitGCPRef {
diff --git a/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp b/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp
index de35ce5c275..3bbf1c76644 100644
--- a/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp
+++ b/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp
@@ -14160,11 +14160,36 @@ void Dbdih::execWAIT_GCP_REQ(Signal* signal)
     jam();
     conf->senderData = senderData;
     conf->gcp = cnewgcp;
+    conf->blockStatus = cgcpOrderBlocked;
     sendSignal(senderRef, GSN_WAIT_GCP_CONF, signal, 
 	       WaitGCPConf::SignalLength, JBB);
     return;
   }//if
 
+  if (requestType == WaitGCPReq::BlockStartGcp)
+  {
+    jam();
+    conf->senderData = senderData;
+    conf->gcp = cnewgcp;
+    conf->blockStatus = cgcpOrderBlocked;
+    sendSignal(senderRef, GSN_WAIT_GCP_CONF, signal, 
+	       WaitGCPConf::SignalLength, JBB);
+    cgcpOrderBlocked = 1;
+    return;
+  }
+
+  if (requestType == WaitGCPReq::UnblockStartGcp)
+  {
+    jam();
+    conf->senderData = senderData;
+    conf->gcp = cnewgcp;
+    conf->blockStatus = cgcpOrderBlocked;
+    sendSignal(senderRef, GSN_WAIT_GCP_CONF, signal, 
+	       WaitGCPConf::SignalLength, JBB);
+    cgcpOrderBlocked = 0;
+    return;
+  }
+  
   if(isMaster()) {
     /**
      * Master
@@ -14176,6 +14201,7 @@ void Dbdih::execWAIT_GCP_REQ(Signal* signal)
       jam();
       conf->senderData = senderData;
       conf->gcp = coldgcp;
+      conf->blockStatus = cgcpOrderBlocked;
       sendSignal(senderRef, GSN_WAIT_GCP_CONF, signal, 
 		 WaitGCPConf::SignalLength, JBB);
       return;
@@ -14262,6 +14288,7 @@ void Dbdih::execWAIT_GCP_CONF(Signal* signal)
 
   conf->senderData = ptr.p->clientData;
   conf->gcp = gcp;
+  conf->blockStatus = cgcpOrderBlocked;
   sendSignal(ptr.p->clientRef, GSN_WAIT_GCP_CONF, signal,
 	     WaitGCPConf::SignalLength, JBB);
   
@@ -14329,6 +14356,7 @@ void Dbdih::emptyWaitGCPMasterQueue(Signal* signal)
 
     c_waitGCPMasterList.next(ptr);    
     conf->senderData = clientData;
+    conf->blockStatus = cgcpOrderBlocked;
     sendSignal(clientRef, GSN_WAIT_GCP_CONF, signal,
 	       WaitGCPConf::SignalLength, JBB);
     

From bde890effd37961e5e42498a15dedba1f0fc7998 Mon Sep 17 00:00:00 2001
From: "jonas@perch.ndb.mysql.com" <>
Date: Fri, 31 Mar 2006 16:46:28 +0200
Subject: [PATCH 04/10] ndb - bug#18612 (detection of partitioned cluster)  
 this also impl. gcp safe multi node shutdown   1) block gcp   2) wait for
 ongoing gcp   3) inform all stopping QMGR's (so that they don't start with
 error handler)   4) wait for all QMGR's to reply   5) broadcast failrep for
 stopping nodes   6) (if !master died) unblock gcp

---
 .../kernel/signaldata/DumpStateOrd.hpp        |   1 +
 ndb/include/kernel/signaldata/FailRep.hpp     |   6 +-
 ndb/include/kernel/signaldata/StopReq.hpp     |  44 ++-
 ndb/src/kernel/blocks/ndbcntr/Ndbcntr.hpp     |  11 +
 ndb/src/kernel/blocks/ndbcntr/NdbcntrInit.cpp |   1 +
 ndb/src/kernel/blocks/ndbcntr/NdbcntrMain.cpp | 351 +++++++++++++++---
 ndb/src/kernel/blocks/qmgr/Qmgr.hpp           |   6 +-
 ndb/src/kernel/blocks/qmgr/QmgrInit.cpp       |   2 +
 ndb/src/kernel/blocks/qmgr/QmgrMain.cpp       |  46 ++-
 ndb/test/ndbapi/testNodeRestart.cpp           | 112 +++++-
 10 files changed, 506 insertions(+), 74 deletions(-)

diff --git a/ndb/include/kernel/signaldata/DumpStateOrd.hpp b/ndb/include/kernel/signaldata/DumpStateOrd.hpp
index b42b930711c..a2993ad5d03 100644
--- a/ndb/include/kernel/signaldata/DumpStateOrd.hpp
+++ b/ndb/include/kernel/signaldata/DumpStateOrd.hpp
@@ -64,6 +64,7 @@ public:
     // 19 NDBFS Fipple with O_SYNC, O_CREATE etc.
     // 20-24 BACKUP
     NdbcntrTestStopOnError = 25,
+    NdbcntrStopNodes = 70,
     // 100-105 TUP and ACC  
     // 200-240 UTIL
     // 300-305 TRIX
diff --git a/ndb/include/kernel/signaldata/FailRep.hpp b/ndb/include/kernel/signaldata/FailRep.hpp
index 44577f07fdc..b1c16294e70 100644
--- a/ndb/include/kernel/signaldata/FailRep.hpp
+++ b/ndb/include/kernel/signaldata/FailRep.hpp
@@ -27,6 +27,7 @@ class FailRep {
    * Sender(s) & Reciver(s)
    */
   friend class Qmgr;
+  friend class Ndbcntr;
   
   /**
    * For printing
@@ -43,9 +44,10 @@ public:
     ZSTART_IN_REGREQ=3,
     ZHEARTBEAT_FAILURE=4,
     ZLINK_FAILURE=5,
-    ZOTHERNODE_FAILED_DURING_START=6
+    ZOTHERNODE_FAILED_DURING_START=6,
+    ZMULTI_NODE_SHUTDOWN = 7
   };
-
+  
 private:
   
   Uint32 failNodeId;
diff --git a/ndb/include/kernel/signaldata/StopReq.hpp b/ndb/include/kernel/signaldata/StopReq.hpp
index 8e6a0b90a91..8a9fde75b6c 100644
--- a/ndb/include/kernel/signaldata/StopReq.hpp
+++ b/ndb/include/kernel/signaldata/StopReq.hpp
@@ -32,7 +32,7 @@ class StopReq
   friend class MgmtSrvr;
 
 public:
-  STATIC_CONST( SignalLength = 9 );
+  STATIC_CONST( SignalLength = 9 + NdbNodeBitmask::Size);
   
 public:
   Uint32 senderRef;
@@ -49,29 +49,34 @@ public:
   Int32 readOperationTimeout; // Timeout before read operations are aborted
   Int32 operationTimeout;     // Timeout before all operations are aborted
 
+  Uint32 nodes[NdbNodeBitmask::Size];
+
   static void setSystemStop(Uint32 & requestInfo, bool value);
   static void setPerformRestart(Uint32 & requestInfo, bool value);
   static void setNoStart(Uint32 & requestInfo, bool value);
   static void setInitialStart(Uint32 & requestInfo, bool value);
-  static void setEscalateOnNodeFail(Uint32 & requestInfo, bool value);
   /**
    * Don't perform "graceful" shutdown/restart...
    */
   static void setStopAbort(Uint32 & requestInfo, bool value);
+  static void setStopNodes(Uint32 & requestInfo, bool value);
 
   static bool getSystemStop(const Uint32 & requestInfo);
   static bool getPerformRestart(const Uint32 & requestInfo);
   static bool getNoStart(const Uint32 & requestInfo);
   static bool getInitialStart(const Uint32 & requestInfo);
-  static bool getEscalateOnNodeFail(const Uint32 & requestInfo);
   static bool getStopAbort(const Uint32 & requestInfo);
+  static bool getStopNodes(const Uint32 & requestInfo);
 };
 
 struct StopConf
 {
   STATIC_CONST( SignalLength = 2 );
   Uint32 senderData;
-  Uint32 nodeState;
+  union {
+    Uint32 nodeState;
+    Uint32 nodeId;
+  };
 };
 
 class StopRef 
@@ -94,7 +99,9 @@ public:
     NodeShutdownInProgress = 1,
     SystemShutdownInProgress = 2,
     NodeShutdownWouldCauseSystemCrash = 3,
-    TransactionAbortFailed = 4
+    TransactionAbortFailed = 4,
+    UnsupportedNodeShutdown = 5,
+    MultiNodeShutdownNotMaster = 6
   };
   
 public:
@@ -132,16 +139,16 @@ StopReq::getInitialStart(const Uint32 & requestInfo)
 
 inline
 bool
-StopReq::getEscalateOnNodeFail(const Uint32 & requestInfo)
+StopReq::getStopAbort(const Uint32 & requestInfo)
 {
-  return requestInfo & 16;
+  return requestInfo & 32;
 }
 
 inline
 bool
-StopReq::getStopAbort(const Uint32 & requestInfo)
+StopReq::getStopNodes(const Uint32 & requestInfo)
 {
-  return requestInfo & 32;
+  return requestInfo & 64;
 }
 
 
@@ -185,16 +192,6 @@ StopReq::setInitialStart(Uint32 & requestInfo, bool value)
     requestInfo &= ~8;
 }
 
-inline
-void
-StopReq::setEscalateOnNodeFail(Uint32 & requestInfo, bool value)
-{
-  if(value)
-    requestInfo |= 16;
-  else
-    requestInfo &= ~16;
-}
-
 inline
 void
 StopReq::setStopAbort(Uint32 & requestInfo, bool value)
@@ -205,6 +202,15 @@ StopReq::setStopAbort(Uint32 & requestInfo, bool value)
     requestInfo &= ~32;
 }
 
+inline
+void
+StopReq::setStopNodes(Uint32 & requestInfo, bool value)
+{
+  if(value)
+    requestInfo |= 64;
+  else
+    requestInfo &= ~64;
+}
 
 #endif
 
diff --git a/ndb/src/kernel/blocks/ndbcntr/Ndbcntr.hpp b/ndb/src/kernel/blocks/ndbcntr/Ndbcntr.hpp
index 657133bda36..ae40a7c4581 100644
--- a/ndb/src/kernel/blocks/ndbcntr/Ndbcntr.hpp
+++ b/ndb/src/kernel/blocks/ndbcntr/Ndbcntr.hpp
@@ -202,6 +202,7 @@ private:
   void execWAIT_GCP_CONF(Signal* signal);
 
   void execSTOP_REQ(Signal* signal);
+  void execSTOP_CONF(Signal* signal);
   void execRESUME_REQ(Signal* signal);
 
   void execCHANGE_NODE_STATE_CONF(Signal* signal);
@@ -337,6 +338,16 @@ public:
     void progError(int line, int cause, const char * extra) { 
       cntr.progError(line, cause, extra); 
     }
+
+    enum StopNodesStep {
+      SR_BLOCK_GCP_START_GCP = 0,
+      SR_WAIT_COMPLETE_GCP = 1,
+      SR_UNBLOCK_GCP_START_GCP = 2,
+      SR_QMGR_STOP_REQ = 3,
+      SR_WAIT_NODE_FAILURES = 4,
+      SR_CLUSTER_SHUTDOWN = 12
+    } m_state;
+    SignalCounter m_stop_req_counter;
   };
 private:
   StopRecord c_stopRec;
diff --git a/ndb/src/kernel/blocks/ndbcntr/NdbcntrInit.cpp b/ndb/src/kernel/blocks/ndbcntr/NdbcntrInit.cpp
index 97ca3f44b3a..cb20fb2ca22 100644
--- a/ndb/src/kernel/blocks/ndbcntr/NdbcntrInit.cpp
+++ b/ndb/src/kernel/blocks/ndbcntr/NdbcntrInit.cpp
@@ -86,6 +86,7 @@ Ndbcntr::Ndbcntr(const class Configuration & conf):
   addRecSignal(GSN_STOP_ME_CONF, &Ndbcntr::execSTOP_ME_CONF);
 
   addRecSignal(GSN_STOP_REQ, &Ndbcntr::execSTOP_REQ);
+  addRecSignal(GSN_STOP_CONF, &Ndbcntr::execSTOP_CONF);
   addRecSignal(GSN_RESUME_REQ, &Ndbcntr::execRESUME_REQ);
 
   addRecSignal(GSN_WAIT_GCP_REF, &Ndbcntr::execWAIT_GCP_REF);
diff --git a/ndb/src/kernel/blocks/ndbcntr/NdbcntrMain.cpp b/ndb/src/kernel/blocks/ndbcntr/NdbcntrMain.cpp
index e3ec1f9723e..5a841d6f836 100644
--- a/ndb/src/kernel/blocks/ndbcntr/NdbcntrMain.cpp
+++ b/ndb/src/kernel/blocks/ndbcntr/NdbcntrMain.cpp
@@ -42,6 +42,8 @@
 #include <signaldata/FsRemoveReq.hpp>
 #include <signaldata/ReadConfig.hpp>
 
+#include <signaldata/FailRep.hpp>
+
 #include <AttributeHeader.hpp>
 #include <Configuration.hpp>
 #include <DebuggerNames.hpp>
@@ -1454,13 +1456,74 @@ void Ndbcntr::execNODE_FAILREP(Signal* signal)
   sendSignal(SUMA_REF, GSN_NODE_FAILREP, signal,
 	     NodeFailRep::SignalLength, JBB);
 
+  if (c_stopRec.stopReq.senderRef)
+  {
+    jam();
+    switch(c_stopRec.m_state){
+    case StopRecord::SR_WAIT_NODE_FAILURES:
+    {
+      jam();
+      NdbNodeBitmask tmp;
+      tmp.assign(NdbNodeBitmask::Size, c_stopRec.stopReq.nodes);
+      tmp.bitANDC(allFailed);      
+      tmp.copyto(NdbNodeBitmask::Size, c_stopRec.stopReq.nodes);
+      
+      if (tmp.isclear())
+      {
+	jam();
+	if (c_stopRec.stopReq.senderRef != RNIL)
+	{
+	  jam();
+	  StopConf * const stopConf = (StopConf *)&signal->theData[0];
+	  stopConf->senderData = c_stopRec.stopReq.senderData;
+	  stopConf->nodeState  = (Uint32) NodeState::SL_SINGLEUSER;
+	  sendSignal(c_stopRec.stopReq.senderRef, GSN_STOP_CONF, signal, 
+		     StopConf::SignalLength, JBB);
+	}
+
+	c_stopRec.stopReq.senderRef = 0;
+	WaitGCPReq * req = (WaitGCPReq*)&signal->theData[0];
+	req->senderRef = reference();
+	req->senderData = StopRecord::SR_UNBLOCK_GCP_START_GCP;
+	req->requestType = WaitGCPReq::UnblockStartGcp;
+	sendSignal(DBDIH_REF, GSN_WAIT_GCP_REQ, signal, 
+		   WaitGCPReq::SignalLength, JBA);
+      }
+      break;
+    }
+    case StopRecord::SR_QMGR_STOP_REQ:
+    {
+      NdbNodeBitmask tmp;
+      tmp.assign(NdbNodeBitmask::Size, c_stopRec.stopReq.nodes);
+      tmp.bitANDC(allFailed);      
+
+      if (tmp.isclear())
+      {
+	Uint32 nodeId = allFailed.find(0);
+	tmp.set(nodeId);
+
+	StopConf* conf = (StopConf*)signal->getDataPtrSend();
+	conf->senderData = c_stopRec.stopReq.senderData;
+	conf->nodeId = nodeId;
+	sendSignal(reference(), 
+		   GSN_STOP_CONF, signal, StopConf::SignalLength, JBB);
+      }
+
+      tmp.copyto(NdbNodeBitmask::Size, c_stopRec.stopReq.nodes);
+      
+      break;
+    }
+    }
+  }
+  
+  signal->theData[0] = EventReport::NODE_FAILREP;
+  signal->theData[2] = 0;
+  
   Uint32 nodeId = 0;
   while(!allFailed.isclear()){
     nodeId = allFailed.find(nodeId + 1);
     allFailed.clear(nodeId);
-    signal->theData[0] = EventReport::NODE_FAILREP;
     signal->theData[1] = nodeId;
-    signal->theData[2] = 0;
     sendSignal(CMVMI_REF, GSN_EVENT_REP, signal, 3, JBB);
   }//for
 
@@ -1908,13 +1971,15 @@ void
 Ndbcntr::execDUMP_STATE_ORD(Signal* signal)
 {
   DumpStateOrd * const & dumpState = (DumpStateOrd *)&signal->theData[0];
-  if(signal->theData[0] == 13){
+  Uint32 arg = dumpState->args[0];
+
+  if(arg == 13){
     infoEvent("Cntr: cstartPhase = %d, cinternalStartphase = %d, block = %d", 
 	      cstartPhase, cinternalStartphase, cndbBlocksCount);
     infoEvent("Cntr: cmasterNodeId = %d", cmasterNodeId);
   }
 
-  if (dumpState->args[0] == DumpStateOrd::NdbcntrTestStopOnError){
+  if (arg == DumpStateOrd::NdbcntrTestStopOnError){
     if (theConfiguration.stopOnError() == true)
       ((Configuration&)theConfiguration).stopOnError(false);
     
@@ -1927,6 +1992,28 @@ Ndbcntr::execDUMP_STATE_ORD(Signal* signal)
 	       SystemError::SignalLength, JBA);
   }
 
+  if (arg == DumpStateOrd::NdbcntrStopNodes)
+  {
+    NdbNodeBitmask mask;
+    for(Uint32 i = 1; i<signal->getLength(); i++)
+      mask.set(signal->theData[i]);
+
+    StopReq* req = (StopReq*)signal->getDataPtrSend();
+    req->senderRef = RNIL;
+    req->senderData = 123;
+    req->requestInfo = 0;
+    req->singleuser = 0;
+    req->singleUserApi = 0;
+    mask.copyto(NdbNodeBitmask::Size, req->nodes);
+    StopReq::setPerformRestart(req->requestInfo, 1);
+    StopReq::setNoStart(req->requestInfo, 1);
+    StopReq::setStopNodes(req->requestInfo, 1);
+    StopReq::setStopAbort(req->requestInfo, 1);
+    
+    sendSignal(reference(), GSN_STOP_REQ, signal,
+	       StopReq::SignalLength, JBB);
+    return;
+  }
 
 }//Ndbcntr::execDUMP_STATE_ORD()
 
@@ -1987,9 +2074,12 @@ Ndbcntr::execSTOP_REQ(Signal* signal){
   Uint32 senderData = req->senderData;
   BlockReference senderRef = req->senderRef;
   bool abort = StopReq::getStopAbort(req->requestInfo);
+  bool stopnodes = StopReq::getStopNodes(req->requestInfo);
 
-  if(getNodeState().startLevel < NodeState::SL_STARTED || 
-     abort && !singleuser){
+  if(!singleuser && 
+     (getNodeState().startLevel < NodeState::SL_STARTED || 
+      (abort && !stopnodes)))
+  {
     /**
      * Node is not started yet
      *
@@ -2028,21 +2118,71 @@ Ndbcntr::execSTOP_REQ(Signal* signal){
     else
       ref->errorCode = StopRef::NodeShutdownInProgress;
     ref->senderData = senderData;
-    sendSignal(senderRef, GSN_STOP_REF, signal, StopRef::SignalLength, JBB);
+    
+    if (senderRef != RNIL)
+      sendSignal(senderRef, GSN_STOP_REF, signal, StopRef::SignalLength, JBB);
+    return;
+  }
+
+  if (stopnodes && !abort)
+  {
+    jam();
+    ref->errorCode = StopRef::UnsupportedNodeShutdown;
+    ref->senderData = senderData;
+    if (senderRef != RNIL)
+      sendSignal(senderRef, GSN_STOP_REF, signal, StopRef::SignalLength, JBB);
+    return;
+  }
+
+  if (stopnodes && cmasterNodeId != getOwnNodeId())
+  {
+    jam();
+    ref->errorCode = StopRef::MultiNodeShutdownNotMaster;
+    ref->senderData = senderData;
+    if (senderRef != RNIL)
+      sendSignal(senderRef, GSN_STOP_REF, signal, StopRef::SignalLength, JBB);
     return;
   }
   
   c_stopRec.stopReq = * req;
   c_stopRec.stopInitiatedTime = NdbTick_CurrentMillisecond();
   
-  if(!singleuser) {
-    if(StopReq::getSystemStop(c_stopRec.stopReq.requestInfo)) {
+  if (stopnodes)
+  {
+    jam();
+
+    if(!c_stopRec.checkNodeFail(signal))
+    {
       jam();
-      if(StopReq::getPerformRestart(c_stopRec.stopReq.requestInfo)){
+      return;
+    }
+
+    char buf[100];
+    NdbNodeBitmask mask;
+    mask.assign(NdbNodeBitmask::Size, c_stopRec.stopReq.nodes);
+    infoEvent("Initiating shutdown abort of %s", mask.getText(buf));
+    ndbout_c("Initiating shutdown abort of %s", mask.getText(buf));    
+
+    WaitGCPReq * req = (WaitGCPReq*)&signal->theData[0];
+    req->senderRef = reference();
+    req->senderData = StopRecord::SR_BLOCK_GCP_START_GCP;
+    req->requestType = WaitGCPReq::BlockStartGcp;
+    sendSignal(DBDIH_REF, GSN_WAIT_GCP_REQ, signal, 
+	       WaitGCPReq::SignalLength, JBB);
+    return;
+  }
+  else if(!singleuser) 
+  {
+    if(StopReq::getSystemStop(c_stopRec.stopReq.requestInfo)) 
+    {
+      jam();
+      if(StopReq::getPerformRestart(c_stopRec.stopReq.requestInfo))
+      {
 	((Configuration&)theConfiguration).stopOnError(false);
       }
     }
-    if(!c_stopRec.checkNodeFail(signal)){
+    if(!c_stopRec.checkNodeFail(signal))
+    {
       jam();
       return;
     }
@@ -2112,7 +2252,17 @@ Ndbcntr::StopRecord::checkNodeFail(Signal* signal){
    */
   NodeBitmask ndbMask; 
   ndbMask.assign(cntr.c_startedNodes);
-  ndbMask.clear(cntr.getOwnNodeId());
+
+  if (StopReq::getStopNodes(stopReq.requestInfo))
+  {
+    NdbNodeBitmask tmp;
+    tmp.assign(NdbNodeBitmask::Size, stopReq.nodes);
+    ndbMask.bitANDC(tmp);
+  }
+  else
+  {
+    ndbMask.clear(cntr.getOwnNodeId());
+  }
   
   CheckNodeGroups* sd = (CheckNodeGroups*)&signal->theData[0];
   sd->blockRef = cntr.reference();
@@ -2134,7 +2284,8 @@ Ndbcntr::StopRecord::checkNodeFail(Signal* signal){
   ref->errorCode = StopRef::NodeShutdownWouldCauseSystemCrash;
   
   const BlockReference bref = stopReq.senderRef;
-  cntr.sendSignal(bref, GSN_STOP_REF, signal, StopRef::SignalLength, JBB);
+  if (bref != RNIL)
+    cntr.sendSignal(bref, GSN_STOP_REF, signal, StopRef::SignalLength, JBB);
   
   stopReq.senderRef = 0;
 
@@ -2184,23 +2335,23 @@ Ndbcntr::StopRecord::checkTcTimeout(Signal* signal){
     if(stopReq.getSystemStop(stopReq.requestInfo)  || stopReq.singleuser){
       jam();
       if(stopReq.singleuser) 
-	{
-	  jam();
-	   AbortAllReq * req = (AbortAllReq*)&signal->theData[0];
-	   req->senderRef = cntr.reference();
-	   req->senderData = 12;
-	   cntr.sendSignal(DBTC_REF, GSN_ABORT_ALL_REQ, signal, 
-		      AbortAllReq::SignalLength, JBB);
-	} 
+      {
+	jam();
+	AbortAllReq * req = (AbortAllReq*)&signal->theData[0];
+	req->senderRef = cntr.reference();
+	req->senderData = 12;
+	cntr.sendSignal(DBTC_REF, GSN_ABORT_ALL_REQ, signal, 
+			AbortAllReq::SignalLength, JBB);
+      } 
       else
-	{
-	  WaitGCPReq * req = (WaitGCPReq*)&signal->theData[0];
-	  req->senderRef = cntr.reference();
-	  req->senderData = 12;
-	  req->requestType = WaitGCPReq::CompleteForceStart;
-	  cntr.sendSignal(DBDIH_REF, GSN_WAIT_GCP_REQ, signal, 
-			  WaitGCPReq::SignalLength, JBB);
-	}
+      {
+	WaitGCPReq * req = (WaitGCPReq*)&signal->theData[0];
+	req->senderRef = cntr.reference();
+	req->senderData = StopRecord::SR_CLUSTER_SHUTDOWN;
+	req->requestType = WaitGCPReq::CompleteForceStart;
+	cntr.sendSignal(DBDIH_REF, GSN_WAIT_GCP_REQ, signal, 
+			WaitGCPReq::SignalLength, JBB);
+      }
     } else {
       jam();
       StopPermReq * req = (StopPermReq*)&signal->theData[0];
@@ -2362,7 +2513,7 @@ void Ndbcntr::execWAIT_GCP_REF(Signal* signal){
 
   WaitGCPReq * req = (WaitGCPReq*)&signal->theData[0];
   req->senderRef = reference();
-  req->senderData = 12;
+  req->senderData = StopRecord::SR_CLUSTER_SHUTDOWN;
   req->requestType = WaitGCPReq::CompleteForceStart;
   sendSignal(DBDIH_REF, GSN_WAIT_GCP_REQ, signal, 
 	     WaitGCPReq::SignalLength, JBB);
@@ -2371,29 +2522,129 @@ void Ndbcntr::execWAIT_GCP_REF(Signal* signal){
 void Ndbcntr::execWAIT_GCP_CONF(Signal* signal){
   jamEntry();
 
-  ndbrequire(StopReq::getSystemStop(c_stopRec.stopReq.requestInfo));
-  NodeState newState(NodeState::SL_STOPPING_3, true); 
+  WaitGCPConf* conf = (WaitGCPConf*)signal->getDataPtr();
 
-  /**
-   * Inform QMGR so that arbitrator won't kill us
-   */
-  NodeStateRep * rep = (NodeStateRep *)&signal->theData[0];
-  rep->nodeState = newState;
-  rep->nodeState.masterNodeId = cmasterNodeId;
-  rep->nodeState.setNodeGroup(c_nodeGroup);
-  EXECUTE_DIRECT(QMGR, GSN_NODE_STATE_REP, signal, NodeStateRep::SignalLength);
+  switch(conf->senderData){
+  case StopRecord::SR_BLOCK_GCP_START_GCP:
+  {
+    jam();
+    /**
+     * 
+     */
+    if(!c_stopRec.checkNodeFail(signal))
+    {
+      jam();
+      goto unblock;
+    }
+    
+    WaitGCPReq * req = (WaitGCPReq*)&signal->theData[0];
+    req->senderRef = reference();
+    req->senderData = StopRecord::SR_WAIT_COMPLETE_GCP;
+    req->requestType = WaitGCPReq::CompleteIfRunning;
 
-  if(StopReq::getPerformRestart(c_stopRec.stopReq.requestInfo)){
-    jam();
-    StartOrd * startOrd = (StartOrd *)&signal->theData[0];
-    startOrd->restartInfo = c_stopRec.stopReq.requestInfo;
-    sendSignalWithDelay(CMVMI_REF, GSN_START_ORD, signal, 500, 
-			StartOrd::SignalLength);
-  } else {
-    jam();
-    sendSignalWithDelay(CMVMI_REF, GSN_STOP_ORD, signal, 500, 1);
+    sendSignal(DBDIH_REF, GSN_WAIT_GCP_REQ, signal, 
+	       WaitGCPReq::SignalLength, JBB);
+    return;
+  }
+  case StopRecord::SR_UNBLOCK_GCP_START_GCP:
+  {
+    jam();
+    return;
+  }
+  case StopRecord::SR_WAIT_COMPLETE_GCP:
+  {
+    jam();
+    if(!c_stopRec.checkNodeFail(signal))
+    {
+      jam();
+      goto unblock;
+    }
+
+    NdbNodeBitmask tmp;
+    tmp.assign(NdbNodeBitmask::Size, c_stopRec.stopReq.nodes);
+    c_stopRec.m_stop_req_counter = tmp;
+    NodeReceiverGroup rg(QMGR, tmp);
+    StopReq * stopReq = (StopReq *)&signal->theData[0];
+    * stopReq = c_stopRec.stopReq;
+    stopReq->senderRef = reference();
+    sendSignal(rg, GSN_STOP_REQ, signal, StopReq::SignalLength, JBA);
+    c_stopRec.m_state = StopRecord::SR_QMGR_STOP_REQ; 
+    return;
+  }
+  case StopRecord::SR_CLUSTER_SHUTDOWN:
+  {
+    jam();
+    break;
+  }
+  }
+  
+  {  
+    ndbrequire(StopReq::getSystemStop(c_stopRec.stopReq.requestInfo));
+    NodeState newState(NodeState::SL_STOPPING_3, true); 
+    
+    /**
+     * Inform QMGR so that arbitrator won't kill us
+     */
+    NodeStateRep * rep = (NodeStateRep *)&signal->theData[0];
+    rep->nodeState = newState;
+    rep->nodeState.masterNodeId = cmasterNodeId;
+    rep->nodeState.setNodeGroup(c_nodeGroup);
+    EXECUTE_DIRECT(QMGR, GSN_NODE_STATE_REP, signal, 
+		   NodeStateRep::SignalLength);
+    
+    if(StopReq::getPerformRestart(c_stopRec.stopReq.requestInfo)){
+      jam();
+      StartOrd * startOrd = (StartOrd *)&signal->theData[0];
+      startOrd->restartInfo = c_stopRec.stopReq.requestInfo;
+      sendSignalWithDelay(CMVMI_REF, GSN_START_ORD, signal, 500, 
+			  StartOrd::SignalLength);
+    } else {
+      jam();
+      sendSignalWithDelay(CMVMI_REF, GSN_STOP_ORD, signal, 500, 1);
+    }
+    return;
+  }
+  
+unblock:
+  WaitGCPReq * req = (WaitGCPReq*)&signal->theData[0];
+  req->senderRef = reference();
+  req->senderData = StopRecord::SR_UNBLOCK_GCP_START_GCP;
+  req->requestType = WaitGCPReq::UnblockStartGcp;
+  sendSignal(DBDIH_REF, GSN_WAIT_GCP_REQ, signal, 
+	     WaitGCPReq::SignalLength, JBB);
+}
+
+void
+Ndbcntr::execSTOP_CONF(Signal* signal)
+{
+  jamEntry();
+  StopConf *conf = (StopConf*)signal->getDataPtr();
+  ndbrequire(c_stopRec.m_state == StopRecord::SR_QMGR_STOP_REQ);
+  c_stopRec.m_stop_req_counter.clearWaitingFor(conf->nodeId);
+  if (c_stopRec.m_stop_req_counter.done())
+  {
+    char buf[100];
+    NdbNodeBitmask mask;
+    mask.assign(NdbNodeBitmask::Size, c_stopRec.stopReq.nodes);
+    infoEvent("Stopping of %s", mask.getText(buf));
+    ndbout_c("Stopping of %s", mask.getText(buf));    
+
+    /**
+     * Kill any node...
+     */
+    FailRep * const failRep = (FailRep *)&signal->theData[0];
+    failRep->failCause = FailRep::ZMULTI_NODE_SHUTDOWN;
+    NodeReceiverGroup rg(QMGR, c_clusterNodes);
+    Uint32 nodeId = 0;
+    while ((nodeId = NdbNodeBitmask::find(c_stopRec.stopReq.nodes, nodeId+1))
+	   != NdbNodeBitmask::NotFound)
+    {
+      failRep->failNodeId = nodeId;
+      sendSignal(rg, GSN_FAIL_REP, signal, FailRep::SignalLength, JBA);
+    }
+    c_stopRec.m_state = StopRecord::SR_WAIT_NODE_FAILURES;
+    return;
   }
-  return;
 }
 
 void Ndbcntr::execSTTORRY(Signal* signal){
diff --git a/ndb/src/kernel/blocks/qmgr/Qmgr.hpp b/ndb/src/kernel/blocks/qmgr/Qmgr.hpp
index efcb8a30721..3b623b36206 100644
--- a/ndb/src/kernel/blocks/qmgr/Qmgr.hpp
+++ b/ndb/src/kernel/blocks/qmgr/Qmgr.hpp
@@ -29,6 +29,7 @@
 #include <signaldata/CmRegSignalData.hpp>
 #include <signaldata/ApiRegSignalData.hpp>
 #include <signaldata/FailRep.hpp>
+#include <signaldata/StopReq.hpp>
 
 #include "timer.hpp"
 
@@ -218,6 +219,7 @@ private:
   void execPRES_TOCONF(Signal* signal);
   void execDISCONNECT_REP(Signal* signal);
   void execSYSTEM_ERROR(Signal* signal);
+  void execSTOP_REQ(Signal* signal);
 
   // Received signals
   void execDUMP_STATE_ORD(Signal* signal);
@@ -402,7 +404,9 @@ private:
   Uint16 cfailedNodes[MAX_NDB_NODES];
   Uint16 cprepFailedNodes[MAX_NDB_NODES];
   Uint16 ccommitFailedNodes[MAX_NDB_NODES];
-
+  
+  StopReq c_stopReq;
+  void check_multi_node_shutdown(Signal* signal);
 };
 
 #endif
diff --git a/ndb/src/kernel/blocks/qmgr/QmgrInit.cpp b/ndb/src/kernel/blocks/qmgr/QmgrInit.cpp
index 43d8f0971ed..ade880b7e4a 100644
--- a/ndb/src/kernel/blocks/qmgr/QmgrInit.cpp
+++ b/ndb/src/kernel/blocks/qmgr/QmgrInit.cpp
@@ -35,6 +35,7 @@ void Qmgr::initData()
 
   Uint32 hbDBAPI = 500;
   setHbApiDelay(hbDBAPI);
+  c_stopReq.senderRef = 0;
 }//Qmgr::initData()
 
 void Qmgr::initRecords() 
@@ -49,6 +50,7 @@ Qmgr::Qmgr(const class Configuration & conf)
 
   // Transit signals
   addRecSignal(GSN_DUMP_STATE_ORD, &Qmgr::execDUMP_STATE_ORD);
+  addRecSignal(GSN_STOP_REQ, &Qmgr::execSTOP_REQ);
   addRecSignal(GSN_DEBUG_SIG, &Qmgr::execDEBUG_SIG);
   addRecSignal(GSN_CONTINUEB, &Qmgr::execCONTINUEB);
   addRecSignal(GSN_CM_HEARTBEAT, &Qmgr::execCM_HEARTBEAT);
diff --git a/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp b/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp
index 991e60a3efd..03f6fa2ae87 100644
--- a/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp
+++ b/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp
@@ -2342,6 +2342,9 @@ void Qmgr::failReportLab(Signal* signal, Uint16 aFailedNode,
 
   failedNodePtr.i = aFailedNode;
   ptrCheckGuard(failedNodePtr, MAX_NODES, nodeRec);
+
+  check_multi_node_shutdown(signal);
+  
   if (failedNodePtr.i == getOwnNodeId()) {
     jam();
 
@@ -2433,7 +2436,9 @@ void Qmgr::execPREP_FAILREQ(Signal* signal)
 {
   NodeRecPtr myNodePtr;
   jamEntry();
-
+  
+  check_multi_node_shutdown(signal);
+  
   PrepFailReqRef * const prepFail = (PrepFailReqRef *)&signal->theData[0];
 
   BlockReference Tblockref  = prepFail->xxxBlockRef;
@@ -4085,6 +4090,8 @@ Qmgr::stateArbitCrash(Signal* signal)
   if (! (arbitRec.getTimediff() > getArbitTimeout()))
     return;
 #endif
+  CRASH_INSERTION(932);
+
   progError(__LINE__, ERR_ARBIT_SHUTDOWN, "Arbitrator decided to shutdown this node");
 }
 
@@ -4245,3 +4252,40 @@ Qmgr::execAPI_BROADCAST_REP(Signal* signal)
   NodeReceiverGroup rg(API_CLUSTERMGR, mask);
   sendSignal(rg, api.gsn, signal, len, JBB); // forward sections
 }
+
+void
+Qmgr::execSTOP_REQ(Signal* signal)
+{
+  jamEntry();
+  c_stopReq = * (StopReq*)signal->getDataPtr();
+
+  if (c_stopReq.senderRef)
+  {
+    ndbrequire(NdbNodeBitmask::get(c_stopReq.nodes, getOwnNodeId()));
+    
+    StopConf *conf = (StopConf*)signal->getDataPtrSend();
+    conf->senderData = c_stopReq.senderData;
+    conf->nodeState = getOwnNodeId();
+    sendSignal(c_stopReq.senderRef, 
+	       GSN_STOP_CONF, signal, StopConf::SignalLength, JBA);
+  }
+}
+
+void
+Qmgr::check_multi_node_shutdown(Signal* signal)
+{
+  if (c_stopReq.senderRef && 
+      NdbNodeBitmask::get(c_stopReq.nodes, getOwnNodeId()))
+  {
+    jam();
+    if(StopReq::getPerformRestart(c_stopReq.requestInfo))
+    {
+      jam();
+      StartOrd * startOrd = (StartOrd *)&signal->theData[0];
+      startOrd->restartInfo = c_stopReq.requestInfo;
+      EXECUTE_DIRECT(CMVMI, GSN_START_ORD, signal, 2);
+    } else {
+      EXECUTE_DIRECT(CMVMI, GSN_STOP_ORD, signal, 1);
+    }
+  }
+}
diff --git a/ndb/test/ndbapi/testNodeRestart.cpp b/ndb/test/ndbapi/testNodeRestart.cpp
index 365d6e3ed6e..5f577b77f34 100644
--- a/ndb/test/ndbapi/testNodeRestart.cpp
+++ b/ndb/test/ndbapi/testNodeRestart.cpp
@@ -22,7 +22,7 @@
 #include <NdbRestarts.hpp>
 #include <Vector.hpp>
 #include <signaldata/DumpStateOrd.hpp>
-
+#include <Bitmask.hpp>
 
 int runLoadTable(NDBT_Context* ctx, NDBT_Step* step){
 
@@ -669,6 +669,110 @@ err:
   return NDBT_FAILED;    
 }
 
+int 
+runBug18612(NDBT_Context* ctx, NDBT_Step* step){
+
+  // Assume two replicas
+  NdbRestarter restarter;
+  if (restarter.getNumDbNodes() < 2)
+  {
+    ctx->stopTest();
+    return NDBT_OK;
+  }
+
+  Uint32 cnt = restarter.getNumDbNodes();
+
+  for(int loop = 0; loop < ctx->getNumLoops(); loop++)
+  {
+    int partition0[256];
+    int partition1[256];
+    bzero(partition0, sizeof(partition0));
+    bzero(partition1, sizeof(partition1));
+    Bitmask<4> nodesmask;
+    
+    Uint32 node1 = restarter.getDbNodeId(rand()%cnt);
+    for (Uint32 i = 0; i<cnt/2; i++)
+    {
+      do { 
+	node1 = restarter.getRandomNodeOtherNodeGroup(node1, rand());
+      } while(nodesmask.get(node1));
+      
+      partition0[i] = node1;
+      partition1[i] = restarter.getRandomNodeSameNodeGroup(node1, rand());
+      
+      ndbout_c("nodes %d %d", node1, partition1[i]);
+      
+      assert(!nodesmask.get(node1));
+      assert(!nodesmask.get(partition1[i]));
+      nodesmask.set(node1);
+      nodesmask.set(partition1[i]);
+    } 
+    
+    ndbout_c("done");
+
+    int dump[255];
+    dump[0] = DumpStateOrd::NdbcntrStopNodes;
+    memcpy(dump + 1, partition0, sizeof(int)*cnt/2);
+    
+    Uint32 master = restarter.getMasterNodeId();
+    
+    if (restarter.dumpStateOneNode(master, dump, 1+cnt/2))
+      return NDBT_FAILED;
+    
+    if (restarter.waitNodesNoStart(partition0, cnt/2))
+      return NDBT_FAILED;
+
+    int val2[] = { DumpStateOrd::CmvmiSetRestartOnErrorInsert, 1 };
+    
+    if (restarter.dumpStateAllNodes(val2, 2))
+      return NDBT_FAILED;
+    
+    if (restarter.insertErrorInAllNodes(932))
+      return NDBT_FAILED;
+
+    dump[0] = 9000;
+    memcpy(dump + 1, partition0, sizeof(int)*cnt/2);    
+    for (Uint32 i = 0; i<cnt/2; i++)
+      if (restarter.dumpStateOneNode(partition1[i], dump, 1+cnt/2))
+	return NDBT_FAILED;
+
+    dump[0] = 9000;
+    memcpy(dump + 1, partition1, sizeof(int)*cnt/2);    
+    for (Uint32 i = 0; i<cnt/2; i++)
+      if (restarter.dumpStateOneNode(partition0[i], dump, 1+cnt/2))
+	return NDBT_FAILED;
+    
+    if (restarter.startNodes(partition0, cnt/2))
+      return NDBT_FAILED;
+    
+    if (restarter.waitNodesStartPhase(partition0, cnt/2, 2))
+      return NDBT_FAILED;
+    
+    dump[0] = 9001;
+    for (Uint32 i = 0; i<cnt/2; i++)
+      if (restarter.dumpStateAllNodes(dump, 2))
+	return NDBT_FAILED;
+
+    if (restarter.waitClusterNoStart())
+      return NDBT_FAILED;
+    
+    for (Uint32 i = 0; i<cnt/2; i++)
+      if (restarter.restartOneDbNode(partition0[i], true, true, true))
+	return NDBT_FAILED;
+
+    if (restarter.waitNodesNoStart(partition0, cnt/2))
+      return NDBT_FAILED;
+    
+    if (restarter.startAll())
+      return NDBT_FAILED;
+
+    if (restarter.waitClusterStarted())
+      return NDBT_FAILED;
+  }
+  return NDBT_OK;
+}
+
+
 NDBT_TESTSUITE(testNodeRestart);
 TESTCASE("NoLoad", 
 	 "Test that one node at a time can be stopped and then restarted "\
@@ -963,6 +1067,12 @@ TESTCASE("Bug18414",
   STEP(runBug18414);
   FINALIZER(runClearTable);
 }
+TESTCASE("Bug18612",
+	 "Test bug with partitioned clusters"){
+  INITIALIZER(runLoadTable);
+  STEP(runBug18612);
+  FINALIZER(runClearTable);
+}
 NDBT_TESTSUITE_END(testNodeRestart);
 
 int main(int argc, const char** argv){

From d367f635e9310365c8d5893f2dc9c8816953672d Mon Sep 17 00:00:00 2001
From: "jonas@perch.ndb.mysql.com" <>
Date: Fri, 31 Mar 2006 18:53:07 +0200
Subject: [PATCH 05/10] ndb - autotest   add new testpgrom for bug#18612 to
 autotest

---
 ndb/test/run-test/daily-basic-tests.txt | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/ndb/test/run-test/daily-basic-tests.txt b/ndb/test/run-test/daily-basic-tests.txt
index ce5462d11c9..1e9bad1b969 100644
--- a/ndb/test/run-test/daily-basic-tests.txt
+++ b/ndb/test/run-test/daily-basic-tests.txt
@@ -458,10 +458,14 @@ args: -n Bug16772 T1
 #cmd: testSystemRestart
 #args: -n Bug18385 T1
 #
-max-time: 500
+max-time: 1000
 cmd: testNodeRestart
 args: -n Bug18414 T1
 
+max-time: 500
+cmd: testNodeRestart
+args: -n Bug18612 T1
+
 # OLD FLEX
 max-time: 500
 cmd: flexBench

From 1aa9a95065cad59795076c17fe35edfd6f86deef Mon Sep 17 00:00:00 2001
From: "jonas@perch.ndb.mysql.com" <>
Date: Mon, 3 Apr 2006 11:26:29 +0200
Subject: [PATCH 06/10] ndb - bug#18612   post weeked fixes :-)   change impl.
 to use READ_NODESREQ to query state of other qmgr(partition)     this as it
 has no (current) side effects, so that it's possible only to kill    
 starting cluster (if one started and one starting)

---
 ndb/include/kernel/signaldata/FailRep.hpp |  11 +-
 ndb/src/kernel/blocks/qmgr/Qmgr.hpp       |   7 +-
 ndb/src/kernel/blocks/qmgr/QmgrInit.cpp   |   3 +
 ndb/src/kernel/blocks/qmgr/QmgrMain.cpp   | 230 ++++++++++------------
 ndb/test/ndbapi/testNodeRestart.cpp       |   4 +-
 5 files changed, 126 insertions(+), 129 deletions(-)

diff --git a/ndb/include/kernel/signaldata/FailRep.hpp b/ndb/include/kernel/signaldata/FailRep.hpp
index b1c16294e70..f575d99e865 100644
--- a/ndb/include/kernel/signaldata/FailRep.hpp
+++ b/ndb/include/kernel/signaldata/FailRep.hpp
@@ -36,7 +36,8 @@ class FailRep {
 
 public:
   STATIC_CONST( SignalLength = 2 );
-
+  STATIC_CONST( ExtraLength = 1 + NdbNodeBitmask::Size );
+  
   enum FailCause {
     ZOWN_FAILURE=0,
     ZOTHER_NODE_WHEN_WE_START=1,
@@ -45,13 +46,19 @@ public:
     ZHEARTBEAT_FAILURE=4,
     ZLINK_FAILURE=5,
     ZOTHERNODE_FAILED_DURING_START=6,
-    ZMULTI_NODE_SHUTDOWN = 7
+    ZMULTI_NODE_SHUTDOWN = 7,
+    ZPARTITIONED_CLUSTER = 8
   };
   
 private:
   
   Uint32 failNodeId;
   Uint32 failCause;
+  /**
+   * Used when failCause == ZPARTITIONED_CLUSTER
+   */
+  Uint32 president;
+  Uint32 partition[NdbNodeBitmask::Size];
 };
 
 
diff --git a/ndb/src/kernel/blocks/qmgr/Qmgr.hpp b/ndb/src/kernel/blocks/qmgr/Qmgr.hpp
index 3b623b36206..07e6a2a10c1 100644
--- a/ndb/src/kernel/blocks/qmgr/Qmgr.hpp
+++ b/ndb/src/kernel/blocks/qmgr/Qmgr.hpp
@@ -124,7 +124,7 @@ public:
    *
    * i.e. nodes that connect to use, when we already have elected president
    */
-  NdbNodeBitmask c_cmregreq_nodes;
+  NdbNodeBitmask c_readnodes_nodes;
   
   Uint32 c_maxDynamicId;
   
@@ -233,6 +233,8 @@ private:
   void execREAD_NODESREQ(Signal* signal);
   void execSET_VAR_REQ(Signal* signal);
 
+  void execREAD_NODESREF(Signal* signal);
+  void execREAD_NODESCONF(Signal* signal);
 
   void execAPI_VERSION_REQ(Signal* signal);
   void execAPI_BROADCAST_REP(Signal* signal);
@@ -249,6 +251,8 @@ private:
   void execARBIT_STOPREP(Signal* signal);
 
   // Statement blocks
+  void check_readnodes_reply(Signal* signal, Uint32 nodeId, Uint32 gsn);
+
   void node_failed(Signal* signal, Uint16 aFailedNode);
   void checkStartInterface(Signal* signal);
   void failReport(Signal* signal,
@@ -268,7 +272,6 @@ private:
   void startphase1(Signal* signal);
   void electionWon(Signal* signal);
   void cmInfoconf010Lab(Signal* signal);
-  bool check_cmregreq_reply(Signal* signal, Uint32 nodeId, Uint32 gsn);
   
   void apiHbHandlingLab(Signal* signal);
   void timerHandlingLab(Signal* signal);
diff --git a/ndb/src/kernel/blocks/qmgr/QmgrInit.cpp b/ndb/src/kernel/blocks/qmgr/QmgrInit.cpp
index ade880b7e4a..a8fe30d8cfa 100644
--- a/ndb/src/kernel/blocks/qmgr/QmgrInit.cpp
+++ b/ndb/src/kernel/blocks/qmgr/QmgrInit.cpp
@@ -94,6 +94,9 @@ Qmgr::Qmgr(const class Configuration & conf)
   addRecSignal(GSN_ARBIT_CHOOSEREF, &Qmgr::execARBIT_CHOOSEREF);
   addRecSignal(GSN_ARBIT_STOPREP, &Qmgr::execARBIT_STOPREP);
 
+  addRecSignal(GSN_READ_NODESREF, &Qmgr::execREAD_NODESREF);
+  addRecSignal(GSN_READ_NODESCONF, &Qmgr::execREAD_NODESCONF);
+  
   initData();
 }//Qmgr::Qmgr()
 
diff --git a/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp b/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp
index 03f6fa2ae87..c17922dff48 100644
--- a/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp
+++ b/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp
@@ -369,13 +369,29 @@ void Qmgr::execCONNECT_REP(Signal* signal)
   }
   
   ndbrequire(!c_start.m_nodes.isWaitingFor(nodeId));
-  ndbrequire(!c_cmregreq_nodes.get(nodeId));
-  c_cmregreq_nodes.set(nodeId);
-  sendCmRegReq(signal, nodeId);  
-  c_regReqReqSent--;
+  ndbrequire(!c_readnodes_nodes.get(nodeId));
+  c_readnodes_nodes.set(nodeId);
+  signal->theData[0] = reference();
+  sendSignal(calcQmgrBlockRef(nodeId), GSN_READ_NODESREQ, signal, 1, JBA);
   return;
 }//Qmgr::execCONNECT_REP()
 
+void
+Qmgr::execREAD_NODESCONF(Signal* signal)
+{
+  check_readnodes_reply(signal, 
+			refToNode(signal->getSendersBlockRef()),
+			GSN_READ_NODESCONF);
+}
+
+void
+Qmgr::execREAD_NODESREF(Signal* signal)
+{
+  check_readnodes_reply(signal, 
+			refToNode(signal->getSendersBlockRef()),
+			GSN_READ_NODESREF);
+}
+
 /*******************************/
 /* CM_INFOCONF                */
 /*******************************/
@@ -668,12 +684,6 @@ void Qmgr::execCM_REGCONF(Signal* signal)
   const CmRegConf * const cmRegConf = (CmRegConf *)&signal->theData[0];
   Uint32 presidentNodeId = cmRegConf->presidentNodeId;
 
-  if (check_cmregreq_reply(signal, presidentNodeId, GSN_CM_REGCONF))
-  {
-    jam();
-    return;
-  }
-
   if (!ndbCompatible_ndb_ndb(NDB_VERSION, cmRegConf->presidentVersion)) {
     jam();
     char buf[128];
@@ -731,8 +741,8 @@ void Qmgr::execCM_REGCONF(Signal* signal)
   return;
 }//Qmgr::execCM_REGCONF()
 
-bool
-Qmgr::check_cmregreq_reply(Signal* signal, Uint32 nodeId, Uint32 gsn)
+void
+Qmgr::check_readnodes_reply(Signal* signal, Uint32 nodeId, Uint32 gsn)
 {
   NodeRecPtr myNodePtr;
   myNodePtr.i = getOwnNodeId();
@@ -741,117 +751,65 @@ Qmgr::check_cmregreq_reply(Signal* signal, Uint32 nodeId, Uint32 gsn)
   NodeRecPtr nodePtr;
   nodePtr.i = nodeId;
   ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRec);
-  
-  /**
-   * Try to decide if replying node
-   *   knows who is president
-   */
-  Uint32 president_reply = RNIL;
-  switch(gsn){
-  case GSN_CM_REGREF:{
-    jam();
-    CmRegRef* ref = (CmRegRef*)signal->getDataPtr();
-    switch(ref->errorCode){
-    case CmRegRef::ZBUSY:
-    case CmRegRef::ZBUSY_PRESIDENT:
-    case CmRegRef::ZBUSY_TO_PRES:
-      jam();
-      /**
-       * Only president replies this
-       */
-      ndbrequire(nodeId == ref->presidentCandidate);
-      president_reply = nodeId;
-      break;
-    case CmRegRef::ZNOT_PRESIDENT:
-      jam();
-      president_reply = ref->presidentCandidate;
-      break;
-    case CmRegRef::ZNOT_IN_CFG:
-    case CmRegRef::ZNOT_DEAD:
-    case CmRegRef::ZELECTION:
-      // Neither of these replies give certain president knowledge
-      jam();
-    }
-    break;
-  }
-  case GSN_CM_REGCONF:
-    jam();
-    president_reply = nodeId;
-    break;
-  }
-  
-  char buf[256];
-  switch(c_start.m_gsn){
-  case GSN_CM_REGREQ:
-    jam();
-    ndbrequire(c_start.m_nodes.isWaitingFor(nodeId));
-    ndbrequire(c_cmregreq_nodes.isclear());    
-    ndbrequire(myNodePtr.p->phase == ZSTARTING);
-    return false;
-  case GSN_CM_NODEINFOREQ:
-    jam();
 
-    ndbrequire(myNodePtr.p->phase == ZSTARTING);
-    if (c_start.m_nodes.isWaitingFor(nodeId))
-    {
-      jam();
-      /**
-       * We're waiting for CM_NODEINFO
-       */
-      if (gsn == GSN_CM_REGREF)
-      {
-	jam();
-	return false;
-      }
-      
-      jam();
-      BaseString::snprintf(buf, sizeof(buf), 
-			   "Partitioned cluster! check StartPartialTimeout, "
-			   " received CM_REGCONF from %d"
-			   " while waiting for GSN_CM_NODEINFOCONF."
-			   " president=%d", 
-			   nodeId, cpresident);
-      goto die_direct;
-    }
-    
-    goto check_reply;
-  default:
-  case GSN_CM_NODEINFOCONF:
-    jam();
-    ndbrequire(myNodePtr.p->phase == ZRUNNING);
-    goto check_reply;
-  }
-  
-check_reply:
-  jam();
-  c_cmregreq_nodes.clear(nodeId);
-  
-  if (gsn == GSN_CM_REGCONF)
+  ndbrequire(c_readnodes_nodes.get(nodeId));
+  ReadNodesConf* conf = (ReadNodesConf*)signal->getDataPtr();
+  if (gsn == GSN_READ_NODESREF)
   {
     jam();
-    BaseString::snprintf(buf, sizeof(buf),
-			 "Partitioned cluster! check StartPartialTimeout, "
-			 " received CM_REGCONF"
-			 " from %d I think president: %d",
-			 nodeId, cpresident);
-    goto die_direct;
+retry:
+    signal->theData[0] = reference();
+    sendSignal(calcQmgrBlockRef(nodeId), GSN_READ_NODESREQ, signal, 1, JBA);
+    return;
   }
   
-  if (president_reply != RNIL && president_reply != cpresident)
+  if (conf->masterNodeId == ZNIL)
   {
     jam();
-    BaseString::snprintf(buf, sizeof(buf),
-			 "Partitioned cluster! check StartPartialTimeout, "
-			 " received CM_REGREF from %d specifying president as"
-			 " %d, president: %d",
-			 nodeId, president_reply, cpresident);
-    goto die_direct;
+    goto retry;
   }
   
-  return true;
+  Uint32 president = conf->masterNodeId;
+  if (president == cpresident)
+  {
+    jam();
+    c_readnodes_nodes.clear(nodeId);
+    return;
+  }
+
+  char buf[255];
+  BaseString::snprintf(buf, sizeof(buf),
+		       "Partitioned cluster! check StartPartialTimeout, "
+		       " node %d thinks %d is president, "
+		       " I think president is: %d",
+		       nodeId, president, cpresident);
 
-die_direct:
   ndbout_c(buf);
+  CRASH_INSERTION(933);
+
+  if (getNodeState().startLevel == NodeState::SL_STARTED)
+  {
+    jam();
+    NdbNodeBitmask part;
+    part.assign(NdbNodeBitmask::Size, conf->clusterNodes);
+    FailRep* rep = (FailRep*)signal->getDataPtrSend();
+    rep->failCause = FailRep::ZPARTITIONED_CLUSTER;
+    rep->president = cpresident;
+    c_clusterNodes.copyto(NdbNodeBitmask::Size, rep->partition);
+    Uint32 ref = calcQmgrBlockRef(nodeId);
+    Uint32 i = 0;
+    while((i = part.find(i + 1)) != NdbNodeBitmask::NotFound)
+    {
+      if (i == nodeId)
+	continue;
+      rep->failNodeId = i;
+      sendSignal(ref, GSN_FAIL_REP, signal, FailRep::SignalLength, JBA);
+    }
+    rep->failNodeId = nodeId;
+    sendSignal(ref, GSN_FAIL_REP, signal, FailRep::SignalLength, JBB);
+    return;
+  }
+  
   CRASH_INSERTION(932);
   
   progError(__LINE__, 
@@ -899,12 +857,6 @@ void Qmgr::execCM_REGREF(Signal* signal)
   Uint32 candidate = signal->theData[3];
   DEBUG_START3(signal, TrefuseReason);
 
-  if (check_cmregreq_reply(signal, TaddNodeno, GSN_CM_REGREF))
-  {
-    jam();
-    return;
-  }
-
   c_regReqReqRecv++;
 
   // Ignore block reference in data[0]
@@ -2069,7 +2021,7 @@ void Qmgr::execDISCONNECT_REP(Signal* signal)
   const DisconnectRep * const rep = (DisconnectRep *)&signal->theData[0];
   const Uint32 nodeId = rep->nodeId;
   c_connectedNodes.clear(nodeId);
-  c_cmregreq_nodes.clear(nodeId);
+  c_readnodes_nodes.clear(nodeId);
   
   NodeRecPtr nodePtr;
   nodePtr.i = getOwnNodeId();
@@ -2342,13 +2294,16 @@ void Qmgr::failReportLab(Signal* signal, Uint16 aFailedNode,
 
   failedNodePtr.i = aFailedNode;
   ptrCheckGuard(failedNodePtr, MAX_NODES, nodeRec);
+  FailRep* rep = (FailRep*)signal->getDataPtr();
 
   check_multi_node_shutdown(signal);
   
   if (failedNodePtr.i == getOwnNodeId()) {
     jam();
 
+    Uint32 code = 0;
     const char * msg = 0;
+    char extra[100];
     switch(aFailCause){
     case FailRep::ZOWN_FAILURE: 
       msg = "Own failure"; 
@@ -2369,17 +2324,46 @@ void Qmgr::failReportLab(Signal* signal, Uint16 aFailedNode,
     case FailRep::ZLINK_FAILURE:
       msg = "Connection failure";
       break;
+    case FailRep::ZPARTITIONED_CLUSTER:
+    {
+      code = ERR_ARBIT_SHUTDOWN;
+      char buf1[100], buf2[100];
+      c_clusterNodes.getText(buf1);
+      if (signal->getLength()== FailRep::SignalLength + FailRep::ExtraLength &&
+	  signal->header.theVerId_signalNumber == GSN_FAIL_REP)
+      {
+	jam();
+	NdbNodeBitmask part;
+	part.assign(NdbNodeBitmask::Size, rep->partition);
+	part.getText(buf2);
+	BaseString::snprintf(extra, sizeof(extra),
+			     "Partitioned cluster!"
+			     " Our cluster: %s other cluster: %s",
+			     buf1, buf2);
+      }
+      else
+      {
+	jam();
+	BaseString::snprintf(extra, sizeof(extra),
+			     "Partitioned cluster!"
+			     " Our cluster: %s ", buf1);
+      }
+      msg = extra;
+      break;
+    }
     }
     
-    char buf[100];
-    BaseString::snprintf(buf, 100, 
+    CRASH_INSERTION(932);
+
+    char buf[255];
+    BaseString::snprintf(buf, sizeof(buf), 
 			 "We(%u) have been declared dead by %u reason: %s(%u)",
 			 getOwnNodeId(),
 			 refToNode(signal->getSendersBlockRef()),
 			 aFailCause,
 			 msg ? msg : "<Unknown>");
-
-    progError(__LINE__, 0, buf);
+    
+    progError(__LINE__, code, buf);
     return;
   }//if
   
diff --git a/ndb/test/ndbapi/testNodeRestart.cpp b/ndb/test/ndbapi/testNodeRestart.cpp
index 5f577b77f34..bdf0069aa26 100644
--- a/ndb/test/ndbapi/testNodeRestart.cpp
+++ b/ndb/test/ndbapi/testNodeRestart.cpp
@@ -753,13 +753,13 @@ runBug18612(NDBT_Context* ctx, NDBT_Step* step){
       if (restarter.dumpStateAllNodes(dump, 2))
 	return NDBT_FAILED;
 
-    if (restarter.waitClusterNoStart())
+    if (restarter.waitNodesNoStart(partition0, cnt/2))
       return NDBT_FAILED;
     
     for (Uint32 i = 0; i<cnt/2; i++)
       if (restarter.restartOneDbNode(partition0[i], true, true, true))
 	return NDBT_FAILED;
-
+    
     if (restarter.waitNodesNoStart(partition0, cnt/2))
       return NDBT_FAILED;
     

From 7109c84c1621021ab2ccd2c47c5fb1bd3eb5c010 Mon Sep 17 00:00:00 2001
From: "jonas@perch.ndb.mysql.com" <>
Date: Mon, 3 Apr 2006 12:09:50 +0200
Subject: [PATCH 07/10] ndb - bug#18612 - partitioned startup   add testprg for
 SR case aswell

---
 ndb/test/ndbapi/testNodeRestart.cpp     | 96 +++++++++++++++++++++++++
 ndb/test/run-test/daily-basic-tests.txt |  6 +-
 2 files changed, 101 insertions(+), 1 deletion(-)

diff --git a/ndb/test/ndbapi/testNodeRestart.cpp b/ndb/test/ndbapi/testNodeRestart.cpp
index bdf0069aa26..d297527ac8b 100644
--- a/ndb/test/ndbapi/testNodeRestart.cpp
+++ b/ndb/test/ndbapi/testNodeRestart.cpp
@@ -772,6 +772,96 @@ runBug18612(NDBT_Context* ctx, NDBT_Step* step){
   return NDBT_OK;
 }
 
+int 
+runBug18612SR(NDBT_Context* ctx, NDBT_Step* step){
+
+  // Assume two replicas
+  NdbRestarter restarter;
+  if (restarter.getNumDbNodes() < 2)
+  {
+    ctx->stopTest();
+    return NDBT_OK;
+  }
+
+  Uint32 cnt = restarter.getNumDbNodes();
+
+  for(int loop = 0; loop < ctx->getNumLoops(); loop++)
+  {
+    int partition0[256];
+    int partition1[256];
+    bzero(partition0, sizeof(partition0));
+    bzero(partition1, sizeof(partition1));
+    Bitmask<4> nodesmask;
+    
+    Uint32 node1 = restarter.getDbNodeId(rand()%cnt);
+    for (Uint32 i = 0; i<cnt/2; i++)
+    {
+      do { 
+	node1 = restarter.getRandomNodeOtherNodeGroup(node1, rand());
+      } while(nodesmask.get(node1));
+      
+      partition0[i] = node1;
+      partition1[i] = restarter.getRandomNodeSameNodeGroup(node1, rand());
+      
+      ndbout_c("nodes %d %d", node1, partition1[i]);
+      
+      assert(!nodesmask.get(node1));
+      assert(!nodesmask.get(partition1[i]));
+      nodesmask.set(node1);
+      nodesmask.set(partition1[i]);
+    } 
+    
+    ndbout_c("done");
+
+    if (restarter.restartAll(false, true, false))
+      return NDBT_FAILED;
+
+    int dump[255];
+    dump[0] = 9000;
+    memcpy(dump + 1, partition0, sizeof(int)*cnt/2);    
+    for (Uint32 i = 0; i<cnt/2; i++)
+      if (restarter.dumpStateOneNode(partition1[i], dump, 1+cnt/2))
+	return NDBT_FAILED;
+
+    dump[0] = 9000;
+    memcpy(dump + 1, partition1, sizeof(int)*cnt/2);    
+    for (Uint32 i = 0; i<cnt/2; i++)
+      if (restarter.dumpStateOneNode(partition0[i], dump, 1+cnt/2))
+	return NDBT_FAILED;
+
+    int val2[] = { DumpStateOrd::CmvmiSetRestartOnErrorInsert, 1 };
+    
+    if (restarter.dumpStateAllNodes(val2, 2))
+      return NDBT_FAILED;
+    
+    if (restarter.insertErrorInAllNodes(932))
+      return NDBT_FAILED;
+    
+    if (restarter.startAll())
+      return NDBT_FAILED;
+    
+    if (restarter.waitClusterStartPhase(2))
+      return NDBT_FAILED;
+    
+    dump[0] = 9001;
+    for (Uint32 i = 0; i<cnt/2; i++)
+      if (restarter.dumpStateAllNodes(dump, 2))
+	return NDBT_FAILED;
+
+    if (restarter.waitClusterNoStart(30))
+      if (restarter.waitNodesNoStart(partition0, cnt/2, 10))
+	if (restarter.waitNodesNoStart(partition1, cnt/2, 10))
+	  return NDBT_FAILED;
+    
+    if (restarter.startAll())
+      return NDBT_FAILED;
+    
+    if (restarter.waitClusterStarted())
+      return NDBT_FAILED;
+  }
+  return NDBT_OK;
+}
+
 
 NDBT_TESTSUITE(testNodeRestart);
 TESTCASE("NoLoad", 
@@ -1073,6 +1163,12 @@ TESTCASE("Bug18612",
   STEP(runBug18612);
   FINALIZER(runClearTable);
 }
+TESTCASE("Bug18612SR",
+	 "Test bug with partitioned clusters"){
+  INITIALIZER(runLoadTable);
+  STEP(runBug18612SR);
+  FINALIZER(runClearTable);
+}
 NDBT_TESTSUITE_END(testNodeRestart);
 
 int main(int argc, const char** argv){
diff --git a/ndb/test/run-test/daily-basic-tests.txt b/ndb/test/run-test/daily-basic-tests.txt
index 1e9bad1b969..508bf4c3d1e 100644
--- a/ndb/test/run-test/daily-basic-tests.txt
+++ b/ndb/test/run-test/daily-basic-tests.txt
@@ -462,10 +462,14 @@ max-time: 1000
 cmd: testNodeRestart
 args: -n Bug18414 T1
 
-max-time: 500
+max-time: 1000
 cmd: testNodeRestart
 args: -n Bug18612 T1
 
+max-time: 1000
+cmd: testNodeRestart
+args: -n Bug18612SR T1
+
 # OLD FLEX
 max-time: 500
 cmd: flexBench

From 274e35c4d6192e9f45c05fed1427917e5a19ac1e Mon Sep 17 00:00:00 2001
From: "jonas@perch.ndb.mysql.com" <>
Date: Mon, 3 Apr 2006 13:12:23 +0200
Subject: [PATCH 08/10] ndb -   Fix compile error...when compiling debug

---
 ndb/include/kernel/signaldata/FailRep.hpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/ndb/include/kernel/signaldata/FailRep.hpp b/ndb/include/kernel/signaldata/FailRep.hpp
index f575d99e865..f2250f1af73 100644
--- a/ndb/include/kernel/signaldata/FailRep.hpp
+++ b/ndb/include/kernel/signaldata/FailRep.hpp
@@ -18,6 +18,7 @@
 #define FAIL_REP_HPP
 
 #include "SignalData.hpp"
+#include <NodeBitmask.hpp>
 
 /**
  * 

From 2abc5e2f77ca6ef18826b42aa325431aac320674 Mon Sep 17 00:00:00 2001
From: "jonas@perch.ndb.mysql.com" <>
Date: Mon, 3 Apr 2006 20:43:14 +0200
Subject: [PATCH 09/10] ndb -   fix testprogam if only 1 node group

---
 ndb/test/ndbapi/testNodeRestart.cpp | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/ndb/test/ndbapi/testNodeRestart.cpp b/ndb/test/ndbapi/testNodeRestart.cpp
index d297527ac8b..7017aac0ade 100644
--- a/ndb/test/ndbapi/testNodeRestart.cpp
+++ b/ndb/test/ndbapi/testNodeRestart.cpp
@@ -694,7 +694,10 @@ runBug18612(NDBT_Context* ctx, NDBT_Step* step){
     for (Uint32 i = 0; i<cnt/2; i++)
     {
       do { 
-	node1 = restarter.getRandomNodeOtherNodeGroup(node1, rand());
+	int tmp = restarter.getRandomNodeOtherNodeGroup(node1, rand());
+	if (tmp == -1)
+	  break;
+	node1 = tmp;
       } while(nodesmask.get(node1));
       
       partition0[i] = node1;
@@ -797,7 +800,10 @@ runBug18612SR(NDBT_Context* ctx, NDBT_Step* step){
     for (Uint32 i = 0; i<cnt/2; i++)
     {
       do { 
-	node1 = restarter.getRandomNodeOtherNodeGroup(node1, rand());
+	int tmp = restarter.getRandomNodeOtherNodeGroup(node1, rand());
+	if (tmp == -1)
+	  break;
+	node1 = tmp;
       } while(nodesmask.get(node1));
       
       partition0[i] = node1;

From f41db42287b685fcde3525c07347d71477638acb Mon Sep 17 00:00:00 2001
From: "jonas@perch.ndb.mysql.com" <>
Date: Thu, 6 Apr 2006 16:18:42 +0200
Subject: [PATCH 10/10] ndb - bug#18612 - post review fixes   1) make sure that
 check_multi_node_shutdown does not proceed (in stop case)   2) Fix printout

---
 ndb/src/kernel/blocks/qmgr/Qmgr.hpp     |  2 +-
 ndb/src/kernel/blocks/qmgr/QmgrMain.cpp | 20 +++++++++++++++-----
 2 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/ndb/src/kernel/blocks/qmgr/Qmgr.hpp b/ndb/src/kernel/blocks/qmgr/Qmgr.hpp
index 07e6a2a10c1..02be002cae0 100644
--- a/ndb/src/kernel/blocks/qmgr/Qmgr.hpp
+++ b/ndb/src/kernel/blocks/qmgr/Qmgr.hpp
@@ -409,7 +409,7 @@ private:
   Uint16 ccommitFailedNodes[MAX_NDB_NODES];
   
   StopReq c_stopReq;
-  void check_multi_node_shutdown(Signal* signal);
+  bool check_multi_node_shutdown(Signal* signal);
 };
 
 #endif
diff --git a/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp b/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp
index c17922dff48..8b7caadfeb9 100644
--- a/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp
+++ b/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp
@@ -2296,7 +2296,11 @@ void Qmgr::failReportLab(Signal* signal, Uint16 aFailedNode,
   ptrCheckGuard(failedNodePtr, MAX_NODES, nodeRec);
   FailRep* rep = (FailRep*)signal->getDataPtr();
 
-  check_multi_node_shutdown(signal);
+  if (check_multi_node_shutdown(signal))
+  {
+    jam();
+    return;
+  }
   
   if (failedNodePtr.i == getOwnNodeId()) {
     jam();
@@ -2360,8 +2364,8 @@ void Qmgr::failReportLab(Signal* signal, Uint16 aFailedNode,
 			 "We(%u) have been declared dead by %u reason: %s(%u)",
 			 getOwnNodeId(),
 			 refToNode(signal->getSendersBlockRef()),
-			 aFailCause,
-			 msg ? msg : "<Unknown>");
+			 msg ? msg : "<Unknown>",
+			 aFailCause);
     
     progError(__LINE__, code, buf);
     return;
@@ -2421,7 +2425,11 @@ void Qmgr::execPREP_FAILREQ(Signal* signal)
   NodeRecPtr myNodePtr;
   jamEntry();
   
-  check_multi_node_shutdown(signal);
+  if (check_multi_node_shutdown(signal))
+  {
+    jam();
+    return;
+  }
   
   PrepFailReqRef * const prepFail = (PrepFailReqRef *)&signal->theData[0];
 
@@ -4255,7 +4263,7 @@ Qmgr::execSTOP_REQ(Signal* signal)
   }
 }
 
-void
+bool
 Qmgr::check_multi_node_shutdown(Signal* signal)
 {
   if (c_stopReq.senderRef && 
@@ -4271,5 +4279,7 @@ Qmgr::check_multi_node_shutdown(Signal* signal)
     } else {
       EXECUTE_DIRECT(CMVMI, GSN_STOP_ORD, signal, 1);
     }
+    return true;
   }
+  return false;
 }