bug#10358 - ndb

Cluster failure with non started nodes can result in timedout transactions ndb/src/mgmapi/mgmapi.cpp: Increase timeout for restarts ndb/src/ndbapi/ClusterMgr.cpp: Report NFCOMPLETEREP if no alive node exists (instead of no connected node exists) ndb/src/ndbapi/ClusterMgr.hpp: Report NFCOMPLETEREP if no alive node exists (instead of no connected node exists)
2025-01-18 13:02:28 +01:00 · 2005-05-04 18:40:54 +02:00 · 2005-05-04 18:40:54 +02:00 · 80abad58fc
commit 80abad58fc
parent 55c9c4d7e0
3 changed files with 27 additions and 6 deletions
--- a/ndb/src/mgmapi/mgmapi.cpp
+++ b/ndb/src/mgmapi/mgmapi.cpp
@ -857,7 +857,10 @@ ndb_mgm_restart2(NdbMgmHandle handle, int no_of_nodes, const int * node_list,
    args.put("initialstart", initial);
    args.put("nostart", nostart);
    const Properties *reply;
+    const int timeout = handle->read_timeout;
+    handle->read_timeout= 5*60*1000; // 5 minutes
    reply = ndb_mgm_call(handle, restart_reply, "restart all", &args);
+    handle->read_timeout= timeout;
    CHECK_REPLY(reply, -1);

    BaseString result;
@ -890,7 +893,10 @@ ndb_mgm_restart2(NdbMgmHandle handle, int no_of_nodes, const int * node_list,
  args.put("nostart", nostart);

  const Properties *reply;
+  const int timeout = handle->read_timeout;
+  handle->read_timeout= 5*60*1000; // 5 minutes
  reply = ndb_mgm_call(handle, restart_reply, "restart node", &args);
+  handle->read_timeout= timeout;
  if(reply != NULL) {
    BaseString result;
    reply->get("result", result);
--- a/ndb/src/ndbapi/ClusterMgr.cpp
+++ b/ndb/src/ndbapi/ClusterMgr.cpp
@ -66,6 +66,7 @@ ClusterMgr::ClusterMgr(TransporterFacade & _facade):
 {
  ndbSetOwnVersion();
  clusterMgrThreadMutex = NdbMutex_Create();
+  noOfAliveNodes= 0;
  noOfConnectedNodes= 0;
  theClusterMgrThread= 0;
 }
@ -335,9 +336,9 @@ ClusterMgr::execAPI_REGCONF(const Uint32 * theData){
  node.m_state = apiRegConf->nodeState;
  if (node.compatible && (node.m_state.startLevel == NodeState::SL_STARTED  ||
 			  node.m_state.startLevel == NodeState::SL_SINGLEUSER)){
-    node.m_alive = true;
+    set_node_alive(node, true);
  } else {
-    node.m_alive = false;
+    set_node_alive(node, false);
  }//if
  node.hbSent = 0;
  node.hbCounter = 0;
@ -360,7 +361,7 @@ ClusterMgr::execAPI_REGREF(const Uint32 * theData){
  assert(node.defined == true);

  node.compatible = false;
-  node.m_alive = false;
+  set_node_alive(node, false);
  node.m_state = NodeState::SL_NOTHING;
  node.m_info.m_version = ref->version;

@ -437,7 +438,7 @@ ClusterMgr::reportNodeFailed(NodeId nodeId){

  Node & theNode = theNodes[nodeId];
 
-  theNode.m_alive = false;
+  set_node_alive(theNode, false);
  if(theNode.connected)
    theFacade.doDisconnect(nodeId);
  
@ -450,7 +451,7 @@ ClusterMgr::reportNodeFailed(NodeId nodeId){

  theNode.nfCompleteRep = false;
  
-  if(noOfConnectedNodes == 0){
+  if(noOfAliveNodes == 0){
    NFCompleteRep rep;
    for(Uint32 i = 1; i<MAX_NODES; i++){
      if(theNodes[i].defined && theNodes[i].nfCompleteRep == false){
--- a/ndb/src/ndbapi/ClusterMgr.hpp
+++ b/ndb/src/ndbapi/ClusterMgr.hpp
@ -80,6 +80,7 @@ public:
  Uint32        getNoOfConnectedNodes() const;
  
 private:
+  Uint32        noOfAliveNodes;
  Uint32        noOfConnectedNodes;
  Node          theNodes[MAX_NODES];
  NdbThread*    theClusterMgrThread;
@ -100,6 +101,19 @@ private:
  void execAPI_REGREF    (const Uint32 * theData);
  void execNODE_FAILREP  (const Uint32 * theData);
  void execNF_COMPLETEREP(const Uint32 * theData);
+
+  inline void set_node_alive(Node& node, bool alive){
+    if(node.m_alive && !alive)
+    {
+      assert(noOfAliveNodes);
+      noOfAliveNodes--;
+    }
+    else if(!node.m_alive && alive)
+    {
+      noOfAliveNodes++;
+    }
+    node.m_alive = alive;
+  }
 };

 inline