mirror of
https://github.com/MariaDB/server.git
synced 2025-01-17 04:22:27 +01:00
Bug #18550 ndbd getting "node failure handling not complete..." after graceful restart
- addded more retries to wait for nodefailure to complete Bug #19039 multi node failure causes node failure handling not to complete - patch to avoid this scenario when the management server is used to perform the stop - wait for NF_COMPLETE_REP in management server before returning ndb: allocate nodeid - only retry on retryable error
This commit is contained in:
parent
51ddd6ff31
commit
2f955d19a6
7 changed files with 148 additions and 51 deletions
|
@ -232,6 +232,12 @@ extern "C" {
|
|||
/** Could not connect to socker */
|
||||
NDB_MGM_COULD_NOT_CONNECT_TO_SOCKET = 1011,
|
||||
|
||||
/* Alloc node id failures */
|
||||
/** Generic error, retry may succeed */
|
||||
NDB_MGM_ALLOCID_ERROR = 1101,
|
||||
/** Non retriable error */
|
||||
NDB_MGM_ALLOCID_CONFIG_MISMATCH = 1102,
|
||||
|
||||
/* Service errors - Start/Stop Node or System */
|
||||
/** Start failed */
|
||||
NDB_MGM_START_FAILED = 2001,
|
||||
|
@ -999,7 +1005,7 @@ extern "C" {
|
|||
void ndb_mgm_destroy_configuration(struct ndb_mgm_configuration *);
|
||||
|
||||
int ndb_mgm_alloc_nodeid(NdbMgmHandle handle,
|
||||
unsigned version, int nodetype);
|
||||
unsigned version, int nodetype, int log_event);
|
||||
|
||||
/**
|
||||
* End Session
|
||||
|
|
|
@ -349,12 +349,14 @@ ConfigRetriever::allocNodeId(int no_retries, int retry_delay_in_seconds)
|
|||
if(!ndb_mgm_connect(m_handle, 0, 0, 0))
|
||||
goto next;
|
||||
|
||||
res= ndb_mgm_alloc_nodeid(m_handle, m_version, m_node_type);
|
||||
res= ndb_mgm_alloc_nodeid(m_handle, m_version, m_node_type,
|
||||
no_retries == 0 /* only log last retry */);
|
||||
if(res >= 0)
|
||||
return _ownNodeId= (Uint32)res;
|
||||
|
||||
next:
|
||||
if (no_retries == 0)
|
||||
int error = ndb_mgm_get_latest_error(m_handle);
|
||||
if (no_retries == 0 || error == NDB_MGM_ALLOCID_CONFIG_MISMATCH)
|
||||
break;
|
||||
no_retries--;
|
||||
NdbSleep_SecSleep(retry_delay_in_seconds);
|
||||
|
|
|
@ -286,7 +286,8 @@ Configuration::fetch_configuration(){
|
|||
if (globalData.ownId)
|
||||
cr.setNodeId(globalData.ownId);
|
||||
|
||||
globalData.ownId = cr.allocNodeId(2 /*retry*/,3 /*delay*/);
|
||||
globalData.ownId = cr.allocNodeId(globalData.ownId ? 10 : 2 /*retry*/,
|
||||
3 /*delay*/);
|
||||
|
||||
if(globalData.ownId == 0){
|
||||
ERROR_SET(fatal, NDBD_EXIT_INVALID_CONFIG,
|
||||
|
|
|
@ -1868,7 +1868,8 @@ const char *ndb_mgm_get_connectstring(NdbMgmHandle handle, char *buf, int buf_sz
|
|||
|
||||
extern "C"
|
||||
int
|
||||
ndb_mgm_alloc_nodeid(NdbMgmHandle handle, unsigned int version, int nodetype)
|
||||
ndb_mgm_alloc_nodeid(NdbMgmHandle handle, unsigned int version, int nodetype,
|
||||
int log_event)
|
||||
{
|
||||
CHECK_HANDLE(handle, 0);
|
||||
CHECK_CONNECTED(handle, 0);
|
||||
|
@ -1888,9 +1889,11 @@ ndb_mgm_alloc_nodeid(NdbMgmHandle handle, unsigned int version, int nodetype)
|
|||
args.put("endian", (endian_check.c[sizeof(long)-1])?"big":"little");
|
||||
if (handle->m_name)
|
||||
args.put("name", handle->m_name);
|
||||
args.put("log_event", log_event);
|
||||
|
||||
const ParserRow<ParserDummy> reply[]= {
|
||||
MGM_CMD("get nodeid reply", NULL, ""),
|
||||
MGM_ARG("error_code", Int, Optional, "Error code"),
|
||||
MGM_ARG("nodeid", Int, Optional, "Error message"),
|
||||
MGM_ARG("result", String, Mandatory, "Error message"),
|
||||
MGM_END()
|
||||
|
@ -1903,14 +1906,16 @@ ndb_mgm_alloc_nodeid(NdbMgmHandle handle, unsigned int version, int nodetype)
|
|||
nodeid= -1;
|
||||
do {
|
||||
const char * buf;
|
||||
if(!prop->get("result", &buf) || strcmp(buf, "Ok") != 0){
|
||||
if (!prop->get("result", &buf) || strcmp(buf, "Ok") != 0)
|
||||
{
|
||||
const char *hostname= ndb_mgm_get_connected_host(handle);
|
||||
unsigned port= ndb_mgm_get_connected_port(handle);
|
||||
BaseString err;
|
||||
Uint32 error_code= NDB_MGM_ALLOCID_ERROR;
|
||||
err.assfmt("Could not alloc node id at %s port %d: %s",
|
||||
hostname, port, buf);
|
||||
setError(handle, NDB_MGM_COULD_NOT_CONNECT_TO_SOCKET, __LINE__,
|
||||
err.c_str());
|
||||
prop->get("error_code", &error_code);
|
||||
setError(handle, error_code, __LINE__, err.c_str());
|
||||
break;
|
||||
}
|
||||
Uint32 _nodeid;
|
||||
|
|
|
@ -507,9 +507,10 @@ MgmtSrvr::MgmtSrvr(SocketServer *socket_server,
|
|||
if (_ownNodeId == 0) // we did not get node id from other server
|
||||
{
|
||||
NodeId tmp= m_config_retriever->get_configuration_nodeid();
|
||||
int error_code;
|
||||
|
||||
if (!alloc_node_id(&tmp, NDB_MGM_NODE_TYPE_MGM,
|
||||
0, 0, error_string)){
|
||||
0, 0, error_code, error_string)){
|
||||
ndbout << "Unable to obtain requested nodeid: "
|
||||
<< error_string.c_str() << endl;
|
||||
require(false);
|
||||
|
@ -1118,31 +1119,16 @@ int MgmtSrvr::sendSTOP_REQ(const Vector<NodeId> &node_ids,
|
|||
const NFCompleteRep * const rep =
|
||||
CAST_CONSTPTR(NFCompleteRep, signal->getDataPtr());
|
||||
#ifdef VM_TRACE
|
||||
ndbout_c("Node %d fail completed", rep->failedNodeId);
|
||||
ndbout_c("sendSTOP_REQ Node %d fail completed", rep->failedNodeId);
|
||||
#endif
|
||||
nodes.clear(rep->failedNodeId); // clear the failed node
|
||||
if (singleUserNodeId == 0)
|
||||
stoppedNodes.set(rep->failedNodeId);
|
||||
break;
|
||||
}
|
||||
case GSN_NODE_FAILREP:{
|
||||
const NodeFailRep * const rep =
|
||||
CAST_CONSTPTR(NodeFailRep, signal->getDataPtr());
|
||||
NodeBitmask failedNodes;
|
||||
failedNodes.assign(NodeBitmask::Size, rep->theNodes);
|
||||
#ifdef VM_TRACE
|
||||
{
|
||||
ndbout << "Failed nodes:";
|
||||
for (unsigned i = 0; i < 32*NodeBitmask::Size; i++)
|
||||
if(failedNodes.get(i))
|
||||
ndbout << " " << i;
|
||||
ndbout << endl;
|
||||
}
|
||||
#endif
|
||||
failedNodes.bitAND(nodes);
|
||||
if (!failedNodes.isclear())
|
||||
{
|
||||
nodes.bitANDC(failedNodes); // clear the failed nodes
|
||||
if (singleUserNodeId == 0)
|
||||
stoppedNodes.bitOR(failedNodes);
|
||||
}
|
||||
break;
|
||||
}
|
||||
default:
|
||||
|
@ -1263,11 +1249,47 @@ int MgmtSrvr::restartNodes(const Vector<NodeId> &node_ids,
|
|||
abort,
|
||||
false,
|
||||
true,
|
||||
nostart,
|
||||
true,
|
||||
initialStart);
|
||||
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
if (stopCount)
|
||||
*stopCount = nodes.count();
|
||||
return ret;
|
||||
|
||||
// start up the nodes again
|
||||
int waitTime = 12000;
|
||||
NDB_TICKS maxTime = NdbTick_CurrentMillisecond() + waitTime;
|
||||
for (unsigned i = 0; i < node_ids.size(); i++)
|
||||
{
|
||||
NodeId nodeId= node_ids[i];
|
||||
enum ndb_mgm_node_status s;
|
||||
s = NDB_MGM_NODE_STATUS_NO_CONTACT;
|
||||
#ifdef VM_TRACE
|
||||
ndbout_c("Waiting for %d not started", nodeId);
|
||||
#endif
|
||||
while (s != NDB_MGM_NODE_STATUS_NOT_STARTED && waitTime > 0)
|
||||
{
|
||||
Uint32 startPhase = 0, version = 0, dynamicId = 0, nodeGroup = 0;
|
||||
Uint32 connectCount = 0;
|
||||
bool system;
|
||||
const char *address;
|
||||
status(nodeId, &s, &version, &startPhase,
|
||||
&system, &dynamicId, &nodeGroup, &connectCount, &address);
|
||||
NdbSleep_MilliSleep(100);
|
||||
waitTime = (maxTime - NdbTick_CurrentMillisecond());
|
||||
}
|
||||
}
|
||||
|
||||
if (nostart)
|
||||
return 0;
|
||||
|
||||
for (unsigned i = 0; i < node_ids.size(); i++)
|
||||
{
|
||||
int result = start(node_ids[i]);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -1918,7 +1940,8 @@ MgmtSrvr::alloc_node_id(NodeId * nodeId,
|
|||
enum ndb_mgm_node_type type,
|
||||
struct sockaddr *client_addr,
|
||||
SOCKET_SIZE_TYPE *client_addr_len,
|
||||
BaseString &error_string)
|
||||
int &error_code, BaseString &error_string,
|
||||
int log_event)
|
||||
{
|
||||
DBUG_ENTER("MgmtSrvr::alloc_node_id");
|
||||
DBUG_PRINT("enter", ("nodeid=%d, type=%d, client_addr=%d",
|
||||
|
@ -1927,6 +1950,7 @@ MgmtSrvr::alloc_node_id(NodeId * nodeId,
|
|||
if (*nodeId == 0) {
|
||||
error_string.appfmt("no-nodeid-checks set in management server.\n"
|
||||
"node id must be set explicitly in connectstring");
|
||||
error_code = NDB_MGM_ALLOCID_CONFIG_MISMATCH;
|
||||
DBUG_RETURN(false);
|
||||
}
|
||||
DBUG_RETURN(true);
|
||||
|
@ -1951,8 +1975,10 @@ MgmtSrvr::alloc_node_id(NodeId * nodeId,
|
|||
|
||||
if(NdbMutex_Lock(m_configMutex))
|
||||
{
|
||||
// should not happen
|
||||
error_string.appfmt("unable to lock configuration mutex");
|
||||
return false;
|
||||
error_code = NDB_MGM_ALLOCID_ERROR;
|
||||
DBUG_RETURN(false);
|
||||
}
|
||||
ndb_mgm_configuration_iterator
|
||||
iter(* _config->m_configValues, CFG_SECTION_NODE);
|
||||
|
@ -2023,6 +2049,7 @@ MgmtSrvr::alloc_node_id(NodeId * nodeId,
|
|||
"or specifying unique host names in config file.",
|
||||
id_found, tmp);
|
||||
NdbMutex_Unlock(m_configMutex);
|
||||
error_code = NDB_MGM_ALLOCID_CONFIG_MISMATCH;
|
||||
DBUG_RETURN(false);
|
||||
}
|
||||
if (config_hostname == 0) {
|
||||
|
@ -2031,6 +2058,7 @@ MgmtSrvr::alloc_node_id(NodeId * nodeId,
|
|||
"or specifying unique host names in config file,\n"
|
||||
"or specifying just one mgmt server in config file.",
|
||||
tmp);
|
||||
error_code = NDB_MGM_ALLOCID_CONFIG_MISMATCH;
|
||||
DBUG_RETURN(false);
|
||||
}
|
||||
id_found= tmp; // mgmt server matched, check for more matches
|
||||
|
@ -2072,8 +2100,9 @@ MgmtSrvr::alloc_node_id(NodeId * nodeId,
|
|||
|
||||
char tmp_str[128];
|
||||
m_reserved_nodes.getText(tmp_str);
|
||||
g_eventLogger.info("Mgmt server state: nodeid %d reserved for ip %s, m_reserved_nodes %s.",
|
||||
id_found, get_connect_address(id_found), tmp_str);
|
||||
g_eventLogger.info("Mgmt server state: nodeid %d reserved for ip %s, "
|
||||
"m_reserved_nodes %s.",
|
||||
id_found, get_connect_address(id_found), tmp_str);
|
||||
DBUG_RETURN(true);
|
||||
}
|
||||
|
||||
|
@ -2093,26 +2122,48 @@ MgmtSrvr::alloc_node_id(NodeId * nodeId,
|
|||
type_c_string.assfmt("%s(%s)", alias, str);
|
||||
}
|
||||
|
||||
if (*nodeId == 0) {
|
||||
if (*nodeId == 0)
|
||||
{
|
||||
if (found_matching_id)
|
||||
{
|
||||
if (found_matching_type)
|
||||
{
|
||||
if (found_free_node)
|
||||
{
|
||||
error_string.appfmt("Connection done from wrong host ip %s.",
|
||||
(client_addr)?
|
||||
inet_ntoa(((struct sockaddr_in *)
|
||||
inet_ntoa(((struct sockaddr_in *)
|
||||
(client_addr))->sin_addr):"");
|
||||
error_code = NDB_MGM_ALLOCID_ERROR;
|
||||
}
|
||||
else
|
||||
{
|
||||
error_string.appfmt("No free node id found for %s.",
|
||||
type_string.c_str());
|
||||
error_code = NDB_MGM_ALLOCID_ERROR;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
error_string.appfmt("No %s node defined in config file.",
|
||||
type_string.c_str());
|
||||
error_code = NDB_MGM_ALLOCID_CONFIG_MISMATCH;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
error_string.append("No nodes defined in config file.");
|
||||
} else {
|
||||
error_code = NDB_MGM_ALLOCID_CONFIG_MISMATCH;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (found_matching_id)
|
||||
{
|
||||
if (found_matching_type)
|
||||
if (found_free_node) {
|
||||
{
|
||||
if (found_free_node)
|
||||
{
|
||||
// have to split these into two since inet_ntoa overwrites itself
|
||||
error_string.appfmt("Connection with id %d done from wrong host ip %s,",
|
||||
*nodeId, inet_ntoa(((struct sockaddr_in *)
|
||||
|
@ -2120,27 +2171,44 @@ MgmtSrvr::alloc_node_id(NodeId * nodeId,
|
|||
error_string.appfmt(" expected %s(%s).", config_hostname,
|
||||
r_config_addr ?
|
||||
"lookup failed" : inet_ntoa(config_addr));
|
||||
} else
|
||||
error_code = NDB_MGM_ALLOCID_CONFIG_MISMATCH;
|
||||
}
|
||||
else
|
||||
{
|
||||
error_string.appfmt("Id %d already allocated by another node.",
|
||||
*nodeId);
|
||||
error_code = NDB_MGM_ALLOCID_ERROR;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
error_string.appfmt("Id %d configured as %s, connect attempted as %s.",
|
||||
*nodeId, type_c_string.c_str(),
|
||||
type_string.c_str());
|
||||
error_code = NDB_MGM_ALLOCID_CONFIG_MISMATCH;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
error_string.appfmt("No node defined with id=%d in config file.",
|
||||
*nodeId);
|
||||
error_code = NDB_MGM_ALLOCID_CONFIG_MISMATCH;
|
||||
}
|
||||
}
|
||||
|
||||
g_eventLogger.warning("Allocate nodeid (%d) failed. Connection from ip %s. "
|
||||
"Returned error string \"%s\"",
|
||||
*nodeId,
|
||||
client_addr != 0 ? inet_ntoa(((struct sockaddr_in *)(client_addr))->sin_addr) : "<none>",
|
||||
error_string.c_str());
|
||||
|
||||
NodeBitmask connected_nodes2;
|
||||
get_connected_nodes(connected_nodes2);
|
||||
if (log_event || error_code == NDB_MGM_ALLOCID_CONFIG_MISMATCH)
|
||||
{
|
||||
g_eventLogger.warning("Allocate nodeid (%d) failed. Connection from ip %s."
|
||||
" Returned error string \"%s\"",
|
||||
*nodeId,
|
||||
client_addr != 0
|
||||
? inet_ntoa(((struct sockaddr_in *)
|
||||
(client_addr))->sin_addr)
|
||||
: "<none>",
|
||||
error_string.c_str());
|
||||
|
||||
NodeBitmask connected_nodes2;
|
||||
get_connected_nodes(connected_nodes2);
|
||||
BaseString tmp_connected, tmp_not_connected;
|
||||
for(Uint32 i = 0; i < MAX_NODES; i++)
|
||||
{
|
||||
|
|
|
@ -434,8 +434,10 @@ public:
|
|||
*/
|
||||
bool getNextNodeId(NodeId * _nodeId, enum ndb_mgm_node_type type) const ;
|
||||
bool alloc_node_id(NodeId * _nodeId, enum ndb_mgm_node_type type,
|
||||
struct sockaddr *client_addr, SOCKET_SIZE_TYPE *client_addr_len,
|
||||
BaseString &error_string);
|
||||
struct sockaddr *client_addr,
|
||||
SOCKET_SIZE_TYPE *client_addr_len,
|
||||
int &error_code, BaseString &error_string,
|
||||
int log_event = 1);
|
||||
|
||||
/**
|
||||
*
|
||||
|
|
|
@ -138,6 +138,7 @@ ParserRow<MgmApiSession> commands[] = {
|
|||
MGM_ARG("endian", String, Optional, "Endianness"),
|
||||
MGM_ARG("name", String, Optional, "Name of connection"),
|
||||
MGM_ARG("timeout", Int, Optional, "Timeout in seconds"),
|
||||
MGM_ARG("log_event", Int, Optional, "Log failure in cluster log"),
|
||||
|
||||
MGM_CMD("get version", &MgmApiSession::getVersion, ""),
|
||||
|
||||
|
@ -425,6 +426,8 @@ MgmApiSession::get_nodeid(Parser_t::Context &,
|
|||
const char * public_key;
|
||||
const char * endian= NULL;
|
||||
const char * name= NULL;
|
||||
Uint32 log_event= 1;
|
||||
bool log_event_version;
|
||||
union { long l; char c[sizeof(long)]; } endian_check;
|
||||
|
||||
args.get("version", &version);
|
||||
|
@ -437,6 +440,8 @@ MgmApiSession::get_nodeid(Parser_t::Context &,
|
|||
args.get("endian", &endian);
|
||||
args.get("name", &name);
|
||||
args.get("timeout", &timeout);
|
||||
/* for backwards compatability keep track if client uses new protocol */
|
||||
log_event_version= args.get("log_event", &log_event);
|
||||
|
||||
endian_check.l = 1;
|
||||
if(endian
|
||||
|
@ -476,11 +481,15 @@ MgmApiSession::get_nodeid(Parser_t::Context &,
|
|||
NodeId tmp= nodeid;
|
||||
if(tmp == 0 || !m_allocated_resources->is_reserved(tmp)){
|
||||
BaseString error_string;
|
||||
int error_code;
|
||||
NDB_TICKS tick= 0;
|
||||
/* only report error on second attempt as not to clog the cluster log */
|
||||
while (!m_mgmsrv.alloc_node_id(&tmp, (enum ndb_mgm_node_type)nodetype,
|
||||
&addr, &addrlen, error_string))
|
||||
&addr, &addrlen, error_code, error_string,
|
||||
tick == 0 ? 0 : log_event))
|
||||
{
|
||||
if (tick == 0)
|
||||
/* NDB_MGM_ALLOCID_CONFIG_MISMATCH is a non retriable error */
|
||||
if (tick == 0 && error_code != NDB_MGM_ALLOCID_CONFIG_MISMATCH)
|
||||
{
|
||||
// attempt to free any timed out reservations
|
||||
tick= NdbTick_CurrentMillisecond();
|
||||
|
@ -492,6 +501,7 @@ MgmApiSession::get_nodeid(Parser_t::Context &,
|
|||
ps.tick= tick;
|
||||
m_mgmsrv.get_socket_server()->
|
||||
foreachSession(stop_session_if_timed_out,&ps);
|
||||
error_string = "";
|
||||
continue;
|
||||
}
|
||||
const char *alias;
|
||||
|
@ -500,6 +510,9 @@ MgmApiSession::get_nodeid(Parser_t::Context &,
|
|||
nodetype, &str);
|
||||
m_output->println(cmd);
|
||||
m_output->println("result: %s", error_string.c_str());
|
||||
/* only use error_code protocol if client knows about it */
|
||||
if (log_event_version)
|
||||
m_output->println("error_code: %d", error_code);
|
||||
m_output->println("");
|
||||
return;
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue