mirror of
https://github.com/MariaDB/server.git
synced 2025-01-31 19:11:46 +01:00
Fix for bug #19328: Slave timeout with COM_REGISTER_SLAVE error causing stop
Problem: "Under high load, the slave registering to the master can timeout during the COM_REGISTER_SLAVE execution. This causes an error, which prevents the slave from connecting at all." Fix: Do not abort immediately, but retry registering on master.
This commit is contained in:
parent
cbdbf5b75a
commit
99630cb901
1 changed files with 208 additions and 100 deletions
308
sql/slave.cc
308
sql/slave.cc
|
@ -59,6 +59,58 @@ ulonglong relay_log_space_limit = 0;
|
|||
int disconnect_slave_event_count = 0, abort_slave_event_count = 0;
|
||||
int events_till_abort = -1;
|
||||
|
||||
enum enum_slave_reconnect_actions
|
||||
{
|
||||
SLAVE_RECON_ACT_REG= 0,
|
||||
SLAVE_RECON_ACT_DUMP= 1,
|
||||
SLAVE_RECON_ACT_EVENT= 2,
|
||||
SLAVE_RECON_ACT_MAX
|
||||
};
|
||||
|
||||
enum enum_slave_reconnect_messages
|
||||
{
|
||||
SLAVE_RECON_MSG_WAIT= 0,
|
||||
SLAVE_RECON_MSG_KILLED_WAITING= 1,
|
||||
SLAVE_RECON_MSG_AFTER= 2,
|
||||
SLAVE_RECON_MSG_FAILED= 3,
|
||||
SLAVE_RECON_MSG_COMMAND= 4,
|
||||
SLAVE_RECON_MSG_KILLED_AFTER= 5,
|
||||
SLAVE_RECON_MSG_MAX
|
||||
};
|
||||
|
||||
static const char *reconnect_messages[SLAVE_RECON_ACT_MAX][SLAVE_RECON_MSG_MAX]=
|
||||
{
|
||||
{
|
||||
"Waiting to reconnect after a failed registration on master",
|
||||
"Slave I/O thread killed while waitnig to reconnect after a failed \
|
||||
registration on master",
|
||||
"Reconnecting after a failed registration on master",
|
||||
"failed registering on master, reconnecting to try again, \
|
||||
log '%s' at postion %s",
|
||||
"COM_REGISTER_SLAVE",
|
||||
"Slave I/O thread killed during or after reconnect"
|
||||
},
|
||||
{
|
||||
"Waiting to reconnect after a failed binlog dump request",
|
||||
"Slave I/O thread killed while retrying master dump",
|
||||
"Reconnecting after a failed binlog dump request",
|
||||
"failed dump request, reconnecting to try again, log '%s' at postion %s",
|
||||
"COM_BINLOG_DUMP",
|
||||
"Slave I/O thread killed during or after reconnect"
|
||||
},
|
||||
{
|
||||
"Waiting to reconnect after a failed master event read",
|
||||
"Slave I/O thread killed while waiting to reconnect after a failed read",
|
||||
"Reconnecting after a failed master event read",
|
||||
"Slave I/O thread: Failed reading log event, reconnecting to retry, \
|
||||
log '%s' at postion %s",
|
||||
"",
|
||||
"Slave I/O thread killed during or after a reconnect done to recover from \
|
||||
failed read"
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
typedef enum { SLAVE_THD_IO, SLAVE_THD_SQL} SLAVE_THD_TYPE;
|
||||
|
||||
static int process_io_rotate(MASTER_INFO* mi, Rotate_log_event* rev);
|
||||
|
@ -1098,12 +1150,14 @@ static void write_ignored_events_info_to_relay_log(THD *thd, MASTER_INFO *mi)
|
|||
}
|
||||
|
||||
|
||||
int register_slave_on_master(MYSQL* mysql, MASTER_INFO *mi)
|
||||
int register_slave_on_master(MYSQL* mysql, MASTER_INFO *mi,
|
||||
bool *suppress_warnings)
|
||||
{
|
||||
uchar buf[1024], *pos= buf;
|
||||
uint report_host_len, report_user_len=0, report_password_len=0;
|
||||
DBUG_ENTER("register_slave_on_master");
|
||||
|
||||
*suppress_warnings= FALSE;
|
||||
if (!report_host)
|
||||
DBUG_RETURN(0);
|
||||
report_host_len= strlen(report_host);
|
||||
|
@ -1127,11 +1181,18 @@ int register_slave_on_master(MYSQL* mysql, MASTER_INFO *mi)
|
|||
|
||||
if (simple_command(mysql, COM_REGISTER_SLAVE, buf, (size_t) (pos- buf), 0))
|
||||
{
|
||||
char buf[256];
|
||||
my_snprintf(buf, sizeof(buf),
|
||||
"%s (Errno: %d)", mysql_error(mysql), mysql_errno(mysql));
|
||||
mi->report(ERROR_LEVEL, ER_SLAVE_MASTER_COM_FAILURE,
|
||||
ER(ER_SLAVE_MASTER_COM_FAILURE), "COM_REGISTER_SLAVE", buf);
|
||||
if (mysql_errno(mysql) == ER_NET_READ_INTERRUPTED)
|
||||
{
|
||||
*suppress_warnings= TRUE; // Suppress reconnect warning
|
||||
}
|
||||
else
|
||||
{
|
||||
char buf[256];
|
||||
my_snprintf(buf, sizeof(buf), "%s (Errno: %d)", mysql_error(mysql),
|
||||
mysql_errno(mysql));
|
||||
mi->report(ERROR_LEVEL, ER_SLAVE_MASTER_COM_FAILURE,
|
||||
ER(ER_SLAVE_MASTER_COM_FAILURE), "COM_REGISTER_SLAVE", buf);
|
||||
}
|
||||
DBUG_RETURN(1);
|
||||
}
|
||||
DBUG_RETURN(0);
|
||||
|
@ -1460,6 +1521,8 @@ static int request_dump(MYSQL* mysql, MASTER_INFO* mi,
|
|||
int binlog_flags = 0; // for now
|
||||
char* logname = mi->master_log_name;
|
||||
DBUG_ENTER("request_dump");
|
||||
|
||||
*suppress_warnings= FALSE;
|
||||
|
||||
// TODO if big log files: Change next to int8store()
|
||||
int4store(buf, (ulong) mi->master_log_pos);
|
||||
|
@ -1475,7 +1538,7 @@ static int request_dump(MYSQL* mysql, MASTER_INFO* mi,
|
|||
now we just fill up the error log :-)
|
||||
*/
|
||||
if (mysql_errno(mysql) == ER_NET_READ_INTERRUPTED)
|
||||
*suppress_warnings= 1; // Suppress reconnect warning
|
||||
*suppress_warnings= TRUE; // Suppress reconnect warning
|
||||
else
|
||||
sql_print_error("Error on COM_BINLOG_DUMP: %d %s, will retry in %d secs",
|
||||
mysql_errno(mysql), mysql_error(mysql),
|
||||
|
@ -1539,7 +1602,7 @@ static ulong read_event(MYSQL* mysql, MASTER_INFO *mi, bool* suppress_warnings)
|
|||
ulong len;
|
||||
DBUG_ENTER("read_event");
|
||||
|
||||
*suppress_warnings= 0;
|
||||
*suppress_warnings= FALSE;
|
||||
/*
|
||||
my_real_read() will time us out
|
||||
We check if we were told to die, and if not, try reading again
|
||||
|
@ -1879,6 +1942,94 @@ on this slave.\
|
|||
}
|
||||
|
||||
|
||||
static bool check_io_slave_killed(THD *thd, MASTER_INFO *mi, const char *info)
|
||||
{
|
||||
if (io_slave_killed(thd, mi))
|
||||
{
|
||||
if (global_system_variables.log_warnings)
|
||||
sql_print_information(info);
|
||||
return TRUE;
|
||||
}
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
@brief Try to reconnect slave IO thread.
|
||||
|
||||
@details Terminates current connection to master, sleeps for
|
||||
@c mi->connect_retry msecs and initiates new connection with
|
||||
@c safe_reconnect(). Variable pointed by @c retry_count is increased -
|
||||
if it exceeds @c master_retry_count then connection is not re-established
|
||||
and function signals error.
|
||||
Unless @c suppres_warnings is TRUE, a warning is put in the server error log
|
||||
when reconnecting. The warning message and messages used to report errors
|
||||
are taken from @c messages array. In case @c master_retry_count is exceeded,
|
||||
no messages are added to the log.
|
||||
|
||||
@param[in] thd Thread context.
|
||||
@param[in] mysql MySQL connection.
|
||||
@param[in] mi Master connection information.
|
||||
@param[in,out] retry_count Number of attempts to reconnect.
|
||||
@param[in] suppress_warnings TRUE when a normal net read timeout
|
||||
has caused to reconnecting.
|
||||
@param[in] messages Messages to print/log, see
|
||||
reconnect_messages[] array.
|
||||
|
||||
@retval 0 OK.
|
||||
@retval 1 There was an error.
|
||||
*/
|
||||
|
||||
static int try_to_reconnect(THD *thd, MYSQL *mysql, MASTER_INFO *mi,
|
||||
uint *retry_count, bool suppress_warnings,
|
||||
const char *messages[SLAVE_RECON_MSG_MAX])
|
||||
{
|
||||
mi->slave_running= MYSQL_SLAVE_RUN_NOT_CONNECT;
|
||||
thd->proc_info= messages[SLAVE_RECON_MSG_WAIT];
|
||||
#ifdef SIGNAL_WITH_VIO_CLOSE
|
||||
thd->clear_active_vio();
|
||||
#endif
|
||||
end_server(mysql);
|
||||
if ((*retry_count)++)
|
||||
{
|
||||
if (*retry_count > master_retry_count)
|
||||
return 1; // Don't retry forever
|
||||
safe_sleep(thd, mi->connect_retry, (CHECK_KILLED_FUNC) io_slave_killed,
|
||||
(void *) mi);
|
||||
}
|
||||
if (check_io_slave_killed(thd, mi, messages[SLAVE_RECON_MSG_KILLED_WAITING]))
|
||||
return 1;
|
||||
thd->proc_info = messages[SLAVE_RECON_MSG_AFTER];
|
||||
if (!suppress_warnings)
|
||||
{
|
||||
char buf[256], llbuff[22];
|
||||
my_snprintf(buf, sizeof(buf), messages[SLAVE_RECON_MSG_FAILED],
|
||||
IO_RPL_LOG_NAME, llstr(mi->master_log_pos, llbuff));
|
||||
/*
|
||||
Raise a warining during registering on master/requesting dump.
|
||||
Log a message reading event.
|
||||
*/
|
||||
if (messages[SLAVE_RECON_MSG_COMMAND][0])
|
||||
{
|
||||
mi->report(WARNING_LEVEL, ER_SLAVE_MASTER_COM_FAILURE,
|
||||
ER(ER_SLAVE_MASTER_COM_FAILURE),
|
||||
messages[SLAVE_RECON_MSG_COMMAND], buf);
|
||||
}
|
||||
else
|
||||
{
|
||||
sql_print_information(buf);
|
||||
}
|
||||
}
|
||||
if (safe_reconnect(thd, mysql, mi, 1) || io_slave_killed(thd, mi))
|
||||
{
|
||||
if (global_system_variables.log_warnings)
|
||||
sql_print_information(messages[SLAVE_RECON_MSG_KILLED_AFTER]);
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
/* Slave I/O Thread entry point */
|
||||
|
||||
pthread_handler_t handle_slave_io(void *arg)
|
||||
|
@ -1889,7 +2040,10 @@ pthread_handler_t handle_slave_io(void *arg)
|
|||
RELAY_LOG_INFO *rli= &mi->rli;
|
||||
char llbuff[22];
|
||||
uint retry_count;
|
||||
|
||||
bool suppress_warnings;
|
||||
#ifndef DBUG_OFF
|
||||
uint retry_count_reg= 0, retry_count_dump= 0, retry_count_event= 0;
|
||||
#endif
|
||||
// needs to call my_thread_init(), otherwise we get a coredump in DBUG_ stuff
|
||||
my_thread_init();
|
||||
DBUG_ENTER("handle_slave_io");
|
||||
|
@ -1975,79 +2129,56 @@ connected:
|
|||
Register ourselves with the master.
|
||||
*/
|
||||
thd->proc_info = "Registering slave on master";
|
||||
if (register_slave_on_master(mysql, mi))
|
||||
if (register_slave_on_master(mysql, mi, &suppress_warnings))
|
||||
{
|
||||
sql_print_error("Slave I/O thread couldn't register on master");
|
||||
goto err;
|
||||
if (check_io_slave_killed(thd, mi, "Slave I/O thread killed while \
|
||||
registering slave on master") ||
|
||||
try_to_reconnect(thd, mysql, mi, &retry_count, suppress_warnings,
|
||||
reconnect_messages[SLAVE_RECON_ACT_REG]))
|
||||
goto err;
|
||||
goto connected;
|
||||
}
|
||||
DBUG_EXECUTE_IF("FORCE_SLAVE_TO_RECONNECT_REG",
|
||||
if (!retry_count_reg)
|
||||
{
|
||||
retry_count_reg++;
|
||||
sql_print_information("Forcing to reconnect slave I/O thread");
|
||||
if (try_to_reconnect(thd, mysql, mi, &retry_count, suppress_warnings,
|
||||
reconnect_messages[SLAVE_RECON_ACT_REG]))
|
||||
goto err;
|
||||
goto connected;
|
||||
});
|
||||
}
|
||||
|
||||
DBUG_PRINT("info",("Starting reading binary log from master"));
|
||||
while (!io_slave_killed(thd,mi))
|
||||
{
|
||||
bool suppress_warnings= 0;
|
||||
thd->proc_info = "Requesting binlog dump";
|
||||
if (request_dump(mysql, mi, &suppress_warnings))
|
||||
{
|
||||
sql_print_error("Failed on request_dump()");
|
||||
if (io_slave_killed(thd,mi))
|
||||
{
|
||||
sql_print_information("Slave I/O thread killed while requesting master \
|
||||
dump");
|
||||
if (check_io_slave_killed(thd, mi, "Slave I/O thread killed while \
|
||||
requesting master dump") ||
|
||||
try_to_reconnect(thd, mysql, mi, &retry_count, suppress_warnings,
|
||||
reconnect_messages[SLAVE_RECON_ACT_DUMP]))
|
||||
goto err;
|
||||
}
|
||||
|
||||
mi->slave_running= MYSQL_SLAVE_RUN_NOT_CONNECT;
|
||||
thd->proc_info= "Waiting to reconnect after a failed binlog dump request";
|
||||
#ifdef SIGNAL_WITH_VIO_CLOSE
|
||||
thd->clear_active_vio();
|
||||
#endif
|
||||
end_server(mysql);
|
||||
/*
|
||||
First time retry immediately, assuming that we can recover
|
||||
right away - if first time fails, sleep between re-tries
|
||||
hopefuly the admin can fix the problem sometime
|
||||
*/
|
||||
if (retry_count++)
|
||||
{
|
||||
if (retry_count > master_retry_count)
|
||||
goto err; // Don't retry forever
|
||||
safe_sleep(thd,mi->connect_retry,(CHECK_KILLED_FUNC)io_slave_killed,
|
||||
(void*)mi);
|
||||
}
|
||||
if (io_slave_killed(thd,mi))
|
||||
{
|
||||
sql_print_information("Slave I/O thread killed while retrying master \
|
||||
dump");
|
||||
goto err;
|
||||
}
|
||||
|
||||
thd->proc_info = "Reconnecting after a failed binlog dump request";
|
||||
if (!suppress_warnings) {
|
||||
char buf[256];
|
||||
my_snprintf(buf, sizeof(buf),
|
||||
"failed dump request, reconnecting to try again,"
|
||||
" log '%s' at postion %s",
|
||||
IO_RPL_LOG_NAME,
|
||||
llstr(mi->master_log_pos,llbuff));
|
||||
mi->report(WARNING_LEVEL, ER_SLAVE_MASTER_COM_FAILURE,
|
||||
ER(ER_SLAVE_MASTER_COM_FAILURE), "COM_BINLOG_DUMP", buf);
|
||||
}
|
||||
if (safe_reconnect(thd, mysql, mi, suppress_warnings) ||
|
||||
io_slave_killed(thd,mi))
|
||||
{
|
||||
sql_print_information("Slave I/O thread killed during or \
|
||||
after reconnect");
|
||||
goto err;
|
||||
}
|
||||
|
||||
goto connected;
|
||||
}
|
||||
DBUG_EXECUTE_IF("FORCE_SLAVE_TO_RECONNECT_DUMP",
|
||||
if (!retry_count_dump)
|
||||
{
|
||||
retry_count_dump++;
|
||||
sql_print_information("Forcing to reconnect slave I/O thread");
|
||||
if (try_to_reconnect(thd, mysql, mi, &retry_count, suppress_warnings,
|
||||
reconnect_messages[SLAVE_RECON_ACT_DUMP]))
|
||||
goto err;
|
||||
goto connected;
|
||||
});
|
||||
|
||||
while (!io_slave_killed(thd,mi))
|
||||
{
|
||||
ulong event_len;
|
||||
suppress_warnings= 0;
|
||||
/*
|
||||
We say "waiting" because read_event() will wait if there's nothing to
|
||||
read. But if there's something to read, it will not wait. The
|
||||
|
@ -2056,12 +2187,19 @@ after reconnect");
|
|||
*/
|
||||
thd->proc_info= "Waiting for master to send event";
|
||||
event_len= read_event(mysql, mi, &suppress_warnings);
|
||||
if (io_slave_killed(thd,mi))
|
||||
{
|
||||
if (global_system_variables.log_warnings)
|
||||
sql_print_information("Slave I/O thread killed while reading event");
|
||||
if (check_io_slave_killed(thd, mi, "Slave I/O thread killed while \
|
||||
reading event"))
|
||||
goto err;
|
||||
}
|
||||
DBUG_EXECUTE_IF("FORCE_SLAVE_TO_RECONNECT_EVENT",
|
||||
if (!retry_count_event)
|
||||
{
|
||||
retry_count_event++;
|
||||
sql_print_information("Forcing to reconnect slave I/O thread");
|
||||
if (try_to_reconnect(thd, mysql, mi, &retry_count, suppress_warnings,
|
||||
reconnect_messages[SLAVE_RECON_ACT_EVENT]))
|
||||
goto err;
|
||||
goto connected;
|
||||
});
|
||||
|
||||
if (event_len == packet_error)
|
||||
{
|
||||
|
@ -2081,39 +2219,9 @@ max_allowed_packet",
|
|||
mysql_error(mysql));
|
||||
goto err;
|
||||
}
|
||||
mi->slave_running= MYSQL_SLAVE_RUN_NOT_CONNECT;
|
||||
thd->proc_info = "Waiting to reconnect after a failed master event read";
|
||||
#ifdef SIGNAL_WITH_VIO_CLOSE
|
||||
thd->clear_active_vio();
|
||||
#endif
|
||||
end_server(mysql);
|
||||
if (retry_count++)
|
||||
{
|
||||
if (retry_count > master_retry_count)
|
||||
goto err; // Don't retry forever
|
||||
safe_sleep(thd,mi->connect_retry,(CHECK_KILLED_FUNC)io_slave_killed,
|
||||
(void*) mi);
|
||||
}
|
||||
if (io_slave_killed(thd,mi))
|
||||
{
|
||||
if (global_system_variables.log_warnings)
|
||||
sql_print_information("Slave I/O thread killed while waiting to \
|
||||
reconnect after a failed read");
|
||||
if (try_to_reconnect(thd, mysql, mi, &retry_count, suppress_warnings,
|
||||
reconnect_messages[SLAVE_RECON_ACT_EVENT]))
|
||||
goto err;
|
||||
}
|
||||
thd->proc_info = "Reconnecting after a failed master event read";
|
||||
if (!suppress_warnings)
|
||||
sql_print_information("Slave I/O thread: Failed reading log event, \
|
||||
reconnecting to retry, log '%s' position %s", IO_RPL_LOG_NAME,
|
||||
llstr(mi->master_log_pos, llbuff));
|
||||
if (safe_reconnect(thd, mysql, mi, suppress_warnings) ||
|
||||
io_slave_killed(thd,mi))
|
||||
{
|
||||
if (global_system_variables.log_warnings)
|
||||
sql_print_information("Slave I/O thread killed during or after a \
|
||||
reconnect done to recover from failed read");
|
||||
goto err;
|
||||
}
|
||||
goto connected;
|
||||
} // if (event_len == packet_error)
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue