MDEV-35806: Error in read_log_event() corrupts relay log writer, crashes server

In Log_event::read_log_event(), don't use IO_CACHE::error of the relay log's
IO_CACHE to signal an error back to the caller. When reading the active
relay log, this flag is also being used by the IO thread, and setting it can
randomly cause the IO thread to wrongly detect IO error on writing and
permanently disable the relay log.

This was seen sporadically in test case rpl.rpl_from_mysql80. The read
error set by the SQL thread in the IO_CACHE would be interpreted as a
write error by the IO thread, which would cause it to throw a fatal
error and close the relay log. And this would later cause CHANGE
MASTER to try to purge a closed relay log, resulting in nullptr crash.

Also fix another sporadic failure in rpl.rpl_from_mysql80 where the outout
of MASTER_POS_WAIT() depended on timing of SQL and IO thread.

Signed-off-by: Kristian Nielsen <knielsen@knielsen-hq.org>
This commit is contained in:
Kristian Nielsen 2025-01-10 00:06:25 +01:00
parent 43233fe469
commit 777c351d94
10 changed files with 50 additions and 28 deletions

View file

@ -2698,6 +2698,7 @@ static Exit_status check_header(IO_CACHE* file,
uchar buf[PROBE_HEADER_LEN];
my_off_t tmp_pos, pos;
MY_STAT my_file_stat;
int read_error;
delete glob_description_event;
if (!(glob_description_event= new Format_description_log_event(3)))
@ -2798,7 +2799,8 @@ static Exit_status check_header(IO_CACHE* file,
Format_description_log_event *new_description_event;
my_b_seek(file, tmp_pos); /* seek back to event's start */
if (!(new_description_event= (Format_description_log_event*)
Log_event::read_log_event(file, glob_description_event,
Log_event::read_log_event(file, &read_error,
glob_description_event,
opt_verify_binlog_checksum)))
/* EOF can't be hit here normally, so it's a real error */
{
@ -2831,7 +2833,8 @@ static Exit_status check_header(IO_CACHE* file,
{
Log_event *ev;
my_b_seek(file, tmp_pos); /* seek back to event's start */
if (!(ev= Log_event::read_log_event(file, glob_description_event,
if (!(ev= Log_event::read_log_event(file, &read_error,
glob_description_event,
opt_verify_binlog_checksum)))
{
/* EOF can't be hit here normally, so it's a real error */
@ -2945,8 +2948,10 @@ static Exit_status dump_local_log_entries(PRINT_EVENT_INFO *print_event_info,
{
char llbuff[21];
my_off_t old_off = my_b_tell(file);
int read_error;
Log_event* ev = Log_event::read_log_event(file, glob_description_event,
Log_event* ev = Log_event::read_log_event(file, &read_error,
glob_description_event,
opt_verify_binlog_checksum);
if (!ev)
{
@ -2955,15 +2960,15 @@ static Exit_status dump_local_log_entries(PRINT_EVENT_INFO *print_event_info,
about a corruption, but treat it as EOF and move to the next binlog.
*/
if (glob_description_event->flags & LOG_EVENT_BINLOG_IN_USE_F)
file->error= 0;
else if (file->error)
read_error= 0;
else if (read_error)
{
error("Could not read entry at offset %s: "
"Error in log format or read error.",
llstr(old_off,llbuff));
goto err;
}
// else file->error == 0 means EOF, that's OK, we break in this case
// else read_error == 0 means EOF, that's OK, we break in this case
/*
Emit a warning in the event that we finished processing input

View file

@ -11,8 +11,6 @@ START SLAVE IO_THREAD;
include/wait_for_slave_io_to_start.inc
START SLAVE UNTIL Master_log_file='master-bin.000001', Master_log_pos= 1178;
SELECT MASTER_POS_WAIT('master-bin.000001', 1178, 60);
MASTER_POS_WAIT('master-bin.000001', 1178, 60)
NULL
SELECT * FROM t1 ORDER BY a;
a b c
1 0

View file

@ -81,7 +81,9 @@ START SLAVE IO_THREAD;
# The position 1178 is the start of: INSERT INTO t1 VALUES (4, 0, 'skip');
# After that comes unknown MySQL 8.0 events, which we test error for below.
START SLAVE UNTIL Master_log_file='master-bin.000001', Master_log_pos= 1178;
--disable_result_log
SELECT MASTER_POS_WAIT('master-bin.000001', 1178, 60);
--enable_result_log
SELECT * FROM t1 ORDER BY a;
--source include/wait_for_slave_sql_to_stop.inc

View file

@ -5390,6 +5390,13 @@ int MYSQL_BIN_LOG::new_file_impl()
DBUG_RETURN(error);
}
if (log_file.error)
{
my_error(ER_SLAVE_RELAY_LOG_WRITE_FAILURE, MYF(ME_ERROR_LOG),
"Writing to the relay log prevented by an earlier error");
DBUG_RETURN(1);
}
mysql_mutex_lock(&LOCK_index);
/* Reuse old name if not binlog and not update log */
@ -11385,7 +11392,8 @@ int TC_LOG_BINLOG::recover(LOG_INFO *linfo, const char *last_log_name,
binlog_checkpoint_found= false;
for (round= 1;;)
{
while ((ev= Log_event::read_log_event(round == 1 ? first_log : &log,
int error;
while ((ev= Log_event::read_log_event(round == 1 ? first_log : &log, &error,
fdle, opt_master_verify_checksum))
&& ev->is_valid())
{
@ -11671,7 +11679,8 @@ MYSQL_BIN_LOG::do_binlog_recovery(const char *opt_name, bool do_xa_recovery)
return 1;
}
if ((ev= Log_event::read_log_event(&log, &fdle,
int read_error;
if ((ev= Log_event::read_log_event(&log, &read_error, &fdle,
opt_master_verify_checksum)) &&
ev->get_type_code() == FORMAT_DESCRIPTION_EVENT)
{
@ -11895,10 +11904,11 @@ get_gtid_list_event(IO_CACHE *cache, Gtid_list_log_event **out_gtid_list)
Format_description_log_event *fdle;
Log_event *ev;
const char *errormsg = NULL;
int read_error;
*out_gtid_list= NULL;
if (!(ev= Log_event::read_log_event(cache, &init_fdle,
if (!(ev= Log_event::read_log_event(cache, &read_error, &init_fdle,
opt_master_verify_checksum)) ||
ev->get_type_code() != FORMAT_DESCRIPTION_EVENT)
{
@ -11914,7 +11924,8 @@ get_gtid_list_event(IO_CACHE *cache, Gtid_list_log_event **out_gtid_list)
{
Log_event_type typ;
ev= Log_event::read_log_event(cache, fdle, opt_master_verify_checksum);
ev= Log_event::read_log_event(cache, &read_error, fdle,
opt_master_verify_checksum);
if (!ev)
{
errormsg= "Could not read GTID list event while looking for GTID "

View file

@ -916,7 +916,7 @@ int Log_event::read_log_event(IO_CACHE* file, String* packet,
DBUG_RETURN(0);
}
Log_event* Log_event::read_log_event(IO_CACHE* file,
Log_event* Log_event::read_log_event(IO_CACHE* file, int *out_error,
const Format_description_log_event *fdle,
my_bool crc_check,
my_bool print_errors)
@ -927,6 +927,7 @@ Log_event* Log_event::read_log_event(IO_CACHE* file,
const char *error= 0;
Log_event *res= 0;
*out_error= 0;
switch (read_log_event(file, &event, fdle, BINLOG_CHECKSUM_ALG_OFF))
{
case 0:
@ -976,14 +977,14 @@ err:
#endif
/*
The SQL slave thread will check if file->error<0 to know
The SQL slave thread will check *out_error to know
if there was an I/O error. Even if there is no "low-level" I/O errors
with 'file', any of the high-level above errors is worrying
enough to stop the SQL thread now ; as we are skipping the current event,
going on with reading and successfully executing other events can
only corrupt the slave's databases. So stop.
*/
file->error= -1;
*out_error= 1;
#ifndef MYSQL_CLIENT
if (!print_errors)

View file

@ -1344,7 +1344,7 @@ public:
we detect the event's type, then call the specific event's
constructor and pass description_event as an argument.
*/
static Log_event* read_log_event(IO_CACHE* file,
static Log_event* read_log_event(IO_CACHE* file, int *out_error,
const Format_description_log_event
*description_event,
my_bool crc_check,

View file

@ -1037,14 +1037,15 @@ do_retry:
/* The loop is here so we can try again the next relay log file on EOF. */
for (;;)
{
int error;
old_offset= cur_offset;
ev= Log_event::read_log_event(&rlog, description_event,
ev= Log_event::read_log_event(&rlog, &error, description_event,
opt_slave_sql_verify_checksum);
cur_offset= my_b_tell(&rlog);
if (ev)
break;
if (unlikely(rlog.error < 0))
if (unlikely(error))
{
errmsg= "slave SQL thread aborted because of I/O error";
err= 1;

View file

@ -544,12 +544,13 @@ read_relay_log_description_event(IO_CACHE *cur_log, ulonglong start_pos,
if (my_b_tell(cur_log) >= start_pos)
break;
if (!(ev= Log_event::read_log_event(cur_log, fdev,
int read_error;
if (!(ev= Log_event::read_log_event(cur_log, &read_error, fdev,
opt_slave_sql_verify_checksum)))
{
DBUG_PRINT("info",("could not read event, cur_log->error=%d",
cur_log->error));
if (cur_log->error) /* not EOF */
DBUG_PRINT("info",("could not read event, read_error=%d",
read_error));
if (read_error) /* not EOF */
{
*errmsg= "I/O error reading event at position 4";
delete fdev;

View file

@ -7845,7 +7845,8 @@ static Log_event* next_event(rpl_group_info *rgi, ulonglong *event_size)
MYSQL_BIN_LOG::open() will write the buffered description event.
*/
old_pos= rli->event_relay_log_pos;
if ((ev= Log_event::read_log_event(cur_log,
int error;
if ((ev= Log_event::read_log_event(cur_log, &error,
rli->relay_log.description_event_for_exec,
opt_slave_sql_verify_checksum)))
@ -7862,8 +7863,8 @@ static Log_event* next_event(rpl_group_info *rgi, ulonglong *event_size)
DBUG_RETURN(ev);
}
if (opt_reckless_slave) // For mysql-test
cur_log->error = 0;
if (unlikely(cur_log->error < 0))
error = 0;
if (unlikely(error))
{
errmsg = "slave SQL thread aborted because of I/O error";
if (hot_log)

View file

@ -4188,7 +4188,8 @@ bool mysql_show_binlog_events(THD* thd)
my_off_t scan_pos = BIN_LOG_HEADER_SIZE;
while (scan_pos < pos)
{
ev= Log_event::read_log_event(&log, description_event,
int error;
ev= Log_event::read_log_event(&log, &error, description_event,
opt_master_verify_checksum);
scan_pos = my_b_tell(&log);
if (ev == NULL || !ev->is_valid())
@ -4263,8 +4264,9 @@ bool mysql_show_binlog_events(THD* thd)
writing about this in the server log would be confusing as it isn't
related to server operational status.
*/
int error;
for (event_count = 0;
(ev = Log_event::read_log_event(&log,
(ev = Log_event::read_log_event(&log, &error,
description_event,
(opt_master_verify_checksum ||
verify_checksum_once), false)); )
@ -4308,7 +4310,7 @@ bool mysql_show_binlog_events(THD* thd)
break;
}
if (unlikely(event_count < unit->lim.get_select_limit() && log.error))
if (unlikely(event_count < unit->lim.get_select_limit() && error))
{
errmsg = "Wrong offset or I/O error";
mysql_mutex_unlock(log_lock);