backporting fixes of bug@45940 to 5.1-rpl+2 to cover failures rpl_heartbeat_* as well.

This commit is contained in:
Andrei Elkin 2009-10-09 16:26:37 +03:00
parent 75a4dd3e6a
commit 124ad21d01
11 changed files with 342 additions and 170 deletions

View file

@ -1,17 +0,0 @@
stop slave;
drop table if exists t1,t2,t3,t4,t5,t6,t7,t8,t9;
reset master;
reset slave;
drop table if exists t1,t2,t3,t4,t5,t6,t7,t8,t9;
start slave;
create table t1 (words varchar(20)) engine=myisam;
load data infile '../../std_data/words.dat' into table t1 (words);
select count(*) from t1;
count(*)
70
select count(*) from t1;
count(*)
70
drop table t1;
include/stop_slave.inc
drop table t1;

View file

@ -0,0 +1,61 @@
stop slave;
drop table if exists t1,t2,t3,t4,t5,t6,t7,t8,t9;
reset master;
reset slave;
drop table if exists t1,t2,t3,t4,t5,t6,t7,t8,t9;
start slave;
create table tm (a int auto_increment primary key) engine=myisam;
create table ti (a int auto_increment primary key) engine=innodb;
set @@global.debug="+d,stop_slave_middle_group";
begin;
insert into ti set a=null;
insert into tm set a=null;
commit;
SELECT "NO" AS Last_SQL_Error, @check as `true`;
Last_SQL_Error true
NO 1
select count(*) as one from tm;
one
1
select count(*) as one from ti;
one
1
set @@global.debug="-d";
include/start_slave.inc
truncate table tm;
truncate table ti;
set @@global.debug="+d,stop_slave_middle_group";
set @@global.debug="+d,incomplete_group_in_relay_log";
begin;
insert into ti set a=null;
insert into tm set a=null;
commit;
SELECT "Fatal error: ... The slave SQL is stopped, leaving the current group of events unfinished with a non-transaction table changed. If the group consists solely of Row-based events, you can try restarting the slave with --slave-exec-mode=IDEMPOTENT, which ignores duplicate key, key not found, and similar errors (see documentation for details)." AS Last_SQL_Error, @check as `true`;
Last_SQL_Error true
Fatal error: ... The slave SQL is stopped, leaving the current group of events unfinished with a non-transaction table changed. If the group consists solely of Row-based events, you can try restarting the slave with --slave-exec-mode=IDEMPOTENT, which ignores duplicate key, key not found, and similar errors (see documentation for details). 1
select count(*) as one from tm;
one
1
select count(*) as zero from ti;
zero
0
set @@global.debug="-d";
stop slave;
truncate table tm;
include/start_slave.inc
set @@global.debug="+d,stop_slave_middle_group";
set @@global.debug="+d,incomplete_group_in_relay_log";
update tm as t1, ti as t2 set t1.a=t1.a * 2, t2.a=t2.a * 2;
SELECT "Fatal error: ... The slave SQL is stopped, leaving the current group of events unfinished with a non-transaction table changed. If the group consists solely of Row-based events, you can try restarting the slave with --slave-exec-mode=IDEMPOTENT, which ignores duplicate key, key not found, and similar errors (see documentation for details)." AS Last_SQL_Error, @check as `true`;
Last_SQL_Error true
Fatal error: ... The slave SQL is stopped, leaving the current group of events unfinished with a non-transaction table changed. If the group consists solely of Row-based events, you can try restarting the slave with --slave-exec-mode=IDEMPOTENT, which ignores duplicate key, key not found, and similar errors (see documentation for details). 1
select max(a) as two from tm;
two
2
select max(a) as one from ti;
one
1
set @@global.debug="-d";
drop table tm, ti;
include/stop_slave.inc
drop table tm, ti;

View file

@ -1 +0,0 @@
--loose-binlog-row-event-max-size=256

View file

@ -1 +0,0 @@
--loose-debug=d,STOP_SLAVE_after_first_Rows_event

View file

@ -1,32 +0,0 @@
-- source include/have_binlog_format_row.inc
-- source include/have_debug.inc
-- source include/master-slave.inc
# master is asked to create small Rows events: if only one event is
# created, stopping slave at the end of that one will show no bug, we
# need at least two (and stop after first); in this test we use three.
connection master;
create table t1 (words varchar(20)) engine=myisam;
load data infile '../../std_data/words.dat' into table t1 (words);
select count(*) from t1;
save_master_pos;
connection slave;
# slave will automatically stop the sql thread thanks to the .opt
# file; it will initiate the stop request after the first
# Rows_log_event (out of 3) but should wait until the last one is
# executed before stopping.
source include/wait_for_slave_sql_to_stop.inc;
# check that we inserted all rows (waited until the last Rows event)
select count(*) from t1;
connection master;
drop table t1;
connection slave; # slave SQL thread is stopped
source include/stop_slave.inc;
drop table t1;

View file

@ -0,0 +1,149 @@
-- source include/have_debug.inc
-- source include/master-slave.inc
-- source include/have_innodb.inc
# Proving that stopping in the middle of applying a group of events
# does not have immediate effect if a non-transaction table has been changed.
# The slave sql thread has to try to finish applying first.
# The tests rely on simulation of the killed status.
# The matter of testing correlates to some of `rpl_start_stop_slave' that does
# not require `have_debug'.
connection master;
create table tm (a int auto_increment primary key) engine=myisam;
create table ti (a int auto_increment primary key) engine=innodb;
sync_slave_with_master;
set @@global.debug="+d,stop_slave_middle_group";
connection master;
begin;
insert into ti set a=null;
insert into tm set a=null; # to simulate killed status on the slave
commit;
connection slave;
# slave will catch the killed status but won't shut down immediately
# only after the whole group has done (commit)
source include/wait_for_slave_sql_to_stop.inc;
# checking: no error and the group is finished
let $error= query_get_value("SHOW SLAVE STATUS", Last_SQL_Error, 1);
let $read = query_get_value("SHOW SLAVE STATUS", Read_Master_Log_Pos, 1);
let $exec = query_get_value("SHOW SLAVE STATUS", Exec_Master_Log_Pos, 1);
--disable_query_log
eval SELECT $read = $exec into @check;
--enable_query_log
eval SELECT "NO$error" AS Last_SQL_Error, @check as `true`;
select count(*) as one from tm;
select count(*) as one from ti;
set @@global.debug="-d";
#
# bug#45940 issues around rli->last_event_start_time
# Testing of slave stopped after it had waited (in vain) for
# the group be finished.
# It could not be finished because of simulation of failure to
# receive the terminal part
# The test relay on simulation of the incomplete group in the relay log
# Two cases are verified: a mixed transacton and a mixed multi-table update.
#
# The mixed transacton.
#
source include/start_slave.inc;
connection master;
truncate table tm; # cleanup of former tests
truncate table ti;
#connection slave;
sync_slave_with_master;
set @@global.debug="+d,stop_slave_middle_group";
set @@global.debug="+d,incomplete_group_in_relay_log";
connection master;
begin;
insert into ti set a=null;
insert into tm set a=null;
commit;
connection slave;
# slave will catch the killed status, won't shut down immediately
# but does it eventually having the whole group unfinished (not committed)
source include/wait_for_slave_sql_to_stop.inc;
# checking: the error and group unfinished
let $error= query_get_value("SHOW SLAVE STATUS", Last_SQL_Error, 1);
let $read = query_get_value("SHOW SLAVE STATUS", Read_Master_Log_Pos, 1);
let $exec = query_get_value("SHOW SLAVE STATUS", Exec_Master_Log_Pos, 1);
--disable_query_log
eval SELECT $read - $exec > 0 into @check;
--enable_query_log
eval SELECT "$error" AS Last_SQL_Error, @check as `true`;
select count(*) as one from tm;
select count(*) as zero from ti;
set @@global.debug="-d";
#
# The mixed multi-table update
#
stop slave;
truncate table tm;
source include/start_slave.inc;
connection master;
#connection slave;
sync_slave_with_master;
set @@global.debug="+d,stop_slave_middle_group";
set @@global.debug="+d,incomplete_group_in_relay_log";
connection master;
update tm as t1, ti as t2 set t1.a=t1.a * 2, t2.a=t2.a * 2;
connection slave;
# slave will catch the killed status, won't shut down immediately
# but does it eventually having the whole group unfinished (not committed)
#
source include/wait_for_slave_sql_to_stop.inc;
# checking: the error and group unfinished
let $error= query_get_value("SHOW SLAVE STATUS", Last_SQL_Error, 1);
let $read = query_get_value("SHOW SLAVE STATUS", Read_Master_Log_Pos, 1);
let $exec = query_get_value("SHOW SLAVE STATUS", Exec_Master_Log_Pos, 1);
--disable_query_log
eval SELECT $read - $exec > 0 into @check;
--enable_query_log
eval SELECT "$error" AS Last_SQL_Error, @check as `true`;
select max(a) as two from tm;
select max(a) as one from ti;
set @@global.debug="-d";
#
# clean-up
#
connection master;
drop table tm, ti;
connection slave; # slave SQL thread is stopped
source include/stop_slave.inc;
drop table tm, ti;

View file

@ -3272,6 +3272,21 @@ Default database: '%s'. Query: '%s'",
*/
} /* End of if (db_ok(... */
{/**
The following failure injecion works in cooperation with tests
setting @@global.debug= 'd,stop_slave_middle_group'.
The sql thread receives the killed status and will proceed
to shutdown trying to finish incomplete events group.
*/
DBUG_EXECUTE_IF("stop_slave_middle_group",
if (strcmp("COMMIT", query) != 0 &&
strcmp("BEGIN", query) != 0)
{
if (thd->transaction.all.modified_non_trans_table)
const_cast<Relay_log_info*>(rli)->abort_slave= 1;
};);
}
end:
/*
Probably we have set thd->query, thd->db, thd->catalog to point to places
@ -7475,8 +7490,16 @@ int Rows_log_event::do_apply_event(Relay_log_info const *rli)
thd->transaction.stmt.modified_non_trans_table= TRUE;
} // row processing loop
DBUG_EXECUTE_IF("STOP_SLAVE_after_first_Rows_event",
const_cast<Relay_log_info*>(rli)->abort_slave= 1;);
{/**
The following failure injecion works in cooperation with tests
setting @@global.debug= 'd,stop_slave_middle_group'.
The sql thread receives the killed status and will proceed
to shutdown trying to finish incomplete events group.
*/
DBUG_EXECUTE_IF("stop_slave_middle_group",
if (thd->transaction.all.modified_non_trans_table)
const_cast<Relay_log_info*>(rli)->abort_slave= 1;);
}
if ((error= do_after_row_operations(rli, error)) &&
ignored_error_code(convert_handler_error(error, thd, table)))
@ -7513,32 +7536,6 @@ int Rows_log_event::do_apply_event(Relay_log_info const *rli)
const_cast<Relay_log_info*>(rli)->cleanup_context(thd, error);
thd->is_slave_error= 1;
}
/*
This code would ideally be placed in do_update_pos() instead, but
since we have no access to table there, we do the setting of
last_event_start_time here instead.
*/
else if (table && (table->s->primary_key == MAX_KEY) &&
!cache_stmt && get_flags(STMT_END_F) == RLE_NO_FLAGS)
{
/*
------------ Temporary fix until WL#2975 is implemented ---------
This event is not the last one (no STMT_END_F). If we stop now
(in case of terminate_slave_thread()), how will we restart? We
have to restart from Table_map_log_event, but as this table is
not transactional, the rows already inserted will still be
present, and idempotency is not guaranteed (no PK) so we risk
that repeating leads to double insert. So we desperately try to
continue, hope we'll eventually leave this buggy situation (by
executing the final Rows_log_event). If we are in a hopeless
wait (reached end of last relay log and nothing gets appended
there), we timeout after one minute, and notify DBA about the
problem. When WL#2975 is implemented, just remove the member
Relay_log_info::last_event_start_time and all its occurrences.
*/
const_cast<Relay_log_info*>(rli)->last_event_start_time= my_time(0);
}
DBUG_RETURN(error);
}

View file

@ -226,7 +226,7 @@ Old_rows_log_event::do_apply_event(Old_rows_log_event *ev, const Relay_log_info
row_start= row_end;
}
DBUG_EXECUTE_IF("STOP_SLAVE_after_first_Rows_event",
DBUG_EXECUTE_IF("stop_slave_middle_group",
const_cast<Relay_log_info*>(rli)->abort_slave= 1;);
error= do_after_row_operations(table, error);
if (!ev->cache_stmt)
@ -269,34 +269,6 @@ Old_rows_log_event::do_apply_event(Old_rows_log_event *ev, const Relay_log_info
DBUG_RETURN(error);
}
/*
This code would ideally be placed in do_update_pos() instead, but
since we have no access to table there, we do the setting of
last_event_start_time here instead.
*/
if (table && (table->s->primary_key == MAX_KEY) &&
!ev->cache_stmt &&
ev->get_flags(Old_rows_log_event::STMT_END_F) == Old_rows_log_event::RLE_NO_FLAGS)
{
/*
------------ Temporary fix until WL#2975 is implemented ---------
This event is not the last one (no STMT_END_F). If we stop now
(in case of terminate_slave_thread()), how will we restart? We
have to restart from Table_map_log_event, but as this table is
not transactional, the rows already inserted will still be
present, and idempotency is not guaranteed (no PK) so we risk
that repeating leads to double insert. So we desperately try to
continue, hope we'll eventually leave this buggy situation (by
executing the final Old_rows_log_event). If we are in a hopeless
wait (reached end of last relay log and nothing gets appended
there), we timeout after one minute, and notify DBA about the
problem. When WL#2975 is implemented, just remove the member
st_relay_log_info::last_event_start_time and all its occurences.
*/
const_cast<Relay_log_info*>(rli)->last_event_start_time= my_time(0);
}
DBUG_RETURN(0);
}
#endif
@ -1744,7 +1716,7 @@ int Old_rows_log_event::do_apply_event(Relay_log_info const *rli)
} // row processing loop
DBUG_EXECUTE_IF("STOP_SLAVE_after_first_Rows_event",
DBUG_EXECUTE_IF("stop_slave_middle_group",
const_cast<Relay_log_info*>(rli)->abort_slave= 1;);
error= do_after_row_operations(rli, error);
if (!cache_stmt)
@ -1787,33 +1759,6 @@ int Old_rows_log_event::do_apply_event(Relay_log_info const *rli)
DBUG_RETURN(error);
}
/*
This code would ideally be placed in do_update_pos() instead, but
since we have no access to table there, we do the setting of
last_event_start_time here instead.
*/
if (table && (table->s->primary_key == MAX_KEY) &&
!cache_stmt && get_flags(STMT_END_F) == RLE_NO_FLAGS)
{
/*
------------ Temporary fix until WL#2975 is implemented ---------
This event is not the last one (no STMT_END_F). If we stop now
(in case of terminate_slave_thread()), how will we restart? We
have to restart from Table_map_log_event, but as this table is
not transactional, the rows already inserted will still be
present, and idempotency is not guaranteed (no PK) so we risk
that repeating leads to double insert. So we desperately try to
continue, hope we'll eventually leave this buggy situation (by
executing the final Old_rows_log_event). If we are in a hopeless
wait (reached end of last relay log and nothing gets appended
there), we timeout after one minute, and notify DBA about the
problem. When WL#2975 is implemented, just remove the member
Relay_log_info::last_event_start_time and all its occurrences.
*/
const_cast<Relay_log_info*>(rli)->last_event_start_time= my_time(0);
}
DBUG_RETURN(0);
}

View file

@ -1195,7 +1195,6 @@ void Relay_log_info::cleanup_context(THD *thd, bool error)
*/
thd->options&= ~OPTION_NO_FOREIGN_KEY_CHECKS;
thd->options&= ~OPTION_RELAXED_UNIQUE_CHECKS;
last_event_start_time= 0;
DBUG_VOID_RETURN;
}

View file

@ -349,12 +349,10 @@ public:
void clear_tables_to_lock();
/*
Used by row-based replication to detect that it should not stop at
this event, but give it a chance to send more events. The time
where the last event inside a group started is stored here. If the
variable is zero, we are not in a group (but may be in a
transaction).
*/
Used to defer stopping the SQL thread to give it a chance
to finish up the current group of events.
The timestamp is set and reset in @c sql_slave_killed().
*/
time_t last_event_start_time;
/**

View file

@ -49,6 +49,10 @@
#define FLAGSTR(V,F) ((V)&(F)?#F" ":"")
#define MAX_SLAVE_RETRY_PAUSE 5
/*
a parameter of sql_slave_killed() to defer the killed status
*/
#define SLAVE_WAIT_GROUP_DONE 60
bool use_slave_mask = 0;
MY_BITMAP slave_error_mask;
char slave_skip_error_names[SHOW_VAR_FUNC_BUFF_SIZE];
@ -790,46 +794,95 @@ static bool io_slave_killed(THD* thd, Master_info* mi)
DBUG_RETURN(mi->abort_slave || abort_loop || thd->killed);
}
/**
The function analyzes a possible killed status and makes
a decision whether to accept it or not.
Normally upon accepting the sql thread goes to shutdown.
In the event of deffering decision @rli->last_event_start_time waiting
timer is set to force the killed status be accepted upon its expiration.
@param thd pointer to a THD instance
@param rli pointer to Relay_log_info instance
@return TRUE the killed status is recognized, FALSE a possible killed
status is deferred.
*/
static bool sql_slave_killed(THD* thd, Relay_log_info* rli)
{
bool ret= FALSE;
DBUG_ENTER("sql_slave_killed");
DBUG_ASSERT(rli->sql_thd == thd);
DBUG_ASSERT(rli->slave_running == 1);// tracking buffer overrun
if (abort_loop || thd->killed || rli->abort_slave)
{
if (rli->abort_slave && rli->is_in_group() &&
thd->transaction.all.modified_non_trans_table)
DBUG_RETURN(0);
/*
If we are in an unsafe situation (stopping could corrupt replication),
we give one minute to the slave SQL thread of grace before really
terminating, in the hope that it will be able to read more events and
the unsafe situation will soon be left. Note that this one minute starts
from the last time anything happened in the slave SQL thread. So it's
really one minute of idleness, we don't timeout if the slave SQL thread
is actively working.
*/
if (rli->last_event_start_time == 0)
DBUG_RETURN(1);
DBUG_PRINT("info", ("Slave SQL thread is in an unsafe situation, giving "
"it some grace period"));
if (difftime(time(0), rli->last_event_start_time) > 60)
if (thd->transaction.all.modified_non_trans_table && rli->is_in_group())
{
rli->report(ERROR_LEVEL, 0,
"SQL thread had to stop in an unsafe situation, in "
"the middle of applying updates to a "
"non-transactional table without any primary key. "
"There is a risk of duplicate updates when the slave "
"SQL thread is restarted. Please check your tables' "
"contents after restart.");
DBUG_RETURN(1);
char msg_stopped[]=
"... The slave SQL is stopped, leaving the current group "
"of events unfinished with a non-transaction table changed. "
"If the group consists solely of Row-based events, you can try "
"restarting the slave with --slave-exec-mode=IDEMPOTENT, which "
"ignores duplicate key, key not found, and similar errors (see "
"documentation for details).";
if (rli->abort_slave)
{
DBUG_PRINT("info", ("Slave SQL thread is being stopped in the middle of"
" a group having updated a non-trans table, giving"
" it some grace period"));
/*
Slave sql thread shutdown in face of unfinished group modified
Non-trans table is handled via a timer. The slave may eventually
give out to complete the current group and in that case there
might be issues at consequent slave restart, see the error message.
WL#2975 offers a robust solution requiring to store the last exectuted
event's coordinates along with the group's coordianates
instead of waiting with @c last_event_start_time the timer.
*/
if (rli->last_event_start_time == 0)
rli->last_event_start_time= my_time(0);
ret= difftime(my_time(0), rli->last_event_start_time) <=
SLAVE_WAIT_GROUP_DONE ? FALSE : TRUE;
DBUG_EXECUTE_IF("stop_slave_middle_group",
DBUG_EXECUTE_IF("incomplete_group_in_relay_log",
ret= TRUE;);); // time is over
if (ret == 0)
{
rli->report(WARNING_LEVEL, 0,
"slave SQL thread is being stopped in the middle "
"of applying of a group having updated a non-transaction "
"table; waiting for the group completion ... ");
}
else
{
rli->report(ERROR_LEVEL, ER_SLAVE_FATAL_ERROR,
ER(ER_SLAVE_FATAL_ERROR), msg_stopped);
}
}
else
{
ret= TRUE;
rli->report(ERROR_LEVEL, ER_SLAVE_FATAL_ERROR, ER(ER_SLAVE_FATAL_ERROR),
msg_stopped);
}
}
else
{
ret= TRUE;
}
}
DBUG_RETURN(0);
if (ret)
rli->last_event_start_time= 0;
DBUG_RETURN(ret);
}
/*
skip_load_data_infile()
@ -2525,6 +2578,27 @@ static int exec_relay_log_event(THD* thd, Relay_log_info* rli)
delete ev;
DBUG_RETURN(1);
}
{ /**
The following failure injecion works in cooperation with tests
setting @@global.debug= 'd,incomplete_group_in_relay_log'.
Xid or Commit events are not executed to force the slave sql
read hanging if the realy log does not have any more events.
*/
DBUG_EXECUTE_IF("incomplete_group_in_relay_log",
if ((ev->get_type_code() == XID_EVENT) ||
((ev->get_type_code() == QUERY_EVENT) &&
strcmp("COMMIT", ((Query_log_event *) ev)->query) == 0))
{
DBUG_ASSERT(thd->transaction.all.modified_non_trans_table);
rli->abort_slave= 1;
pthread_mutex_unlock(&rli->data_lock);
delete ev;
rli->inc_event_relay_log_pos();
DBUG_RETURN(0);
};);
}
exec_res= apply_event_and_update_pos(ev, thd, rli, TRUE);
/*
@ -3953,7 +4027,7 @@ static int queue_event(Master_info* mi,const char* buf, ulong event_len)
goto skip_relay_logging;
}
break;
default:
inc_pos= event_len;
break;