Bug#55375 Transaction bigger than max_binlog_cache_size crashes slave

When slave executes a transaction bigger than slave's max_binlog_cache_size,
slave will crash. It is caused by the assert that server should only roll back
the statement but not the whole transaction if the error ER_TRANS_CACHE_FULL 
happens. But slave sql thread always rollbacks the whole transaction when
an error happens.
            
Ather this patch, we always clear any error set in sql thread(it is different
from the error in 'SHOW SLAVE STATUS') and it is cleared before rolling back
the transaction.
This commit is contained in:
2010-10-09 15:05:43 +08:00
parent 9c82ecec37
commit b66825912a
5 changed files with 144 additions and 52 deletions

View file

@ -4,6 +4,8 @@ reset master;
reset slave;
drop table if exists t1,t2,t3,t4,t5,t6,t7,t8,t9;
start slave;
SET GLOBAL max_binlog_cache_size = 4096;
SET GLOBAL binlog_cache_size = 4096;
CREATE TABLE t1(a INT PRIMARY KEY, data VARCHAR(30000)) ENGINE=Innodb;
CREATE TABLE t2(a INT PRIMARY KEY, data VARCHAR(30000)) ENGINE=MyIsam;
CREATE TABLE t3(a INT PRIMARY KEY, data VARCHAR(30000)) ENGINE=Innodb;
@ -15,11 +17,14 @@ Got one of the listed errors
*** Single statement on non-transactional table ***
*** After WL#2687 the difference between STATEMENT/MIXED and ROW will not exist. ***
Got one of the listed errors
--source include/wait_for_slave_sql_error_and_skip.inc
include/start_slave.inc
*** Single statement on both transactional and non-transactional tables. ***
*** After WL#2687 we will be able to change the order of the tables. ***
Got one of the listed errors
SET GLOBAL SQL_SLAVE_SKIP_COUNTER = 1;
START SLAVE SQL_THREAD;
--source include/wait_for_slave_sql_error_and_skip.inc
SET GLOBAL SQL_SLAVE_SKIP_COUNTER=1;
include/start_slave.inc
TRUNCATE TABLE t1;
TRUNCATE TABLE t2;
TRUNCATE TABLE t3;
@ -116,6 +121,37 @@ COMMIT;
BEGIN;
Got one of the listed errors
COMMIT;
--source include/wait_for_slave_sql_error_and_skip.inc
SET GLOBAL SQL_SLAVE_SKIP_COUNTER=1;
include/start_slave.inc
########################################################################
# 8 - Bug#55375(Regression Bug) Transaction bigger than
# max_binlog_cache_size crashes slave
########################################################################
SET GLOBAL max_binlog_cache_size = 4096;
SET GLOBAL binlog_cache_size = 4096;
include/stop_slave.inc
include/start_slave.inc
CALL mtr.add_suppression("Multi-statement transaction required more than 'max_binlog_cache_size' bytes of storage.*");
CALL mtr.add_suppression("Writing one row to the row-based binary log failed.*");
TRUNCATE t1;
SET GLOBAL max_binlog_cache_size= ORIGINAL_VALUE;
SET GLOBAL binlog_cache_size= ORIGINAL_VALUE;
BEGIN;
Repeat statement 'INSERT INTO t1 VALUES($n, repeat("a", 32))' 128 times
COMMIT;
SELECT count(*) FROM t1;
count(*)
0
show binlog events in 'slave-bin.000001' from <binlog_start>;
Log_name Pos Event_type Server_id End_log_pos Info
SET GLOBAL max_binlog_cache_size= ORIGINAL_VALUE;
SET GLOBAL binlog_cache_size= ORIGINAL_VALUE;
include/stop_slave.inc
include/start_slave.inc
SELECT count(*) FROM t1;
count(*)
128
########################################################################################
# CLEAN
########################################################################################
@ -126,10 +162,3 @@ DROP TABLE IF EXISTS t4;
DROP TABLE IF EXISTS t5;
DROP TABLE IF EXISTS t6;
DROP PROCEDURE p1;
DROP TABLE t1;
DROP TABLE t2;
DROP TABLE t3;
DROP TABLE IF EXISTS t4;
DROP TABLE IF EXISTS t5;
DROP TABLE IF EXISTS t6;
DROP PROCEDURE p1;

View file

@ -1 +0,0 @@
--binlog_cache_size=4096 --max_binlog_cache_size=7680

View file

@ -30,6 +30,16 @@
--source include/not_embedded.inc
--source include/not_windows.inc
let $old_max_binlog_cache_size= query_get_value(SHOW VARIABLES LIKE "max_binlog_cache_size", Value, 1);
let $old_binlog_cache_size= query_get_value(SHOW VARIABLES LIKE "binlog_cache_size", Value, 1);
SET GLOBAL max_binlog_cache_size = 4096;
# Becuase of bug#55377, we have to set binlog_cache_size until the bug is
# fixed.
SET GLOBAL binlog_cache_size = 4096;
disconnect master;
connect (master,127.0.0.1,root,,test,$MASTER_MYPORT,);
CREATE TABLE t1(a INT PRIMARY KEY, data VARCHAR(30000)) ENGINE=Innodb;
CREATE TABLE t2(a INT PRIMARY KEY, data VARCHAR(30000)) ENGINE=MyIsam;
CREATE TABLE t3(a INT PRIMARY KEY, data VARCHAR(30000)) ENGINE=Innodb;
@ -57,25 +67,23 @@ if (`SELECT @@binlog_format = 'STATEMENT' || @@binlog_format = 'MIXED'`)
{
eval INSERT INTO t2 (a, data) VALUES (2,
CONCAT($data, $data, $data, $data, $data, $data));
# Below code fakes the result of 'ROW' mode.
--echo Got one of the listed errors
--echo --source include/wait_for_slave_sql_error_and_skip.inc
--echo include/start_slave.inc
}
if (`SELECT @@binlog_format = 'ROW'`)
{
--error ER_TRANS_CACHE_FULL, ER_ERROR_ON_WRITE
eval INSERT INTO t2 (a, data) VALUES (2,
CONCAT($data, $data, $data, $data, $data, $data));
connection slave;
--source include/wait_for_slave_sql_to_stop.inc
SET GLOBAL SQL_SLAVE_SKIP_COUNTER = 1;
START SLAVE SQL_THREAD;
--source include/wait_for_slave_sql_to_start.inc
# Incident event
--let $slave_sql_errno=1590
--source include/wait_for_slave_sql_error_and_skip.inc
}
--enable_warnings
--enable_query_log
connection master;
--disable_query_log
eval INSERT INTO t1 (a, data) VALUES (3, $data);
eval INSERT INTO t1 (a, data) VALUES (4, $data);
@ -93,11 +101,9 @@ eval UPDATE t2, t1 SET t2.data = CONCAT($data, $data, $data, $data),
t1.data = CONCAT($data, $data, $data, $data);
--enable_query_log
connection slave;
--source include/wait_for_slave_sql_to_stop.inc
SET GLOBAL SQL_SLAVE_SKIP_COUNTER = 1;
START SLAVE SQL_THREAD;
--source include/wait_for_slave_sql_to_start.inc
# Incident event
--let $slave_sql_errno=1590
--source include/wait_for_slave_sql_error_and_skip.inc
#--echo ########################################################################################
#--echo # 2 - BEGIN - IMPLICIT COMMIT by DDL
@ -368,8 +374,70 @@ BEGIN;
--enable_query_log
COMMIT;
# Incident event
--let $slave_sql_errno=1590
--source include/wait_for_slave_sql_error_and_skip.inc
sync_slave_with_master;
--echo ########################################################################
--echo # 8 - Bug#55375(Regression Bug) Transaction bigger than
--echo # max_binlog_cache_size crashes slave
--echo ########################################################################
SET GLOBAL max_binlog_cache_size = 4096;
SET GLOBAL binlog_cache_size = 4096;
source include/stop_slave.inc;
source include/start_slave.inc;
CALL mtr.add_suppression("Multi-statement transaction required more than 'max_binlog_cache_size' bytes of storage.*");
CALL mtr.add_suppression("Writing one row to the row-based binary log failed.*");
connection master;
TRUNCATE t1;
sync_slave_with_master;
--let binlog_start= query_get_value(SHOW MASTER STATUS, Position, 1)
--let binlog_file= query_get_value(SHOW MASTER STATUS, File, 1)
connection master;
--replace_result $old_max_binlog_cache_size ORIGINAL_VALUE
--eval SET GLOBAL max_binlog_cache_size= $old_max_binlog_cache_size
--replace_result $old_binlog_cache_size ORIGINAL_VALUE
--eval SET GLOBAL binlog_cache_size= $old_binlog_cache_size
disconnect master;
connect (master,127.0.0.1,root,,test,$MASTER_MYPORT,);
--let $n=128
BEGIN;
--disable_query_log
--echo Repeat statement 'INSERT INTO t1 VALUES(\$n, repeat("a", 32))' $n times
while ($n)
{
--eval INSERT INTO t1 VALUES ($n, repeat("a", 32))
--dec $n
}
--enable_query_log
COMMIT;
connection slave;
--source include/wait_for_slave_sql_to_stop.inc
--let $slave_sql_errno= 1197
source include/wait_for_slave_sql_error.inc;
SELECT count(*) FROM t1;
source include/show_binlog_events.inc;
--replace_result $old_max_binlog_cache_size ORIGINAL_VALUE
--eval SET GLOBAL max_binlog_cache_size= $old_max_binlog_cache_size
--replace_result $old_binlog_cache_size ORIGINAL_VALUE
--eval SET GLOBAL binlog_cache_size= $old_binlog_cache_size
source include/stop_slave.inc;
source include/start_slave.inc;
connection master;
sync_slave_with_master;
SELECT count(*) FROM t1;
--echo ########################################################################################
--echo # CLEAN
@ -384,12 +452,5 @@ DROP TABLE IF EXISTS t4;
DROP TABLE IF EXISTS t5;
DROP TABLE IF EXISTS t6;
DROP PROCEDURE p1;
connection slave;
DROP TABLE t1;
DROP TABLE t2;
DROP TABLE t3;
DROP TABLE IF EXISTS t4;
DROP TABLE IF EXISTS t5;
DROP TABLE IF EXISTS t6;
DROP PROCEDURE p1;
--enable_warnings
source include/master-slave-end.inc;

View file

@ -144,16 +144,21 @@ static void inline slave_rows_error_report(enum loglevel level, int ha_error,
len= my_snprintf(slider, buff_end - slider,
" %s, Error_code: %d;", err->msg, err->code);
}
rli->report(level, thd->is_error()? thd->main_da.sql_errno() : 0,
"Could not execute %s event on table %s.%s;"
"%s handler error %s; "
"the event's master log %s, end_log_pos %lu",
type, table->s->db.str,
table->s->table_name.str,
buff,
handler_error == NULL? "<unknown>" : handler_error,
log_name, pos);
if (ha_error != 0)
rli->report(level, thd->is_error() ? thd->main_da.sql_errno() : 0,
"Could not execute %s event on table %s.%s;"
"%s handler error %s; "
"the event's master log %s, end_log_pos %lu",
type, table->s->db.str, table->s->table_name.str,
buff, handler_error == NULL ? "<unknown>" : handler_error,
log_name, pos);
else
rli->report(level, thd->is_error() ? thd->main_da.sql_errno() : 0,
"Could not execute %s event on table %s.%s;"
"%s the event's master log %s, end_log_pos %lu",
type, table->s->db.str, table->s->table_name.str,
buff, log_name, pos);
}
#endif
@ -7649,7 +7654,6 @@ int Rows_log_event::do_apply_event(Relay_log_info const *rli)
get_type_str(),
RPL_LOG_NAME, (ulong) log_pos);
thd->reset_current_stmt_binlog_row_based();
const_cast<Relay_log_info*>(rli)->cleanup_context(thd, error);
thd->is_slave_error= 1;
DBUG_RETURN(error);
}
@ -7680,14 +7684,12 @@ int Rows_log_event::do_apply_event(Relay_log_info const *rli)
const_cast<Relay_log_info*>(rli)->last_event_start_time= my_time(0);
}
if (get_flags(STMT_END_F))
if ((error= rows_event_stmt_cleanup(rli, thd)))
rli->report(ERROR_LEVEL, error,
"Error in %s event: commit of row events failed, "
"table `%s`.`%s`",
get_type_str(), m_table->s->db.str,
m_table->s->table_name.str);
if (get_flags(STMT_END_F) && (error= rows_event_stmt_cleanup(rli, thd)))
slave_rows_error_report(ERROR_LEVEL,
thd->is_error() ? 0 : error,
rli, thd, table,
get_type_str(),
RPL_LOG_NAME, (ulong) log_pos);
DBUG_RETURN(error);
}

View file

@ -2343,7 +2343,7 @@ static int exec_relay_log_event(THD* thd, Relay_log_info* rli)
else
{
exec_res= 0;
end_trans(thd, ROLLBACK);
rli->cleanup_context(thd, 1);
/* chance for concurrent connection to get more locks */
safe_sleep(thd, min(rli->trans_retries, MAX_SLAVE_RETRY_PAUSE),
(CHECK_KILLED_FUNC)sql_slave_killed, (void*)rli);
@ -3158,6 +3158,7 @@ the slave SQL thread with \"SLAVE START\". We stopped at log \
request is detected only by the present function, not by events), so we
must "proactively" clear playgrounds:
*/
thd->clear_error();
rli->cleanup_context(thd, 1);
/*
Some extra safety, which should not been needed (normally, event deletion