mariadb/mysql-test/suite/galera/t/galera_bf_kill.test
Marko Mäkelä ddd7d5d8e3 MDEV-24035 Failing assertion: UT_LIST_GET_LEN(lock.trx_locks) == 0 causing disruption and replication failure
Under unknown circumstances, the SQL layer may wrongly disregard an
invocation of thd_mark_transaction_to_rollback() when an InnoDB
transaction had been aborted (rolled back) due to one of the following errors:
* HA_ERR_LOCK_DEADLOCK
* HA_ERR_RECORD_CHANGED (if innodb_snapshot_isolation=ON)
* HA_ERR_LOCK_WAIT_TIMEOUT (if innodb_rollback_on_timeout=ON)

Such an error used to cause a crash of InnoDB during transaction commit.
These changes aim to catch and report the error earlier, so that not only
this crash can be avoided but also the original root cause be found and
fixed more easily later.

The idea of this fix is from Michael 'Monty' Widenius.

HA_ERR_ROLLBACK: A new error code that will be translated into
ER_ROLLBACK_ONLY, signalling that the current transaction
has been aborted and the only allowed action is ROLLBACK.

trx_t::state: Add TRX_STATE_ABORTED that is like
TRX_STATE_NOT_STARTED, but noting that the transaction had been
rolled back and aborted.

trx_t::is_started(): Replaces trx_is_started().

ha_innobase: Check the transaction state in various places.
Simplify the logic around SAVEPOINT.

ha_innobase::is_valid_trx(): Replaces ha_innobase::is_read_only().

The InnoDB logic around transaction savepoints, commit, and rollback
was unnecessarily complex and might have contributed to this
inconsistency. So, we are simplifying that logic as well.

trx_savept_t: Replace with const undo_no_t*. When we rollback to
a savepoint, all we need to know is the number of undo log records
that must survive.

trx_named_savept_t, DB_NO_SAVEPOINT: Remove. We can store undo_no_t
directly in the space allocated at innobase_hton->savepoint_offset.

fts_trx_create(): Do not copy previous savepoints.

fts_savepoint_rollback(): If a savepoint was not found, roll back
everything after the default savepoint of fts_trx_create().
The test innodb_fts.savepoint is extended to cover this code.

Reviewed by: Vladislav Lesin
Tested by: Matthias Leich
2024-12-12 18:02:00 +02:00

232 lines
6 KiB
Text

--source include/galera_cluster.inc
--source include/have_innodb.inc
#
# Test case 1: Start a transaction on node_2a and kill it
# from other connection on same node
#
--connection node_2
call mtr.add_suppression("InnoDB: Transaction was aborted due to ");
CREATE TABLE t1(a int not null primary key auto_increment,b int) engine=InnoDB;
insert into t1 values (NULL,1);
--connect node_2a, 127.0.0.1, root, , test, $NODE_MYPORT_2
--connection node_2a
call mtr.add_suppression("InnoDB: Transaction was aborted due to ");
begin;
update t1 set a = 5;
--connection node_2
--let $wait_condition = SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.PROCESSLIST WHERE USER = 'root' AND COMMAND = 'Sleep' LIMIT 1
--source include/wait_condition.inc
--let $k_thread = `SELECT ID FROM INFORMATION_SCHEMA.PROCESSLIST WHERE USER = 'root' AND COMMAND = 'Sleep' LIMIT 1`
--disable_query_log
--eval KILL $k_thread
--enable_query_log
select * from t1;
--disconnect node_2a
#
# Test case 2: Start a transaction on node_2a and use
# kill query from other connection on same node
#
--connect node_2a, 127.0.0.1, root, , test, $NODE_MYPORT_2
--connection node_2a
begin;
update t1 set a =5;
--connection node_2
--let $wait_condition = SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.PROCESSLIST WHERE USER = 'root' AND COMMAND = 'Sleep' LIMIT 1
--source include/wait_condition.inc
--let $k_thread = `SELECT ID FROM INFORMATION_SCHEMA.PROCESSLIST WHERE USER = 'root' AND COMMAND = 'Sleep' LIMIT 1`
--disable_query_log
--eval KILL QUERY $k_thread
--enable_query_log
select * from t1;
--disconnect node_2a
#
# Test case 3: Start a transaction on node_2a and start a DDL on other transaction
# that will then abort node_2a transaction
#
--connect node_2a, 127.0.0.1, root, , test, $NODE_MYPORT_2
--connection node_2a
begin;
update t1 set a =5, b=2;
--connection node_2
ALTER TABLE t1 ADD UNIQUE KEY b1(b);
ALTER TABLE t1 DROP KEY b1;
select * from t1;
--disconnect node_2a
#
# Test case 4: Start a transaction on node_2a and conflicting transaction on node_2b
# and start a DDL on other transaction that will then abort node_2a and node_2b
# transactions
#
--connect node_2a, 127.0.0.1, root, , test, $NODE_MYPORT_2
--connection node_2a
begin;
update t1 set a =5, b=2;
--connect node_2b, 127.0.0.1, root, , test, $NODE_MYPORT_2
--connection node_2b
begin;
send update t1 set a =6, b=7;
--connection node_2
ALTER TABLE t1 ADD UNIQUE KEY b2(b);
ALTER TABLE t1 DROP KEY b2;
select * from t1;
--disconnect node_2a
--disconnect node_2b
#
# Test case 5: Start a transaction on node_2a with wsrep disabled.
# A conflicting DDL on other transaction can't BF abort
# transaction from node_2a (wsrep disabled).
#
--connect node_2a, 127.0.0.1, root, , test, $NODE_MYPORT_2
--connect node_2b, 127.0.0.1, root, , test, $NODE_MYPORT_2
--connection node_2a
SET SESSION wsrep_on=OFF;
begin;
update t1 set a =5, b=2;
--connection node_2
--send ALTER TABLE t1 ADD UNIQUE KEY b3(b)
--connection node_2b
SET SESSION wsrep_sync_wait=0;
--let $wait_condition = SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.PROCESSLIST WHERE STATE = 'Waiting for table metadata lock';
--source include/wait_condition.inc
--connection node_2a
select * from t1;
# We expect that ALTER should not be able to BF abort
# this transaction, it must wait for it to finish.
# Expect commit to succeed.
commit;
--connection node_2
--reap
--disconnect node_2a
--disconnect node_2b
#
# Test case 6: Start a transaction on node_2a with wsrep disabled
# and kill it from other connection on same node.
#
--connect node_2a, 127.0.0.1, root, , test, $NODE_MYPORT_2
--connection node_2a
SET SESSION wsrep_on=OFF;
begin;
update t1 set a =5, b=2;
--connection node_2
--let $wait_condition = SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.PROCESSLIST WHERE USER = 'root' AND COMMAND = 'Sleep' LIMIT 1
--source include/wait_condition.inc
--let $k_thread = `SELECT ID FROM INFORMATION_SCHEMA.PROCESSLIST WHERE USER = 'root' AND COMMAND = 'Sleep' LIMIT 1`
--disable_query_log
--eval KILL $k_thread
--enable_query_log
select * from t1;
--disconnect node_2a
--connection node_1
source include/maybe_debug.inc;
if ($have_debug) {
#
# Test case 7: Start a transaction on node_2 and use KILL to abort
# a query in connection node_2a
# During the KILL execution replicate conflicting transaction from node_1
# to BF abort the transaction executing the KILL
#
--connect node_2a, 127.0.0.1, root, , test, $NODE_MYPORT_2
--connection node_2a
truncate t1;
insert into t1 values (7,0);
--connection node_2
set wsrep_sync_wait=0;
# get the ID of connection to be later killed
--let $wait_condition = SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.PROCESSLIST WHERE USER = 'root' AND COMMAND = 'Sleep' LIMIT 1
--source include/wait_condition.inc
--let $k_thread = `SELECT ID FROM INFORMATION_SCHEMA.PROCESSLIST WHERE USER = 'root' AND COMMAND = 'Sleep' LIMIT 1`
# start a transaction
begin;
update t1 set b=2 where a=7;
# set sync point for incoming applying
--connect node_2b, 127.0.0.1, root, , test, $NODE_MYPORT_2
set wsrep_sync_wait=0;
SET GLOBAL debug_dbug = "d,sync.wsrep_apply_cb";
# replicate conflicting transaction, should stopp in the sync point
--connection node_1
update t1 set b=1 where a=7;
# wait for the applier to reach the sync point
--connection node_2b
SET SESSION DEBUG_SYNC = "now WAIT_FOR sync.wsrep_apply_cb_reached";
# issue KILL inside the transacion, implicit commit is expected
--connection node_2
--disable_query_log
--send_eval KILL QUERY $k_thread
--enable_query_log
# wait for the KILL processing to be seen in processlist
--connection node_2b
--let $wait_condition = SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.PROCESSLIST WHERE USER = 'root' AND INFO LIKE 'KILL QUERY%'
--source include/wait_condition.inc
# resume applying, BF abort should follow
SET DEBUG_SYNC = "now SIGNAL signal.wsrep_apply_cb";
--connection node_2
--error ER_LOCK_DEADLOCK
--reap
commit;
select * from t1;
--connection node_2a
SET DEBUG_SYNC= 'RESET';
SET GLOBAL debug_dbug = "";
--disconnect node_2a
--disconnect node_2b
--connection node_1
}
drop table t1;