MDEV-32974 : Member fails to join due to old seqno in GTID

Before MDEV-15158, wsrep xid information was stored in only one place:
in the TRX_SYS page. Starting with 10.3, it is not stored there but
in the rollback segment header pages, and the latest one is what
matters. MDEV-19229 allows the undo tablespaces to be rebuilt when
innodb_undo_tablespaces is changed on startup. Previously it was not
possible to change that parameter.

These changes caused the fact that rollback segment header pages could
contain several wsrep xid's stored and when undo tablespaces were
rebuilt there was a effort to restore wsrep xid back to rollback
segment header page but because there was several of them the latest
wsrep xid was overwritten with older one.

trx_rseg_read_wsrep_checkpoint
trx_rseg_init_wsrep_xid
	Return true if read xid is wsrep xid, false if not

trx_rseg_mem_restore
	Try to read wsrep xid and if it is found copy it to
	trx_sys.recovered_wsrep_xid if read xid has larger
	seqno.
This commit is contained in:
Jan Lindström 2024-03-12 11:47:47 +02:00
parent 5faf2fdc3b
commit cac0fc97cc
5 changed files with 71 additions and 10 deletions

View file

@ -1,6 +1,6 @@
--- r/galera_sst_mariabackup.result
+++ r/galera_sst_mariabackup.reject
@@ -516,5 +516,189 @@
--- galera/r/galera_sst_mariabackup.result 2024-04-11 09:53:12.950512316 +0300
+++ galera/r/galera_sst_mariabackup,debug.reject 2024-04-11 10:00:36.771144955 +0300
@@ -524,6 +524,190 @@
1
DROP TABLE t1;
COMMIT;
@ -188,5 +188,6 @@
+DROP TABLE t1;
+COMMIT;
+SET GLOBAL debug_dbug = $debug_orig;
disconnect node_2;
disconnect node_1;
connection node_2;
Shutting down server ...
connection node_1;

View file

@ -2,6 +2,14 @@ connection node_2;
connection node_1;
connection node_1;
connection node_2;
connection node_1;
select @@innodb_undo_tablespaces;
@@innodb_undo_tablespaces
0
connection node_2;
select @@innodb_undo_tablespaces;
@@innodb_undo_tablespaces
3
Performing State Transfer on a server that has been shut down cleanly and restarted
connection node_1;
CREATE TABLE t1 (id int not null primary key,f1 CHAR(255)) ENGINE=InnoDB;
@ -516,5 +524,17 @@ COUNT(*) = 0
1
DROP TABLE t1;
COMMIT;
connection node_2;
Shutting down server ...
connection node_1;
connection node_2;
Starting server ...
Using --wsrep-start-position when starting mysqld ...
connection node_1;
connection node_2;
select @@innodb_undo_tablespaces;
@@innodb_undo_tablespaces
3
call mtr.add_suppression("InnoDB: Cannot change innodb_undo_tablespaces=3 because previous shutdown was not with innodb_fast_shutdown=0");
disconnect node_2;
disconnect node_1;

View file

@ -7,9 +7,14 @@ wsrep_debug=1
[mysqld.1]
wsrep_provider_options='base_port=@mysqld.1.#galera_port;gcache.size=1;pc.ignore_sb=true'
innodb_fast_shutdown=0
innodb_undo_tablespaces=0
[mysqld.2]
wsrep_provider_options='base_port=@mysqld.2.#galera_port;gcache.size=1;pc.ignore_sb=true'
innodb_fast_shutdown=0
innodb_undo_tablespaces=3
innodb_log_file_buffering
[sst]
transferfmt=@ENV.MTR_GALERA_TFMT

View file

@ -8,12 +8,39 @@
--let $node_2=node_2
--source include/auto_increment_offset_save.inc
--connection node_1
select @@innodb_undo_tablespaces;
--connection node_2
select @@innodb_undo_tablespaces;
--source suite/galera/include/galera_st_shutdown_slave.inc
--source suite/galera/include/galera_st_clean_slave.inc
--source suite/galera/include/galera_st_kill_slave.inc
--source suite/galera/include/galera_st_kill_slave_ddl.inc
--connection node_2
--echo Shutting down server ...
--source include/shutdown_mysqld.inc
--connection node_1
--let $wait_condition = SELECT VARIABLE_VALUE = 1 FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_cluster_size'
--source include/wait_condition.inc
--connection node_2
--echo Starting server ...
--source include/start_mysqld.inc
--connection node_1
--let $wait_condition = SELECT VARIABLE_VALUE = 2 FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_cluster_size'
--source include/wait_condition.inc
--connection node_2
select @@innodb_undo_tablespaces;
call mtr.add_suppression("InnoDB: Cannot change innodb_undo_tablespaces=3 because previous shutdown was not with innodb_fast_shutdown=0");
# Restore original auto_increment_offset values.
--source include/auto_increment_offset_restore.inc

View file

@ -201,7 +201,7 @@ bool trx_rseg_read_wsrep_checkpoint(const buf_block_t *rseg_header, XID &xid)
memcpy(xid.data, TRX_RSEG + TRX_RSEG_WSREP_XID_DATA
+ rseg_header->page.frame, XIDDATASIZE);
return true;
return wsrep_is_wsrep_xid(&xid);
}
/** Read the WSREP XID from the TRX_SYS page (in case of upgrade).
@ -237,7 +237,8 @@ static bool trx_rseg_init_wsrep_xid(const page_t* page, XID& xid)
memcpy(xid.data,
TRX_SYS + TRX_SYS_WSREP_XID_INFO
+ TRX_SYS_WSREP_XID_DATA + page, XIDDATASIZE);
return true;
return wsrep_is_wsrep_xid(&xid);
}
/** Recover the latest WSREP checkpoint XID.
@ -498,10 +499,17 @@ static dberr_t trx_rseg_mem_restore(trx_rseg_t *rseg, mtr_t *mtr)
trx_sys.recovered_binlog_offset= binlog_offset;
trx_sys.recovered_binlog_is_legacy_pos= false;
}
#ifdef WITH_WSREP
trx_rseg_read_wsrep_checkpoint(rseg_hdr, trx_sys.recovered_wsrep_xid);
#endif
}
#ifdef WITH_WSREP
XID tmp_xid;
tmp_xid.null();
/* Update recovered wsrep xid only if we found wsrep xid from
rseg header page and read xid seqno is larger than currently
recovered xid seqno. */
if (trx_rseg_read_wsrep_checkpoint(rseg_hdr, tmp_xid) &&
wsrep_xid_seqno(&tmp_xid) > wsrep_xid_seqno(&trx_sys.recovered_wsrep_xid))
trx_sys.recovered_wsrep_xid.set(&tmp_xid);
#endif
}
if (srv_operation == SRV_OPERATION_RESTORE)