MDEV-15443 Properly read wsrep XID and binlog position from rollback segment headers

The problem is a regression caused by MDEV-15158.
If some transactions were committed with wsrep_on=0, a
rollback segment header having the highest trx_id assigned might
store undefined wsrep XID. When reading the wsrep checkpoint
from InnodB, the undefined wsrep XID might be returned instead
of the highest valid one.

Similarly, if the binary log is intermittently disabled or enabled
while InnoDB transactions are being committed, the latest updated
rollback segment header page might not contain the latest binlog metadata.

Therefore, the MDEV-15158 logic to rely on TRX_RSEG_MAX_TRX_ID for
determining the most recent WSREP XID or binlog position is invalid.
We must choose the maximum entries among the rollback segment header
pages.

This fix is based on code submitted by Teemu Ollakka from Codership
and by Thirunarayanan Balathandayuthapani from MariaDB Corporation.

trx_purge_add_undo_to_history(): Only write TRX_RSEG_MAX_TRX_ID
when it was used to be written before MDEV-15158.

wsrep_seqno: Renamed from trx_sys_cur_xid_seqno.

wsrep_uuid: Renamed from trx_sys_cur_xid_uuid, and enable in non-debug
builds.

read_wsrep_xid_uuid(): Make non-debug, and remove the memcpy().

trx_rseg_update_wsrep_checkpoint(): Correctly compare and copy
the entire UUID in the debug check. In case of UUID mismatch,
write the WSREP XID to all 128 rollback segment headers in
a single mini-transaction.

trx_rseg_read_wsrep_checkpoint(rseg_header, xid): Make static.
In case the information is absent, do not overwrite xid.

trx_rseg_read_wsrep_checkpoint(xid): Determine the maximum
WSREP XID.

trx_rseg_mem_restore(): Remove the parameter max_rseg_trx_id.
Determine the latest binlog file and position by comparing
file names and offsets. Declare trx_sys.recovered_binlog_offset
as an unsigned type.
This commit is contained in:
Marko Mäkelä 2018-03-06 23:29:38 +02:00
parent d70573564c
commit 67f6d40bd9
7 changed files with 195 additions and 84 deletions

View file

@ -0,0 +1,15 @@
CREATE TABLE t1 (f1 INT PRIMARY KEY) ENGINE=InnoDB;
INSERT INTO t1 VALUES (1);
connection node_2;
SELECT * FROM t1;
f1
1
SET GLOBAL wsrep_cluster_address='';
SET SESSION wsrep_on=0;
INSERT INTO t1 VALUES (2);
DELETE FROM t1 WHERE f1 = 2;
connection node_1;
INSERT INTO t1 VALUES (2);
connection node_2;
connection node_1;
DROP TABLE t1;

View file

@ -0,0 +1,5 @@
!include ../galera_2nodes.cnf
[mysqld.1]
wsrep_auto_increment_control=OFF
[mysqld.2]
wsrep_auto_increment_control=OFF

View file

@ -0,0 +1,53 @@
#
# MDEV-15443
#
# If transactions are executed into InnoDB without wsrep_on,
# rseg header trx_id gets incremented and the rseg header
# corresponding to maximum trx_id may store undefined wsrep XID.
# When the wsrep XID is read from the storage engine,
# undefined XID may returned instead the valid one.
#
# This test demonstrates the problem by taking a node_2 out
# of the cluster and writing and deleting a row with
# wsrep_on=0. When the bug is present, node_2 will fail to
# rejoin the cluster because an invalid XID is read from the
# storage engine after startup/recovery.
#
--source include/have_innodb.inc
--source include/galera_cluster.inc
# Initialize table on node_1
CREATE TABLE t1 (f1 INT PRIMARY KEY) ENGINE=InnoDB;
INSERT INTO t1 VALUES (1);
# Go to node_2, verify that the previous INSERT completed.
# Take node_2 out of the cluster, insert and delete a record
# on a table with wsrep_on.
--connection node_2
SELECT * FROM t1;
SET GLOBAL wsrep_cluster_address='';
SET SESSION wsrep_on=0;
INSERT INTO t1 VALUES (2);
DELETE FROM t1 WHERE f1 = 2;
# Shutdown node_2
--source include/shutdown_mysqld.inc
# On node_1, verify that the node has left the cluster.
--connection node_1
--let $wait_condition = SELECT VARIABLE_VALUE = 1 FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_cluster_size';
--source include/wait_condition.inc
# Insert into t1 to enforce IST on node_2 when it is restarted.
INSERT INTO t1 VALUES (2);
# Restart node_2
--connection node_2
--source include/start_mysqld.inc
--let $wait_condition = SELECT VARIABLE_VALUE = 2 FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_cluster_size';
--source include/wait_condition.inc
--connection node_1
--source include/wait_condition.inc
DROP TABLE t1;

View file

@ -281,16 +281,15 @@ trx_rseg_update_wsrep_checkpoint(
const XID* xid,
mtr_t* mtr);
/** Update WSREP checkpoint XID in first rollback segment header.
/** Update WSREP checkpoint XID in first rollback segment header
as part of wsrep_set_SE_checkpoint() when it is guaranteed that there
are no wsrep transactions committing.
If the UUID part of the WSREP XID does not match to the UUIDs of XIDs already
stored into rollback segments, the WSREP XID in all the remaining rollback
segments will be reset.
@param[in] xid WSREP XID */
void trx_rseg_update_wsrep_checkpoint(const XID* xid);
/** Read the WSREP XID information in rollback segment header.
@param[in] rseg_header Rollback segment header
@param[out] xid Transaction XID
@return whether the WSREP XID was present */
bool trx_rseg_read_wsrep_checkpoint(const trx_rsegf_t* rseg_header, XID& xid);
/** Recover the latest WSREP checkpoint XID.
@param[out] xid WSREP XID
@return whether the WSREP XID was found */

View file

@ -849,7 +849,7 @@ public:
XID recovered_wsrep_xid;
#endif
/** Latest recovered binlog offset */
int64_t recovered_binlog_offset;
uint64_t recovered_binlog_offset;
/** Latest recovred binlog file name */
char recovered_binlog_filename[TRX_SYS_MYSQL_LOG_NAME_LEN];

View file

@ -254,12 +254,10 @@ trx_purge_add_undo_to_history(const trx_t* trx, trx_undo_t*& undo, mtr_t* mtr)
mlog_write_ulint(
rseg_header + TRX_RSEG_HISTORY_SIZE,
hist_size + undo->size, MLOG_4BYTES, mtr);
}
/* This field now also serves as an identifier for the latest
binlog and WSREP XID information. */
mlog_write_ull(rseg_header + TRX_RSEG_MAX_TRX_ID,
trx_sys.get_max_trx_id(), mtr);
mlog_write_ull(rseg_header + TRX_RSEG_MAX_TRX_ID,
trx_sys.get_max_trx_id(), mtr);
}
/* Before any transaction-generating background threads or the
purge have been started, recv_recovery_rollback_active() can

View file

@ -37,8 +37,21 @@ Created 3/26/1996 Heikki Tuuri
#ifdef WITH_WSREP
#ifdef UNIV_DEBUG
static long long trx_sys_cur_xid_seqno = -1;
static unsigned char trx_sys_cur_xid_uuid[16];
/** The latest known WSREP XID sequence number */
static long long wsrep_seqno = -1;
#endif /* UNIV_DEBUG */
/** The latest known WSREP XID UUID */
static unsigned char wsrep_uuid[16];
/** Read WSREP XID UUID.
@param[in] xid WSREP XID
@return Pointer to the first byte of the UUID.
*/
static inline const byte* read_wsrep_xid_uuid(const XID* xid)
{
return reinterpret_cast<const byte*>(xid->data + 8);
}
/** Read WSREP XID seqno */
static inline long long read_wsrep_xid_seqno(const XID* xid)
@ -48,14 +61,6 @@ static inline long long read_wsrep_xid_seqno(const XID* xid)
return seqno;
}
/** Read WSREP XID UUID */
static inline void read_wsrep_xid_uuid(const XID* xid, unsigned char* buf)
{
memcpy(buf, xid->data + 8, 16);
}
#endif /* UNIV_DEBUG */
/** Update the WSREP XID information in rollback segment header.
@param[in,out] rseg_header rollback segment header
@param[in] xid WSREP XID
@ -70,17 +75,15 @@ trx_rseg_update_wsrep_checkpoint(
#ifdef UNIV_DEBUG
/* Check that seqno is monotonically increasing */
unsigned char xid_uuid[16];
long long xid_seqno = read_wsrep_xid_seqno(xid);
read_wsrep_xid_uuid(xid, xid_uuid);
const byte* xid_uuid = read_wsrep_xid_uuid(xid);
if (!memcmp(xid_uuid, trx_sys_cur_xid_uuid, 8)) {
ut_ad(xid_seqno > trx_sys_cur_xid_seqno);
trx_sys_cur_xid_seqno = xid_seqno;
if (!memcmp(xid_uuid, wsrep_uuid, sizeof wsrep_uuid)) {
ut_ad(xid_seqno > wsrep_seqno);
} else {
memcpy(trx_sys_cur_xid_uuid, xid_uuid, 16);
memcpy(wsrep_uuid, xid_uuid, sizeof wsrep_uuid);
}
trx_sys_cur_xid_seqno = xid_seqno;
wsrep_seqno = xid_seqno;
#endif /* UNIV_DEBUG */
mlog_write_ulint(TRX_RSEG_WSREP_XID_FORMAT + rseg_header,
@ -100,7 +103,12 @@ trx_rseg_update_wsrep_checkpoint(
XIDDATASIZE, mtr);
}
/** Update WSREP checkpoint XID in first rollback segment header.
/** Update WSREP checkpoint XID in first rollback segment header
as part of wsrep_set_SE_checkpoint() when it is guaranteed that there
are no wsrep transactions committing.
If the UUID part of the WSREP XID does not match to the UUIDs of XIDs already
stored into rollback segments, the WSREP XID in all the remaining rollback
segments will be reset.
@param[in] xid WSREP XID */
void trx_rseg_update_wsrep_checkpoint(const XID* xid)
{
@ -115,9 +123,27 @@ void trx_rseg_update_wsrep_checkpoint(const XID* xid)
trx_rseg_format_upgrade(rseg_header, &mtr);
}
mlog_write_ull(rseg_header + TRX_RSEG_MAX_TRX_ID,
trx_sys.get_max_trx_id(), &mtr);
trx_rseg_update_wsrep_checkpoint(rseg_header, xid, &mtr);
const byte* xid_uuid = read_wsrep_xid_uuid(xid);
if (memcmp(wsrep_uuid, xid_uuid, sizeof wsrep_uuid)) {
memcpy(wsrep_uuid, xid_uuid, sizeof wsrep_uuid);
/* Because the UUID part of the WSREP XID differed
from current_xid_uuid, the WSREP group UUID was
changed, and we must reset the XID in all rollback
segment headers. */
for (ulint rseg_id = 1; rseg_id < TRX_SYS_N_RSEGS; ++rseg_id) {
if (const trx_rseg_t* rseg =
trx_sys.rseg_array[rseg_id]) {
trx_rseg_update_wsrep_checkpoint(
trx_rsegf_get(rseg->space,
rseg->page_no, &mtr),
xid, &mtr);
}
}
}
mtr.commit();
}
@ -125,24 +151,24 @@ void trx_rseg_update_wsrep_checkpoint(const XID* xid)
@param[in] rseg_header Rollback segment header
@param[out] xid Transaction XID
@return whether the WSREP XID was present */
static
bool trx_rseg_read_wsrep_checkpoint(const trx_rsegf_t* rseg_header, XID& xid)
{
xid.formatID = (int)mach_read_from_4(
TRX_RSEG_WSREP_XID_FORMAT + rseg_header);
if (xid.formatID == 0) {
memset(&xid, 0, sizeof(xid));
long long seqno= -1;
memcpy(xid.data + 24, &seqno, sizeof(long long));
xid.formatID = -1;
int formatID = static_cast<int>(
mach_read_from_4(
TRX_RSEG_WSREP_XID_FORMAT + rseg_header));
if (formatID == 0) {
return false;
}
xid.gtrid_length = (int)mach_read_from_4(
TRX_RSEG_WSREP_XID_GTRID_LEN + rseg_header);
xid.formatID = formatID;
xid.gtrid_length = static_cast<int>(
mach_read_from_4(
TRX_RSEG_WSREP_XID_GTRID_LEN + rseg_header));
xid.bqual_length = (int)mach_read_from_4(
TRX_RSEG_WSREP_XID_BQUAL_LEN + rseg_header);
xid.bqual_length = static_cast<int>(
mach_read_from_4(
TRX_RSEG_WSREP_XID_BQUAL_LEN + rseg_header));
memcpy(xid.data, TRX_RSEG_WSREP_XID_DATA + rseg_header, XIDDATASIZE);
@ -162,15 +188,18 @@ static bool trx_rseg_init_wsrep_xid(const page_t* page, XID& xid)
return false;
}
xid.formatID = (int)mach_read_from_4(
TRX_SYS + TRX_SYS_WSREP_XID_INFO
+ TRX_SYS_WSREP_XID_FORMAT + page);
xid.gtrid_length = (int)mach_read_from_4(
TRX_SYS + TRX_SYS_WSREP_XID_INFO
+ TRX_SYS_WSREP_XID_GTRID_LEN + page);
xid.bqual_length = (int)mach_read_from_4(
TRX_SYS + TRX_SYS_WSREP_XID_INFO
+ TRX_SYS_WSREP_XID_BQUAL_LEN + page);
xid.formatID = static_cast<int>(
mach_read_from_4(
TRX_SYS + TRX_SYS_WSREP_XID_INFO
+ TRX_SYS_WSREP_XID_FORMAT + page));
xid.gtrid_length = static_cast<int>(
mach_read_from_4(
TRX_SYS + TRX_SYS_WSREP_XID_INFO
+ TRX_SYS_WSREP_XID_GTRID_LEN + page));
xid.bqual_length = static_cast<int>(
mach_read_from_4(
TRX_SYS + TRX_SYS_WSREP_XID_INFO
+ TRX_SYS_WSREP_XID_BQUAL_LEN + page));
memcpy(xid.data,
TRX_SYS + TRX_SYS_WSREP_XID_INFO
+ TRX_SYS_WSREP_XID_DATA + page, XIDDATASIZE);
@ -183,7 +212,7 @@ static bool trx_rseg_init_wsrep_xid(const page_t* page, XID& xid)
bool trx_rseg_read_wsrep_checkpoint(XID& xid)
{
mtr_t mtr;
trx_id_t max_id = 0;
long long max_xid_seqno = -1;
bool found = false;
for (ulint rseg_id = 0; rseg_id < TRX_SYS_N_RSEGS;
@ -192,6 +221,7 @@ bool trx_rseg_read_wsrep_checkpoint(XID& xid)
const buf_block_t* sys = trx_sysf_get(&mtr, false);
if (rseg_id == 0) {
found = trx_rseg_init_wsrep_xid(sys->frame, xid);
ut_ad(!found || xid.formatID == 1);
}
const uint32_t page_no = trx_sysf_rseg_get_page_no(
@ -208,16 +238,17 @@ bool trx_rseg_read_wsrep_checkpoint(XID& xid)
continue;
}
trx_id_t id = mach_read_from_8(rseg_header
+ TRX_RSEG_MAX_TRX_ID);
if (id < max_id) {
continue;
XID tmp_xid;
long long tmp_seqno = 0;
if (trx_rseg_read_wsrep_checkpoint(rseg_header, tmp_xid)
&& (tmp_seqno = read_wsrep_xid_seqno(&tmp_xid))
> max_xid_seqno) {
found = true;
max_xid_seqno = tmp_seqno;
xid = tmp_xid;
memcpy(wsrep_uuid, read_wsrep_xid_uuid(&tmp_xid),
sizeof wsrep_uuid);
}
max_id = id;
found = trx_rseg_read_wsrep_checkpoint(rseg_header, xid)
|| found;
}
return found;
@ -400,15 +431,10 @@ trx_undo_lists_init(trx_rseg_t* rseg, trx_id_t& max_trx_id,
/** Restore the state of a persistent rollback segment.
@param[in,out] rseg persistent rollback segment
@param[in,out] max_trx_id maximum observed transaction identifier
@param[in,out] max_rseg_trx_id maximum observed TRX_RSEG_MAX_TRX_ID
@param[in,out] mtr mini-transaction */
static
void
trx_rseg_mem_restore(
trx_rseg_t* rseg,
trx_id_t& max_trx_id,
trx_id_t& max_rseg_trx_id,
mtr_t* mtr)
trx_rseg_mem_restore(trx_rseg_t* rseg, trx_id_t& max_trx_id, mtr_t* mtr)
{
trx_rsegf_t* rseg_header = trx_rsegf_get_new(
rseg->space, rseg->page_no, mtr);
@ -421,16 +447,33 @@ trx_rseg_mem_restore(
max_trx_id = id;
}
if (id > max_rseg_trx_id) {
max_rseg_trx_id = id;
if (rseg_header[TRX_RSEG_BINLOG_NAME]) {
const char* binlog_name = reinterpret_cast<const char*>
(rseg_header) + TRX_RSEG_BINLOG_NAME;
compile_time_assert(TRX_RSEG_BINLOG_NAME_LEN == sizeof
trx_sys.recovered_binlog_filename);
if (rseg_header[TRX_RSEG_BINLOG_NAME]) {
memcpy(trx_sys.recovered_binlog_filename,
rseg_header + TRX_RSEG_BINLOG_NAME,
TRX_RSEG_BINLOG_NAME_LEN);
trx_sys.recovered_binlog_offset = mach_read_from_8(
rseg_header
+ TRX_RSEG_BINLOG_OFFSET);
int cmp = *trx_sys.recovered_binlog_filename
? strncmp(binlog_name,
trx_sys.recovered_binlog_filename,
TRX_RSEG_BINLOG_NAME_LEN)
: 1;
if (cmp >= 0) {
uint64_t binlog_offset = mach_read_from_8(
rseg_header + TRX_RSEG_BINLOG_OFFSET);
if (cmp) {
memcpy(trx_sys.
recovered_binlog_filename,
binlog_name,
TRX_RSEG_BINLOG_NAME_LEN);
trx_sys.recovered_binlog_offset
= binlog_offset;
} else if (binlog_offset >
trx_sys.recovered_binlog_offset) {
trx_sys.recovered_binlog_offset
= binlog_offset;
}
}
#ifdef WITH_WSREP
@ -513,10 +556,10 @@ static void trx_rseg_init_binlog_info(const page_t* page)
void
trx_rseg_array_init()
{
trx_id_t max_trx_id = 0, max_rseg_trx_id = 0;
trx_id_t max_trx_id = 0;
*trx_sys.recovered_binlog_filename = '\0';
trx_sys.recovered_binlog_offset = -1;
trx_sys.recovered_binlog_offset = 0;
#ifdef WITH_WSREP
memset(&trx_sys.recovered_wsrep_xid, 0,
sizeof trx_sys.recovered_wsrep_xid);
@ -548,9 +591,7 @@ trx_rseg_array_init()
ut_ad(rseg->id == rseg_id);
ut_ad(!trx_sys.rseg_array[rseg_id]);
trx_sys.rseg_array[rseg_id] = rseg;
trx_rseg_mem_restore(
rseg, max_trx_id, max_rseg_trx_id,
&mtr);
trx_rseg_mem_restore(rseg, max_trx_id, &mtr);
}
}