MDEV-17458 Unable to start galera node

Bootstrapping a new cluster from a backup created from a MariaDB
version prior to 10.3.5 may result in error "SST position can't be
set in past" when attempting to join additional nodes.
The problem stems from the fact that when reading the wsrep position
from InnoDB, the position is looked up in two places:
the TRX_SYS page, where versions prior to 10.3.5 used to store
WSREP's position; and rollback segments, this is where newer versions
store the position.
When starting a new cluster, the starting seqno is 0 and a new cluster
UUID is generated. This is persisted in rollback segments, but the old
UUID and seqno are not cleared from TRX_SYS page.
Subsequently, when reading back the position,
trx_rseg_read_wsrep_checkpoint() is going to return the maximum seqno
found in both TRX_SYS page and rollback segments. So in the case of a
newly bootstrapped cluster, it's always going to return the old
cluster information.
The fix consists of changing trx_rseg_read_wsrep_checkpoint() so that
only rollback segments are looked up. On startup, position is read
from the TRX_SYS page, and if present, it is copied to rollback
segments (unless a newer position is already present in the rollback
segments).
Finally the position stored in TRX_SYS page is cleared.
This commit is contained in:
Daniele Sciascia 2019-05-09 09:36:43 +02:00 committed by Marko Mäkelä
parent d0ef948d70
commit 592dc59d7a

View file

@ -43,31 +43,16 @@ static long long wsrep_seqno = -1;
/** The latest known WSREP XID UUID */
static unsigned char wsrep_uuid[16];
/** Update the WSREP XID information in rollback segment header.
/** Write the WSREP XID information into rollback segment header.
@param[in,out] rseg_header rollback segment header
@param[in] xid WSREP XID
@param[in,out] mtr mini-transaction */
void
trx_rseg_update_wsrep_checkpoint(
@param[in,out] mtr mini transaction */
static void
trx_rseg_write_wsrep_checkpoint(
trx_rsegf_t* rseg_header,
const XID* xid,
mtr_t* mtr)
{
ut_ad(wsrep_is_wsrep_xid(xid));
#ifdef UNIV_DEBUG
/* Check that seqno is monotonically increasing */
long long xid_seqno = wsrep_xid_seqno(xid);
const byte* xid_uuid = wsrep_xid_uuid(xid);
if (!memcmp(xid_uuid, wsrep_uuid, sizeof wsrep_uuid)) {
ut_ad(xid_seqno > wsrep_seqno);
} else {
memcpy(wsrep_uuid, xid_uuid, sizeof wsrep_uuid);
}
wsrep_seqno = xid_seqno;
#endif /* UNIV_DEBUG */
mlog_write_ulint(TRX_RSEG_WSREP_XID_FORMAT + rseg_header,
uint32_t(xid->formatID),
MLOG_4BYTES, mtr);
@ -85,6 +70,83 @@ trx_rseg_update_wsrep_checkpoint(
XIDDATASIZE, mtr);
}
/** Update the WSREP XID information in rollback segment header.
@param[in,out] rseg_header rollback segment header
@param[in] xid WSREP XID
@param[in,out] mtr mini-transaction */
void
trx_rseg_update_wsrep_checkpoint(
trx_rsegf_t* rseg_header,
const XID* xid,
mtr_t* mtr)
{
ut_ad(wsrep_is_wsrep_xid(xid));
#ifdef UNIV_DEBUG
/* Check that seqno is monotonically increasing */
long long xid_seqno = wsrep_xid_seqno(xid);
const byte* xid_uuid = wsrep_xid_uuid(xid);
if (xid_seqno != -1
&& !memcmp(xid_uuid, wsrep_uuid, sizeof wsrep_uuid)) {
ut_ad(xid_seqno > wsrep_seqno);
} else {
memcpy(wsrep_uuid, xid_uuid, sizeof wsrep_uuid);
}
wsrep_seqno = xid_seqno;
#endif /* UNIV_DEBUG */
trx_rseg_write_wsrep_checkpoint(rseg_header, xid, mtr);
}
/** Clear the WSREP XID information from rollback segment header.
@param[in,out] rseg_header Rollback segment header
@param[in,out] mtr mini-transaction */
static void
trx_rseg_clear_wsrep_checkpoint(
trx_rsegf_t* rseg_header,
mtr_t* mtr)
{
mlog_write_ulint(TRX_RSEG_WSREP_XID_FORMAT + rseg_header,
0, MLOG_4BYTES, mtr);
}
static void
trx_rseg_update_wsrep_checkpoint(const XID* xid, mtr_t* mtr)
{
const byte* xid_uuid = wsrep_xid_uuid(xid);
/* We must make check against wsrep_uuid here, the
trx_rseg_update_wsrep_checkpoint() writes over wsrep_uuid with
xid contents in debug mode and the memcmp() will never give nonzero
result. */
const bool must_clear_rsegs = memcmp(wsrep_uuid, xid_uuid,
sizeof wsrep_uuid);
const trx_rseg_t* rseg = trx_sys.rseg_array[0];
trx_rsegf_t* rseg_header = trx_rsegf_get(rseg->space, rseg->page_no,
mtr);
if (UNIV_UNLIKELY(mach_read_from_4(rseg_header + TRX_RSEG_FORMAT))) {
trx_rseg_format_upgrade(rseg_header, mtr);
}
trx_rseg_update_wsrep_checkpoint(rseg_header, xid, mtr);
if (must_clear_rsegs) {
/* Because the UUID part of the WSREP XID differed
from current_xid_uuid, the WSREP group UUID was
changed, and we must reset the XID in all rollback
segment headers. */
for (ulint rseg_id = 1; rseg_id < TRX_SYS_N_RSEGS; ++rseg_id) {
if (const trx_rseg_t* rseg =
trx_sys.rseg_array[rseg_id]) {
trx_rseg_clear_wsrep_checkpoint(
trx_rsegf_get(rseg->space,
rseg->page_no, mtr),
mtr);
}
}
}
}
/** Update WSREP checkpoint XID in first rollback segment header
as part of wsrep_set_SE_checkpoint() when it is guaranteed that there
are no wsrep transactions committing.
@ -96,36 +158,7 @@ void trx_rseg_update_wsrep_checkpoint(const XID* xid)
{
mtr_t mtr;
mtr.start();
const trx_rseg_t* rseg = trx_sys.rseg_array[0];
trx_rsegf_t* rseg_header = trx_rsegf_get(rseg->space, rseg->page_no,
&mtr);
if (UNIV_UNLIKELY(mach_read_from_4(rseg_header + TRX_RSEG_FORMAT))) {
trx_rseg_format_upgrade(rseg_header, &mtr);
}
trx_rseg_update_wsrep_checkpoint(rseg_header, xid, &mtr);
const byte* xid_uuid = wsrep_xid_uuid(xid);
if (memcmp(wsrep_uuid, xid_uuid, sizeof wsrep_uuid)) {
memcpy(wsrep_uuid, xid_uuid, sizeof wsrep_uuid);
/* Because the UUID part of the WSREP XID differed
from current_xid_uuid, the WSREP group UUID was
changed, and we must reset the XID in all rollback
segment headers. */
for (ulint rseg_id = 1; rseg_id < TRX_SYS_N_RSEGS; ++rseg_id) {
if (const trx_rseg_t* rseg =
trx_sys.rseg_array[rseg_id]) {
trx_rseg_update_wsrep_checkpoint(
trx_rsegf_get(rseg->space,
rseg->page_no, &mtr),
xid, &mtr);
}
}
}
trx_rseg_update_wsrep_checkpoint(xid, &mtr);
mtr.commit();
}
@ -201,16 +234,6 @@ bool trx_rseg_read_wsrep_checkpoint(XID& xid)
rseg_id++, mtr.commit()) {
mtr.start();
const buf_block_t* sys = trx_sysf_get(&mtr, false);
if (rseg_id == 0) {
found = trx_rseg_init_wsrep_xid(sys->frame, xid);
ut_ad(!found || xid.formatID == 1);
if (found) {
max_xid_seqno = wsrep_xid_seqno(&xid);
memcpy(wsrep_uuid, wsrep_xid_uuid(&xid),
sizeof wsrep_uuid);
}
}
const uint32_t page_no = trx_sysf_rseg_get_page_no(
sys, rseg_id);
@ -542,6 +565,9 @@ trx_rseg_array_init()
trx_sys.recovered_binlog_offset = 0;
#ifdef WITH_WSREP
trx_sys.recovered_wsrep_xid.null();
XID wsrep_sys_xid;
wsrep_sys_xid.null();
bool wsrep_xid_in_rseg_found = false;
#endif
for (ulint rseg_id = 0; rseg_id < TRX_SYS_N_RSEGS; rseg_id++) {
@ -556,6 +582,9 @@ trx_rseg_array_init()
TRX_SYS + TRX_SYS_TRX_ID_STORE
+ sys->frame);
trx_rseg_init_binlog_info(sys->frame);
#ifdef WITH_WSREP
wsrep_sys_xid.set(&trx_sys.recovered_wsrep_xid);
#endif
}
const uint32_t page_no = trx_sysf_rseg_get_page_no(
@ -571,12 +600,49 @@ trx_rseg_array_init()
ut_ad(!trx_sys.rseg_array[rseg_id]);
trx_sys.rseg_array[rseg_id] = rseg;
trx_rseg_mem_restore(rseg, max_trx_id, &mtr);
#ifdef WITH_WSREP
if (!wsrep_sys_xid.is_null() &&
!wsrep_sys_xid.eq(&trx_sys.recovered_wsrep_xid)) {
wsrep_xid_in_rseg_found = true;
ut_ad(memcmp(wsrep_xid_uuid(&wsrep_sys_xid),
wsrep_xid_uuid(&trx_sys.recovered_wsrep_xid),
sizeof wsrep_uuid)
|| wsrep_xid_seqno(
&wsrep_sys_xid)
<= wsrep_xid_seqno(
&trx_sys.recovered_wsrep_xid));
}
#endif
}
}
mtr.commit();
}
#ifdef WITH_WSREP
if (!wsrep_sys_xid.is_null()) {
/* Upgrade from a version prior to 10.3.5,
where WSREP XID was stored in TRX_SYS page.
If no rollback segment has a WSREP XID set,
we must copy the XID found in TRX_SYS page
to rollback segments. */
mtr_t mtr;
mtr.start();
if (!wsrep_xid_in_rseg_found) {
trx_rseg_update_wsrep_checkpoint(&wsrep_sys_xid, &mtr);
}
/* Finally, clear WSREP XID in TRX_SYS page. */
const buf_block_t* sys = trx_sysf_get(&mtr);
mlog_write_ulint(TRX_SYS + TRX_SYS_WSREP_XID_INFO +
+ TRX_SYS_WSREP_XID_MAGIC_N_FLD + sys->frame,
0, MLOG_4BYTES, &mtr);
mtr.commit();
}
#endif
trx_sys.init_max_trx_id(max_trx_id + 1);
}