MDEV-11799 Doublewrite recovery can corrupt data pages

The purpose of the InnoDB doublewrite buffer is to make InnoDB
tolerant against cases where the server was killed in the middle
of a page write. (In Linux, killing a process may interrupt a
write system call, typically on a 4096-byte boundary.)

There may exist multiple copies of a page number in the doublewrite
buffer. Recovery should choose the latest valid copy of the page.
By design, the FIL_PAGE_LSN must not precede the latest checkpoint LSN
nor be later than the end of the recovered log.

For page_compressed and encrypted pages, we were missing proper
consistency checks. In the 10.4 data set generated for in MDEV-23231,
the data file contained a valid page_compressed page, and an
identical copy of that page was also present in the doublewrite
buffer. But, recovery would incorrectly consider the page invalid
and restore an uncompressed copy of the same page that had been
written before the log checkpoint. (In fact, no redo log was to
be applied to that page.)

buf_dblwr_process(): Validate the FIL_PAGE_LSN in the doublewrite
buffer pages, and always skip page 0, because those pages should
have been recovered by Datafile::restore_from_doublewrite() if
necessary.

Datafile::restore_from_doublewrite(): Choose the latest applicable
page from the doublewrite buffer.

recv_dblwr_t::find_page(): Also validate encrypted or
page_compressed pages.

recv_dblwr_t::validate_page(): New function to validate a page,
either a copy in a data file or in the doublewrite buffer.
Also validate encrypted or page_compressed pages.

This is joint work with Thirunarayanan Balathandayuthapani.
This commit is contained in:
Marko Mäkelä 2020-07-31 11:51:44 +03:00
commit 879ba1979b
4 changed files with 164 additions and 114 deletions

View file

@ -57,6 +57,7 @@ Created 9/20/1997 Heikki Tuuri
#include "srv0start.h"
#include "trx0roll.h"
#include "row0merge.h"
#include "fil0pagecompress.h"
/** Log records are stored in the hash table in chunks at most of this size;
this must be less than UNIV_PAGE_SIZE as it is stored in the buffer pool */
@ -3910,6 +3911,8 @@ skip_apply:
rescan = true;
}
recv_sys->parse_start_lsn = checkpoint_lsn;
if (srv_operation == SRV_OPERATION_NORMAL) {
buf_dblwr_process();
}
@ -4084,26 +4087,91 @@ recv_recovery_rollback_active(void)
}
}
/** Find a doublewrite copy of a page.
@param[in] space_id tablespace identifier
@param[in] page_no page number
@return page frame
@retval NULL if no page was found */
const byte*
recv_dblwr_t::find_page(ulint space_id, ulint page_no)
bool recv_dblwr_t::validate_page(const page_id_t page_id,
const byte *page,
const fil_space_t *space,
byte *tmp_buf)
{
const byte *result= NULL;
if (page_id.page_no() == 0)
{
ulint flags= fsp_header_get_flags(page);
if (!fsp_flags_is_valid(flags, page_id.space()))
{
ulint cflags= fsp_flags_convert_from_101(flags);
if (cflags == ULINT_UNDEFINED)
{
ib::warn() << "Ignoring a doublewrite copy of page " << page_id
<< "due to invalid flags " << ib::hex(flags);
return false;
}
flags= cflags;
}
/* Page 0 is never page_compressed or encrypted. */
return !buf_page_is_corrupted(true, page, page_size_t(flags));
}
ut_ad(tmp_buf);
byte *tmp_frame= tmp_buf;
byte *tmp_page= tmp_buf + srv_page_size;
const uint16_t page_type= mach_read_from_2(page + FIL_PAGE_TYPE);
const page_size_t page_size(space->flags);
const bool expect_encrypted= space->crypt_data &&
space->crypt_data->type != CRYPT_SCHEME_UNENCRYPTED;
if (expect_encrypted &&
mach_read_from_4(page + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION))
{
if (!fil_space_verify_crypt_checksum(page, page_size))
return false;
if (page_type != FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED)
return true;
if (page_size.is_compressed())
return false;
memcpy(tmp_page, page, page_size.physical());
if (!fil_space_decrypt(space, tmp_frame, tmp_page))
return false;
}
switch (page_type) {
case FIL_PAGE_PAGE_COMPRESSED:
memcpy(tmp_page, page, page_size.physical());
/* fall through */
case FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED:
if (page_size.is_compressed())
return false; /* ROW_FORMAT=COMPRESSED cannot be page_compressed */
ulint decomp= fil_page_decompress(tmp_frame, tmp_page);
if (!decomp)
return false; /* decompression failed */
if (decomp == srv_page_size)
return false; /* the page was not compressed (invalid page type) */
return !buf_page_is_corrupted(true, tmp_page, page_size, space);
}
return !buf_page_is_corrupted(true, page, page_size, space);
}
byte *recv_dblwr_t::find_page(const page_id_t page_id,
const fil_space_t *space, byte *tmp_buf)
{
byte *result= NULL;
lsn_t max_lsn= 0;
for (list::const_iterator i = pages.begin(); i != pages.end(); ++i)
{
const byte *page= *i;
if (page_get_page_no(page) != page_no ||
page_get_space_id(page) != space_id)
byte *page= *i;
if (page_get_page_no(page) != page_id.page_no() ||
page_get_space_id(page) != page_id.space())
continue;
const lsn_t lsn= mach_read_from_8(page + FIL_PAGE_LSN);
if (lsn <= max_lsn)
if (lsn <= max_lsn ||
!validate_page(page_id, page, space, tmp_buf))
{
/* Mark processed for subsequent iterations in buf_dblwr_process() */
memset(page + FIL_PAGE_LSN, 0, 8);
continue;
}
max_lsn= lsn;
result= page;
}