mariadb/storage/innobase/log/log0recv.cc
Marko Mäkelä b3d02a1fcf MDEV-12353: Replace DELETE_MARK redo log records with MLOG_WRITE_STRING
btr_cur_upd_rec_sys(): Replaces row_upd_rec_sys_fields() and
implements redo logging.

row_upd_rec_sys_fields_in_recovery(): Remove, and merge to the
only remaining caller btr_cur_parse_update_in_place().

btr_cur_del_mark_set_clust_rec_log(),
btr_cur_del_mark_set_sec_rec_log(),
btr_cur_set_deleted_flag_for_ibuf():
Remove, and replace with btr_rec_set_deleted<bool>().

page_zip_rec_set_deleted(): Add the parameter mtr, and write a
MLOG_ZIP_WRITE_STRING record to the log.
2020-02-13 18:19:14 +02:00

4086 lines
110 KiB
C++

/*****************************************************************************
Copyright (c) 1997, 2017, Oracle and/or its affiliates. All Rights Reserved.
Copyright (c) 2012, Facebook Inc.
Copyright (c) 2013, 2020, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
You should have received a copy of the GNU General Public License along with
this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
*****************************************************************************/
/**************************************************//**
@file log/log0recv.cc
Recovery
Created 9/20/1997 Heikki Tuuri
*******************************************************/
#include "univ.i"
#include <map>
#include <string>
#include <my_service_manager.h>
#include "log0recv.h"
#ifdef HAVE_MY_AES_H
#include <my_aes.h>
#endif
#include "log0crypt.h"
#include "mem0mem.h"
#include "buf0buf.h"
#include "buf0flu.h"
#include "mtr0mtr.h"
#include "mtr0log.h"
#include "page0cur.h"
#include "page0zip.h"
#include "btr0btr.h"
#include "btr0cur.h"
#include "ibuf0ibuf.h"
#include "trx0undo.h"
#include "trx0rec.h"
#include "fil0fil.h"
#include "buf0rea.h"
#include "srv0srv.h"
#include "srv0start.h"
/** Read-ahead area in applying log records to file pages */
#define RECV_READ_AHEAD_AREA 32U
/** The recovery system */
recv_sys_t recv_sys;
/** TRUE when applying redo log records during crash recovery; FALSE
otherwise. Note that this is FALSE while a background thread is
rolling back incomplete transactions. */
volatile bool recv_recovery_on;
/** TRUE when recv_init_crash_recovery() has been called. */
bool recv_needed_recovery;
#ifdef UNIV_DEBUG
/** TRUE if writing to the redo log (mtr_commit) is forbidden.
Protected by log_sys.mutex. */
bool recv_no_log_write = false;
#endif /* UNIV_DEBUG */
/** TRUE if buf_page_is_corrupted() should check if the log sequence
number (FIL_PAGE_LSN) is in the future. Initially FALSE, and set by
recv_recovery_from_checkpoint_start(). */
bool recv_lsn_checks_on;
/** If the following is TRUE, the buffer pool file pages must be invalidated
after recovery and no ibuf operations are allowed; this becomes TRUE if
the log record hash table becomes too full, and log records must be merged
to file pages already before the recovery is finished: in this case no
ibuf operations are allowed, as they could modify the pages read in the
buffer pool before the pages have been recovered to the up-to-date state.
TRUE means that recovery is running and no operations on the log files
are allowed yet: the variable name is misleading. */
bool recv_no_ibuf_operations;
/** The type of the previous parsed redo log record */
static mlog_id_t recv_previous_parsed_rec_type;
/** The offset of the previous parsed redo log record */
static ulint recv_previous_parsed_rec_offset;
/** The 'multi' flag of the previous parsed redo log record */
static ulint recv_previous_parsed_rec_is_multi;
/** The maximum lsn we see for a page during the recovery process. If this
is bigger than the lsn we are able to scan up to, that is an indication that
the recovery failed and the database may be corrupt. */
static lsn_t recv_max_page_lsn;
#ifdef UNIV_PFS_THREAD
mysql_pfs_key_t recv_writer_thread_key;
#endif /* UNIV_PFS_THREAD */
/** Is recv_writer_thread active? */
bool recv_writer_thread_active;
/** Stored physiological log record with byte-oriented start/end LSN */
struct recv_t : public log_rec_t
{
/**
Constructor.
@param len total length of the redo log record body, in bytes
@param type redo log record chunk
@param start_lsn start LSN of the mini-transaction
@param end_lsn end LSN of the mini-transaction
*/
recv_t(uint32_t len, mlog_id_t type, lsn_t start_lsn, lsn_t end_lsn) :
log_rec_t(end_lsn), start_lsn(start_lsn), len(len), type(type), data(NULL)
{}
/** start LSN of the mini-transaction (not necessarily of this record) */
const lsn_t start_lsn;
/** log record body length in bytes */
const uint32_t len;
/** log record type */
const mlog_id_t type;
/** log record */
struct data_t
{
/** pointer to the next chunk, or NULL for the last chunk. The
log record data is stored immediately after this field. */
data_t *next= NULL;
data_t() {}
/**
Constructor.
@param chunk redo log record chunk
@param len length of the chunk, in bytes
*/
data_t(const void* chunk, size_t len)
{
memcpy(reinterpret_cast<void*>(this + 1), chunk, len);
}
/**
Append a log snippet.
@param d log snippet
*/
void append(data_t *d) { ut_ad(!next); ut_ad(!d->next); next= d; }
}* data;
/** Free the log snippet */
void free() const
{
data_t *d= data;
do
{
data_t *next= d->next;
recv_sys.free(d);
d= next;
}
while (d);
recv_sys.free(this);
}
};
#ifndef DBUG_OFF
/** Return string name of the redo log record type.
@param[in] type record log record enum
@return string name of record log record */
static const char* get_mlog_string(mlog_id_t type);
#endif /* !DBUG_OFF */
/** Tablespace item during recovery */
struct file_name_t {
/** Tablespace file name (MLOG_FILE_NAME) */
std::string name;
/** Tablespace object (NULL if not valid or not found) */
fil_space_t* space;
/** Tablespace status. */
enum fil_status {
/** Normal tablespace */
NORMAL,
/** Deleted tablespace */
DELETED,
/** Missing tablespace */
MISSING
};
/** Status of the tablespace */
fil_status status;
/** FSP_SIZE of tablespace */
ulint size;
/** the log sequence number of the last observed MLOG_INDEX_LOAD
record for the tablespace */
lsn_t enable_lsn;
/** Constructor */
file_name_t(std::string name_, bool deleted) :
name(name_), space(NULL), status(deleted ? DELETED: NORMAL),
size(0), enable_lsn(0) {}
/** Report a MLOG_INDEX_LOAD operation, meaning that
mlog_init for any earlier LSN must be skipped.
@param lsn log sequence number of the MLOG_INDEX_LOAD */
void mlog_index_load(lsn_t lsn)
{
if (enable_lsn < lsn) enable_lsn = lsn;
}
};
/** Map of dirty tablespaces during recovery */
typedef std::map<
ulint,
file_name_t,
std::less<ulint>,
ut_allocator<std::pair<const ulint, file_name_t> > > recv_spaces_t;
static recv_spaces_t recv_spaces;
/** Report optimized DDL operation (without redo log),
corresponding to MLOG_INDEX_LOAD.
@param[in] space_id tablespace identifier
*/
void (*log_optimized_ddl_op)(ulint space_id);
/** Report an operation to create, delete, or rename a file during backup.
@param[in] space_id tablespace identifier
@param[in] flags tablespace flags (NULL if not create)
@param[in] name file name (not NUL-terminated)
@param[in] len length of name, in bytes
@param[in] new_name new file name (NULL if not rename)
@param[in] new_len length of new_name, in bytes (0 if NULL) */
void (*log_file_op)(ulint space_id, const byte* flags,
const byte* name, ulint len,
const byte* new_name, ulint new_len);
/** Information about initializing page contents during redo log processing */
class mlog_init_t
{
public:
/** A page initialization operation that was parsed from
the redo log */
struct init {
/** log sequence number of the page initialization */
lsn_t lsn;
/** Whether btr_page_create() avoided a read of the page.
At the end of the last recovery batch, mark_ibuf_exist()
will mark pages for which this flag is set. */
bool created;
};
private:
typedef std::map<const page_id_t, init,
std::less<const page_id_t>,
ut_allocator<std::pair<const page_id_t, init> > >
map;
/** Map of page initialization operations.
FIXME: Merge this to recv_sys.pages! */
map inits;
public:
/** Record that a page will be initialized by the redo log.
@param[in] page_id page identifier
@param[in] lsn log sequence number */
void add(const page_id_t page_id, lsn_t lsn)
{
ut_ad(mutex_own(&recv_sys.mutex));
const init init = { lsn, false };
std::pair<map::iterator, bool> p = inits.insert(
map::value_type(page_id, init));
ut_ad(!p.first->second.created);
if (!p.second && p.first->second.lsn < init.lsn) {
p.first->second = init;
}
}
/** Get the last stored lsn of the page id and its respective
init/load operation.
@param[in] page_id page id
@param[in,out] init initialize log or load log
@return the latest page initialization;
not valid after releasing recv_sys.mutex. */
init& last(page_id_t page_id)
{
ut_ad(mutex_own(&recv_sys.mutex));
return inits.find(page_id)->second;
}
/** At the end of each recovery batch, reset the 'created' flags. */
void reset()
{
ut_ad(mutex_own(&recv_sys.mutex));
ut_ad(recv_no_ibuf_operations);
for (map::value_type& i : inits) {
i.second.created = false;
}
}
/** On the last recovery batch, mark whether there exist
buffered changes for the pages that were initialized
by buf_page_create() and still reside in the buffer pool.
@param[in,out] mtr dummy mini-transaction */
void mark_ibuf_exist(mtr_t& mtr)
{
ut_ad(mutex_own(&recv_sys.mutex));
ut_ad(!recv_no_ibuf_operations);
mtr.start();
for (const map::value_type& i : inits) {
if (!i.second.created) {
continue;
}
if (buf_block_t* block = buf_page_get_gen(
i.first, 0, RW_X_LATCH, NULL,
BUF_GET_IF_IN_POOL, __FILE__, __LINE__,
&mtr)) {
mutex_exit(&recv_sys.mutex);
block->page.ibuf_exist = ibuf_page_exists(
block->page);
mtr.commit();
mtr.start();
mutex_enter(&recv_sys.mutex);
}
}
mtr.commit();
}
/** Clear the data structure */
void clear() { inits.clear(); }
};
static mlog_init_t mlog_init;
/** Process a record that indicates that a tablespace is
being shrunk in size.
@param page_id first page identifier that is not in the file
@param lsn log sequence number of the shrink operation */
inline void recv_sys_t::trim(const page_id_t page_id, lsn_t lsn)
{
DBUG_ENTER("recv_sys_t::trim");
DBUG_LOG("ib_log",
"discarding log beyond end of tablespace "
<< page_id << " before LSN " << lsn);
ut_ad(mutex_own(&mutex));
for (recv_sys_t::map::iterator p = pages.lower_bound(page_id);
p != pages.end() && p->first.space() == page_id.space();) {
recv_sys_t::map::iterator r = p++;
if (r->second.log.trim(lsn)) {
pages.erase(r);
}
}
if (fil_space_t* space = fil_space_get(page_id.space())) {
ut_ad(UT_LIST_GET_LEN(space->chain) == 1);
fil_node_t* file = UT_LIST_GET_FIRST(space->chain);
ut_ad(file->is_open());
os_file_truncate(file->name, file->handle,
os_offset_t{page_id.page_no()}
<< srv_page_size_shift, true);
}
DBUG_VOID_RETURN;
}
/** Process a file name from a MLOG_FILE_* record.
@param[in,out] name file name
@param[in] len length of the file name
@param[in] space_id the tablespace ID
@param[in] deleted whether this is a MLOG_FILE_DELETE record */
static
void
fil_name_process(char* name, ulint len, ulint space_id, bool deleted)
{
if (srv_operation == SRV_OPERATION_BACKUP) {
return;
}
ut_ad(srv_operation == SRV_OPERATION_NORMAL
|| srv_operation == SRV_OPERATION_RESTORE
|| srv_operation == SRV_OPERATION_RESTORE_EXPORT);
/* We will also insert space=NULL into the map, so that
further checks can ensure that a MLOG_FILE_NAME record was
scanned before applying any page records for the space_id. */
os_normalize_path(name);
file_name_t fname(std::string(name, len - 1), deleted);
std::pair<recv_spaces_t::iterator,bool> p = recv_spaces.insert(
std::make_pair(space_id, fname));
ut_ad(p.first->first == space_id);
file_name_t& f = p.first->second;
if (deleted) {
/* Got MLOG_FILE_DELETE */
if (!p.second && f.status != file_name_t::DELETED) {
f.status = file_name_t::DELETED;
if (f.space != NULL) {
fil_space_free(space_id, false);
f.space = NULL;
}
}
ut_ad(f.space == NULL);
} else if (p.second // the first MLOG_FILE_NAME or MLOG_FILE_RENAME2
|| f.name != fname.name) {
fil_space_t* space;
/* Check if the tablespace file exists and contains
the space_id. If not, ignore the file after displaying
a note. Abort if there are multiple files with the
same space_id. */
switch (fil_ibd_load(space_id, name, space)) {
case FIL_LOAD_OK:
ut_ad(space != NULL);
if (f.space == NULL || f.space == space) {
if (f.size && f.space == NULL) {
fil_space_set_recv_size(space->id, f.size);
}
f.name = fname.name;
f.space = space;
f.status = file_name_t::NORMAL;
} else {
ib::error() << "Tablespace " << space_id
<< " has been found in two places: '"
<< f.name << "' and '" << name << "'."
" You must delete one of them.";
recv_sys.found_corrupt_fs = true;
}
break;
case FIL_LOAD_ID_CHANGED:
ut_ad(space == NULL);
break;
case FIL_LOAD_NOT_FOUND:
/* No matching tablespace was found; maybe it
was renamed, and we will find a subsequent
MLOG_FILE_* record. */
ut_ad(space == NULL);
if (srv_force_recovery) {
/* Without innodb_force_recovery,
missing tablespaces will only be
reported in
recv_init_crash_recovery_spaces().
Enable some more diagnostics when
forcing recovery. */
ib::info()
<< "At LSN: " << recv_sys.recovered_lsn
<< ": unable to open file " << name
<< " for tablespace " << space_id;
}
break;
case FIL_LOAD_INVALID:
ut_ad(space == NULL);
if (srv_force_recovery == 0) {
ib::warn() << "We do not continue the crash"
" recovery, because the table may"
" become corrupt if we cannot apply"
" the log records in the InnoDB log to"
" it. To fix the problem and start"
" mysqld:";
ib::info() << "1) If there is a permission"
" problem in the file and mysqld"
" cannot open the file, you should"
" modify the permissions.";
ib::info() << "2) If the tablespace is not"
" needed, or you can restore an older"
" version from a backup, then you can"
" remove the .ibd file, and use"
" --innodb_force_recovery=1 to force"
" startup without this file.";
ib::info() << "3) If the file system or the"
" disk is broken, and you cannot"
" remove the .ibd file, you can set"
" --innodb_force_recovery.";
recv_sys.found_corrupt_fs = true;
break;
}
ib::info() << "innodb_force_recovery was set to "
<< srv_force_recovery << ". Continuing crash"
" recovery even though we cannot access the"
" files for tablespace " << space_id << ".";
break;
}
}
}
/** Parse or process a MLOG_FILE_* record.
@param[in,out] ptr redo log record
@param[in] end end of the redo log buffer
@param[in] page_id first page number in the file
@param[in] type MLOG_FILE_NAME or MLOG_FILE_DELETE
or MLOG_FILE_CREATE2 or MLOG_FILE_RENAME2
@param[in] apply whether to apply the record
@return pointer to next redo log record
@retval NULL if this log record was truncated */
static
const byte*
fil_name_parse(
byte* ptr,
const byte* end,
const page_id_t page_id,
mlog_id_t type,
bool apply)
{
if (type == MLOG_FILE_CREATE2) {
if (end < ptr + 4) {
return(NULL);
}
ptr += 4;
}
if (end < ptr + 2) {
return(NULL);
}
ulint len = mach_read_from_2(ptr);
ptr += 2;
if (end < ptr + len) {
return(NULL);
}
/* MLOG_FILE_* records should only be written for
user-created tablespaces. The name must be long enough
and end in .ibd. */
bool corrupt = is_predefined_tablespace(page_id.space())
|| len < sizeof "/a.ibd\0"
|| (!page_id.page_no() != !memcmp(ptr + len - 5, DOT_IBD, 5));
if (!corrupt && !memchr(ptr, OS_PATH_SEPARATOR, len)) {
if (byte* c = static_cast<byte*>
(memchr(ptr, OS_PATH_SEPARATOR_ALT, len))) {
ut_ad(c >= ptr);
ut_ad(c < ptr + len);
do {
*c = OS_PATH_SEPARATOR;
} while ((c = static_cast<byte*>
(memchr(ptr, OS_PATH_SEPARATOR_ALT,
len - ulint(c - ptr)))) != NULL);
} else {
corrupt = true;
}
}
byte* end_ptr = ptr + len;
switch (type) {
default:
ut_ad(0); // the caller checked this
/* fall through */
case MLOG_FILE_NAME:
if (corrupt) {
ib::error() << "MLOG_FILE_NAME incorrect:" << ptr;
recv_sys.found_corrupt_log = true;
break;
}
fil_name_process(
reinterpret_cast<char*>(ptr), len, page_id.space(),
false);
break;
case MLOG_FILE_DELETE:
if (corrupt) {
ib::error() << "MLOG_FILE_DELETE incorrect:" << ptr;
recv_sys.found_corrupt_log = true;
break;
}
fil_name_process(reinterpret_cast<char*>(ptr), len,
page_id.space(), true);
/* fall through */
case MLOG_FILE_CREATE2:
if (page_id.page_no()) {
ut_ad(page_id.page_no()
== SRV_UNDO_TABLESPACE_SIZE_IN_PAGES);
ut_a(srv_is_undo_tablespace(page_id.space()));
compile_time_assert(
UT_ARR_SIZE(recv_sys.truncated_undo_spaces)
== TRX_SYS_MAX_UNDO_SPACES);
recv_sys_t::trunc& t = recv_sys.truncated_undo_spaces[
page_id.space() - srv_undo_space_id_start];
t.lsn = recv_sys.recovered_lsn;
t.pages = uint32_t(page_id.page_no());
} else if (log_file_op) {
log_file_op(page_id.space(),
type == MLOG_FILE_CREATE2 ? ptr - 4 : NULL,
ptr, len, NULL, 0);
}
break;
case MLOG_FILE_RENAME2:
if (corrupt) {
ib::error() << "MLOG_FILE_RENAME2 incorrect:" << ptr;
recv_sys.found_corrupt_log = true;
}
/* The new name follows the old name. */
byte* new_name = end_ptr + 2;
if (end < new_name) {
return(NULL);
}
ulint new_len = mach_read_from_2(end_ptr);
if (end < end_ptr + 2 + new_len) {
return(NULL);
}
end_ptr += 2 + new_len;
corrupt = corrupt
|| new_len < sizeof "/a.ibd\0"
|| memcmp(new_name + new_len - 5, DOT_IBD, 5) != 0;
if (!corrupt && !memchr(new_name, OS_PATH_SEPARATOR, new_len)) {
if (byte* c = static_cast<byte*>
(memchr(new_name, OS_PATH_SEPARATOR_ALT,
new_len))) {
ut_ad(c >= new_name);
ut_ad(c < new_name + new_len);
do {
*c = OS_PATH_SEPARATOR;
} while ((c = static_cast<byte*>
(memchr(ptr, OS_PATH_SEPARATOR_ALT,
new_len
- ulint(c - new_name))))
!= NULL);
} else {
corrupt = true;
}
}
if (corrupt) {
ib::error() << "MLOG_FILE_RENAME2 new_name incorrect:" << ptr
<< " new_name: " << new_name;
recv_sys.found_corrupt_log = true;
break;
}
fil_name_process(
reinterpret_cast<char*>(ptr), len,
page_id.space(), false);
fil_name_process(
reinterpret_cast<char*>(new_name), new_len,
page_id.space(), false);
if (log_file_op) {
log_file_op(page_id.space(), NULL,
ptr, len, new_name, new_len);
}
if (!apply) {
break;
}
if (!fil_op_replay_rename(
page_id.space(), page_id.page_no(),
reinterpret_cast<const char*>(ptr),
reinterpret_cast<const char*>(new_name))) {
recv_sys.found_corrupt_fs = true;
}
}
return(end_ptr);
}
/** Clean up after recv_sys_t::create() */
void recv_sys_t::close()
{
ut_ad(this == &recv_sys);
ut_ad(!recv_writer_thread_active);
if (is_initialised()) {
dblwr.pages.clear();
ut_d(mutex_enter(&mutex));
clear();
ut_d(mutex_exit(&mutex));
if (flush_start) {
os_event_destroy(flush_start);
}
if (flush_end) {
os_event_destroy(flush_end);
}
if (buf) {
ut_free_dodump(buf, buf_size);
buf = NULL;
}
buf_size = 0;
mutex_free(&writer_mutex);
mutex_free(&mutex);
}
recv_spaces.clear();
mlog_init.clear();
}
/************************************************************
Reset the state of the recovery system variables. */
void
recv_sys_var_init(void)
/*===================*/
{
recv_recovery_on = false;
recv_needed_recovery = false;
recv_lsn_checks_on = false;
recv_no_ibuf_operations = false;
recv_previous_parsed_rec_type = MLOG_SINGLE_REC_FLAG;
recv_previous_parsed_rec_offset = 0;
recv_previous_parsed_rec_is_multi = 0;
recv_max_page_lsn = 0;
}
/******************************************************************//**
recv_writer thread tasked with flushing dirty pages from the buffer
pools.
@return a dummy parameter */
extern "C"
os_thread_ret_t
DECLARE_THREAD(recv_writer_thread)(
/*===============================*/
void* arg MY_ATTRIBUTE((unused)))
/*!< in: a dummy parameter required by
os_thread_create */
{
my_thread_init();
ut_ad(!srv_read_only_mode);
#ifdef UNIV_PFS_THREAD
pfs_register_thread(recv_writer_thread_key);
#endif /* UNIV_PFS_THREAD */
#ifdef UNIV_DEBUG_THREAD_CREATION
ib::info() << "recv_writer thread running, id "
<< os_thread_pf(os_thread_get_curr_id());
#endif /* UNIV_DEBUG_THREAD_CREATION */
while (srv_shutdown_state == SRV_SHUTDOWN_NONE) {
/* Wait till we get a signal to clean the LRU list.
Bounded by max wait time of 100ms. */
int64_t sig_count = os_event_reset(buf_flush_event);
os_event_wait_time_low(buf_flush_event, 100000, sig_count);
mutex_enter(&recv_sys.writer_mutex);
if (!recv_recovery_is_on()) {
mutex_exit(&recv_sys.writer_mutex);
break;
}
/* Flush pages from end of LRU if required */
os_event_reset(recv_sys.flush_end);
recv_sys.flush_type = BUF_FLUSH_LRU;
os_event_set(recv_sys.flush_start);
os_event_wait(recv_sys.flush_end);
mutex_exit(&recv_sys.writer_mutex);
}
recv_writer_thread_active = false;
my_thread_end();
/* We count the number of threads in os_thread_exit().
A created thread should always use that to exit and not
use return() to exit. */
os_thread_exit();
OS_THREAD_DUMMY_RETURN;
}
/** Initialize the redo log recovery subsystem. */
void recv_sys_t::create()
{
ut_ad(this == &recv_sys);
ut_ad(!is_initialised());
ut_ad(!flush_start);
ut_ad(!flush_end);
mutex_create(LATCH_ID_RECV_SYS, &mutex);
mutex_create(LATCH_ID_RECV_WRITER, &writer_mutex);
if (!srv_read_only_mode) {
flush_start = os_event_create(0);
flush_end = os_event_create(0);
}
flush_type = BUF_FLUSH_LRU;
apply_log_recs = false;
apply_batch_on = false;
max_log_blocks = buf_pool_get_n_pages() / 3;
buf = static_cast<byte*>(ut_malloc_dontdump(RECV_PARSING_BUF_SIZE));
buf_size = RECV_PARSING_BUF_SIZE;
len = 0;
parse_start_lsn = 0;
scanned_lsn = 0;
scanned_checkpoint_no = 0;
recovered_offset = 0;
recovered_lsn = 0;
found_corrupt_log = false;
found_corrupt_fs = false;
mlog_checkpoint_lsn = 0;
progress_time = time(NULL);
recv_max_page_lsn = 0;
memset(truncated_undo_spaces, 0, sizeof truncated_undo_spaces);
last_stored_lsn = 0;
UT_LIST_INIT(blocks, &buf_block_t::unzip_LRU);
}
/** Clear a fully processed set of stored redo log records. */
inline void recv_sys_t::clear()
{
ut_ad(mutex_own(&mutex));
apply_log_recs= false;
apply_batch_on= false;
ut_ad(!after_apply || !UT_LIST_GET_LAST(blocks));
pages.clear();
for (buf_block_t *block= UT_LIST_GET_LAST(blocks); block; )
{
buf_block_t *prev_block= UT_LIST_GET_PREV(unzip_LRU, block);
ut_ad(buf_block_get_state(block) == BUF_BLOCK_MEMORY);
UT_LIST_REMOVE(blocks, block);
buf_block_free(block);
block= prev_block;
}
}
/** Free most recovery data structures. */
void recv_sys_t::debug_free()
{
ut_ad(this == &recv_sys);
ut_ad(is_initialised());
mutex_enter(&mutex);
pages.clear();
ut_free_dodump(buf, buf_size);
buf = NULL;
/* wake page cleaner up to progress */
if (!srv_read_only_mode) {
ut_ad(!recv_recovery_is_on());
ut_ad(!recv_writer_thread_active);
os_event_reset(buf_flush_event);
os_event_set(flush_start);
}
mutex_exit(&mutex);
}
inline size_t recv_sys_t::get_free_len() const
{
if (const buf_block_t* block= UT_LIST_GET_FIRST(blocks))
{
if (const size_t used= static_cast<uint16_t>(block->page.access_time))
return srv_page_size - used;
ut_ad(srv_page_size == 65536);
}
return 0;
}
inline byte* recv_sys_t::alloc(size_t len, bool store_recv)
{
ut_ad(mutex_own(&mutex));
ut_ad(len);
ut_ad(len <= srv_page_size);
buf_block_t *block= UT_LIST_GET_FIRST(blocks);
if (UNIV_UNLIKELY(!block))
{
create_block:
block= buf_block_alloc();
block->page.access_time= 1U << 16 |
ut_calc_align<uint16_t>(static_cast<uint16_t>(len), ALIGNMENT);
static_assert(ut_is_2pow(ALIGNMENT), "ALIGNMENT must be a power of 2");
UT_LIST_ADD_FIRST(blocks, block);
UNIV_MEM_INVALID(block->frame, len);
UNIV_MEM_FREE(block->frame + len, srv_page_size - len);
return my_assume_aligned<ALIGNMENT>(block->frame);
}
size_t free_offset= static_cast<uint16_t>(block->page.access_time);
ut_ad(!ut_2pow_remainder(free_offset, ALIGNMENT));
if (UNIV_UNLIKELY(!free_offset))
{
ut_ad(srv_page_size == 65536);
goto create_block;
}
ut_ad(free_offset <= srv_page_size);
free_offset+= len;
if (store_recv && free_offset + sizeof(recv_t::data) + 1 > srv_page_size)
goto create_block;
if (free_offset > srv_page_size)
goto create_block;
block->page.access_time= ((block->page.access_time >> 16) + 1) << 16 |
ut_calc_align<uint16_t>(static_cast<uint16_t>(free_offset), ALIGNMENT);
UNIV_MEM_ALLOC(block->frame + free_offset - len, len);
return my_assume_aligned<ALIGNMENT>(block->frame + free_offset - len);
}
/** Free a redo log snippet.
@param data buffer returned by alloc() */
inline void recv_sys_t::free(const void *data)
{
ut_ad(!ut_align_offset(data, ALIGNMENT));
data= page_align(data);
ut_ad(mutex_own(&mutex));
#ifdef UNIV_DEBUG
/* MDEV-14481 FIXME: To prevent race condition with buf_pool_resize(),
we must acquire and hold the buffer pool mutex here. */
extern volatile bool buf_pool_resizing;
extern volatile bool buf_pool_withdrawing;
ut_ad(!buf_pool_resizing);
ut_ad(!buf_pool_withdrawing);
#endif
buf_chunk_t *chunk= buf_pool->chunks;
for (auto i= buf_pool->n_chunks; i--; chunk++)
{
if (data < chunk->blocks->frame)
continue;
const size_t offs= (reinterpret_cast<const byte*>(data) -
chunk->blocks->frame) >> srv_page_size_shift;
if (offs >= chunk->size)
continue;
buf_block_t *block= &chunk->blocks[offs];
ut_ad(block->frame == data);
ut_ad(buf_block_get_state(block) == BUF_BLOCK_MEMORY);
ut_ad(static_cast<uint16_t>(block->page.access_time - 1) <
srv_page_size);
ut_ad(block->page.access_time >= 1U << 16);
if (!((block->page.access_time -= 1U << 16) >> 16))
{
UT_LIST_REMOVE(blocks, block);
buf_block_free(block);
}
return;
}
ut_ad(0);
}
/** Read a log segment to log_sys.buf.
@param[in,out] start_lsn in: read area start,
out: the last read valid lsn
@param[in] end_lsn read area end
@return whether no invalid blocks (e.g checksum mismatch) were found */
bool log_t::files::read_log_seg(lsn_t* start_lsn, lsn_t end_lsn)
{
ulint len;
bool success = true;
ut_ad(log_sys.mutex.is_owned());
ut_ad(!(*start_lsn % OS_FILE_LOG_BLOCK_SIZE));
ut_ad(!(end_lsn % OS_FILE_LOG_BLOCK_SIZE));
byte* buf = log_sys.buf;
loop:
lsn_t source_offset = calc_lsn_offset(*start_lsn);
ut_a(end_lsn - *start_lsn <= ULINT_MAX);
len = (ulint) (end_lsn - *start_lsn);
ut_ad(len != 0);
const bool at_eof = (source_offset % file_size) + len > file_size;
if (at_eof) {
/* If the above condition is true then len (which is ulint)
is > the expression below, so the typecast is ok */
len = ulint(file_size - (source_offset % file_size));
}
log_sys.n_log_ios++;
MONITOR_INC(MONITOR_LOG_IO);
ut_a((source_offset >> srv_page_size_shift) <= ULINT_MAX);
log_sys.log.read(static_cast<size_t>(source_offset), {buf, len});
for (ulint l = 0; l < len; l += OS_FILE_LOG_BLOCK_SIZE,
buf += OS_FILE_LOG_BLOCK_SIZE,
(*start_lsn) += OS_FILE_LOG_BLOCK_SIZE) {
const ulint block_number = log_block_get_hdr_no(buf);
if (block_number != log_block_convert_lsn_to_no(*start_lsn)) {
/* Garbage or an incompletely written log block.
We will not report any error, because this can
happen when InnoDB was killed while it was
writing redo log. We simply treat this as an
abrupt end of the redo log. */
fail:
end_lsn = *start_lsn;
success = false;
break;
}
ulint crc = log_block_calc_checksum_crc32(buf);
ulint cksum = log_block_get_checksum(buf);
DBUG_EXECUTE_IF("log_intermittent_checksum_mismatch", {
static int block_counter;
if (block_counter++ == 0) {
cksum = crc + 1;
}
});
if (crc != cksum) {
ib::error() << "Invalid log block checksum."
<< " block: " << block_number
<< " checkpoint no: "
<< log_block_get_checkpoint_no(buf)
<< " expected: " << crc
<< " found: " << cksum;
goto fail;
}
if (is_encrypted()
&& !log_crypt(buf, *start_lsn,
OS_FILE_LOG_BLOCK_SIZE,
LOG_DECRYPT)) {
goto fail;
}
ulint dl = log_block_get_data_len(buf);
if (dl < LOG_BLOCK_HDR_SIZE
|| (dl != OS_FILE_LOG_BLOCK_SIZE
&& dl > log_sys.trailer_offset())) {
recv_sys.found_corrupt_log = true;
goto fail;
}
}
if (recv_sys.report(time(NULL))) {
ib::info() << "Read redo log up to LSN=" << *start_lsn;
service_manager_extend_timeout(INNODB_EXTEND_TIMEOUT_INTERVAL,
"Read redo log up to LSN=" LSN_PF,
*start_lsn);
}
if (*start_lsn != end_lsn) {
goto loop;
}
return(success);
}
/********************************************************//**
Copies a log segment from the most up-to-date log group to the other log
groups, so that they all contain the latest log data. Also writes the info
about the latest checkpoint to the groups, and inits the fields in the group
memory structs to up-to-date values. */
static
void
recv_synchronize_groups()
{
const lsn_t recovered_lsn = recv_sys.recovered_lsn;
/* Read the last recovered log block to the recovery system buffer:
the block is always incomplete */
lsn_t start_lsn = ut_uint64_align_down(recovered_lsn,
OS_FILE_LOG_BLOCK_SIZE);
log_sys.log.read_log_seg(&start_lsn,
start_lsn + OS_FILE_LOG_BLOCK_SIZE);
log_sys.log.set_fields(recovered_lsn);
/* Copy the checkpoint info to the log; remember that we have
incremented checkpoint_no by one, and the info will not be written
over the max checkpoint info, thus making the preservation of max
checkpoint info on disk certain */
if (!srv_read_only_mode) {
log_write_checkpoint_info(0);
log_mutex_enter();
}
}
/** Check the consistency of a log header block.
@param[in] log header block
@return true if ok */
static
bool
recv_check_log_header_checksum(
const byte* buf)
{
return(log_block_get_checksum(buf)
== log_block_calc_checksum_crc32(buf));
}
/** Find the latest checkpoint in the format-0 log header.
@param[out] max_field LOG_CHECKPOINT_1 or LOG_CHECKPOINT_2
@return error code or DB_SUCCESS */
static MY_ATTRIBUTE((warn_unused_result))
dberr_t
recv_find_max_checkpoint_0(ulint* max_field)
{
ib_uint64_t max_no = 0;
ib_uint64_t checkpoint_no;
byte* buf = log_sys.checkpoint_buf;
ut_ad(log_sys.log.format == 0);
/** Offset of the first checkpoint checksum */
static const uint CHECKSUM_1 = 288;
/** Offset of the second checkpoint checksum */
static const uint CHECKSUM_2 = CHECKSUM_1 + 4;
/** Most significant bits of the checkpoint offset */
static const uint OFFSET_HIGH32 = CHECKSUM_2 + 12;
/** Least significant bits of the checkpoint offset */
static const uint OFFSET_LOW32 = 16;
bool found = false;
for (ulint field = LOG_CHECKPOINT_1; field <= LOG_CHECKPOINT_2;
field += LOG_CHECKPOINT_2 - LOG_CHECKPOINT_1) {
log_header_read(field);
if (static_cast<uint32_t>(ut_fold_binary(buf, CHECKSUM_1))
!= mach_read_from_4(buf + CHECKSUM_1)
|| static_cast<uint32_t>(
ut_fold_binary(buf + LOG_CHECKPOINT_LSN,
CHECKSUM_2 - LOG_CHECKPOINT_LSN))
!= mach_read_from_4(buf + CHECKSUM_2)) {
DBUG_LOG("ib_log",
"invalid pre-10.2.2 checkpoint " << field);
continue;
}
checkpoint_no = mach_read_from_8(
buf + LOG_CHECKPOINT_NO);
if (!log_crypt_101_read_checkpoint(buf)) {
ib::error() << "Decrypting checkpoint failed";
continue;
}
DBUG_PRINT("ib_log",
("checkpoint " UINT64PF " at " LSN_PF " found",
checkpoint_no,
mach_read_from_8(buf + LOG_CHECKPOINT_LSN)));
if (checkpoint_no >= max_no) {
found = true;
*max_field = field;
max_no = checkpoint_no;
log_sys.log.set_lsn(mach_read_from_8(
buf + LOG_CHECKPOINT_LSN));
log_sys.log.set_lsn_offset(
lsn_t(mach_read_from_4(buf + OFFSET_HIGH32))
<< 32
| mach_read_from_4(buf + OFFSET_LOW32));
}
}
if (found) {
return(DB_SUCCESS);
}
ib::error() << "Upgrade after a crash is not supported."
" This redo log was created before MariaDB 10.2.2,"
" and we did not find a valid checkpoint."
" Please follow the instructions at"
" https://mariadb.com/kb/en/library/upgrading/";
return(DB_ERROR);
}
/** Determine if a pre-MySQL 5.7.9/MariaDB 10.2.2 redo log is clean.
@param[in] lsn checkpoint LSN
@param[in] crypt whether the log might be encrypted
@return error code
@retval DB_SUCCESS if the redo log is clean
@retval DB_ERROR if the redo log is corrupted or dirty */
static dberr_t recv_log_format_0_recover(lsn_t lsn, bool crypt)
{
log_mutex_enter();
const lsn_t source_offset = log_sys.log.calc_lsn_offset(lsn);
log_mutex_exit();
byte* buf = log_sys.buf;
static const char* NO_UPGRADE_RECOVERY_MSG =
"Upgrade after a crash is not supported."
" This redo log was created before MariaDB 10.2.2";
log_sys.log.read(source_offset & ~(OS_FILE_LOG_BLOCK_SIZE - 1),
{buf, OS_FILE_LOG_BLOCK_SIZE});
if (log_block_calc_checksum_format_0(buf)
!= log_block_get_checksum(buf)
&& !log_crypt_101_read_block(buf)) {
ib::error() << NO_UPGRADE_RECOVERY_MSG
<< ", and it appears corrupted.";
return(DB_CORRUPTION);
}
if (log_block_get_data_len(buf)
== (source_offset & (OS_FILE_LOG_BLOCK_SIZE - 1))) {
} else if (crypt) {
ib::error() << "Cannot decrypt log for upgrading."
" The encrypted log was created"
" before MariaDB 10.2.2.";
return DB_ERROR;
} else {
ib::error() << NO_UPGRADE_RECOVERY_MSG << ".";
return(DB_ERROR);
}
/* Mark the redo log for upgrading. */
srv_log_file_size = 0;
recv_sys.parse_start_lsn = recv_sys.recovered_lsn
= recv_sys.scanned_lsn
= recv_sys.mlog_checkpoint_lsn = lsn;
log_sys.last_checkpoint_lsn = log_sys.next_checkpoint_lsn
= log_sys.lsn = log_sys.write_lsn
= log_sys.current_flush_lsn = log_sys.flushed_to_disk_lsn
= lsn;
log_sys.next_checkpoint_no = 0;
return(DB_SUCCESS);
}
/** Find the latest checkpoint in the log header.
@param[out] max_field LOG_CHECKPOINT_1 or LOG_CHECKPOINT_2
@return error code or DB_SUCCESS */
dberr_t
recv_find_max_checkpoint(ulint* max_field)
{
ib_uint64_t max_no;
ib_uint64_t checkpoint_no;
ulint field;
byte* buf;
max_no = 0;
*max_field = 0;
buf = log_sys.checkpoint_buf;
log_header_read(0);
/* Check the header page checksum. There was no
checksum in the first redo log format (version 0). */
log_sys.log.format = mach_read_from_4(buf + LOG_HEADER_FORMAT);
log_sys.log.subformat = log_sys.log.format != log_t::FORMAT_3_23
? mach_read_from_4(buf + LOG_HEADER_SUBFORMAT)
: 0;
if (log_sys.log.format != log_t::FORMAT_3_23
&& !recv_check_log_header_checksum(buf)) {
ib::error() << "Invalid redo log header checksum.";
return(DB_CORRUPTION);
}
char creator[LOG_HEADER_CREATOR_END - LOG_HEADER_CREATOR + 1];
memcpy(creator, buf + LOG_HEADER_CREATOR, sizeof creator);
/* Ensure that the string is NUL-terminated. */
creator[LOG_HEADER_CREATOR_END - LOG_HEADER_CREATOR] = 0;
switch (log_sys.log.format) {
case log_t::FORMAT_3_23:
return(recv_find_max_checkpoint_0(max_field));
case log_t::FORMAT_10_2:
case log_t::FORMAT_10_2 | log_t::FORMAT_ENCRYPTED:
case log_t::FORMAT_10_3:
case log_t::FORMAT_10_3 | log_t::FORMAT_ENCRYPTED:
case log_t::FORMAT_10_4:
case log_t::FORMAT_10_4 | log_t::FORMAT_ENCRYPTED:
break;
default:
ib::error() << "Unsupported redo log format."
" The redo log was created with " << creator << ".";
return(DB_ERROR);
}
for (field = LOG_CHECKPOINT_1; field <= LOG_CHECKPOINT_2;
field += LOG_CHECKPOINT_2 - LOG_CHECKPOINT_1) {
log_header_read(field);
const ulint crc32 = log_block_calc_checksum_crc32(buf);
const ulint cksum = log_block_get_checksum(buf);
if (crc32 != cksum) {
DBUG_PRINT("ib_log",
("invalid checkpoint,"
" at " ULINTPF
", checksum " ULINTPFx
" expected " ULINTPFx,
field, cksum, crc32));
continue;
}
if (log_sys.is_encrypted()
&& !log_crypt_read_checkpoint_buf(buf)) {
ib::error() << "Reading checkpoint"
" encryption info failed.";
continue;
}
checkpoint_no = mach_read_from_8(
buf + LOG_CHECKPOINT_NO);
DBUG_PRINT("ib_log",
("checkpoint " UINT64PF " at " LSN_PF " found",
checkpoint_no, mach_read_from_8(
buf + LOG_CHECKPOINT_LSN)));
if (checkpoint_no >= max_no) {
*max_field = field;
max_no = checkpoint_no;
log_sys.log.set_lsn(mach_read_from_8(
buf + LOG_CHECKPOINT_LSN));
log_sys.log.set_lsn_offset(mach_read_from_8(
buf + LOG_CHECKPOINT_OFFSET));
log_sys.next_checkpoint_no = checkpoint_no;
}
}
if (*max_field == 0) {
/* Before 10.2.2, we could get here during database
initialization if we created an ib_logfile0 file that
was filled with zeroes, and were killed. After
10.2.2, we would reject such a file already earlier,
when checking the file header. */
ib::error() << "No valid checkpoint found"
" (corrupted redo log)."
" You can try --innodb-force-recovery=6"
" as a last resort.";
return(DB_ERROR);
}
return(DB_SUCCESS);
}
/** Try to parse a single log record body and also applies it if
specified.
@param[in] type redo log entry type
@param[in] ptr redo log record body
@param[in] end_ptr end of buffer
@param[in] page_id page identifier
@param[in] apply whether to apply the record
@param[in,out] block buffer block, or NULL if
a page log record should not be applied
or if it is a MLOG_FILE_ operation
@param[in,out] mtr mini-transaction, or NULL if
a page log record should not be applied
@return log record end, NULL if not a complete record */
static
const byte*
recv_parse_or_apply_log_rec_body(
mlog_id_t type,
const byte* ptr,
const byte* end_ptr,
const page_id_t page_id,
bool apply,
buf_block_t* block,
mtr_t* mtr)
{
ut_ad(!block == !mtr);
ut_ad(!apply || recv_sys.mlog_checkpoint_lsn);
switch (type) {
case MLOG_FILE_NAME:
case MLOG_FILE_DELETE:
case MLOG_FILE_CREATE2:
case MLOG_FILE_RENAME2:
ut_ad(block == NULL);
/* Collect the file names when parsing the log,
before applying any log records. */
return fil_name_parse(const_cast<byte*>(ptr), end_ptr,
page_id, type, apply);
case MLOG_INDEX_LOAD:
if (end_ptr < ptr + 8) {
return(NULL);
}
return(ptr + 8);
case MLOG_TRUNCATE:
ib::error() << "Cannot crash-upgrade from "
"old-style TRUNCATE TABLE";
recv_sys.found_corrupt_log = true;
return NULL;
default:
break;
}
dict_index_t* index = NULL;
page_t* page;
page_zip_des_t* page_zip;
#ifdef UNIV_DEBUG
uint16_t page_type;
#endif /* UNIV_DEBUG */
if (block) {
/* Applying a page log record. */
ut_ad(apply);
page = block->frame;
page_zip = buf_block_get_page_zip(block);
ut_d(page_type = fil_page_get_type(page));
} else if (apply
&& !is_predefined_tablespace(page_id.space())
&& recv_spaces.find(page_id.space()) == recv_spaces.end()) {
if (recv_sys.recovered_lsn < recv_sys.mlog_checkpoint_lsn) {
/* We have not seen all records between the
checkpoint and MLOG_CHECKPOINT. There should be
a MLOG_FILE_DELETE for this tablespace later. */
recv_spaces.insert(
std::make_pair(page_id.space(),
file_name_t("", false)));
goto parse_log;
}
ib::error() << "Missing MLOG_FILE_NAME or MLOG_FILE_DELETE"
" for redo log record " << type << page_id << " at "
<< recv_sys.recovered_lsn << ".";
recv_sys.found_corrupt_log = true;
return(NULL);
} else {
parse_log:
/* Parsing a page log record. */
page = NULL;
page_zip = NULL;
ut_d(page_type = FIL_PAGE_TYPE_ALLOCATED);
}
const byte* old_ptr = ptr;
switch (type) {
case MLOG_1BYTE: case MLOG_2BYTES: case MLOG_4BYTES: case MLOG_8BYTES:
case MLOG_MEMSET:
#ifdef UNIV_DEBUG
if (page && page_type == FIL_PAGE_TYPE_ALLOCATED
&& end_ptr >= ptr + 2) {
/* It is OK to set FIL_PAGE_TYPE and certain
list node fields on an empty page. Any other
write is not OK. */
/* NOTE: There may be bogus assertion failures for
dict_hdr_create(), trx_rseg_header_create(),
trx_sys_create_doublewrite_buf(), and
trx_sysf_create().
These are only called during database creation. */
ulint offs = mach_read_from_2(ptr);
switch (type) {
default:
ut_error;
case MLOG_1BYTE:
ut_ad(offs == FIL_PAGE_TYPE + 1);
break;
case MLOG_2BYTES:
/* Note that this can fail when the
redo log been written with something
older than InnoDB Plugin 1.0.4. */
ut_ad(offs == FIL_PAGE_TYPE
|| srv_is_undo_tablespace(
page_id.space())
|| offs == IBUF_TREE_SEG_HEADER
+ IBUF_HEADER + FSEG_HDR_OFFSET
|| offs == PAGE_BTR_IBUF_FREE_LIST
+ PAGE_HEADER + FIL_ADDR_BYTE
|| offs == PAGE_BTR_IBUF_FREE_LIST
+ PAGE_HEADER + FIL_ADDR_BYTE
+ FIL_ADDR_SIZE
|| offs == PAGE_BTR_SEG_LEAF
+ PAGE_HEADER + FSEG_HDR_OFFSET
|| offs == PAGE_BTR_SEG_TOP
+ PAGE_HEADER + FSEG_HDR_OFFSET
|| offs == PAGE_BTR_IBUF_FREE_LIST_NODE
+ PAGE_HEADER + FIL_ADDR_BYTE
+ 0 /*FLST_PREV*/
|| offs == PAGE_BTR_IBUF_FREE_LIST_NODE
+ PAGE_HEADER + FIL_ADDR_BYTE
+ FIL_ADDR_SIZE /*FLST_NEXT*/);
break;
case MLOG_4BYTES:
/* Note that this can fail when the
redo log been written with something
older than InnoDB Plugin 1.0.4. */
ut_ad(0
/* fil_crypt_rotate_page() writes this */
|| offs == FIL_PAGE_SPACE_ID
|| srv_is_undo_tablespace(
page_id.space())
|| offs == IBUF_TREE_SEG_HEADER
+ IBUF_HEADER + FSEG_HDR_SPACE
|| offs == IBUF_TREE_SEG_HEADER
+ IBUF_HEADER + FSEG_HDR_PAGE_NO
|| offs == PAGE_BTR_IBUF_FREE_LIST
+ PAGE_HEADER/* flst_init */
|| offs == PAGE_BTR_IBUF_FREE_LIST
+ PAGE_HEADER + FIL_ADDR_PAGE
|| offs == PAGE_BTR_IBUF_FREE_LIST
+ PAGE_HEADER + FIL_ADDR_PAGE
+ FIL_ADDR_SIZE
|| offs == PAGE_BTR_SEG_LEAF
+ PAGE_HEADER + FSEG_HDR_PAGE_NO
|| offs == PAGE_BTR_SEG_LEAF
+ PAGE_HEADER + FSEG_HDR_SPACE
|| offs == PAGE_BTR_SEG_TOP
+ PAGE_HEADER + FSEG_HDR_PAGE_NO
|| offs == PAGE_BTR_SEG_TOP
+ PAGE_HEADER + FSEG_HDR_SPACE
|| offs == PAGE_BTR_IBUF_FREE_LIST_NODE
+ PAGE_HEADER + FIL_ADDR_PAGE
+ 0 /*FLST_PREV*/
|| offs == PAGE_BTR_IBUF_FREE_LIST_NODE
+ PAGE_HEADER + FIL_ADDR_PAGE
+ FIL_ADDR_SIZE /*FLST_NEXT*/);
break;
}
}
#endif /* UNIV_DEBUG */
ptr = mlog_parse_nbytes(type, ptr, end_ptr, page, page_zip);
if (ptr && page && !page_id.page_no() && type == MLOG_4BYTES) {
switch (ulint offs = mach_read_from_2(old_ptr)) {
fil_space_t* space;
ulint val;
default:
break;
case FSP_HEADER_OFFSET + FSP_SPACE_FLAGS:
case FSP_HEADER_OFFSET + FSP_SIZE:
case FSP_HEADER_OFFSET + FSP_FREE_LIMIT:
case FSP_HEADER_OFFSET + FSP_FREE + FLST_LEN:
space = fil_space_get(page_id.space());
ut_a(space != NULL);
val = mach_read_from_4(page + offs);
switch (offs) {
case FSP_HEADER_OFFSET + FSP_SPACE_FLAGS:
space->flags = val;
break;
case FSP_HEADER_OFFSET + FSP_SIZE:
space->size_in_header = val;
break;
case FSP_HEADER_OFFSET + FSP_FREE_LIMIT:
space->free_limit = val;
break;
case FSP_HEADER_OFFSET + FSP_FREE + FLST_LEN:
space->free_len = val;
ut_ad(val == flst_get_len(
page + offs));
break;
}
}
}
break;
case MLOG_REC_INSERT: case MLOG_COMP_REC_INSERT:
ut_ad(!page || fil_page_type_is_index(page_type));
if (NULL != (ptr = mlog_parse_index(
ptr, end_ptr,
type == MLOG_COMP_REC_INSERT,
&index))) {
ut_a(!page
|| (ibool)!!page_is_comp(page)
== dict_table_is_comp(index->table));
ptr = page_cur_parse_insert_rec(false, ptr, end_ptr,
block, index, mtr);
}
break;
case MLOG_REC_CLUST_DELETE_MARK: case MLOG_COMP_REC_CLUST_DELETE_MARK:
ut_ad(!page || fil_page_type_is_index(page_type));
if (NULL != (ptr = mlog_parse_index(
ptr, end_ptr,
type == MLOG_COMP_REC_CLUST_DELETE_MARK,
&index))) {
ut_a(!page
|| (ibool)!!page_is_comp(page)
== dict_table_is_comp(index->table));
ptr = btr_cur_parse_del_mark_set_clust_rec(
ptr, end_ptr, page, page_zip, index, mtr);
}
break;
case MLOG_REC_SEC_DELETE_MARK:
ut_ad(!page || fil_page_type_is_index(page_type));
ptr = btr_cur_parse_del_mark_set_sec_rec(ptr, end_ptr,
page, page_zip, mtr);
break;
case MLOG_REC_UPDATE_IN_PLACE: case MLOG_COMP_REC_UPDATE_IN_PLACE:
ut_ad(!page || fil_page_type_is_index(page_type));
if (NULL != (ptr = mlog_parse_index(
ptr, end_ptr,
type == MLOG_COMP_REC_UPDATE_IN_PLACE,
&index))) {
ut_a(!page
|| (ibool)!!page_is_comp(page)
== dict_table_is_comp(index->table));
ptr = btr_cur_parse_update_in_place(ptr, end_ptr, page,
page_zip, index);
}
break;
case MLOG_LIST_END_DELETE: case MLOG_COMP_LIST_END_DELETE:
case MLOG_LIST_START_DELETE: case MLOG_COMP_LIST_START_DELETE:
ut_ad(!page || fil_page_type_is_index(page_type));
if (NULL != (ptr = mlog_parse_index(
ptr, end_ptr,
type == MLOG_COMP_LIST_END_DELETE
|| type == MLOG_COMP_LIST_START_DELETE,
&index))) {
ut_a(!page
|| (ibool)!!page_is_comp(page)
== dict_table_is_comp(index->table));
ptr = page_parse_delete_rec_list(type, ptr, end_ptr,
block, index, mtr);
}
break;
case MLOG_LIST_END_COPY_CREATED: case MLOG_COMP_LIST_END_COPY_CREATED:
ut_ad(!page || fil_page_type_is_index(page_type));
if (NULL != (ptr = mlog_parse_index(
ptr, end_ptr,
type == MLOG_COMP_LIST_END_COPY_CREATED,
&index))) {
ut_a(!page
|| (ibool)!!page_is_comp(page)
== dict_table_is_comp(index->table));
ptr = page_parse_copy_rec_list_to_created_page(
ptr, end_ptr, block, index, mtr);
}
break;
case MLOG_PAGE_REORGANIZE:
case MLOG_COMP_PAGE_REORGANIZE:
case MLOG_ZIP_PAGE_REORGANIZE:
ut_ad(!page || fil_page_type_is_index(page_type));
if (NULL != (ptr = mlog_parse_index(
ptr, end_ptr,
type != MLOG_PAGE_REORGANIZE,
&index))) {
ut_a(!page
|| (ibool)!!page_is_comp(page)
== dict_table_is_comp(index->table));
ptr = btr_parse_page_reorganize(
ptr, end_ptr, index,
type == MLOG_ZIP_PAGE_REORGANIZE,
block, mtr);
}
break;
case MLOG_PAGE_CREATE: case MLOG_COMP_PAGE_CREATE:
/* Allow anything in page_type when creating a page. */
ut_a(!page_zip);
page_parse_create(block, type == MLOG_COMP_PAGE_CREATE, false);
break;
case MLOG_PAGE_CREATE_RTREE: case MLOG_COMP_PAGE_CREATE_RTREE:
page_parse_create(block, type == MLOG_COMP_PAGE_CREATE_RTREE,
true);
break;
case MLOG_UNDO_INSERT:
ut_ad(!page || page_type == FIL_PAGE_UNDO_LOG);
ptr = trx_undo_parse_add_undo_rec(ptr, end_ptr, page);
break;
case MLOG_UNDO_ERASE_END:
if (page) {
ut_ad(page_type == FIL_PAGE_UNDO_LOG);
uint16_t first_free = mach_read_from_2(
TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE + page);
memset(page + first_free, 0,
(srv_page_size - FIL_PAGE_DATA_END)
- first_free);
}
break;
case MLOG_UNDO_INIT:
/* Allow anything in page_type when creating a page. */
ptr = trx_undo_parse_page_init(ptr, end_ptr, page);
break;
case MLOG_UNDO_HDR_REUSE:
ut_ad(!page || page_type == FIL_PAGE_UNDO_LOG);
ptr = trx_undo_parse_page_header_reuse(ptr, end_ptr, page);
break;
case MLOG_UNDO_HDR_CREATE:
ut_ad(!page || page_type == FIL_PAGE_UNDO_LOG);
ptr = trx_undo_parse_page_header(ptr, end_ptr, block, mtr);
break;
case MLOG_REC_MIN_MARK: case MLOG_COMP_REC_MIN_MARK:
ut_ad(!page || fil_page_type_is_index(page_type));
/* On a compressed page, MLOG_COMP_REC_MIN_MARK
will be followed by MLOG_COMP_REC_DELETE
or MLOG_ZIP_WRITE_HEADER(FIL_PAGE_PREV, FIL_NULL)
in the same mini-transaction. */
ut_a(type == MLOG_COMP_REC_MIN_MARK || !page_zip);
ptr = btr_parse_set_min_rec_mark(
ptr, end_ptr, type == MLOG_COMP_REC_MIN_MARK,
block, mtr);
break;
case MLOG_REC_DELETE: case MLOG_COMP_REC_DELETE:
ut_ad(!page || fil_page_type_is_index(page_type));
if (NULL != (ptr = mlog_parse_index(
ptr, end_ptr,
type == MLOG_COMP_REC_DELETE,
&index))) {
ut_a(!page
|| (ibool)!!page_is_comp(page)
== dict_table_is_comp(index->table));
ptr = page_cur_parse_delete_rec(ptr, end_ptr,
block, index, mtr);
}
break;
case MLOG_IBUF_BITMAP_INIT:
/* Allow anything in page_type when creating a page. */
if (block) ibuf_bitmap_init_apply(block);
break;
case MLOG_INIT_FILE_PAGE2:
/* Allow anything in page_type when creating a page. */
if (block) fsp_apply_init_file_page(block);
break;
case MLOG_INIT_FREE_PAGE:
/* The page can be zero-filled and its previous
contents can be ignored. We do not write or apply
this record yet. */
break;
case MLOG_ZIP_WRITE_STRING:
ut_ad(!page_zip
|| !fil_page_get_type(page_zip->data)
|| fil_page_get_type(page_zip->data) == FIL_PAGE_INDEX
|| fil_page_get_type(page_zip->data) == FIL_PAGE_RTREE);
if (ptr + 4 > end_ptr) {
goto truncated;
} else {
const ulint ofs = mach_read_from_2(ptr);
const ulint len = mach_read_from_2(ptr + 2);
if (ofs < FIL_PAGE_PREV || !len) {
goto corrupted;
}
ptr += 4 + len;
if (ptr > end_ptr) {
goto truncated;
}
if (!page_zip) {
break;
}
ut_ad(ofs + len <= block->zip_size());
memcpy(page_zip->data + ofs, old_ptr + 4, len);
if (ofs >= FIL_PAGE_TYPE +2
|| ofs + len < FIL_PAGE_TYPE + 2) {
break;
}
/* Ensure that buf_flush_init_for_writing()
will treat the page as an index page, and
not overwrite the compressed page with the
contents of the uncompressed page. */
memcpy_aligned<2>(&page[FIL_PAGE_TYPE],
&page_zip->data[FIL_PAGE_TYPE], 2);
}
break;
case MLOG_WRITE_STRING:
ut_ad(!page_zip
|| fil_page_get_type(page_zip->data)
<= FIL_PAGE_TYPE_ZBLOB2);
if (page_id.page_no() || mach_read_from_2(ptr + 2)
!= 11 + MY_AES_BLOCK_SIZE) {
/* Not writing crypt_info */
} else if (fil_space_t* space
= fil_space_acquire_silent(page_id.space())) {
if (mach_read_from_2(ptr)
== FSP_HEADER_OFFSET + XDES_ARR_OFFSET + MAGIC_SZ
+ space->physical_size() * XDES_SIZE
/ FSP_EXTENT_SIZE
&& (ptr[4] == CRYPT_SCHEME_UNENCRYPTED
|| ptr[4] == CRYPT_SCHEME_1)
&& ptr[5] == MY_AES_BLOCK_SIZE
&& ptr[6 + MY_AES_BLOCK_SIZE + 4 + 4]
<= FIL_ENCRYPTION_OFF) {
/* from fil_space_crypt_t::write_page0() */
fil_crypt_parse(space, ptr + 4);
}
space->release();
}
ptr = mlog_parse_string(ptr, end_ptr, page, page_zip);
break;
case MLOG_ZIP_WRITE_NODE_PTR:
ut_ad(!page || fil_page_type_is_index(page_type));
ptr = page_zip_parse_write_node_ptr(ptr, end_ptr,
page, page_zip);
break;
case MLOG_ZIP_WRITE_BLOB_PTR:
ut_ad(!page || fil_page_type_is_index(page_type));
ptr = page_zip_parse_write_blob_ptr(ptr, end_ptr,
page, page_zip);
break;
case MLOG_ZIP_WRITE_HEADER:
ut_ad(!page || fil_page_type_is_index(page_type));
ptr = page_zip_parse_write_header(ptr, end_ptr,
page, page_zip);
break;
case MLOG_ZIP_PAGE_COMPRESS:
/* Allow anything in page_type when creating a page. */
ptr = page_zip_parse_compress(ptr, end_ptr, block);
break;
case MLOG_ZIP_PAGE_COMPRESS_NO_DATA:
if (NULL != (ptr = mlog_parse_index(
ptr, end_ptr, TRUE, &index))) {
ut_a(!page || ((ibool)!!page_is_comp(page)
== dict_table_is_comp(index->table)));
if (end_ptr == ptr) {
ptr = NULL;
break;
}
if (page &&
!page_zip_compress(block, index, *ptr, mtr)) {
ut_error;
}
ptr++;
}
break;
case MLOG_ZIP_WRITE_TRX_ID:
/* This must be a clustered index leaf page. */
ut_ad(!page || page_type == FIL_PAGE_INDEX);
ptr = page_zip_parse_write_trx_id(ptr, end_ptr,
page, page_zip);
break;
case MLOG_FILE_WRITE_CRYPT_DATA:
dberr_t err;
ptr = fil_parse_write_crypt_data(ptr, end_ptr, &err);
if (err != DB_SUCCESS) {
recv_sys.found_corrupt_log = TRUE;
}
break;
default:
ib::error() << "Incorrect log record type "
<< ib::hex(unsigned(type));
corrupted:
recv_sys.found_corrupt_log = true;
truncated:
ptr = NULL;
}
if (index) {
dict_table_t* table = index->table;
dict_mem_index_free(index);
dict_mem_table_free(table);
}
return(ptr);
}
/** Store a redo log record for applying.
@param type record type
@param page_id page identifier
@param body record body
@param rec_end end of record
@param lsn start LSN of the mini-transaction
@param end_lsn end LSN of the mini-transaction */
inline void recv_sys_t::add(mlog_id_t type, const page_id_t page_id,
const byte* body, const byte* rec_end, lsn_t lsn,
lsn_t end_lsn)
{
ut_ad(type != MLOG_FILE_DELETE);
ut_ad(type != MLOG_FILE_CREATE2);
ut_ad(type != MLOG_FILE_RENAME2);
ut_ad(type != MLOG_FILE_NAME);
ut_ad(type != MLOG_DUMMY_RECORD);
ut_ad(type != MLOG_CHECKPOINT);
ut_ad(type != MLOG_INDEX_LOAD);
ut_ad(type != MLOG_TRUNCATE);
std::pair<map::iterator, bool> p= pages.insert(map::value_type
(page_id, page_recv_t()));
page_recv_t& recs= p.first->second;
ut_ad(p.second == recs.log.empty());
switch (type) {
case MLOG_INIT_FILE_PAGE2:
case MLOG_ZIP_PAGE_COMPRESS:
case MLOG_INIT_FREE_PAGE:
recs.will_not_read();
mlog_init.add(page_id, lsn);
default:
break;
}
/* Store the log record body in limited-size chunks, because the
heap grows into the buffer pool. */
size_t len= static_cast<size_t>(rec_end - body);
recv_t *recv= new (alloc(sizeof *recv, true))
recv_t(static_cast<uint32_t>(len), type, lsn, end_lsn);
recs.log.append(recv);
for (recv_t::data_t *prev= nullptr;;)
{
const size_t l= std::min(len, get_free_len() - sizeof(recv_t::data));
recv_t::data_t *d= new (alloc(sizeof(recv_t::data) + l))
recv_t::data_t(body, l);
if (prev)
prev->append(d);
else
recv->data= d;
prev= d;
body+= l;
len-= l;
if (!len)
break;
}
}
/** Trim old log records for a page.
@param start_lsn oldest log sequence number to preserve
@return whether all the log for the page was trimmed */
inline bool page_recv_t::recs_t::trim(lsn_t start_lsn)
{
while (head)
{
if (head->lsn >= start_lsn) return false;
log_rec_t *next= head->next;
static_cast<const recv_t*>(head)->free();
head= next;
}
tail= nullptr;
return true;
}
inline void page_recv_t::recs_t::clear()
{
ut_ad(mutex_own(&recv_sys.mutex));
for (const log_rec_t *l= head; l; )
{
const log_rec_t *next= l->next;
static_cast<const recv_t*>(l)->free();
l= next;
}
head= tail= nullptr;
}
/** Ignore any earlier redo log records for this page. */
inline void page_recv_t::will_not_read()
{
ut_ad(state == RECV_NOT_PROCESSED || state == RECV_WILL_NOT_READ);
state= RECV_WILL_NOT_READ;
log.clear();
}
/*********************************************************************//**
Copies the log record body from recv to buf. */
static ATTRIBUTE_COLD
void
recv_data_copy_to_buf(
/*==================*/
byte* buf, /*!< in: buffer of length at least recv->len */
const recv_t& recv) /*!< in: log record */
{
const recv_t::data_t* recv_data = recv.data;
ulint len = recv.len;
do {
ulint offset = page_offset(recv_data + 1);
const ulint chunk_limit = (srv_page_size - offset);
const ulint l = std::min(len, chunk_limit);
memcpy(buf, reinterpret_cast<const byte*>(recv_data + 1), l);
recv_data = recv_data->next;
buf += l;
len -= l;
} while (len);
}
/** Apply the hashed log records to the page, if the page lsn is less than the
lsn of a log record.
@param[in,out] block buffer pool page
@param[in,out] mtr mini-transaction
@param[in,out] p recovery address
@param[in,out] init page initialization operation, or NULL */
static void recv_recover_page(buf_block_t* block, mtr_t& mtr,
const recv_sys_t::map::iterator& p,
mlog_init_t::init* init = NULL)
{
page_t* page;
page_zip_des_t* page_zip;
ut_ad(mutex_own(&recv_sys.mutex));
ut_ad(recv_sys.apply_log_recs);
ut_ad(recv_needed_recovery);
ut_ad(!init || init->created);
ut_ad(!init || init->lsn);
ut_ad(block->page.id == p->first);
ut_ad(!p->second.is_being_processed());
if (UNIV_UNLIKELY(srv_print_verbose_log == 2)) {
ib::info() << "Applying log to page " << block->page.id;
}
DBUG_LOG("ib_log", "Applying log to page " << block->page.id);
p->second.state = page_recv_t::RECV_BEING_PROCESSED;
mutex_exit(&recv_sys.mutex);
page = block->frame;
page_zip = buf_block_get_page_zip(block);
const lsn_t page_lsn = mach_read_from_8(page + FIL_PAGE_LSN);
bool free_page = false;
lsn_t start_lsn = 0, end_lsn = 0;
ut_d(lsn_t recv_start_lsn = 0);
const lsn_t init_lsn = init ? init->lsn : 0;
for (const log_rec_t* l : p->second.log) {
ut_ad(l->lsn);
ut_ad(end_lsn <= l->lsn);
end_lsn = l->lsn;
ut_ad(end_lsn <= log_sys.log.scanned_lsn);
const recv_t* recv = static_cast<const recv_t*>(l);
ut_ad(recv->start_lsn);
ut_ad(recv_start_lsn < recv->start_lsn);
ut_d(recv_start_lsn = recv->start_lsn);
if (recv->start_lsn < page_lsn) {
/* Ignore this record, because there are later changes
for this page. */
DBUG_LOG("ib_log", "apply skip "
<< get_mlog_string(recv->type)
<< " LSN " << recv->start_lsn << " < "
<< page_lsn);
} else if (recv->start_lsn < init_lsn) {
DBUG_LOG("ib_log", "init skip "
<< get_mlog_string(recv->type)
<< " LSN " << recv->start_lsn << " < "
<< init_lsn);
} else {
if (recv->type == MLOG_INIT_FREE_PAGE) {
/* This does not really modify the page. */
free_page = true;
} else if (!start_lsn) {
start_lsn = recv->start_lsn;
}
if (UNIV_UNLIKELY(srv_print_verbose_log == 2)) {
ib::info() << "apply " << recv->start_lsn
<< ":" << recv->type
<< " len " << recv->len
<< " page " << block->page.id;
}
DBUG_LOG("ib_log", "apply " << recv->start_lsn << ": "
<< get_mlog_string(recv->type)
<< " len " << recv->len
<< " page " << block->page.id);
ulint data_offset = page_offset(recv->data + 1);
byte* buf;
const byte* recs;
if (srv_page_size - data_offset < recv->len) {
/* We have to copy the record body to
a separate buffer */
recs = buf = static_cast<byte*>
(ut_malloc_nokey(recv->len));
recv_data_copy_to_buf(buf, *recv);
} else {
buf = NULL;
recs = reinterpret_cast<const byte*>
(recv->data + 1);
}
recv_parse_or_apply_log_rec_body(
recv->type, recs, recs + recv->len,
block->page.id, true, block, &mtr);
ut_free(buf);
end_lsn = recv->start_lsn + recv->len;
mach_write_to_8(FIL_PAGE_LSN + page, end_lsn);
mach_write_to_8(srv_page_size
- FIL_PAGE_END_LSN_OLD_CHKSUM
+ page, end_lsn);
if (UNIV_LIKELY_NULL(page_zip)) {
memcpy_aligned<8>(FIL_PAGE_LSN
+ page_zip->data,
FIL_PAGE_LSN + page, 8);
if (fil_page_index_page_check(page)
&& !page_zip_decompress(page_zip, page,
true)) {
ib::error() << "corrupted page "
<< block->page.id;
}
}
}
}
#ifdef UNIV_ZIP_DEBUG
ut_ad(!fil_page_index_page_check(page)
|| !page_zip
|| page_zip_validate_low(page_zip, page, NULL, FALSE));
#endif /* UNIV_ZIP_DEBUG */
if (start_lsn) {
log_flush_order_mutex_enter();
buf_flush_note_modification(block, start_lsn, end_lsn);
log_flush_order_mutex_exit();
} else if (free_page && init) {
/* There have been no operations than MLOG_INIT_FREE_PAGE.
Any buffered changes must not be merged. A subsequent
buf_page_create() from a user thread should discard
any buffered changes. */
init->created = false;
ut_ad(!mtr.has_modifications());
}
/* Make sure that committing mtr does not change the modification
lsn values of page */
mtr.discard_modifications();
mtr.commit();
time_t now = time(NULL);
mutex_enter(&recv_sys.mutex);
if (recv_max_page_lsn < page_lsn) {
recv_max_page_lsn = page_lsn;
}
ut_ad(p->second.is_being_processed());
ut_ad(!recv_sys.pages.empty());
if (recv_sys.report(now)) {
const ulint n = recv_sys.pages.size();
ib::info() << "To recover: " << n << " pages from log";
service_manager_extend_timeout(
INNODB_EXTEND_TIMEOUT_INTERVAL, "To recover: " ULINTPF " pages from log", n);
}
}
/** Remove records for a corrupted page.
This function should only be called when innodb_force_recovery is set.
@param page_id corrupted page identifier */
ATTRIBUTE_COLD void recv_sys_t::free_corrupted_page(page_id_t page_id)
{
mutex_enter(&mutex);
map::iterator p= pages.find(page_id);
if (p != pages.end())
{
p->second.log.clear();
pages.erase(p);
}
mutex_exit(&mutex);
}
/** Apply any buffered redo log to a page that was just read from a data file.
@param[in,out] bpage buffer pool page */
void recv_recover_page(buf_page_t* bpage)
{
mtr_t mtr;
mtr.start();
mtr.set_log_mode(MTR_LOG_NONE);
ut_ad(buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE);
buf_block_t* block = reinterpret_cast<buf_block_t*>(bpage);
/* Move the ownership of the x-latch on the page to
this OS thread, so that we can acquire a second
x-latch on it. This is needed for the operations to
the page to pass the debug checks. */
rw_lock_x_lock_move_ownership(&block->lock);
buf_block_buf_fix_inc(block, __FILE__, __LINE__);
rw_lock_x_lock(&block->lock);
mtr.memo_push(block, MTR_MEMO_PAGE_X_FIX);
mutex_enter(&recv_sys.mutex);
if (recv_sys.apply_log_recs) {
recv_sys_t::map::iterator p = recv_sys.pages.find(bpage->id);
if (p != recv_sys.pages.end()
&& !p->second.is_being_processed()) {
recv_recover_page(block, mtr, p);
p->second.log.clear();
recv_sys.pages.erase(p);
goto func_exit;
}
}
mtr.commit();
func_exit:
mutex_exit(&recv_sys.mutex);
ut_ad(mtr.has_committed());
}
/** Reads in pages which have hashed log records, from an area around a given
page number.
@param[in] page_id page id */
static void recv_read_in_area(page_id_t page_id)
{
ulint page_nos[RECV_READ_AHEAD_AREA];
compile_time_assert(ut_is_2pow(RECV_READ_AHEAD_AREA));
page_id.set_page_no(ut_2pow_round(page_id.page_no(),
RECV_READ_AHEAD_AREA));
const ulint up_limit = page_id.page_no() + RECV_READ_AHEAD_AREA;
ulint* p = page_nos;
for (recv_sys_t::map::iterator i= recv_sys.pages.lower_bound(page_id);
i != recv_sys.pages.end()
&& i->first.space() == page_id.space()
&& i->first.page_no() < up_limit; i++) {
if (i->second.state == page_recv_t::RECV_NOT_PROCESSED
&& !buf_page_hash_get(i->first)) {
i->second.state = page_recv_t::RECV_BEING_READ;
*p++ = i->first.page_no();
}
}
if (p != page_nos) {
mutex_exit(&recv_sys.mutex);
buf_read_recv_pages(FALSE, page_id.space(), page_nos,
ulint(p - page_nos));
mutex_enter(&recv_sys.mutex);
}
}
/** Apply recv_sys.pages to persistent data pages.
@param[in] last_batch whether redo log writes are possible */
void recv_apply_hashed_log_recs(bool last_batch)
{
ut_ad(srv_operation == SRV_OPERATION_NORMAL
|| srv_operation == SRV_OPERATION_RESTORE
|| srv_operation == SRV_OPERATION_RESTORE_EXPORT);
mutex_enter(&recv_sys.mutex);
while (recv_sys.apply_batch_on) {
bool abort = recv_sys.found_corrupt_log;
mutex_exit(&recv_sys.mutex);
if (abort) {
return;
}
os_thread_sleep(500000);
mutex_enter(&recv_sys.mutex);
}
ut_ad(!last_batch == log_mutex_own());
recv_no_ibuf_operations = !last_batch
|| srv_operation == SRV_OPERATION_RESTORE
|| srv_operation == SRV_OPERATION_RESTORE_EXPORT;
ut_d(recv_no_log_write = recv_no_ibuf_operations);
mtr_t mtr;
if (recv_sys.pages.empty()) {
goto done;
}
if (!log_sys.log.subformat && !srv_force_recovery
&& srv_undo_tablespaces_open) {
ib::error() << "Recovery of separately logged"
" TRUNCATE operations is no longer supported."
" Set innodb_force_recovery=1"
" if no *trunc.log files exist";
recv_sys.found_corrupt_log = true;
mutex_exit(&recv_sys.mutex);
return;
} else {
const char* msg = last_batch
? "Starting final batch to recover "
: "Starting a batch to recover ";
const ulint n = recv_sys.pages.size();
ib::info() << msg << n << " pages from redo log.";
sd_notifyf(0, "STATUS=%s" ULINTPF " pages from redo log",
msg, n);
}
recv_sys.apply_log_recs = true;
recv_sys.apply_batch_on = true;
for (ulint id = srv_undo_tablespaces_open; id--;) {
const recv_sys_t::trunc& t= recv_sys.truncated_undo_spaces[id];
if (t.lsn) {
recv_sys.trim(page_id_t(id + srv_undo_space_id_start,
t.pages), t.lsn);
}
}
for (recv_sys_t::map::iterator p = recv_sys.pages.begin();
p != recv_sys.pages.end();) {
const page_id_t page_id = p->first;
page_recv_t& recs = p->second;
ut_ad(!recs.log.empty());
switch (recs.state) {
case page_recv_t::RECV_BEING_READ:
case page_recv_t::RECV_BEING_PROCESSED:
p++;
continue;
case page_recv_t::RECV_NOT_PROCESSED:
apply:
mtr.start();
mtr.set_log_mode(MTR_LOG_NONE);
if (buf_block_t* block = buf_page_get_gen(
page_id, 0, RW_X_LATCH, NULL,
BUF_GET_IF_IN_POOL,
__FILE__, __LINE__, &mtr, NULL)) {
buf_block_dbg_add_level(
block, SYNC_NO_ORDER_CHECK);
recv_recover_page(block, mtr, p);
ut_ad(mtr.has_committed());
} else {
mtr.commit();
recv_read_in_area(page_id);
break;
}
ignore:
{
recv_sys_t::map::iterator r = p++;
r->second.log.clear();
recv_sys.pages.erase(r);
}
continue;
case page_recv_t::RECV_WILL_NOT_READ:
mlog_init_t::init& i = mlog_init.last(page_id);
const lsn_t end_lsn = recs.log.last()->lsn;
if (end_lsn < i.lsn) {
DBUG_LOG("ib_log", "skip log for page "
<< page_id
<< " LSN " << end_lsn
<< " < " << i.lsn);
goto ignore;
}
fil_space_t* space = fil_space_acquire_for_io(
page_id.space());
if (!space) {
goto ignore;
}
if (space->enable_lsn) {
do_read:
space->release_for_io();
recs.state = page_recv_t::RECV_NOT_PROCESSED;
goto apply;
}
/* Determine if a tablespace could be
for an internal table for FULLTEXT INDEX.
For those tables, no MLOG_INDEX_LOAD record
used to be written when redo logging was
disabled. Hence, we cannot optimize
away page reads when crash-upgrading
from MariaDB versions before 10.4,
because all the redo log records for
initializing and modifying the page in
the past could be older than the page
in the data file.
The check is too broad, causing all
tables whose names start with FTS_ to
skip the optimization. */
if ((log_sys.log.format & ~log_t::FORMAT_ENCRYPTED)
!= log_t::FORMAT_10_4
&& strstr(space->name, "/FTS_")) {
goto do_read;
}
mtr.start();
mtr.set_log_mode(MTR_LOG_NONE);
buf_block_t* block = buf_page_create(
page_id, space->zip_size(), &mtr);
p = recv_sys.pages.find(page_id);
if (p == recv_sys.pages.end()) {
/* The page happened to exist
in the buffer pool, or it was
just being read in. Before
buf_page_get_with_no_latch()
returned, all changes must have
been applied to the page already. */
mtr.commit();
} else {
ut_ad(&recs == &p->second);
i.created = true;
buf_block_dbg_add_level(
block, SYNC_NO_ORDER_CHECK);
mtr.x_latch_at_savepoint(0, block);
recv_recover_page(block, mtr, p, &i);
ut_ad(mtr.has_committed());
p->second.log.clear();
recv_sys.pages.erase(p);
}
space->release_for_io();
}
p = recv_sys.pages.lower_bound(page_id);
}
/* Wait until all the pages have been processed */
while (!recv_sys.pages.empty()) {
const bool abort = recv_sys.found_corrupt_log
|| recv_sys.found_corrupt_fs;
if (recv_sys.found_corrupt_fs && !srv_force_recovery) {
ib::info() << "Set innodb_force_recovery=1"
" to ignore corrupted pages.";
}
mutex_exit(&(recv_sys.mutex));
if (abort) {
return;
}
os_thread_sleep(500000);
mutex_enter(&(recv_sys.mutex));
}
done:
if (!last_batch) {
/* Flush all the file pages to disk and invalidate them in
the buffer pool */
mutex_exit(&(recv_sys.mutex));
log_mutex_exit();
/* Stop the recv_writer thread from issuing any LRU
flush batches. */
mutex_enter(&recv_sys.writer_mutex);
/* Wait for any currently run batch to end. */
buf_flush_wait_LRU_batch_end();
os_event_reset(recv_sys.flush_end);
recv_sys.flush_type = BUF_FLUSH_LIST;
os_event_set(recv_sys.flush_start);
os_event_wait(recv_sys.flush_end);
buf_pool_invalidate();
/* Allow batches from recv_writer thread. */
mutex_exit(&recv_sys.writer_mutex);
log_mutex_enter();
mutex_enter(&(recv_sys.mutex));
mlog_init.reset();
} else if (!recv_no_ibuf_operations) {
/* We skipped this in buf_page_create(). */
mlog_init.mark_ibuf_exist(mtr);
}
ut_d(recv_sys.after_apply= true;);
recv_sys.clear();
mutex_exit(&recv_sys.mutex);
}
/** Tries to parse a single log record.
@param[out] type log record type
@param[in] ptr pointer to a buffer
@param[in] end_ptr end of the buffer
@param[out] space_id tablespace identifier
@param[out] page_no page number
@param[in] apply whether to apply MLOG_FILE_* records
@param[out] body start of log record body
@return length of the record, or 0 if the record was not complete */
static
ulint
recv_parse_log_rec(
mlog_id_t* type,
const byte* ptr,
const byte* end_ptr,
ulint* space,
ulint* page_no,
bool apply,
const byte** body)
{
const byte* new_ptr;
*body = NULL;
UNIV_MEM_INVALID(type, sizeof *type);
UNIV_MEM_INVALID(space, sizeof *space);
UNIV_MEM_INVALID(page_no, sizeof *page_no);
UNIV_MEM_INVALID(body, sizeof *body);
if (ptr == end_ptr) {
return(0);
}
switch (*ptr) {
case MLOG_MULTI_REC_END:
case MLOG_DUMMY_RECORD:
*type = static_cast<mlog_id_t>(*ptr);
return(1);
case MLOG_CHECKPOINT:
if (end_ptr < ptr + SIZE_OF_MLOG_CHECKPOINT) {
return(0);
}
*type = static_cast<mlog_id_t>(*ptr);
return(SIZE_OF_MLOG_CHECKPOINT);
case MLOG_MULTI_REC_END | MLOG_SINGLE_REC_FLAG:
case MLOG_DUMMY_RECORD | MLOG_SINGLE_REC_FLAG:
case MLOG_CHECKPOINT | MLOG_SINGLE_REC_FLAG:
ib::error() << "Incorrect log record type "
<< ib::hex(unsigned(*ptr));
recv_sys.found_corrupt_log = true;
return(0);
}
new_ptr = mlog_parse_initial_log_record(ptr, end_ptr, type, space,
page_no);
*body = new_ptr;
if (UNIV_UNLIKELY(!new_ptr)) {
return(0);
}
const byte* old_ptr = new_ptr;
new_ptr = recv_parse_or_apply_log_rec_body(
*type, new_ptr, end_ptr, page_id_t(*space, *page_no), apply,
NULL, NULL);
if (UNIV_UNLIKELY(new_ptr == NULL)) {
return(0);
}
if (*page_no == 0 && *type == MLOG_4BYTES
&& apply
&& mach_read_from_2(old_ptr) == FSP_HEADER_OFFSET + FSP_SIZE) {
old_ptr += 2;
ulint size = mach_parse_compressed(&old_ptr, end_ptr);
recv_spaces_t::iterator it = recv_spaces.find(*space);
ut_ad(!recv_sys.mlog_checkpoint_lsn
|| *space == TRX_SYS_SPACE
|| srv_is_undo_tablespace(*space)
|| it != recv_spaces.end());
if (it != recv_spaces.end() && !it->second.space) {
it->second.size = size;
}
fil_space_set_recv_size(*space, size);
}
return ulint(new_ptr - ptr);
}
/*******************************************************//**
Calculates the new value for lsn when more data is added to the log. */
static
lsn_t
recv_calc_lsn_on_data_add(
/*======================*/
lsn_t lsn, /*!< in: old lsn */
ib_uint64_t len) /*!< in: this many bytes of data is
added, log block headers not included */
{
unsigned frag_len = (lsn % OS_FILE_LOG_BLOCK_SIZE) - LOG_BLOCK_HDR_SIZE;
unsigned payload_size = log_sys.payload_size();
ut_ad(frag_len < payload_size);
lsn_t lsn_len = len;
lsn_len += (lsn_len + frag_len) / payload_size
* (OS_FILE_LOG_BLOCK_SIZE - payload_size);
return(lsn + lsn_len);
}
/** Prints diagnostic info of corrupt log.
@param[in] ptr pointer to corrupt log record
@param[in] type type of the log record (could be garbage)
@param[in] space tablespace ID (could be garbage)
@param[in] page_no page number (could be garbage)
@return whether processing should continue */
static
bool
recv_report_corrupt_log(
const byte* ptr,
int type,
ulint space,
ulint page_no)
{
ib::error() <<
"############### CORRUPT LOG RECORD FOUND ##################";
const ulint ptr_offset = ulint(ptr - recv_sys.buf);
ib::info() << "Log record type " << type << ", page " << space << ":"
<< page_no << ". Log parsing proceeded successfully up to "
<< recv_sys.recovered_lsn << ". Previous log record type "
<< recv_previous_parsed_rec_type << ", is multi "
<< recv_previous_parsed_rec_is_multi << " Recv offset "
<< ptr_offset << ", prev "
<< recv_previous_parsed_rec_offset;
ut_ad(ptr <= recv_sys.buf + recv_sys.len);
const ulint limit = 100;
const ulint prev_offset = std::min(recv_previous_parsed_rec_offset,
ptr_offset);
const ulint before = std::min(prev_offset, limit);
const ulint after = std::min(recv_sys.len - ptr_offset, limit);
ib::info() << "Hex dump starting " << before << " bytes before and"
" ending " << after << " bytes after the corrupted record:";
const byte* start = recv_sys.buf + prev_offset - before;
ut_print_buf(stderr, start, ulint(ptr - start) + after);
putc('\n', stderr);
if (!srv_force_recovery) {
ib::info() << "Set innodb_force_recovery to ignore this error.";
return(false);
}
ib::warn() << "The log file may have been corrupt and it is possible"
" that the log scan did not proceed far enough in recovery!"
" Please run CHECK TABLE on your InnoDB tables to check"
" that they are ok! If mysqld crashes after this recovery; "
<< FORCE_RECOVERY_MSG;
return(true);
}
/** Report a MLOG_INDEX_LOAD operation.
@param[in] space_id tablespace id
@param[in] page_no page number
@param[in] lsn log sequence number */
ATTRIBUTE_COLD static void
recv_mlog_index_load(ulint space_id, ulint page_no, lsn_t lsn)
{
recv_spaces_t::iterator it = recv_spaces.find(space_id);
if (it != recv_spaces.end()) {
it->second.mlog_index_load(lsn);
}
if (log_optimized_ddl_op) {
log_optimized_ddl_op(space_id);
}
}
/** Check whether the number of read redo log blocks exceeds the maximum.
Store last_stored_lsn if the recovery is not in the last phase.
@param[in,out] store whether to store page operations
@return whether the memory is exhausted */
inline bool recv_sys_t::is_memory_exhausted(store_t *store)
{
if (*store == STORE_NO || UT_LIST_GET_LEN(blocks) < max_log_blocks)
return false;
if (*store == STORE_YES)
last_stored_lsn= recovered_lsn;
*store= STORE_NO;
DBUG_PRINT("ib_log",("Ran out of memory and last stored lsn " LSN_PF
" last stored offset " ULINTPF "\n",
recovered_lsn, recovered_offset));
return true;
}
/** Parse log records from a buffer and optionally store them to a
hash table to wait merging to file pages.
@param[in] checkpoint_lsn the LSN of the latest checkpoint
@param[in] store whether to store page operations
@param[in] apply whether to apply the records
@return whether MLOG_CHECKPOINT record was seen the first time,
or corruption was noticed */
bool recv_parse_log_recs(lsn_t checkpoint_lsn, store_t* store, bool apply)
{
bool single_rec;
ulint len;
lsn_t new_recovered_lsn;
lsn_t old_lsn;
mlog_id_t type;
ulint space;
ulint page_no;
const byte* body;
const bool last_phase = (*store == STORE_IF_EXISTS);
ut_ad(log_mutex_own());
ut_ad(mutex_own(&recv_sys.mutex));
ut_ad(recv_sys.parse_start_lsn != 0);
loop:
const byte* ptr = recv_sys.buf + recv_sys.recovered_offset;
const byte* end_ptr = recv_sys.buf + recv_sys.len;
if (ptr == end_ptr) {
return(false);
}
/* Check for memory overflow and ignore the parsing of remaining
redo log records if InnoDB ran out of memory */
if (recv_sys.is_memory_exhausted(store) && last_phase) {
return false;
}
switch (*ptr) {
case MLOG_CHECKPOINT:
case MLOG_DUMMY_RECORD:
single_rec = true;
break;
default:
single_rec = !!(*ptr & MLOG_SINGLE_REC_FLAG);
}
if (single_rec) {
/* The mtr did not modify multiple pages */
old_lsn = recv_sys.recovered_lsn;
/* Try to parse a log record, fetching its type, space id,
page no, and a pointer to the body of the log record */
len = recv_parse_log_rec(&type, ptr, end_ptr, &space,
&page_no, apply, &body);
if (recv_sys.found_corrupt_log) {
recv_report_corrupt_log(ptr, type, space, page_no);
return(true);
}
if (recv_sys.found_corrupt_fs) {
return(true);
}
if (len == 0) {
return(false);
}
new_recovered_lsn = recv_calc_lsn_on_data_add(old_lsn, len);
if (new_recovered_lsn > recv_sys.scanned_lsn) {
/* The log record filled a log block, and we require
that also the next log block should have been scanned
in */
return(false);
}
recv_previous_parsed_rec_type = type;
recv_previous_parsed_rec_offset = recv_sys.recovered_offset;
recv_previous_parsed_rec_is_multi = 0;
recv_sys.recovered_offset += len;
recv_sys.recovered_lsn = new_recovered_lsn;
switch (type) {
lsn_t lsn;
case MLOG_DUMMY_RECORD:
/* Do nothing */
break;
case MLOG_CHECKPOINT:
compile_time_assert(SIZE_OF_MLOG_CHECKPOINT == 1 + 8);
lsn = mach_read_from_8(ptr + 1);
if (UNIV_UNLIKELY(srv_print_verbose_log == 2)) {
fprintf(stderr,
"MLOG_CHECKPOINT(" LSN_PF ") %s at "
LSN_PF "\n", lsn,
lsn != checkpoint_lsn ? "ignored"
: recv_sys.mlog_checkpoint_lsn
? "reread" : "read",
recv_sys.recovered_lsn);
}
DBUG_PRINT("ib_log",
("MLOG_CHECKPOINT(" LSN_PF ") %s at "
LSN_PF,
lsn,
lsn != checkpoint_lsn ? "ignored"
: recv_sys.mlog_checkpoint_lsn
? "reread" : "read",
recv_sys.recovered_lsn));
if (lsn == checkpoint_lsn) {
if (recv_sys.mlog_checkpoint_lsn) {
ut_ad(recv_sys.mlog_checkpoint_lsn
<= recv_sys.recovered_lsn);
break;
}
recv_sys.mlog_checkpoint_lsn
= recv_sys.recovered_lsn;
return(true);
}
break;
default:
switch (*store) {
case STORE_NO:
break;
case STORE_IF_EXISTS:
if (!fil_space_get_size(space)) {
break;
}
/* fall through */
case STORE_YES:
recv_sys.add(
type, page_id_t(space, page_no), body,
ptr + len, old_lsn,
recv_sys.recovered_lsn);
}
/* fall through */
case MLOG_INDEX_LOAD:
if (type == MLOG_INDEX_LOAD) {
recv_mlog_index_load(space, page_no, old_lsn);
}
/* fall through */
case MLOG_FILE_NAME:
case MLOG_FILE_DELETE:
case MLOG_FILE_CREATE2:
case MLOG_FILE_RENAME2:
case MLOG_TRUNCATE:
/* These were already handled by
recv_parse_log_rec() and
recv_parse_or_apply_log_rec_body(). */
DBUG_PRINT("ib_log",
("scan " LSN_PF ": log rec %s"
" len " ULINTPF
" page " ULINTPF ":" ULINTPF,
old_lsn, get_mlog_string(type),
len, space, page_no));
}
} else {
/* Check that all the records associated with the single mtr
are included within the buffer */
ulint total_len = 0;
ulint n_recs = 0;
bool only_mlog_file = true;
ulint mlog_rec_len = 0;
for (;;) {
len = recv_parse_log_rec(
&type, ptr, end_ptr, &space, &page_no,
false, &body);
if (recv_sys.found_corrupt_log) {
corrupted_log:
recv_report_corrupt_log(
ptr, type, space, page_no);
return(true);
}
if (ptr == end_ptr) {
} else if (type == MLOG_CHECKPOINT
|| (*ptr & MLOG_SINGLE_REC_FLAG)) {
recv_sys.found_corrupt_log = true;
goto corrupted_log;
}
if (recv_sys.found_corrupt_fs) {
return(true);
}
if (len == 0) {
return(false);
}
recv_previous_parsed_rec_type = type;
recv_previous_parsed_rec_offset
= recv_sys.recovered_offset + total_len;
recv_previous_parsed_rec_is_multi = 1;
/* MLOG_FILE_NAME redo log records doesn't make changes
to persistent data. If only MLOG_FILE_NAME redo
log record exists then reset the parsing buffer pointer
by changing recovered_lsn and recovered_offset. */
if (type != MLOG_FILE_NAME && only_mlog_file == true) {
only_mlog_file = false;
}
if (only_mlog_file) {
new_recovered_lsn = recv_calc_lsn_on_data_add(
recv_sys.recovered_lsn, len);
mlog_rec_len += len;
recv_sys.recovered_offset += len;
recv_sys.recovered_lsn = new_recovered_lsn;
}
total_len += len;
n_recs++;
ptr += len;
if (type == MLOG_MULTI_REC_END) {
DBUG_PRINT("ib_log",
("scan " LSN_PF
": multi-log end"
" total_len " ULINTPF
" n=" ULINTPF,
recv_sys.recovered_lsn,
total_len, n_recs));
total_len -= mlog_rec_len;
break;
}
DBUG_PRINT("ib_log",
("scan " LSN_PF ": multi-log rec %s"
" len " ULINTPF
" page " ULINTPF ":" ULINTPF,
recv_sys.recovered_lsn,
get_mlog_string(type), len, space, page_no));
}
new_recovered_lsn = recv_calc_lsn_on_data_add(
recv_sys.recovered_lsn, total_len);
if (new_recovered_lsn > recv_sys.scanned_lsn) {
/* The log record filled a log block, and we require
that also the next log block should have been scanned
in */
return(false);
}
/* Add all the records to the hash table */
ptr = recv_sys.buf + recv_sys.recovered_offset;
for (;;) {
old_lsn = recv_sys.recovered_lsn;
/* This will apply MLOG_FILE_ records. We
had to skip them in the first scan, because we
did not know if the mini-transaction was
completely recovered (until MLOG_MULTI_REC_END). */
len = recv_parse_log_rec(
&type, ptr, end_ptr, &space, &page_no,
apply, &body);
if (recv_sys.found_corrupt_log
&& !recv_report_corrupt_log(
ptr, type, space, page_no)) {
return(true);
}
if (recv_sys.found_corrupt_fs) {
return(true);
}
ut_a(len != 0);
ut_a(!(*ptr & MLOG_SINGLE_REC_FLAG));
recv_sys.recovered_offset += len;
recv_sys.recovered_lsn
= recv_calc_lsn_on_data_add(old_lsn, len);
switch (type) {
case MLOG_MULTI_REC_END:
/* Found the end mark for the records */
goto loop;
case MLOG_INDEX_LOAD:
recv_mlog_index_load(space, page_no, old_lsn);
break;
case MLOG_FILE_NAME:
case MLOG_FILE_DELETE:
case MLOG_FILE_CREATE2:
case MLOG_FILE_RENAME2:
case MLOG_TRUNCATE:
/* These were already handled by
recv_parse_log_rec() and
recv_parse_or_apply_log_rec_body(). */
break;
default:
switch (*store) {
case STORE_NO:
break;
case STORE_IF_EXISTS:
if (!fil_space_get_size(space)) {
break;
}
/* fall through */
case STORE_YES:
recv_sys.add(
type,
page_id_t(space, page_no),
body, ptr + len,
old_lsn,
new_recovered_lsn);
}
}
ptr += len;
}
}
goto loop;
}
/** Adds data from a new log block to the parsing buffer of recv_sys if
recv_sys.parse_start_lsn is non-zero.
@param[in] log_block log block to add
@param[in] scanned_lsn lsn of how far we were able to find
data in this log block
@return true if more data added */
bool recv_sys_add_to_parsing_buf(const byte* log_block, lsn_t scanned_lsn)
{
ulint more_len;
ulint data_len;
ulint start_offset;
ulint end_offset;
ut_ad(scanned_lsn >= recv_sys.scanned_lsn);
if (!recv_sys.parse_start_lsn) {
/* Cannot start parsing yet because no start point for
it found */
return(false);
}
data_len = log_block_get_data_len(log_block);
if (recv_sys.parse_start_lsn >= scanned_lsn) {
return(false);
} else if (recv_sys.scanned_lsn >= scanned_lsn) {
return(false);
} else if (recv_sys.parse_start_lsn > recv_sys.scanned_lsn) {
more_len = (ulint) (scanned_lsn - recv_sys.parse_start_lsn);
} else {
more_len = (ulint) (scanned_lsn - recv_sys.scanned_lsn);
}
if (more_len == 0) {
return(false);
}
ut_ad(data_len >= more_len);
start_offset = data_len - more_len;
if (start_offset < LOG_BLOCK_HDR_SIZE) {
start_offset = LOG_BLOCK_HDR_SIZE;
}
end_offset = std::min<ulint>(data_len, log_sys.trailer_offset());
ut_ad(start_offset <= end_offset);
if (start_offset < end_offset) {
memcpy(recv_sys.buf + recv_sys.len,
log_block + start_offset, end_offset - start_offset);
recv_sys.len += end_offset - start_offset;
ut_a(recv_sys.len <= RECV_PARSING_BUF_SIZE);
}
return(true);
}
/** Moves the parsing buffer data left to the buffer start. */
void recv_sys_justify_left_parsing_buf()
{
memmove(recv_sys.buf, recv_sys.buf + recv_sys.recovered_offset,
recv_sys.len - recv_sys.recovered_offset);
recv_sys.len -= recv_sys.recovered_offset;
recv_sys.recovered_offset = 0;
}
/** Scan redo log from a buffer and stores new log data to the parsing buffer.
Parse and hash the log records if new data found.
Apply log records automatically when the hash table becomes full.
@param[in,out] store whether the records should be
stored into recv_sys.pages; this is
reset if just debug checking is
needed, or when the num_max_blocks in
recv_sys runs out
@param[in] log_block log segment
@param[in] checkpoint_lsn latest checkpoint LSN
@param[in] start_lsn buffer start LSN
@param[in] end_lsn buffer end LSN
@param[in,out] contiguous_lsn it is known that all groups contain
contiguous log data upto this lsn
@param[out] group_scanned_lsn scanning succeeded upto this lsn
@return true if not able to scan any more in this log group */
static bool recv_scan_log_recs(
store_t* store,
const byte* log_block,
lsn_t checkpoint_lsn,
lsn_t start_lsn,
lsn_t end_lsn,
lsn_t* contiguous_lsn,
lsn_t* group_scanned_lsn)
{
lsn_t scanned_lsn = start_lsn;
bool finished = false;
ulint data_len;
bool more_data = false;
bool apply = recv_sys.mlog_checkpoint_lsn != 0;
ulint recv_parsing_buf_size = RECV_PARSING_BUF_SIZE;
const bool last_phase = (*store == STORE_IF_EXISTS);
ut_ad(start_lsn % OS_FILE_LOG_BLOCK_SIZE == 0);
ut_ad(end_lsn % OS_FILE_LOG_BLOCK_SIZE == 0);
ut_ad(end_lsn >= start_lsn + OS_FILE_LOG_BLOCK_SIZE);
const byte* const log_end = log_block
+ ulint(end_lsn - start_lsn);
do {
ut_ad(!finished);
if (log_block_get_flush_bit(log_block)) {
/* This block was a start of a log flush operation:
we know that the previous flush operation must have
been completed for all log groups before this block
can have been flushed to any of the groups. Therefore,
we know that log data is contiguous up to scanned_lsn
in all non-corrupt log groups. */
if (scanned_lsn > *contiguous_lsn) {
*contiguous_lsn = scanned_lsn;
}
}
data_len = log_block_get_data_len(log_block);
if (scanned_lsn + data_len > recv_sys.scanned_lsn
&& log_block_get_checkpoint_no(log_block)
< recv_sys.scanned_checkpoint_no
&& (recv_sys.scanned_checkpoint_no
- log_block_get_checkpoint_no(log_block)
> 0x80000000UL)) {
/* Garbage from a log buffer flush which was made
before the most recent database recovery */
finished = true;
break;
}
if (!recv_sys.parse_start_lsn
&& (log_block_get_first_rec_group(log_block) > 0)) {
/* We found a point from which to start the parsing
of log records */
recv_sys.parse_start_lsn = scanned_lsn
+ log_block_get_first_rec_group(log_block);
recv_sys.scanned_lsn = recv_sys.parse_start_lsn;
recv_sys.recovered_lsn = recv_sys.parse_start_lsn;
}
scanned_lsn += data_len;
if (data_len == LOG_BLOCK_HDR_SIZE + SIZE_OF_MLOG_CHECKPOINT
&& scanned_lsn == checkpoint_lsn + SIZE_OF_MLOG_CHECKPOINT
&& log_block[LOG_BLOCK_HDR_SIZE] == MLOG_CHECKPOINT
&& checkpoint_lsn == mach_read_from_8(LOG_BLOCK_HDR_SIZE
+ 1 + log_block)) {
/* The redo log is logically empty. */
ut_ad(recv_sys.mlog_checkpoint_lsn == 0
|| recv_sys.mlog_checkpoint_lsn
== checkpoint_lsn);
recv_sys.mlog_checkpoint_lsn = checkpoint_lsn;
DBUG_PRINT("ib_log", ("found empty log; LSN=" LSN_PF,
scanned_lsn));
finished = true;
break;
}
if (scanned_lsn > recv_sys.scanned_lsn) {
ut_ad(!srv_log_files_created);
if (!recv_needed_recovery) {
recv_needed_recovery = true;
if (srv_read_only_mode) {
ib::warn() << "innodb_read_only"
" prevents crash recovery";
return(true);
}
ib::info() << "Starting crash recovery from"
" checkpoint LSN="
<< recv_sys.scanned_lsn;
}
/* We were able to find more log data: add it to the
parsing buffer if parse_start_lsn is already
non-zero */
DBUG_EXECUTE_IF(
"reduce_recv_parsing_buf",
recv_parsing_buf_size
= (70 * 1024);
);
if (recv_sys.len + 4 * OS_FILE_LOG_BLOCK_SIZE
>= recv_parsing_buf_size) {
ib::error() << "Log parsing buffer overflow."
" Recovery may have failed!";
recv_sys.found_corrupt_log = true;
if (!srv_force_recovery) {
ib::error()
<< "Set innodb_force_recovery"
" to ignore this error.";
return(true);
}
} else if (!recv_sys.found_corrupt_log) {
more_data = recv_sys_add_to_parsing_buf(
log_block, scanned_lsn);
}
recv_sys.scanned_lsn = scanned_lsn;
recv_sys.scanned_checkpoint_no
= log_block_get_checkpoint_no(log_block);
}
/* During last phase of scanning, there can be redo logs
left in recv_sys.buf to parse & store it in recv_sys.heap */
if (last_phase
&& recv_sys.recovered_lsn < recv_sys.scanned_lsn) {
more_data = true;
}
if (data_len < OS_FILE_LOG_BLOCK_SIZE) {
/* Log data for this group ends here */
finished = true;
break;
} else {
log_block += OS_FILE_LOG_BLOCK_SIZE;
}
} while (log_block < log_end);
*group_scanned_lsn = scanned_lsn;
mutex_enter(&recv_sys.mutex);
if (more_data && !recv_sys.found_corrupt_log) {
/* Try to parse more log records */
if (recv_parse_log_recs(checkpoint_lsn, store, apply)) {
ut_ad(recv_sys.found_corrupt_log
|| recv_sys.found_corrupt_fs
|| recv_sys.mlog_checkpoint_lsn
== recv_sys.recovered_lsn);
finished = true;
goto func_exit;
}
recv_sys.is_memory_exhausted(store);
if (recv_sys.recovered_offset > recv_parsing_buf_size / 4) {
/* Move parsing buffer data to the buffer start */
recv_sys_justify_left_parsing_buf();
}
/* Need to re-parse the redo log which're stored
in recv_sys.buf */
if (last_phase && *store == STORE_NO) {
finished = false;
}
}
func_exit:
mutex_exit(&recv_sys.mutex);
return(finished);
}
/** Scans log from a buffer and stores new log data to the parsing buffer.
Parses and hashes the log records if new data found.
@param[in] checkpoint_lsn latest checkpoint log sequence number
@param[in,out] contiguous_lsn log sequence number
until which all redo log has been scanned
@param[in] last_phase whether changes
can be applied to the tablespaces
@return whether rescan is needed (not everything was stored) */
static
bool
recv_group_scan_log_recs(
lsn_t checkpoint_lsn,
lsn_t* contiguous_lsn,
bool last_phase)
{
DBUG_ENTER("recv_group_scan_log_recs");
DBUG_ASSERT(!last_phase || recv_sys.mlog_checkpoint_lsn > 0);
mutex_enter(&recv_sys.mutex);
recv_sys.len = 0;
recv_sys.recovered_offset = 0;
recv_sys.clear();
srv_start_lsn = *contiguous_lsn;
recv_sys.parse_start_lsn = *contiguous_lsn;
recv_sys.scanned_lsn = *contiguous_lsn;
recv_sys.recovered_lsn = *contiguous_lsn;
recv_sys.scanned_checkpoint_no = 0;
recv_previous_parsed_rec_type = MLOG_SINGLE_REC_FLAG;
recv_previous_parsed_rec_offset = 0;
recv_previous_parsed_rec_is_multi = 0;
ut_ad(recv_max_page_lsn == 0);
ut_ad(last_phase || !recv_writer_thread_active);
mutex_exit(&recv_sys.mutex);
lsn_t start_lsn;
lsn_t end_lsn;
store_t store = recv_sys.mlog_checkpoint_lsn == 0
? STORE_NO : (last_phase ? STORE_IF_EXISTS : STORE_YES);
log_sys.log.scanned_lsn = end_lsn = *contiguous_lsn =
ut_uint64_align_down(*contiguous_lsn, OS_FILE_LOG_BLOCK_SIZE);
ut_d(recv_sys.after_apply = last_phase);
do {
if (last_phase && store == STORE_NO) {
store = STORE_IF_EXISTS;
recv_apply_hashed_log_recs(false);
/* Rescan the redo logs from last stored lsn */
end_lsn = recv_sys.recovered_lsn;
}
start_lsn = ut_uint64_align_down(end_lsn,
OS_FILE_LOG_BLOCK_SIZE);
end_lsn = start_lsn;
log_sys.log.read_log_seg(&end_lsn, start_lsn + RECV_SCAN_SIZE);
} while (end_lsn != start_lsn
&& !recv_scan_log_recs(&store, log_sys.buf, checkpoint_lsn,
start_lsn, end_lsn, contiguous_lsn,
&log_sys.log.scanned_lsn));
if (recv_sys.found_corrupt_log || recv_sys.found_corrupt_fs) {
DBUG_RETURN(false);
}
DBUG_PRINT("ib_log", ("%s " LSN_PF " completed",
last_phase ? "rescan" : "scan",
log_sys.log.scanned_lsn));
DBUG_RETURN(store == STORE_NO);
}
/** Report a missing tablespace for which page-redo log exists.
@param[in] err previous error code
@param[in] i tablespace descriptor
@return new error code */
static
dberr_t
recv_init_missing_space(dberr_t err, const recv_spaces_t::const_iterator& i)
{
if (srv_operation == SRV_OPERATION_RESTORE
|| srv_operation == SRV_OPERATION_RESTORE_EXPORT) {
if (i->second.name.find(TEMP_TABLE_PATH_PREFIX) != std::string::npos) {
ib::warn() << "Tablespace " << i->first << " was not"
" found at " << i->second.name << " when"
" restoring a (partial?) backup. All redo log"
" for this file will be ignored!";
}
return(err);
}
if (srv_force_recovery == 0) {
ib::error() << "Tablespace " << i->first << " was not"
" found at " << i->second.name << ".";
if (err == DB_SUCCESS) {
ib::error() << "Set innodb_force_recovery=1 to"
" ignore this and to permanently lose"
" all changes to the tablespace.";
err = DB_TABLESPACE_NOT_FOUND;
}
} else {
ib::warn() << "Tablespace " << i->first << " was not"
" found at " << i->second.name << ", and"
" innodb_force_recovery was set. All redo log"
" for this tablespace will be ignored!";
}
return(err);
}
/** Report the missing tablespace and discard the redo logs for the deleted
tablespace.
@param[in] rescan rescan of redo logs is needed
if hash table ran out of memory
@param[out] missing_tablespace missing tablespace exists or not
@return error code or DB_SUCCESS. */
static MY_ATTRIBUTE((warn_unused_result))
dberr_t
recv_validate_tablespace(bool rescan, bool& missing_tablespace)
{
dberr_t err = DB_SUCCESS;
mutex_enter(&recv_sys.mutex);
for (recv_sys_t::map::iterator p = recv_sys.pages.begin();
p != recv_sys.pages.end();) {
ut_ad(!p->second.log.empty());
const ulint space = p->first.space();
if (is_predefined_tablespace(space)) {
next:
p++;
continue;
}
recv_spaces_t::iterator i = recv_spaces.find(space);
ut_ad(i != recv_spaces.end());
switch (i->second.status) {
case file_name_t::NORMAL:
goto next;
case file_name_t::MISSING:
err = recv_init_missing_space(err, i);
i->second.status = file_name_t::DELETED;
/* fall through */
case file_name_t::DELETED:
recv_sys_t::map::iterator r = p++;
r->second.log.clear();
recv_sys.pages.erase(r);
continue;
}
ut_ad(0);
}
if (err != DB_SUCCESS) {
func_exit:
mutex_exit(&recv_sys.mutex);
return(err);
}
/* When rescan is not needed, recv_sys.pages will contain the
entire redo log. If rescan is needed or innodb_force_recovery
is set, we can ignore missing tablespaces. */
for (const recv_spaces_t::value_type& rs : recv_spaces) {
if (rs.second.status != file_name_t::MISSING) {
continue;
}
missing_tablespace = true;
if (srv_force_recovery > 0) {
ib::warn() << "Tablespace " << rs.first
<<" was not found at " << rs.second.name
<<", and innodb_force_recovery was set."
<<" All redo log for this tablespace"
<<" will be ignored!";
continue;
}
if (!rescan) {
ib::info() << "Tablespace " << rs.first
<< " was not found at '"
<< rs.second.name << "', but there"
<<" were no modifications either.";
}
}
if (!rescan || srv_force_recovery > 0) {
missing_tablespace = false;
}
err = DB_SUCCESS;
goto func_exit;
}
/** Check if all tablespaces were found for crash recovery.
@param[in] rescan rescan of redo logs is needed
@param[out] missing_tablespace missing table exists
@return error code or DB_SUCCESS */
static MY_ATTRIBUTE((warn_unused_result))
dberr_t
recv_init_crash_recovery_spaces(bool rescan, bool& missing_tablespace)
{
bool flag_deleted = false;
ut_ad(!srv_read_only_mode);
ut_ad(recv_needed_recovery);
for (recv_spaces_t::value_type& rs : recv_spaces) {
ut_ad(!is_predefined_tablespace(rs.first));
ut_ad(rs.second.status != file_name_t::DELETED
|| !rs.second.space);
if (rs.second.status == file_name_t::DELETED) {
/* The tablespace was deleted,
so we can ignore any redo log for it. */
flag_deleted = true;
} else if (rs.second.space != NULL) {
/* The tablespace was found, and there
are some redo log records for it. */
fil_names_dirty(rs.second.space);
rs.second.space->enable_lsn = rs.second.enable_lsn;
} else if (rs.second.name == "") {
ib::error() << "Missing MLOG_FILE_NAME"
" or MLOG_FILE_DELETE"
" before MLOG_CHECKPOINT for tablespace "
<< rs.first;
recv_sys.found_corrupt_log = true;
return(DB_CORRUPTION);
} else {
rs.second.status = file_name_t::MISSING;
flag_deleted = true;
}
ut_ad(rs.second.status == file_name_t::DELETED
|| rs.second.name != "");
}
if (flag_deleted) {
return recv_validate_tablespace(rescan, missing_tablespace);
}
return DB_SUCCESS;
}
/** Start recovering from a redo log checkpoint.
@see recv_recovery_from_checkpoint_finish
@param[in] flush_lsn FIL_PAGE_FILE_FLUSH_LSN
of first system tablespace page
@return error code or DB_SUCCESS */
dberr_t
recv_recovery_from_checkpoint_start(lsn_t flush_lsn)
{
ulint max_cp_field;
lsn_t checkpoint_lsn;
bool rescan;
ib_uint64_t checkpoint_no;
lsn_t contiguous_lsn;
byte* buf;
dberr_t err = DB_SUCCESS;
ut_ad(srv_operation == SRV_OPERATION_NORMAL
|| srv_operation == SRV_OPERATION_RESTORE
|| srv_operation == SRV_OPERATION_RESTORE_EXPORT);
ut_d(mutex_enter(&buf_pool->flush_list_mutex));
ut_ad(UT_LIST_GET_LEN(buf_pool->LRU) == 0);
ut_ad(UT_LIST_GET_LEN(buf_pool->unzip_LRU) == 0);
ut_d(mutex_exit(&buf_pool->flush_list_mutex));
/* Initialize red-black tree for fast insertions into the
flush_list during recovery process. */
buf_flush_init_flush_rbt();
if (srv_force_recovery >= SRV_FORCE_NO_LOG_REDO) {
ib::info() << "innodb_force_recovery=6 skips redo log apply";
return(DB_SUCCESS);
}
recv_recovery_on = true;
log_mutex_enter();
err = recv_find_max_checkpoint(&max_cp_field);
if (err != DB_SUCCESS) {
srv_start_lsn = recv_sys.recovered_lsn = log_sys.lsn;
log_mutex_exit();
return(err);
}
log_header_read(max_cp_field);
buf = log_sys.checkpoint_buf;
checkpoint_lsn = mach_read_from_8(buf + LOG_CHECKPOINT_LSN);
checkpoint_no = mach_read_from_8(buf + LOG_CHECKPOINT_NO);
/* Start reading the log from the checkpoint lsn. The variable
contiguous_lsn contains an lsn up to which the log is known to
be contiguously written. */
recv_sys.mlog_checkpoint_lsn = 0;
ut_ad(RECV_SCAN_SIZE <= srv_log_buffer_size);
const lsn_t end_lsn = mach_read_from_8(
buf + LOG_CHECKPOINT_END_LSN);
ut_ad(recv_sys.pages.empty());
contiguous_lsn = checkpoint_lsn;
switch (log_sys.log.format) {
case 0:
log_mutex_exit();
return recv_log_format_0_recover(checkpoint_lsn,
buf[20 + 32 * 9] == 2);
default:
if (end_lsn == 0) {
break;
}
if (end_lsn >= checkpoint_lsn) {
contiguous_lsn = end_lsn;
break;
}
recv_sys.found_corrupt_log = true;
log_mutex_exit();
return(DB_ERROR);
}
/* Look for MLOG_CHECKPOINT. */
recv_group_scan_log_recs(checkpoint_lsn, &contiguous_lsn, false);
/* The first scan should not have stored or applied any records. */
ut_ad(recv_sys.pages.empty());
ut_ad(!recv_sys.found_corrupt_fs);
if (srv_read_only_mode && recv_needed_recovery) {
log_mutex_exit();
return(DB_READ_ONLY);
}
if (recv_sys.found_corrupt_log && !srv_force_recovery) {
log_mutex_exit();
ib::warn() << "Log scan aborted at LSN " << contiguous_lsn;
return(DB_ERROR);
}
if (recv_sys.mlog_checkpoint_lsn == 0) {
lsn_t scan_lsn = log_sys.log.scanned_lsn;
if (!srv_read_only_mode && scan_lsn != checkpoint_lsn) {
log_mutex_exit();
ib::error err;
err << "Missing MLOG_CHECKPOINT";
if (end_lsn) {
err << " at " << end_lsn;
}
err << " between the checkpoint " << checkpoint_lsn
<< " and the end " << scan_lsn << ".";
return(DB_ERROR);
}
log_sys.log.scanned_lsn = checkpoint_lsn;
rescan = false;
} else {
contiguous_lsn = checkpoint_lsn;
rescan = recv_group_scan_log_recs(
checkpoint_lsn, &contiguous_lsn, false);
if ((recv_sys.found_corrupt_log && !srv_force_recovery)
|| recv_sys.found_corrupt_fs) {
log_mutex_exit();
return(DB_ERROR);
}
}
/* NOTE: we always do a 'recovery' at startup, but only if
there is something wrong we will print a message to the
user about recovery: */
if (flush_lsn == checkpoint_lsn + SIZE_OF_MLOG_CHECKPOINT
&& recv_sys.mlog_checkpoint_lsn == checkpoint_lsn) {
/* The redo log is logically empty. */
} else if (checkpoint_lsn != flush_lsn) {
ut_ad(!srv_log_files_created);
if (checkpoint_lsn + SIZE_OF_MLOG_CHECKPOINT < flush_lsn) {
ib::warn() << "Are you sure you are using the"
" right ib_logfiles to start up the database?"
" Log sequence number in the ib_logfiles is "
<< checkpoint_lsn << ", less than the"
" log sequence number in the first system"
" tablespace file header, " << flush_lsn << ".";
}
if (!recv_needed_recovery) {
ib::info() << "The log sequence number " << flush_lsn
<< " in the system tablespace does not match"
" the log sequence number " << checkpoint_lsn
<< " in the ib_logfiles!";
if (srv_read_only_mode) {
ib::error() << "innodb_read_only"
" prevents crash recovery";
log_mutex_exit();
return(DB_READ_ONLY);
}
recv_needed_recovery = true;
}
}
log_sys.lsn = recv_sys.recovered_lsn;
if (recv_needed_recovery) {
bool missing_tablespace = false;
err = recv_init_crash_recovery_spaces(
rescan, missing_tablespace);
if (err != DB_SUCCESS) {
log_mutex_exit();
return(err);
}
/* If there is any missing tablespace and rescan is needed
then there is a possiblity that hash table will not contain
all space ids redo logs. Rescan the remaining unstored
redo logs for the validation of missing tablespace. */
ut_ad(rescan || !missing_tablespace);
while (missing_tablespace) {
DBUG_PRINT("ib_log", ("Rescan of redo log to validate "
"the missing tablespace. Scan "
"from last stored LSN " LSN_PF,
recv_sys.last_stored_lsn));
lsn_t recent_stored_lsn = recv_sys.last_stored_lsn;
rescan = recv_group_scan_log_recs(
checkpoint_lsn, &recent_stored_lsn, false);
ut_ad(!recv_sys.found_corrupt_fs);
missing_tablespace = false;
err = recv_sys.found_corrupt_log
? DB_ERROR
: recv_validate_tablespace(
rescan, missing_tablespace);
if (err != DB_SUCCESS) {
log_mutex_exit();
return err;
}
rescan = true;
}
if (srv_operation == SRV_OPERATION_NORMAL) {
buf_dblwr_process();
}
ut_ad(srv_force_recovery <= SRV_FORCE_NO_UNDO_LOG_SCAN);
/* Spawn the background thread to flush dirty pages
from the buffer pools. */
recv_writer_thread_active = true;
os_thread_create(recv_writer_thread, 0, 0);
if (rescan) {
contiguous_lsn = checkpoint_lsn;
recv_group_scan_log_recs(
checkpoint_lsn, &contiguous_lsn, true);
if ((recv_sys.found_corrupt_log
&& !srv_force_recovery)
|| recv_sys.found_corrupt_fs) {
log_mutex_exit();
return(DB_ERROR);
}
}
} else {
ut_ad(!rescan || recv_sys.pages.empty());
}
if (log_sys.log.scanned_lsn < checkpoint_lsn
|| log_sys.log.scanned_lsn < recv_max_page_lsn) {
ib::error() << "We scanned the log up to "
<< log_sys.log.scanned_lsn
<< ". A checkpoint was at " << checkpoint_lsn << " and"
" the maximum LSN on a database page was "
<< recv_max_page_lsn << ". It is possible that the"
" database is now corrupt!";
}
if (recv_sys.recovered_lsn < checkpoint_lsn) {
log_mutex_exit();
ib::error() << "Recovered only to lsn:"
<< recv_sys.recovered_lsn << " checkpoint_lsn: " << checkpoint_lsn;
return(DB_ERROR);
}
log_sys.next_checkpoint_lsn = checkpoint_lsn;
log_sys.next_checkpoint_no = checkpoint_no + 1;
recv_synchronize_groups();
if (!recv_needed_recovery) {
ut_a(checkpoint_lsn == recv_sys.recovered_lsn);
} else {
srv_start_lsn = recv_sys.recovered_lsn;
}
log_sys.buf_free = ulong(log_sys.lsn % OS_FILE_LOG_BLOCK_SIZE);
log_sys.buf_next_to_write = log_sys.buf_free;
log_sys.write_lsn = log_sys.lsn;
log_sys.last_checkpoint_lsn = checkpoint_lsn;
if (!srv_read_only_mode && srv_operation == SRV_OPERATION_NORMAL) {
/* Write a MLOG_CHECKPOINT marker as the first thing,
before generating any other redo log. This ensures
that subsequent crash recovery will be possible even
if the server were killed soon after this. */
fil_names_clear(log_sys.last_checkpoint_lsn, true);
}
MONITOR_SET(MONITOR_LSN_CHECKPOINT_AGE,
log_sys.lsn - log_sys.last_checkpoint_lsn);
log_sys.next_checkpoint_no = ++checkpoint_no;
mutex_enter(&recv_sys.mutex);
recv_sys.apply_log_recs = true;
mutex_exit(&recv_sys.mutex);
log_mutex_exit();
recv_lsn_checks_on = true;
/* The database is now ready to start almost normal processing of user
transactions: transaction rollbacks and the application of the log
records in the hash table can be run in background. */
return(DB_SUCCESS);
}
/** Complete recovery from a checkpoint. */
void
recv_recovery_from_checkpoint_finish(void)
{
/* Make sure that the recv_writer thread is done. This is
required because it grabs various mutexes and we want to
ensure that when we enable sync_order_checks there is no
mutex currently held by any thread. */
mutex_enter(&recv_sys.writer_mutex);
/* Free the resources of the recovery system */
recv_recovery_on = false;
/* By acquring the mutex we ensure that the recv_writer thread
won't trigger any more LRU batches. Now wait for currently
in progress batches to finish. */
buf_flush_wait_LRU_batch_end();
mutex_exit(&recv_sys.writer_mutex);
ulint count = 0;
while (recv_writer_thread_active) {
++count;
os_thread_sleep(100000);
if (srv_print_verbose_log && count > 600) {
ib::info() << "Waiting for recv_writer to"
" finish flushing of buffer pool";
count = 0;
}
}
recv_sys.debug_free();
/* Free up the flush_rbt. */
buf_flush_free_flush_rbt();
/* Enable innodb_sync_debug checks */
ut_d(sync_check_enable());
}
/** Find a doublewrite copy of a page.
@param[in] space_id tablespace identifier
@param[in] page_no page number
@return page frame
@retval NULL if no page was found */
const byte*
recv_dblwr_t::find_page(ulint space_id, ulint page_no)
{
const byte *result= NULL;
lsn_t max_lsn= 0;
for (const byte *page : pages)
{
if (page_get_page_no(page) != page_no ||
page_get_space_id(page) != space_id)
continue;
const lsn_t lsn= mach_read_from_8(page + FIL_PAGE_LSN);
if (lsn <= max_lsn)
continue;
max_lsn= lsn;
result= page;
}
return result;
}
#ifndef DBUG_OFF
/** Return string name of the redo log record type.
@param[in] type record log record enum
@return string name of record log record */
static const char* get_mlog_string(mlog_id_t type)
{
switch (type) {
case MLOG_SINGLE_REC_FLAG:
return("MLOG_SINGLE_REC_FLAG");
case MLOG_1BYTE:
return("MLOG_1BYTE");
case MLOG_2BYTES:
return("MLOG_2BYTES");
case MLOG_4BYTES:
return("MLOG_4BYTES");
case MLOG_8BYTES:
return("MLOG_8BYTES");
case MLOG_REC_INSERT:
return("MLOG_REC_INSERT");
case MLOG_REC_CLUST_DELETE_MARK:
return("MLOG_REC_CLUST_DELETE_MARK");
case MLOG_REC_SEC_DELETE_MARK:
return("MLOG_REC_SEC_DELETE_MARK");
case MLOG_REC_UPDATE_IN_PLACE:
return("MLOG_REC_UPDATE_IN_PLACE");
case MLOG_REC_DELETE:
return("MLOG_REC_DELETE");
case MLOG_LIST_END_DELETE:
return("MLOG_LIST_END_DELETE");
case MLOG_LIST_START_DELETE:
return("MLOG_LIST_START_DELETE");
case MLOG_LIST_END_COPY_CREATED:
return("MLOG_LIST_END_COPY_CREATED");
case MLOG_PAGE_REORGANIZE:
return("MLOG_PAGE_REORGANIZE");
case MLOG_PAGE_CREATE:
return("MLOG_PAGE_CREATE");
case MLOG_UNDO_INSERT:
return("MLOG_UNDO_INSERT");
case MLOG_UNDO_ERASE_END:
return("MLOG_UNDO_ERASE_END");
case MLOG_UNDO_INIT:
return("MLOG_UNDO_INIT");
case MLOG_UNDO_HDR_REUSE:
return("MLOG_UNDO_HDR_REUSE");
case MLOG_UNDO_HDR_CREATE:
return("MLOG_UNDO_HDR_CREATE");
case MLOG_REC_MIN_MARK:
return("MLOG_REC_MIN_MARK");
case MLOG_IBUF_BITMAP_INIT:
return("MLOG_IBUF_BITMAP_INIT");
case MLOG_ZIP_WRITE_STRING:
return("MLOG_ZIP_WRITE_STRING");
case MLOG_WRITE_STRING:
return("MLOG_WRITE_STRING");
case MLOG_MULTI_REC_END:
return("MLOG_MULTI_REC_END");
case MLOG_DUMMY_RECORD:
return("MLOG_DUMMY_RECORD");
case MLOG_FILE_DELETE:
return("MLOG_FILE_DELETE");
case MLOG_COMP_REC_MIN_MARK:
return("MLOG_COMP_REC_MIN_MARK");
case MLOG_COMP_PAGE_CREATE:
return("MLOG_COMP_PAGE_CREATE");
case MLOG_COMP_REC_INSERT:
return("MLOG_COMP_REC_INSERT");
case MLOG_COMP_REC_CLUST_DELETE_MARK:
return("MLOG_COMP_REC_CLUST_DELETE_MARK");
case MLOG_COMP_REC_UPDATE_IN_PLACE:
return("MLOG_COMP_REC_UPDATE_IN_PLACE");
case MLOG_COMP_REC_DELETE:
return("MLOG_COMP_REC_DELETE");
case MLOG_COMP_LIST_END_DELETE:
return("MLOG_COMP_LIST_END_DELETE");
case MLOG_COMP_LIST_START_DELETE:
return("MLOG_COMP_LIST_START_DELETE");
case MLOG_COMP_LIST_END_COPY_CREATED:
return("MLOG_COMP_LIST_END_COPY_CREATED");
case MLOG_COMP_PAGE_REORGANIZE:
return("MLOG_COMP_PAGE_REORGANIZE");
case MLOG_FILE_CREATE2:
return("MLOG_FILE_CREATE2");
case MLOG_ZIP_WRITE_NODE_PTR:
return("MLOG_ZIP_WRITE_NODE_PTR");
case MLOG_ZIP_WRITE_BLOB_PTR:
return("MLOG_ZIP_WRITE_BLOB_PTR");
case MLOG_ZIP_WRITE_HEADER:
return("MLOG_ZIP_WRITE_HEADER");
case MLOG_ZIP_PAGE_COMPRESS:
return("MLOG_ZIP_PAGE_COMPRESS");
case MLOG_ZIP_PAGE_COMPRESS_NO_DATA:
return("MLOG_ZIP_PAGE_COMPRESS_NO_DATA");
case MLOG_ZIP_PAGE_REORGANIZE:
return("MLOG_ZIP_PAGE_REORGANIZE");
case MLOG_ZIP_WRITE_TRX_ID:
return("MLOG_ZIP_WRITE_TRX_ID");
case MLOG_FILE_RENAME2:
return("MLOG_FILE_RENAME2");
case MLOG_FILE_NAME:
return("MLOG_FILE_NAME");
case MLOG_CHECKPOINT:
return("MLOG_CHECKPOINT");
case MLOG_PAGE_CREATE_RTREE:
return("MLOG_PAGE_CREATE_RTREE");
case MLOG_COMP_PAGE_CREATE_RTREE:
return("MLOG_COMP_PAGE_CREATE_RTREE");
case MLOG_INIT_FILE_PAGE2:
return("MLOG_INIT_FILE_PAGE2");
case MLOG_INDEX_LOAD:
return("MLOG_INDEX_LOAD");
case MLOG_TRUNCATE:
return("MLOG_TRUNCATE");
case MLOG_MEMSET:
return("MLOG_MEMSET");
case MLOG_INIT_FREE_PAGE:
return("MLOG_INIT_FREE_PAGE");
case MLOG_FILE_WRITE_CRYPT_DATA:
return("MLOG_FILE_WRITE_CRYPT_DATA");
}
DBUG_ASSERT(0);
return(NULL);
}
#endif /* !DBUG_OFF */