mirror of
https://github.com/MariaDB/server.git
synced 2026-05-14 19:07:15 +02:00
The error handling for internal 2pc transactions (eg. RocksDB/Spider) would incorrectly try to handle the engine binlog_unlog() during rollback, in binlog_post_rollback(); this should instead be handled solely in log_and_order() and unlog(). This could trigger for example in parallel replication error handling, causing assertions when wrongly entering XA code paths. Also fix a couple bugs found during debug: - Don't send format description even to the slave from before the starting GTID position, as that can cause the slave to wrongly drop temporary tables. - When looking up the initial GTID position for a new dump thread, wait for the necessary part of the binlog to become durable before reading it. - Don't error when searching the initial GTID position if reaching EOF of the durable portion, instead search back to an earlier GTID state record. - A rare race in the test framework that could fail to kill off lingering dump threads before RESET MASTER. Signed-off-by: Kristian Nielsen <knielsen@knielsen-hq.org>
4627 lines
140 KiB
C++
4627 lines
140 KiB
C++
/*****************************************************************************
|
|
|
|
Copyright (c) 2024, Kristian Nielsen
|
|
|
|
This program is free software; you can redistribute it and/or modify it under
|
|
the terms of the GNU General Public License as published by the Free Software
|
|
Foundation; version 2 of the License.
|
|
|
|
This program is distributed in the hope that it will be useful, but WITHOUT
|
|
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
|
|
FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
|
|
|
|
You should have received a copy of the GNU General Public License along with
|
|
this program; if not, write to the Free Software Foundation, Inc.,
|
|
51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
|
|
|
|
*****************************************************************************/
|
|
|
|
/**************************************************//**
|
|
@file handler/innodb_binlog.cc
|
|
InnoDB implementation of binlog.
|
|
*******************************************************/
|
|
|
|
/*
|
|
Need MYSQL_SERVER defined to be able to use THD_ENTER_COND from sql_class.h
|
|
to make my_cond_wait() killable.
|
|
*/
|
|
#define MYSQL_SERVER 1
|
|
#include <my_global.h>
|
|
#include "sql_class.h"
|
|
|
|
#include "innodb_binlog.h"
|
|
#include "mtr0log.h"
|
|
#include "fsp0fsp.h"
|
|
#include "trx0trx.h"
|
|
#include "log0log.h"
|
|
#include "small_vector.h"
|
|
|
|
#include "mysys_err.h"
|
|
#include "my_compr_int.h"
|
|
#include "rpl_gtid_base.h"
|
|
#include "handler_binlog_reader.h"
|
|
#include "log.h"
|
|
|
|
|
|
class ibb_xid_hash;
|
|
|
|
|
|
static int innodb_binlog_inited= 0;
|
|
|
|
pending_lsn_fifo ibb_pending_lsn_fifo;
|
|
uint32_t innodb_binlog_size_in_pages;
|
|
const char *innodb_binlog_directory;
|
|
|
|
/** Current write position in active binlog file. */
|
|
uint32_t binlog_cur_page_no;
|
|
uint32_t binlog_cur_page_offset;
|
|
|
|
/**
|
|
Server setting for how often to dump a (differential) binlog state at the
|
|
start of the page, to speed up finding the initial GTID position, read-only.
|
|
*/
|
|
ulonglong innodb_binlog_state_interval;
|
|
|
|
/**
|
|
Differential binlog state in the currently active binlog tablespace, relative
|
|
to the state at the start.
|
|
*/
|
|
rpl_binlog_state_base binlog_diff_state;
|
|
|
|
static std::thread binlog_prealloc_thr_obj;
|
|
static bool prealloc_thread_end= false;
|
|
|
|
/**
|
|
Mutex around purge operations, including earliest_binlog_file_no and
|
|
total_binlog_used_size.
|
|
*/
|
|
mysql_mutex_t purge_binlog_mutex;
|
|
|
|
/** The earliest binlog tablespace file. Used in binlog purge. */
|
|
static uint64_t earliest_binlog_file_no;
|
|
|
|
/**
|
|
The total space in use by binlog tablespace files. Maintained in-memory to
|
|
not have to stat(2) every file for every new binlog tablespace allocated in
|
|
case of --max-binlog-total-size.
|
|
|
|
Initialized at server startup (and in RESET MASTER), and updated as binlog
|
|
files are pre-allocated and purged.
|
|
*/
|
|
size_t total_binlog_used_size;
|
|
|
|
static bool purge_warning_given= false;
|
|
|
|
/** References to pending XA PREPARED transactions in the binlog. */
|
|
static ibb_xid_hash *ibb_xa_xid_hash;
|
|
|
|
#ifdef UNIV_PFS_THREAD
|
|
mysql_pfs_key_t binlog_prealloc_thread_key;
|
|
#endif
|
|
|
|
|
|
/**
|
|
Structure holding context for out-of-band chunks of binlogged event group.
|
|
*/
|
|
struct binlog_oob_context {
|
|
struct savepoint;
|
|
/*
|
|
Structure used to encapsulate the data to be binlogged in an out-of-band
|
|
chunk, for use by fsp_binlog_write_rec().
|
|
*/
|
|
struct chunk_data_oob : public chunk_data_base {
|
|
/*
|
|
Need room for 5 numbers:
|
|
node index
|
|
left child file_no
|
|
left child offset
|
|
right child file_no
|
|
right child offset
|
|
*/
|
|
static constexpr uint32_t max_buffer= 5*COMPR_INT_MAX64;
|
|
uint64_t sofar;
|
|
uint64_t main_len;
|
|
const byte *main_data;
|
|
uint32_t header_len;
|
|
byte header_buf[max_buffer];
|
|
|
|
chunk_data_oob(uint64_t idx,
|
|
uint64_t left_file_no, uint64_t left_offset,
|
|
uint64_t right_file_no, uint64_t right_offset,
|
|
const byte *data, size_t data_len);
|
|
virtual ~chunk_data_oob() {};
|
|
virtual std::pair<uint32_t, bool> copy_data(byte *p, uint32_t max_len) final;
|
|
};
|
|
|
|
bool binlog_node(uint32_t node, uint64_t new_idx,
|
|
uint32_t left_node, uint32_t right_node,
|
|
chunk_data_oob *oob_data, LF_PINS *pins, mtr_t *mtr);
|
|
bool create_stmt_start_point();
|
|
savepoint *create_savepoint();
|
|
void rollback_to_savepoint(savepoint *savepoint);
|
|
void rollback_to_stmt_start();
|
|
|
|
/*
|
|
Pending binlog write for the ibb_pending_lsn_fifo.
|
|
pending_file_no is ~0 when no write is pending.
|
|
*/
|
|
uint64_t pending_file_no;
|
|
uint64_t pending_offset;
|
|
lsn_t pending_lsn;
|
|
|
|
uint64_t first_node_file_no;
|
|
uint64_t first_node_offset;
|
|
LF_PINS *lf_pins;
|
|
savepoint *stmt_start_point;
|
|
savepoint *savepoint_stack;
|
|
/*
|
|
The secondary pointer is for when server layer binlogs both a
|
|
non-transactional and a transactional oob stream.
|
|
*/
|
|
binlog_oob_context *secondary_ctx;
|
|
uint32_t node_list_len;
|
|
uint32_t node_list_alloc_len;
|
|
/*
|
|
Set if we incremented refcount in first_node_file_no, so we need to
|
|
decrement again at commit record write or reset/rollback.
|
|
*/
|
|
bool pending_refcount;
|
|
/* Set when the transaction is sealed after writing an XA PREPARE record. */
|
|
bool is_xa_prepared;
|
|
/*
|
|
The node_list contains the root of each tree in the forest of perfect
|
|
binary trees.
|
|
*/
|
|
#ifdef _MSC_VER
|
|
/* Flexible array member is not standard C++, disable compiler warning. */
|
|
#pragma warning(disable : 4200)
|
|
#endif
|
|
struct node_info {
|
|
uint64_t file_no;
|
|
uint64_t offset;
|
|
uint64_t node_index;
|
|
uint32_t height;
|
|
} node_list [];
|
|
|
|
/* Saved oob state for implementing ROLLBACK TO SAVEPOINT. */
|
|
struct savepoint {
|
|
/* Maintain a stack of pending savepoints. */
|
|
savepoint *next;
|
|
uint32_t node_list_len;
|
|
uint32_t alloc_len;
|
|
struct node_info node_list[];
|
|
};
|
|
};
|
|
|
|
|
|
/**
|
|
A class for doing the post-order traversal of the forest of perfect binary
|
|
trees that make up the out-of-band data for a commit record.
|
|
*/
|
|
class innodb_binlog_oob_reader {
|
|
enum oob_states {
|
|
/* The initial state, about to visit the node for the first time. */
|
|
ST_initial,
|
|
/* State of leaf node while traversing the prior trees in the forest. */
|
|
ST_traversing_prior_trees,
|
|
/* State of non-leaf node while traversing its left sub-tree. */
|
|
ST_traversing_left_child,
|
|
/* State of non-leaf node while traversing its right sub-tree. */
|
|
ST_traversing_right_child,
|
|
/* State of node while reading out its data. */
|
|
ST_self
|
|
};
|
|
|
|
/*
|
|
Stack entry for one node currently taking part in post-order traversal.
|
|
We maintain a stack of pending nodes during the traversal, as the traversal
|
|
happens in a state machine rather than by recursion.
|
|
*/
|
|
struct stack_entry {
|
|
/* Saved position after reading header. */
|
|
binlog_chunk_reader::saved_position saved_pos;
|
|
/* The location of this node's OOB record. */
|
|
uint64_t file_no;
|
|
uint64_t offset;
|
|
/* Right child, to be traversed after left child. */
|
|
uint64_t right_file_no;
|
|
uint64_t right_offset;
|
|
/* Offset of real data in this node, after header. */
|
|
uint32_t header_len;
|
|
/* Amount of data read into rd_buf, and amount used to parse header. */
|
|
uint32_t rd_buf_len;
|
|
uint32_t rd_buf_sofar;
|
|
/* Current state in post-order traversal state machine. */
|
|
enum oob_states state;
|
|
/* Buffer for reading header. */
|
|
byte rd_buf[5*COMPR_INT_MAX64];
|
|
/*
|
|
True when the node is reached using only left child pointers, false
|
|
otherwise. Used to identify the left-most leaf in a tree which points to
|
|
a prior tree that must be traversed first.
|
|
*/
|
|
bool is_leftmost;
|
|
};
|
|
small_vector<stack_entry, 8>stack;
|
|
|
|
/* State machine current state. */
|
|
enum oob_states state;
|
|
|
|
public:
|
|
innodb_binlog_oob_reader();
|
|
~innodb_binlog_oob_reader();
|
|
|
|
void start_traversal(uint64_t file_no, uint64_t offset);
|
|
bool oob_traversal_done() { return stack.empty(); }
|
|
int read_data(binlog_chunk_reader *chunk_rd, uchar *buf, int max_len);
|
|
|
|
private:
|
|
void push_state(enum oob_states state, uint64_t file_no, uint64_t offset,
|
|
bool is_leftmost);
|
|
};
|
|
|
|
|
|
class ha_innodb_binlog_reader : public handler_binlog_reader {
|
|
enum reader_states {
|
|
ST_read_next_event_group, ST_read_oob_data, ST_read_commit_record
|
|
};
|
|
|
|
binlog_chunk_reader chunk_rd;
|
|
innodb_binlog_oob_reader oob_reader;
|
|
binlog_chunk_reader::saved_position saved_commit_pos;
|
|
|
|
/* Out-of-band data to read after commit record, if any. */
|
|
uint64_t oob_count;
|
|
uint64_t oob_last_file_no;
|
|
uint64_t oob_last_offset;
|
|
/* Any secondary out-of-band data to be also read. */
|
|
uint64_t oob_count2;
|
|
uint64_t oob_last_file_no2;
|
|
uint64_t oob_last_offset2;
|
|
/*
|
|
Originally requested starting file_no, from init_gtid_pos() or
|
|
init_legacy_pos(). Or ~0 if none.
|
|
*/
|
|
uint64_t requested_file_no;
|
|
/* Buffer to hold a page read directly from the binlog file. */
|
|
uchar *page_buf;
|
|
/* Keep track of pending bytes in the rd_buf. */
|
|
uint32_t rd_buf_len;
|
|
uint32_t rd_buf_sofar;
|
|
/* State for state machine reading chunks one by one. */
|
|
enum reader_states state;
|
|
|
|
/* Used to read the header of the commit record. */
|
|
byte rd_buf[5*COMPR_INT_MAX64];
|
|
private:
|
|
int read_data(uchar *buf, uint32_t len);
|
|
|
|
public:
|
|
ha_innodb_binlog_reader(bool wait_durable, uint64_t file_no= 0,
|
|
uint64_t offset= 0);
|
|
~ha_innodb_binlog_reader();
|
|
virtual int read_binlog_data(uchar *buf, uint32_t len) final;
|
|
virtual bool data_available() final;
|
|
virtual bool wait_available(THD *thd, const struct timespec *abstime) final;
|
|
virtual int init_gtid_pos(THD *thd, slave_connection_state *pos,
|
|
rpl_binlog_state_base *state) final;
|
|
virtual int init_legacy_pos(THD *thd, const char *filename,
|
|
ulonglong offset) final;
|
|
virtual void enable_single_file() final;
|
|
void seek_internal(uint64_t file_no, uint64_t offset);
|
|
};
|
|
|
|
|
|
/**
|
|
Class that keeps track of the oob references etc. for each
|
|
XA PREPAREd XID.
|
|
*/
|
|
class ibb_xid_hash {
|
|
public:
|
|
struct xid_elem {
|
|
XID xid;
|
|
uint64_t refcnt_file_no;
|
|
uint64_t oob_num_nodes;
|
|
uint64_t oob_first_file_no;
|
|
uint64_t oob_first_offset;
|
|
uint64_t oob_last_file_no;
|
|
uint64_t oob_last_offset;
|
|
};
|
|
HASH xid_hash;
|
|
mysql_mutex_t xid_mutex;
|
|
|
|
ibb_xid_hash();
|
|
~ibb_xid_hash();
|
|
bool add_xid(const XID *xid, const binlog_oob_context *c);
|
|
xid_elem *grab_xid(const XID *xid);
|
|
template <typename F> bool run_on_xid(const XID *xid, F callback);
|
|
};
|
|
|
|
|
|
struct chunk_data_cache : public chunk_data_base {
|
|
IO_CACHE *cache;
|
|
binlog_oob_context *oob_ctx;
|
|
my_off_t main_start;
|
|
size_t main_remain;
|
|
size_t gtid_remain;
|
|
uint32_t header_remain;
|
|
uint32_t header_sofar;
|
|
byte header_buf[10*COMPR_INT_MAX64];
|
|
|
|
chunk_data_cache(IO_CACHE *cache_arg,
|
|
handler_binlog_event_group_info *binlog_info)
|
|
: cache(cache_arg),
|
|
main_start(binlog_info->out_of_band_offset),
|
|
main_remain((size_t)(binlog_info->gtid_offset -
|
|
binlog_info->out_of_band_offset)),
|
|
header_sofar(0)
|
|
{
|
|
size_t end_offset= (size_t)my_b_tell(cache);
|
|
ut_ad(end_offset > binlog_info->out_of_band_offset);
|
|
ut_ad(binlog_info->gtid_offset >= binlog_info->out_of_band_offset);
|
|
ut_ad(end_offset >= binlog_info->gtid_offset);
|
|
gtid_remain= end_offset - (size_t)binlog_info->gtid_offset;
|
|
|
|
binlog_oob_context *c=
|
|
static_cast<binlog_oob_context *>(binlog_info->engine_ptr);
|
|
unsigned char *p= header_buf;
|
|
ut_ad(c);
|
|
oob_ctx= c;
|
|
if (UNIV_UNLIKELY(!c))
|
|
;
|
|
else if (UNIV_UNLIKELY(binlog_info->xa_xid != nullptr) &&
|
|
!binlog_info->internal_xa)
|
|
{
|
|
/*
|
|
For explicit user XA COMMIT, the commit record must point to the
|
|
OOB data previously saved in XA PREPARE.
|
|
*/
|
|
bool err= ibb_xa_xid_hash->run_on_xid(binlog_info->xa_xid,
|
|
[&p](const ibb_xid_hash::xid_elem *elem) -> bool {
|
|
if (UNIV_LIKELY(elem->oob_num_nodes > 0))
|
|
{
|
|
p= compr_int_write(p, elem->oob_num_nodes);
|
|
p= compr_int_write(p, elem->oob_first_file_no);
|
|
p= compr_int_write(p, elem->oob_first_offset);
|
|
p= compr_int_write(p, elem->oob_last_file_no);
|
|
p= compr_int_write(p, elem->oob_last_offset);
|
|
p= compr_int_write(p, 0);
|
|
}
|
|
else
|
|
p= compr_int_write(p, 0);
|
|
return false;
|
|
});
|
|
/*
|
|
The XID must always be found, else we have a serious
|
|
inconsistency between the server layer and binlog state.
|
|
In case of inconsistency, better crash than leave a corrupt
|
|
binlog.
|
|
*/
|
|
ut_a(!err);
|
|
ut_ad(binlog_info->engine_ptr2 == nullptr);
|
|
}
|
|
else if (c->node_list_len)
|
|
{
|
|
/*
|
|
Link to the out-of-band data. First store the number of nodes; then
|
|
store 2 x 2 numbers of file_no/offset for the first and last node.
|
|
|
|
There's a special case when we have to link two times out-of-band data,
|
|
due to mixing non-transactional and transactional stuff. In that case,
|
|
the non-transactional goes first. In the common case where there is no
|
|
dual oob references, we just store a single 0 count.
|
|
*/
|
|
binlog_oob_context *c2= c->secondary_ctx=
|
|
static_cast<binlog_oob_context *>(binlog_info->engine_ptr2);
|
|
if (UNIV_UNLIKELY(c2 != nullptr) && c2->node_list_len)
|
|
{
|
|
uint32_t last2= c2->node_list_len-1;
|
|
uint64_t num_nodes2= c2->node_list[last2].node_index + 1;
|
|
p= compr_int_write(p, num_nodes2);
|
|
p= compr_int_write(p, c2->first_node_file_no);
|
|
p= compr_int_write(p, c2->first_node_offset);
|
|
p= compr_int_write(p, c2->node_list[last2].file_no);
|
|
p= compr_int_write(p, c2->node_list[last2].offset);
|
|
}
|
|
|
|
uint32_t last= c->node_list_len-1;
|
|
uint64_t num_nodes= c->node_list[last].node_index + 1;
|
|
p= compr_int_write(p, num_nodes);
|
|
p= compr_int_write(p, c->first_node_file_no);
|
|
p= compr_int_write(p, c->first_node_offset);
|
|
p= compr_int_write(p, c->node_list[last].file_no);
|
|
p= compr_int_write(p, c->node_list[last].offset);
|
|
if (UNIV_LIKELY(c2 == nullptr) || c2->node_list_len == 0)
|
|
p= compr_int_write(p, 0);
|
|
}
|
|
else
|
|
{
|
|
/*
|
|
No out-of-band data, marked with a single 0 count for nodes and no
|
|
first/last links.
|
|
*/
|
|
p= compr_int_write(p, 0);
|
|
}
|
|
header_remain= (uint32_t)(p - header_buf);
|
|
ut_ad((size_t)(p - header_buf) <= sizeof(header_buf));
|
|
|
|
ut_ad (cache->pos_in_file <= binlog_info->out_of_band_offset);
|
|
|
|
if (UNIV_UNLIKELY(binlog_info->internal_xa))
|
|
{
|
|
/*
|
|
Insert the XID for the internal 2-phase commit in the xid_hash,
|
|
incrementing the reference count. This will ensure we hold on to
|
|
the commit record until ibb_binlog_unlog() is called, at which point
|
|
the other participating storage engine(s) have durably committed.
|
|
*/
|
|
bool err= ibb_xa_xid_hash->add_xid(binlog_info->xa_xid, c);
|
|
ut_a(!err);
|
|
}
|
|
|
|
/* Start with the GTID event, which is put at the end of the IO_CACHE. */
|
|
my_bool res= reinit_io_cache(cache, READ_CACHE, binlog_info->gtid_offset, 0, 0);
|
|
ut_a(!res /* ToDo: Error handling. */);
|
|
}
|
|
~chunk_data_cache() { }
|
|
|
|
virtual std::pair<uint32_t, bool> copy_data(byte *p, uint32_t max_len) final
|
|
{
|
|
uint32_t size= 0;
|
|
|
|
if (UNIV_LIKELY(oob_ctx != nullptr) && oob_ctx->pending_refcount)
|
|
{
|
|
ibb_file_hash.oob_ref_dec(oob_ctx->first_node_file_no, oob_ctx->lf_pins);
|
|
oob_ctx->pending_refcount= false;
|
|
if (UNIV_UNLIKELY(oob_ctx->secondary_ctx != nullptr) &&
|
|
oob_ctx->secondary_ctx->pending_refcount)
|
|
{
|
|
ibb_file_hash.oob_ref_dec(oob_ctx->secondary_ctx->first_node_file_no,
|
|
oob_ctx->secondary_ctx->lf_pins);
|
|
oob_ctx->secondary_ctx->pending_refcount= false;
|
|
}
|
|
}
|
|
|
|
/* Write header data, if any still available. */
|
|
if (header_remain > 0)
|
|
{
|
|
size= header_remain > max_len ? max_len : (uint32_t)header_remain;
|
|
memcpy(p, header_buf + header_sofar, size);
|
|
header_remain-= size;
|
|
header_sofar+= size;
|
|
max_len-= size;
|
|
if (UNIV_UNLIKELY(max_len == 0))
|
|
{
|
|
ut_ad(gtid_remain + main_remain > 0);
|
|
return {size, false};
|
|
}
|
|
}
|
|
|
|
/* Write GTID data, if any still available. */
|
|
ut_ad(header_remain == 0);
|
|
if (gtid_remain > 0)
|
|
{
|
|
uint32_t size2= gtid_remain > max_len ? max_len : (uint32_t)gtid_remain;
|
|
int res2= my_b_read(cache, p + size, size2);
|
|
ut_a(!res2 /* ToDo: Error handling */);
|
|
gtid_remain-= size2;
|
|
if (gtid_remain == 0)
|
|
my_b_seek(cache, main_start); /* Move to read the rest of the events. */
|
|
max_len-= size2;
|
|
size+= size2;
|
|
if (max_len == 0)
|
|
return {size, gtid_remain + main_remain == 0};
|
|
}
|
|
|
|
/* Write remaining data. */
|
|
ut_ad(gtid_remain == 0);
|
|
if (main_remain == 0)
|
|
{
|
|
/*
|
|
This means that only GTID data is present, eg. when the main data was
|
|
already binlogged out-of-band.
|
|
*/
|
|
ut_ad(size > 0);
|
|
return {size, true};
|
|
}
|
|
uint32_t size2= main_remain > max_len ? max_len : (uint32_t)main_remain;
|
|
int res2= my_b_read(cache, p + size, size2);
|
|
ut_a(!res2 /* ToDo: Error handling */);
|
|
ut_ad(main_remain >= size2);
|
|
main_remain-= size2;
|
|
return {size + size2, main_remain == 0};
|
|
}
|
|
};
|
|
|
|
|
|
template<uint32_t bufsize_>
|
|
struct chunk_data_from_buf : public chunk_data_base {
|
|
static constexpr uint32_t bufsize= bufsize_;
|
|
|
|
uint32_t data_remain;
|
|
uint32_t data_sofar;
|
|
byte buffer[bufsize];
|
|
|
|
chunk_data_from_buf() : data_sofar(0)
|
|
{
|
|
/* data_remain must be initialized in derived class constructor. */
|
|
}
|
|
|
|
virtual std::pair<uint32_t, bool> copy_data(byte *p, uint32_t max_len) final
|
|
{
|
|
if (UNIV_UNLIKELY(data_remain <= 0))
|
|
return {0, true};
|
|
uint32_t size= data_remain > max_len ? max_len : data_remain;
|
|
memcpy(p, buffer + data_sofar, size);
|
|
data_remain-= size;
|
|
data_sofar+= size;
|
|
return {size, data_remain == 0};
|
|
}
|
|
~chunk_data_from_buf() { }
|
|
};
|
|
|
|
|
|
/**
|
|
Record data for the XA prepare record.
|
|
|
|
Size needed for the record data:
|
|
1 byte type/flag.
|
|
1 byte engine count.
|
|
4 bytes formatID
|
|
1 byte gtrid length
|
|
1 byte bqual length
|
|
128 bytes (max) gtrid and bqual strings.
|
|
*/
|
|
struct chunk_data_xa_prepare :
|
|
public chunk_data_from_buf<1 + 1 + 4 + 1 + 1 + 128> {
|
|
|
|
chunk_data_xa_prepare(const XID *xid, uchar engine_count)
|
|
{
|
|
/* ToDo: Need the correct data here, like the oob references. To be done when we start doing the XA crash recovery. */
|
|
buffer[0]= 42 /* ToDo */;
|
|
buffer[1]= engine_count;
|
|
int4store(&buffer[2], xid->formatID);
|
|
ut_a(xid->gtrid_length >= 0 && xid->gtrid_length <= 64);
|
|
buffer[6]= (uchar)xid->gtrid_length;
|
|
ut_a(xid->bqual_length >= 0 && xid->bqual_length <= 64);
|
|
buffer[7]= (uchar)xid->bqual_length;
|
|
memcpy(&buffer[8], &xid->data[0], xid->gtrid_length + xid->bqual_length);
|
|
data_remain=
|
|
static_cast<uint32_t>(8 + xid->gtrid_length + xid->bqual_length);
|
|
}
|
|
~chunk_data_xa_prepare() { }
|
|
};
|
|
|
|
|
|
/**
|
|
Record data for the XA COMMIT or XA ROLLBACK record.
|
|
|
|
Size needed for the record data:
|
|
1 byte type/flag.
|
|
4 bytes formatID
|
|
1 byte gtrid length
|
|
1 byte bqual length
|
|
128 bytes (max) gtrid and bqual strings.
|
|
*/
|
|
struct chunk_data_xa_complete :
|
|
public chunk_data_from_buf<1 + 4 + 1 + 1 + 128> {
|
|
|
|
chunk_data_xa_complete(const XID *xid, bool is_commit)
|
|
{
|
|
buffer[0]= (is_commit ? IBB_FL_XA_TYPE_COMMIT : IBB_FL_XA_TYPE_ROLLBACK);
|
|
int4store(&buffer[1], xid->formatID);
|
|
ut_a(xid->gtrid_length >= 0 && xid->gtrid_length <= 64);
|
|
buffer[5]= (uchar)xid->gtrid_length;
|
|
ut_a(xid->bqual_length >= 0 && xid->bqual_length <= 64);
|
|
buffer[6]= (uchar)xid->bqual_length;
|
|
memcpy(&buffer[7], &xid->data[0], xid->gtrid_length + xid->bqual_length);
|
|
data_remain=
|
|
static_cast<uint32_t>(7 + xid->gtrid_length + xid->bqual_length);
|
|
}
|
|
~chunk_data_xa_complete() { }
|
|
};
|
|
|
|
|
|
class gtid_search {
|
|
public:
|
|
gtid_search();
|
|
~gtid_search();
|
|
int find_gtid_pos(slave_connection_state *pos,
|
|
rpl_binlog_state_base *out_state, uint64_t *out_file_no,
|
|
uint64_t *out_offset);
|
|
private:
|
|
uint64_t cur_open_file_no;
|
|
uint64_t cur_open_file_length;
|
|
File cur_open_file;
|
|
};
|
|
|
|
|
|
struct found_binlogs {
|
|
uint64_t last_file_no, prev_file_no, earliest_file_no;
|
|
size_t last_size, prev_size, total_size;
|
|
int num_found;
|
|
/* Default constructor to silence compiler warnings -Wuninitialized. */
|
|
found_binlogs()= default;
|
|
};
|
|
|
|
|
|
/**
|
|
This structure holds the state needed during InnoDB recovery for recovering
|
|
binlog tablespace files.
|
|
*/
|
|
class binlog_recovery {
|
|
public:
|
|
struct found_binlogs scan_result;
|
|
byte *page_buf;
|
|
const char *binlog_dir;
|
|
/*
|
|
The current file number being recovered.
|
|
This starts out as the most recent existing non-empty binlog that has a
|
|
starting LSN no bigger than the recovery starting LSN. This should always be
|
|
one of the two most recent binlog files found at startup.
|
|
*/
|
|
uint64_t cur_file_no;
|
|
/* The physical length of cur_file_no file. */
|
|
uint64_t cur_phys_size;
|
|
/*
|
|
The starting LSN (as stored in the header of the binlog tablespace file).
|
|
No redo prior to this LSN should be applied to this file.
|
|
*/
|
|
lsn_t start_file_lsn;
|
|
/* Open file for cur_file_no, or -1 if not open. */
|
|
File cur_file_fh;
|
|
/* The sofar position of redo in cur_file_no (end point of previous redo). */
|
|
uint32_t cur_page_no;
|
|
uint32_t cur_page_offset;
|
|
|
|
/* The path to cur_file_no. */
|
|
char full_path[OS_FILE_MAX_PATH];
|
|
|
|
bool inited;
|
|
/*
|
|
Flag set in case of severe error and --innodb-force_recovery to completely
|
|
skip any binlog recovery.
|
|
*/
|
|
bool skip_recovery;
|
|
/*
|
|
Special case, if we start from completely empty (no non-empty binlog files).
|
|
This should recover into an empty binlog state.
|
|
*/
|
|
bool start_empty;
|
|
/*
|
|
Special case: The last two files are empty. Then we ignore the last empty
|
|
file and use the 2 previous files instead. The ignored file is deleted only
|
|
after successful recovery, to try to avoid destroying data in case of
|
|
recovery problems.
|
|
*/
|
|
bool ignore_last;
|
|
/*
|
|
Mark the case where the first binlog tablespace file we need to consider for
|
|
recovery has file LSN that is later than the first redo record; in this case
|
|
we need to skip records until the first one that applies to this file.
|
|
*/
|
|
bool skipping_early_lsn;
|
|
/*
|
|
Skip any initial records until the start of a page. We are guaranteed that
|
|
any page that needs to be recovered will have recovery data for the whole
|
|
page, and this way we never need to read-modify-write pages during recovery.
|
|
*/
|
|
bool skipping_partial_page;
|
|
|
|
bool init_recovery(bool space_id, uint32_t page_no, uint16_t offset,
|
|
lsn_t start_lsn, lsn_t lsn,
|
|
const byte *buf, size_t size) noexcept;
|
|
bool apply_redo(bool space_id, uint32_t page_no, uint16_t offset,
|
|
lsn_t start_lsn, lsn_t lsn,
|
|
const byte *buf, size_t size) noexcept;
|
|
int get_header(uint64_t file_no, lsn_t &out_lsn, bool &out_empty) noexcept;
|
|
bool init_recovery_from(uint64_t file_no, lsn_t file_lsn, uint32_t page_no,
|
|
uint16_t offset, lsn_t lsn,
|
|
const byte *buf, size_t size) noexcept;
|
|
void init_recovery_empty() noexcept;
|
|
void init_recovery_skip_all() noexcept;
|
|
void end_actions(bool recovery_successful) noexcept;
|
|
void release() noexcept;
|
|
bool open_cur_file() noexcept;
|
|
bool flush_page() noexcept;
|
|
void zero_out_cur_file();
|
|
bool close_file() noexcept;
|
|
bool next_file() noexcept;
|
|
bool next_page() noexcept;
|
|
bool update_page_from_record(uint16_t offset,
|
|
const byte *buf, size_t size) noexcept;
|
|
};
|
|
|
|
|
|
static binlog_recovery recover_obj;
|
|
|
|
|
|
static void innodb_binlog_prealloc_thread();
|
|
static int scan_for_binlogs(const char *binlog_dir, found_binlogs *binlog_files,
|
|
bool error_if_missing) noexcept;
|
|
static int innodb_binlog_discover();
|
|
static bool binlog_state_recover();
|
|
static void innodb_binlog_autopurge(uint64_t first_open_file_no, LF_PINS *pins);
|
|
|
|
|
|
/**
|
|
Read the header of a binlog tablespace file identified by file_no.
|
|
Sets the out_empty false if the file is empty or has checksum error (or
|
|
is missing).
|
|
Else sets out_empty true and sets out_lsn from the header.
|
|
|
|
Returns:
|
|
-1 error
|
|
0 File is missing (ENOENT) or has bad checksum on first page.
|
|
1 File found (but may be empty according to out_empty).
|
|
*/
|
|
int
|
|
get_binlog_header(const char *binlog_path, byte *page_buf,
|
|
lsn_t &out_lsn, bool &out_empty) noexcept
|
|
{
|
|
binlog_header_data header;
|
|
|
|
out_empty= true;
|
|
out_lsn= 0;
|
|
|
|
File fh= my_open(binlog_path, O_RDONLY | O_BINARY, MYF(0));
|
|
if (fh < (File)0)
|
|
return (my_errno == ENOENT ? 0 : -1);
|
|
size_t read= my_pread(fh, page_buf, ibb_page_size, 0, MYF(0));
|
|
my_close(fh, MYF(0));
|
|
if (UNIV_UNLIKELY(read == (size_t)-1))
|
|
return -1;
|
|
if (read == 0)
|
|
return 0;
|
|
/*
|
|
If the crc32 does not match, the page was not written properly, so treat
|
|
it as an empty file.
|
|
*/
|
|
const uint32_t payload= (uint32_t)ibb_page_size - BINLOG_PAGE_CHECKSUM;
|
|
uint32_t crc32= uint4korr(page_buf + payload);
|
|
if (UNIV_UNLIKELY(crc32 != my_crc32c(0, page_buf, payload)))
|
|
return 0;
|
|
|
|
fsp_binlog_extract_header_page(page_buf, &header);
|
|
if (header.is_invalid)
|
|
return 0;
|
|
if (!header.is_empty)
|
|
{
|
|
out_empty= false;
|
|
out_lsn= header.start_lsn;
|
|
}
|
|
return 1;
|
|
}
|
|
|
|
|
|
int
|
|
binlog_recovery::get_header(uint64_t file_no, lsn_t &out_lsn, bool &out_empty)
|
|
noexcept
|
|
{
|
|
char full_path[OS_FILE_MAX_PATH];
|
|
binlog_name_make(full_path, file_no, binlog_dir);
|
|
return get_binlog_header(full_path, page_buf, out_lsn, out_empty);
|
|
}
|
|
|
|
|
|
bool binlog_recovery::init_recovery(bool space_id, uint32_t page_no,
|
|
uint16_t offset,
|
|
lsn_t start_lsn, lsn_t end_lsn,
|
|
const byte *buf, size_t size) noexcept
|
|
{
|
|
/* Start by initializing resource pointers so we are safe to releaes(). */
|
|
cur_file_fh= (File)-1;
|
|
if (!(page_buf= static_cast<byte *>
|
|
(ut_malloc(ibb_page_size, mem_key_binlog))))
|
|
{
|
|
my_error(ER_OUTOFMEMORY, MYF(MY_WME), ibb_page_size);
|
|
return true;
|
|
}
|
|
memset(page_buf, 0, ibb_page_size);
|
|
inited= true;
|
|
/*
|
|
ToDo: It would be good to find a way to not duplicate this logic for
|
|
where the binlog tablespace filess are stored with the code in
|
|
innodb_binlog_init(). But it's a bit awkward, because InnoDB recovery
|
|
runs during plugin init, so not even available for the server to call
|
|
into until after recovery is done.
|
|
*/
|
|
binlog_dir= opt_binlog_directory;
|
|
if (!binlog_dir || !binlog_dir[0])
|
|
binlog_dir= ".";
|
|
if (scan_for_binlogs(binlog_dir, &scan_result, true) <= 0)
|
|
return true;
|
|
|
|
/*
|
|
Here we find the two most recent, non-empty binlogs to do recovery on.
|
|
Before we allocate binlog tablespace file N+2, we flush and fsync file N
|
|
to disk. This ensures that we only ever need to apply redo records to the
|
|
two most recent files during recovery.
|
|
|
|
A special case however arises if the two most recent binlog files are
|
|
both completely empty. Then we do not have any LSN to match against to
|
|
know if a redo record applies to one of these two files, or to an earlier
|
|
file with same value of bit 0 of the file_no. In this case, we ignore the
|
|
most recent file (deleting it later after successful recovery), and
|
|
consider instead the two prior files, the first of which is guaranteed to
|
|
have durably saved a starting LSN to use.
|
|
|
|
Hence the loop, which can only ever have one or two iterations.
|
|
|
|
A further special case is if there are fewer than two (or three if last
|
|
two are empty) files. If there are no files, or only empty files, then the
|
|
server must have stopped just after RESET MASTER (or just after
|
|
initializing the binlogs at first startup), and we should just start the
|
|
binlogs from scratch.
|
|
*/
|
|
ignore_last= false;
|
|
uint64_t file_no2= scan_result.last_file_no;
|
|
uint64_t file_no1= scan_result.prev_file_no;
|
|
int num_binlogs= scan_result.num_found;
|
|
for (;;)
|
|
{
|
|
lsn_t lsn1= 0, lsn2= 0;
|
|
bool is_empty1= true, is_empty2= true;
|
|
int res2= get_header(file_no2, lsn2, is_empty2);
|
|
|
|
if (num_binlogs == 0 ||
|
|
(num_binlogs == 1 && is_empty2))
|
|
{
|
|
init_recovery_empty();
|
|
return false;
|
|
}
|
|
if (num_binlogs == 1)
|
|
{
|
|
uint64_t start_file_no= file_no2;
|
|
/*
|
|
Only one binlog file found.
|
|
|
|
This first recovery record may apply to the previous file (which has
|
|
then presumably been purged since the last checkpoint). Or it may
|
|
apply to this file, or only to the following file.
|
|
*/
|
|
if (space_id != (file_no2 & 1) && start_lsn >= lsn2)
|
|
++start_file_no;
|
|
return init_recovery_from(start_file_no, lsn2, page_no, offset,
|
|
start_lsn, buf, size);
|
|
}
|
|
|
|
int res1= get_header(file_no1, lsn1, is_empty1);
|
|
|
|
if (res2 < 0 && !srv_force_recovery)
|
|
{
|
|
sql_print_error("InnoDB: I/O error reading binlog file number %" PRIu64,
|
|
file_no2);
|
|
return true;
|
|
}
|
|
if (res1 < 0 && !srv_force_recovery)
|
|
{
|
|
sql_print_error("InnoDB: I/O error reading binlog file number %" PRIu64,
|
|
file_no1);
|
|
return true;
|
|
}
|
|
if (is_empty1 && is_empty2)
|
|
{
|
|
if (!ignore_last)
|
|
{
|
|
ignore_last= true;
|
|
if (file_no2 > scan_result.earliest_file_no)
|
|
{
|
|
--file_no2;
|
|
if (file_no1 > scan_result.earliest_file_no)
|
|
--file_no1;
|
|
else
|
|
--num_binlogs;
|
|
}
|
|
else
|
|
--num_binlogs;
|
|
continue;
|
|
}
|
|
if (srv_force_recovery)
|
|
{
|
|
/*
|
|
If the last 3 files are empty, we cannot get an LSN to know which
|
|
records apply to each file. This should not happen unless there is
|
|
damage to the file system. If force recovery is requested, we must
|
|
simply do no recovery at all on the binlog files.
|
|
*/
|
|
sql_print_warning("InnoDB: Binlog tablespace file recovery is not "
|
|
"possible. Recovery is skipped due to "
|
|
"--innodb-force-recovery");
|
|
init_recovery_skip_all();
|
|
return false;
|
|
}
|
|
sql_print_error("InnoDB: Last 3 binlog tablespace files are all empty. "
|
|
"Recovery is not possible");
|
|
return true;
|
|
}
|
|
if (is_empty2)
|
|
lsn2= lsn1;
|
|
if (space_id == (file_no2 & 1) && start_lsn >= lsn1)
|
|
{
|
|
if (start_lsn < lsn2 && !srv_force_recovery)
|
|
{
|
|
sql_print_error("InnoDB: inconsistent space_id %d for lsn=%" LSN_PF,
|
|
(int)space_id, start_lsn);
|
|
return true;
|
|
}
|
|
return init_recovery_from(file_no2, lsn2,
|
|
page_no, offset, start_lsn, buf, size);
|
|
}
|
|
else
|
|
return init_recovery_from(file_no1, lsn1,
|
|
page_no, offset, start_lsn, buf, size);
|
|
/* NotReached. */
|
|
}
|
|
}
|
|
|
|
|
|
bool
|
|
binlog_recovery::init_recovery_from(uint64_t file_no, lsn_t file_lsn,
|
|
uint32_t page_no, uint16_t offset,
|
|
lsn_t lsn, const byte *buf, size_t size)
|
|
noexcept
|
|
{
|
|
cur_file_no= file_no;
|
|
cur_phys_size= 0;
|
|
start_file_lsn= file_lsn;
|
|
cur_page_no= page_no;
|
|
cur_page_offset= 0;
|
|
skip_recovery= false;
|
|
start_empty= false;
|
|
skipping_partial_page= true;
|
|
if (lsn < start_file_lsn)
|
|
skipping_early_lsn= true;
|
|
else
|
|
{
|
|
skipping_early_lsn= false;
|
|
if (offset <= BINLOG_PAGE_DATA)
|
|
{
|
|
skipping_partial_page= false;
|
|
return update_page_from_record(offset, buf, size);
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
|
|
|
|
/**
|
|
Initialize recovery from the state where there are no binlog files, or only
|
|
completely empty binlog files. In this case we have no file LSN to compare
|
|
redo records against.
|
|
|
|
This can only happen if we crash immediately after RESET MASTER (or fresh
|
|
server installation) as an initial file header is durably written to disk
|
|
before binlogging new data. Therefore we should skip _all_ redo records and
|
|
recover into a completely empty state.
|
|
*/
|
|
void
|
|
binlog_recovery::init_recovery_empty() noexcept
|
|
{
|
|
cur_file_no= 0;
|
|
cur_phys_size= 0;
|
|
start_file_lsn= (lsn_t)0;
|
|
cur_page_no= 0;
|
|
cur_page_offset= 0;
|
|
skip_recovery= false;
|
|
start_empty= true;
|
|
ignore_last= false;
|
|
skipping_early_lsn= false;
|
|
skipping_partial_page= true;
|
|
}
|
|
|
|
|
|
void
|
|
binlog_recovery::init_recovery_skip_all() noexcept
|
|
{
|
|
skip_recovery= true;
|
|
}
|
|
|
|
|
|
void
|
|
binlog_recovery::end_actions(bool recovery_successful) noexcept
|
|
{
|
|
char full_path[OS_FILE_MAX_PATH];
|
|
if (recovery_successful && !skip_recovery)
|
|
{
|
|
if (!start_empty)
|
|
{
|
|
if (cur_page_offset)
|
|
flush_page();
|
|
if (cur_file_fh > (File)-1)
|
|
zero_out_cur_file();
|
|
close_file();
|
|
++cur_file_no;
|
|
}
|
|
|
|
/*
|
|
Delete any binlog tablespace files following the last recovered file.
|
|
These files could be pre-allocated but never used files, or they could be
|
|
files that were written with data that was eventually not recovered due
|
|
to --innodb-flush-log-at-trx-commit=0|2.
|
|
*/
|
|
for (uint64_t i= cur_file_no;
|
|
scan_result.num_found >= 1 && i <= scan_result.last_file_no;
|
|
++i)
|
|
{
|
|
binlog_name_make(full_path, i, binlog_dir);
|
|
if (my_delete(full_path, MYF(MY_WME)))
|
|
sql_print_warning("InnoDB: Could not delete empty file '%s' ("
|
|
"error: %d)", full_path, my_errno);
|
|
}
|
|
}
|
|
release();
|
|
}
|
|
|
|
|
|
void
|
|
binlog_recovery::release() noexcept
|
|
{
|
|
if (cur_file_fh >= (File)0)
|
|
{
|
|
my_close(cur_file_fh, MYF(0));
|
|
cur_file_fh= (File)-1;
|
|
}
|
|
ut_free(page_buf);
|
|
page_buf= nullptr;
|
|
inited= false;
|
|
}
|
|
|
|
|
|
bool
|
|
binlog_recovery::open_cur_file() noexcept
|
|
{
|
|
if (cur_file_fh >= (File)0)
|
|
my_close(cur_file_fh, MYF(0));
|
|
binlog_name_make(full_path, cur_file_no, binlog_dir);
|
|
cur_file_fh= my_open(full_path, O_RDWR | O_BINARY, MYF(0));
|
|
if (cur_file_fh < (File)0)
|
|
{
|
|
/*
|
|
If we are on page 0 and the binlog file does not exist, then we should
|
|
create it (and recover its content).
|
|
Otherwise, it is an error, we cannot recover it as we are missing the
|
|
start of it.
|
|
*/
|
|
if (my_errno != ENOENT ||
|
|
cur_page_no != 0 ||
|
|
(cur_file_fh= my_open(full_path, O_RDWR | O_CREAT | O_TRUNC |
|
|
O_BINARY, MYF(0))) < (File)0)
|
|
{
|
|
my_error(EE_FILENOTFOUND, MYF(MY_WME), full_path, my_errno);
|
|
return true;
|
|
}
|
|
}
|
|
cur_phys_size= (uint64_t)my_seek(cur_file_fh, 0, MY_SEEK_END, MYF(0));
|
|
return false;
|
|
}
|
|
|
|
|
|
bool
|
|
binlog_recovery::flush_page() noexcept
|
|
{
|
|
if (cur_file_fh < (File)0 &&
|
|
open_cur_file())
|
|
return true;
|
|
size_t res=
|
|
crc32_pwrite_page(cur_file_fh, page_buf, cur_page_no, MYF(MY_WME));
|
|
if (res != ibb_page_size)
|
|
return true;
|
|
cur_page_offset= 0;
|
|
memset(page_buf, 0, ibb_page_size);
|
|
return false;
|
|
}
|
|
|
|
|
|
void
|
|
binlog_recovery::zero_out_cur_file()
|
|
{
|
|
if (cur_file_fh < (File)0)
|
|
return;
|
|
|
|
/* Recover the original size from the current file. */
|
|
int res= crc32_pread_page(cur_file_fh, page_buf, 0, MYF(0));
|
|
if (res <= 0)
|
|
{
|
|
sql_print_warning("InnoDB: Could not read last binlog file during recovery");
|
|
return;
|
|
}
|
|
binlog_header_data header;
|
|
fsp_binlog_extract_header_page(page_buf, &header);
|
|
|
|
if (header.is_invalid)
|
|
{
|
|
sql_print_warning("InnoDB: Invalid header page in last binlog file "
|
|
"during recovery");
|
|
return;
|
|
}
|
|
if (header.is_empty)
|
|
{
|
|
sql_print_warning("InnoDB: Empty binlog file header found during recovery");
|
|
ut_ad(0);
|
|
return;
|
|
}
|
|
|
|
/* Fill up or truncate the file to its original size. */
|
|
if (my_chsize(cur_file_fh, (my_off_t)header.page_count << ibb_page_size_shift,
|
|
0, MYF(0)))
|
|
sql_print_warning("InnoDB: Could not change the size of last binlog file "
|
|
"during recovery (error: %d)", my_errno);
|
|
for (uint32_t i= cur_page_no + 1; i < header.page_count; ++i)
|
|
{
|
|
if (my_pread(cur_file_fh, page_buf, ibb_page_size,
|
|
(my_off_t)i << ibb_page_size_shift, MYF(0)) <
|
|
(size_t)ibb_page_size)
|
|
break;
|
|
/* Check if page already zeroed out. */
|
|
if (page_buf[0] == 0 && !memcmp(page_buf, page_buf+1, ibb_page_size - 1))
|
|
continue;
|
|
memset(page_buf, 0, ibb_page_size);
|
|
if (my_pwrite(cur_file_fh, page_buf, ibb_page_size,
|
|
(uint64_t)i << ibb_page_size_shift, MYF(MY_WME)) <
|
|
(size_t)ibb_page_size)
|
|
{
|
|
sql_print_warning("InnoDB: Error writing to last binlog file during "
|
|
"recovery (error code: %d)", my_errno);
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
bool
|
|
binlog_recovery::close_file() noexcept
|
|
{
|
|
if (cur_file_fh >= (File)0)
|
|
{
|
|
if (my_sync(cur_file_fh, MYF(MY_WME)))
|
|
return true;
|
|
my_close(cur_file_fh, (File)0);
|
|
cur_file_fh= (File)-1;
|
|
cur_phys_size= 0;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
|
|
bool
|
|
binlog_recovery::next_file() noexcept
|
|
{
|
|
if (cur_page_offset && flush_page())
|
|
return true;
|
|
if (close_file())
|
|
return true;
|
|
++cur_file_no;
|
|
cur_page_no= 0;
|
|
return false;
|
|
}
|
|
|
|
|
|
bool
|
|
binlog_recovery::next_page() noexcept
|
|
{
|
|
if (cur_page_offset && flush_page())
|
|
return true;
|
|
++cur_page_no;
|
|
return false;
|
|
}
|
|
|
|
|
|
bool
|
|
binlog_recovery::apply_redo(bool space_id, uint32_t page_no, uint16_t offset,
|
|
lsn_t start_lsn, lsn_t end_lsn,
|
|
const byte *buf, size_t size) noexcept
|
|
{
|
|
if (UNIV_UNLIKELY(skip_recovery) || start_empty)
|
|
return false;
|
|
|
|
if (skipping_partial_page)
|
|
{
|
|
if (offset > BINLOG_PAGE_DATA)
|
|
return false;
|
|
skipping_partial_page= false;
|
|
}
|
|
|
|
if (skipping_early_lsn)
|
|
{
|
|
if (start_lsn < start_file_lsn || space_id != (cur_file_no & 1))
|
|
return false; /* Skip record for earlier file that's already durable. */
|
|
/* Now reset the current page to match the real starting point. */
|
|
cur_page_no= page_no;
|
|
}
|
|
|
|
if (UNIV_UNLIKELY(start_lsn < start_file_lsn))
|
|
{
|
|
ut_a(!skipping_early_lsn /* Was handled in condition above */);
|
|
if (!srv_force_recovery)
|
|
{
|
|
sql_print_error("InnoDB: Unexpected LSN " LSN_PF " during recovery, "
|
|
"expected at least " LSN_PF, start_lsn, start_file_lsn);
|
|
return true;
|
|
}
|
|
sql_print_warning("InnoDB: Ignoring unexpected LSN " LSN_PF " during "
|
|
"recovery, ", start_lsn);
|
|
return false;
|
|
}
|
|
skipping_early_lsn= false;
|
|
|
|
/* Test for moving to the next file. */
|
|
if (space_id != (cur_file_no & 1))
|
|
{
|
|
/* Check that we recovered all of this file. */
|
|
if ( ( (cur_page_offset > BINLOG_PAGE_DATA &&
|
|
cur_page_offset < ibb_page_size - BINLOG_PAGE_DATA_END) ||
|
|
cur_page_no + (cur_page_offset > BINLOG_PAGE_DATA) <
|
|
cur_phys_size >> ibb_page_size_shift) &&
|
|
!srv_force_recovery)
|
|
{
|
|
sql_print_error("InnoDB: Missing recovery record at end of file_no=%"
|
|
PRIu64 ", LSN " LSN_PF, cur_file_no, start_lsn);
|
|
return true;
|
|
}
|
|
|
|
/* Check that we recover from the start of the next file. */
|
|
if ((page_no > 0 || offset > BINLOG_PAGE_DATA) && !srv_force_recovery)
|
|
{
|
|
sql_print_error("InnoDB: Missing recovery record at start of file_no=%"
|
|
PRIu64 ", LSN " LSN_PF, cur_file_no+1, start_lsn);
|
|
return true;
|
|
}
|
|
|
|
if (next_file())
|
|
return true;
|
|
}
|
|
/* Test for moving to the next page. */
|
|
else if (page_no != cur_page_no)
|
|
{
|
|
if (cur_page_offset > BINLOG_PAGE_DATA &&
|
|
cur_page_offset < ibb_page_size - BINLOG_PAGE_DATA_END &&
|
|
!srv_force_recovery)
|
|
{
|
|
sql_print_error("InnoDB: Missing recovery record in file_no=%"
|
|
PRIu64 ", page_no=%u, LSN " LSN_PF,
|
|
cur_file_no, cur_page_no, start_lsn);
|
|
return true;
|
|
}
|
|
|
|
if ((page_no != cur_page_no + 1 || offset > BINLOG_PAGE_DATA) &&
|
|
!srv_force_recovery)
|
|
{
|
|
sql_print_error("InnoDB: Missing recovery record in file_no=%"
|
|
PRIu64 ", page_no=%u, LSN " LSN_PF,
|
|
cur_file_no, cur_page_no + 1, start_lsn);
|
|
return true;
|
|
}
|
|
|
|
if (next_page())
|
|
return true;
|
|
}
|
|
/* Test no gaps in offset. */
|
|
else if (offset != cur_page_offset &&
|
|
offset > BINLOG_PAGE_DATA &&
|
|
!srv_force_recovery)
|
|
{
|
|
sql_print_error("InnoDB: Missing recovery record in file_no=%"
|
|
PRIu64 ", page_no=%u, LSN " LSN_PF,
|
|
cur_file_no, cur_page_no, start_lsn);
|
|
return true;
|
|
}
|
|
|
|
if (offset + size >= ibb_page_size)
|
|
return !srv_force_recovery;
|
|
|
|
return update_page_from_record(offset, buf, size);
|
|
}
|
|
|
|
|
|
bool
|
|
binlog_recovery::update_page_from_record(uint16_t offset,
|
|
const byte *buf, size_t size) noexcept
|
|
{
|
|
memcpy(page_buf + offset, buf, size);
|
|
if (cur_page_no == 0 && offset == 0)
|
|
{
|
|
binlog_header_data header;
|
|
/*
|
|
This recovery record is for the file header page.
|
|
This record is special, it covers only the used part of the header page.
|
|
The reaminder of the page must be set to zeroes.
|
|
Additionally, there is an extra CRC corresponding to a minimum
|
|
page size of IBB_PAGE_SIZE_MIN, in anticipation for future configurable
|
|
page size.
|
|
*/
|
|
memset(page_buf + size, 0, ibb_page_size - (size + BINLOG_PAGE_DATA_END));
|
|
cur_page_offset= (uint32_t)ibb_page_size - BINLOG_PAGE_DATA_END;
|
|
uint32_t payload= IBB_HEADER_PAGE_SIZE - BINLOG_PAGE_CHECKSUM;
|
|
int4store(page_buf + payload, my_crc32c(0, page_buf, payload));
|
|
fsp_binlog_extract_header_page(page_buf, &header);
|
|
if (header.is_invalid)
|
|
{
|
|
sql_print_error("InnoDB: Corrupt or invalid file header found during "
|
|
"recovery of file number %" PRIu64, cur_file_no);
|
|
return !srv_force_recovery;
|
|
}
|
|
if (header.is_empty)
|
|
{
|
|
sql_print_error("InnoDB: Empty file header found during "
|
|
"recovery of file number %" PRIu64, cur_file_no);
|
|
return !srv_force_recovery;
|
|
}
|
|
if (header.file_no != cur_file_no)
|
|
{
|
|
sql_print_error("InnoDB: Inconsistency in file header during recovery. "
|
|
"The header in file number %" PRIu64 " is for file "
|
|
"number %" PRIu64, cur_file_no, header.file_no);
|
|
return !srv_force_recovery;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
cur_page_offset= offset + (uint32_t)size;
|
|
return false;
|
|
}
|
|
|
|
|
|
/**
|
|
Check if this is an InnoDB binlog file name.
|
|
Return the index/file_no if so.
|
|
*/
|
|
bool
|
|
is_binlog_name(const char *name, uint64_t *out_idx)
|
|
{
|
|
const size_t base_len= sizeof(BINLOG_NAME_BASE) - 1; // Length without '\0' terminator
|
|
const size_t ext_len= sizeof(BINLOG_NAME_EXT) - 1;
|
|
|
|
if (0 != strncmp(name, BINLOG_NAME_BASE, base_len))
|
|
return false;
|
|
size_t name_len= strlen(name);
|
|
if (name_len < base_len + 1 + ext_len)
|
|
return false;
|
|
const char *ext_start= name + (name_len - ext_len);
|
|
if (0 != strcmp(ext_start, BINLOG_NAME_EXT))
|
|
return false;
|
|
if (!std::isdigit((unsigned char)(name[base_len])))
|
|
return false;
|
|
char *conv_end= nullptr;
|
|
unsigned long long idx= std::strtoull(name + base_len, &conv_end, 10);
|
|
if (idx == ULLONG_MAX || conv_end != ext_start)
|
|
return false;
|
|
|
|
*out_idx= (uint64_t)idx;
|
|
return true;
|
|
}
|
|
|
|
|
|
void
|
|
innodb_binlog_startup_init()
|
|
{
|
|
fsp_binlog_init();
|
|
mysql_mutex_init(fsp_purge_binlog_mutex_key, &purge_binlog_mutex, nullptr);
|
|
binlog_diff_state.init();
|
|
ibb_xa_xid_hash= new ibb_xid_hash();
|
|
|
|
innodb_binlog_inited= 1;
|
|
}
|
|
|
|
|
|
static void
|
|
innodb_binlog_init_state()
|
|
{
|
|
first_open_binlog_file_no= ~(uint64_t)0;
|
|
for (uint32_t i= 0; i < 4; ++i)
|
|
{
|
|
binlog_cur_end_offset[i].store(~(uint64_t)0, std::memory_order_relaxed);
|
|
binlog_cur_durable_offset[i].store(~(uint64_t)0, std::memory_order_relaxed);
|
|
}
|
|
last_created_binlog_file_no= ~(uint64_t)0;
|
|
earliest_binlog_file_no= ~(uint64_t)0;
|
|
total_binlog_used_size= 0;
|
|
active_binlog_file_no.store(~(uint64_t)0, std::memory_order_release);
|
|
ibb_file_hash.earliest_oob_ref.store(0, std::memory_order_relaxed);
|
|
binlog_cur_page_no= 0;
|
|
binlog_cur_page_offset= BINLOG_PAGE_DATA;
|
|
current_binlog_state_interval=
|
|
(uint64_t)(innodb_binlog_state_interval >> ibb_page_size_shift);
|
|
ut_a(innodb_binlog_state_interval ==
|
|
(current_binlog_state_interval << ibb_page_size_shift));
|
|
}
|
|
|
|
|
|
/** Start the thread that pre-allocates new binlog files. */
|
|
static void
|
|
start_binlog_prealloc_thread()
|
|
{
|
|
prealloc_thread_end= false;
|
|
binlog_prealloc_thr_obj= std::thread{innodb_binlog_prealloc_thread};
|
|
|
|
mysql_mutex_lock(&active_binlog_mutex);
|
|
while (last_created_binlog_file_no == ~(uint64_t)0) {
|
|
/* Wait for the first binlog file to be available. */
|
|
my_cond_wait(&active_binlog_cond, &active_binlog_mutex.m_mutex);
|
|
}
|
|
mysql_mutex_unlock(&active_binlog_mutex);
|
|
}
|
|
|
|
|
|
/**
|
|
Write the initial header record to the file and durably sync it to disk in
|
|
the binlog tablespace file and in the redo log.
|
|
|
|
This is to ensure recovery can work correctly. This way, recovery will
|
|
always find a non-empty file with an initial lsn to start recovery from.
|
|
Except in the case where we crash right here; in this case recovery will
|
|
find no binlog files at all and will know to recover to the empty state
|
|
with no binlog files present.
|
|
*/
|
|
static void
|
|
binlog_sync_initial()
|
|
{
|
|
chunk_data_flush dummy_data;
|
|
mtr_t mtr{nullptr};
|
|
LF_PINS *lf_pins= lf_hash_get_pins(&ibb_file_hash.hash);
|
|
ut_a(lf_pins);
|
|
mtr.start();
|
|
fsp_binlog_write_rec(&dummy_data, &mtr, FSP_BINLOG_TYPE_FILLER, lf_pins);
|
|
uint64_t file_no= active_binlog_file_no.load(std::memory_order_relaxed);
|
|
mtr.commit();
|
|
lf_hash_put_pins(lf_pins);
|
|
log_buffer_flush_to_disk(true);
|
|
binlog_page_fifo->flush_up_to(0, 0);
|
|
binlog_page_fifo->do_fdatasync(0);
|
|
ibb_pending_lsn_fifo.add_to_fifo(mtr.commit_lsn(), file_no,
|
|
binlog_cur_end_offset[file_no & 3].load(std::memory_order_relaxed));
|
|
}
|
|
|
|
|
|
void
|
|
ibb_set_max_size(size_t binlog_size)
|
|
{
|
|
uint64_t pages= binlog_size >> ibb_page_size_shift;
|
|
if (UNIV_LIKELY(pages > (uint64_t)UINT32_MAX)) {
|
|
pages= UINT32_MAX;
|
|
sql_print_warning("Requested max_binlog_size is larger than the maximum "
|
|
"InnoDB tablespace size, truncated to %llu",
|
|
(pages << ibb_page_size_shift));
|
|
} else if (pages < 4) {
|
|
pages= 4;
|
|
sql_print_warning("Requested max_binlog_size is smaller than the minimum "
|
|
"size supported by InnoDB, truncated to %llu",
|
|
(pages << ibb_page_size_shift));
|
|
}
|
|
innodb_binlog_size_in_pages= (uint32_t)pages;
|
|
}
|
|
|
|
|
|
/**
|
|
Open the InnoDB binlog implementation.
|
|
This is called from server binlog layer if the user configured the binlog to
|
|
use the innodb implementation (with --binlog-storage-engine=innodb).
|
|
*/
|
|
bool
|
|
innodb_binlog_init(size_t binlog_size, const char *directory)
|
|
{
|
|
ibb_set_max_size(binlog_size);
|
|
if (!directory || !directory[0])
|
|
directory= ".";
|
|
else if (strlen(directory) + BINLOG_NAME_MAX_LEN > OS_FILE_MAX_PATH)
|
|
{
|
|
sql_print_error("Specified binlog directory path '%s' is too long",
|
|
directory);
|
|
return true;
|
|
}
|
|
innodb_binlog_directory= directory;
|
|
|
|
innodb_binlog_init_state();
|
|
innodb_binlog_inited= 2;
|
|
|
|
/* Find any existing binlog files and continue writing in them. */
|
|
int res= innodb_binlog_discover();
|
|
if (res < 0)
|
|
{
|
|
/* ToDo: Need to think more on the error handling if the binlog cannot be opened. We may need to abort starting the server, at least for some errors? And/or in some cases maybe force ignore any existing unusable files and continue with a new binlog (but then maybe innodb_binlog_discover() should return 0 and print warnings in the error log?). */
|
|
return true;
|
|
}
|
|
if (res > 0)
|
|
{
|
|
/* We are continuing from existing binlogs. Recover the binlog state. */
|
|
if (binlog_state_recover())
|
|
return true;
|
|
}
|
|
|
|
start_binlog_prealloc_thread();
|
|
/*
|
|
We are creating binlogs anew from scratch.
|
|
Write and fsync the initial file-header, so that recovery will know where
|
|
to start in case of a crash.
|
|
*/
|
|
binlog_sync_initial();
|
|
|
|
return false;
|
|
}
|
|
|
|
|
|
/** Compute the (so far) last and last-but-one binlog files found. */
|
|
static void
|
|
process_binlog_name(found_binlogs *bls, uint64_t idx, size_t size)
|
|
{
|
|
if (bls->num_found == 0)
|
|
{
|
|
bls->earliest_file_no= idx;
|
|
bls->total_size= size;
|
|
}
|
|
else
|
|
{
|
|
if (idx < bls->earliest_file_no)
|
|
bls->earliest_file_no= idx;
|
|
bls->total_size+= size;
|
|
}
|
|
|
|
if (bls->num_found == 0 ||
|
|
idx > bls->last_file_no) {
|
|
if (bls->num_found >= 1 && idx == bls->last_file_no + 1) {
|
|
bls->prev_file_no= bls->last_file_no;
|
|
bls->prev_size= bls->last_size;
|
|
bls->num_found= 2;
|
|
} else {
|
|
bls->num_found= 1;
|
|
}
|
|
bls->last_file_no= idx;
|
|
bls->last_size= size;
|
|
} else if (bls->num_found == 1 && idx + 1 == bls->last_file_no) {
|
|
bls->num_found= 2;
|
|
bls->prev_file_no= idx;
|
|
bls->prev_size= size;
|
|
}
|
|
}
|
|
|
|
|
|
/**
|
|
Scan the binlog directory for binlog files.
|
|
Returns:
|
|
1 Success
|
|
0 Binlog directory not found
|
|
-1 Other error
|
|
*/
|
|
static int
|
|
scan_for_binlogs(const char *binlog_dir, found_binlogs *binlog_files,
|
|
bool error_if_missing) noexcept
|
|
{
|
|
MY_DIR *dir= my_dir(binlog_dir, MYF(MY_WANT_STAT));
|
|
if (!dir)
|
|
{
|
|
if (my_errno != ENOENT || error_if_missing)
|
|
sql_print_error("Could not read the binlog directory '%s', error code %d",
|
|
binlog_dir, my_errno);
|
|
return (my_errno == ENOENT ? 0 : -1);
|
|
}
|
|
|
|
binlog_files->num_found= 0;
|
|
size_t num_entries= dir->number_of_files;
|
|
fileinfo *entries= dir->dir_entry;
|
|
for (size_t i= 0; i < num_entries; ++i) {
|
|
const char *name= entries[i].name;
|
|
uint64_t idx;
|
|
if (!is_binlog_name(name, &idx))
|
|
continue;
|
|
process_binlog_name(binlog_files, idx, (size_t)entries[i].mystat->st_size);
|
|
}
|
|
my_dirend(dir);
|
|
|
|
return 1; /* Success */
|
|
}
|
|
|
|
|
|
static bool
|
|
binlog_page_empty(const byte *page)
|
|
{
|
|
return page[BINLOG_PAGE_DATA] == 0;
|
|
}
|
|
|
|
|
|
/**
|
|
Find the last written position in the binlog file.
|
|
Do a binary search through the pages to find the last non-empty page, then
|
|
scan the page to find the place to start writing new binlog data.
|
|
|
|
Returns:
|
|
1 position found, output in *out_space, *out_page_no and *out_pos_in_page.
|
|
0 binlog file is empty.
|
|
-1 error.
|
|
*/
|
|
|
|
static int
|
|
find_pos_in_binlog(uint64_t file_no, size_t file_size, byte *page_buf,
|
|
uint32_t *out_page_no, uint32_t *out_pos_in_page,
|
|
binlog_header_data *out_header_data)
|
|
{
|
|
const uint32_t page_size= (uint32_t)ibb_page_size;
|
|
const uint32_t page_size_shift= (uint32_t)ibb_page_size_shift;
|
|
const uint32_t idx= file_no & 3;
|
|
char file_name[OS_FILE_MAX_PATH];
|
|
uint32_t p_0, p_1, p_2, last_nonempty;
|
|
byte *p, *page_end;
|
|
bool ret;
|
|
|
|
*out_page_no= 0;
|
|
*out_pos_in_page= BINLOG_PAGE_DATA;
|
|
out_header_data->diff_state_interval= 0;
|
|
out_header_data->is_invalid= true;
|
|
|
|
binlog_name_make(file_name, file_no);
|
|
pfs_os_file_t fh= os_file_create(innodb_data_file_key, file_name,
|
|
OS_FILE_OPEN, OS_DATA_FILE,
|
|
srv_read_only_mode, &ret);
|
|
if (!ret) {
|
|
sql_print_warning("InnoDB: Unable to open file '%s'", file_name);
|
|
return -1;
|
|
}
|
|
|
|
int res= crc32_pread_page(fh, page_buf, 0, MYF(MY_WME));
|
|
if (res <= 0) {
|
|
os_file_close(fh);
|
|
return -1;
|
|
}
|
|
fsp_binlog_extract_header_page(page_buf, out_header_data);
|
|
if (out_header_data->is_invalid)
|
|
{
|
|
sql_print_error("InnoDB: Invalid or corrupt file header in file "
|
|
"'%s'", file_name);
|
|
return -1;
|
|
}
|
|
if (out_header_data->is_empty) {
|
|
ret=
|
|
fsp_binlog_open(file_name, fh, file_no, file_size, ~(uint32_t)0, nullptr);
|
|
binlog_cur_durable_offset[idx].store(0, std::memory_order_relaxed);
|
|
binlog_cur_end_offset[idx].store(0, std::memory_order_relaxed);
|
|
return (ret ? -1 : 0);
|
|
}
|
|
if (out_header_data->file_no != file_no)
|
|
{
|
|
sql_print_error("InnoDB: Inconsistent file header in file '%s', "
|
|
"wrong file_no %" PRIu64, file_name,
|
|
out_header_data->file_no);
|
|
return -1;
|
|
}
|
|
last_nonempty= 0;
|
|
|
|
/*
|
|
During the binary search, p_0-1 is the largest page number that is know to
|
|
be non-empty. And p_2 is the first page that is known to be empty.
|
|
*/
|
|
p_0= 1;
|
|
p_2= (uint32_t)(file_size / page_size);
|
|
for (;;) {
|
|
if (p_0 == p_2)
|
|
break;
|
|
ut_ad(p_0 < p_2);
|
|
p_1= (p_0 + p_2) / 2;
|
|
res= crc32_pread_page(fh, page_buf, p_1, MYF(MY_WME));
|
|
if (res <= 0) {
|
|
os_file_close(fh);
|
|
return -1;
|
|
}
|
|
if (binlog_page_empty(page_buf)) {
|
|
p_2= p_1;
|
|
} else {
|
|
p_0= p_1 + 1;
|
|
last_nonempty= p_1;
|
|
}
|
|
}
|
|
/* At this point, p_0 == p_2 is the first empty page. */
|
|
ut_ad(p_0 >= 1);
|
|
|
|
/*
|
|
This sometimes does an extra read, but as this is only during startup it
|
|
does not matter.
|
|
*/
|
|
res= crc32_pread_page(fh, page_buf, last_nonempty, MYF(MY_WME));
|
|
if (res <= 0) {
|
|
os_file_close(fh);
|
|
return -1;
|
|
}
|
|
|
|
/* Now scan the last page to find the position in it to continue. */
|
|
p= &page_buf[BINLOG_PAGE_DATA];
|
|
page_end= &page_buf[page_size - BINLOG_PAGE_DATA_END];
|
|
while (*p && p < page_end) {
|
|
if (*p == FSP_BINLOG_TYPE_FILLER) {
|
|
p= page_end;
|
|
break;
|
|
}
|
|
p += 3 + (((uint32_t)p[2] << 8) | ((uint32_t)p[1] & 0xff));
|
|
// ToDo: How to handle page corruption?
|
|
ut_a(p <= page_end);
|
|
}
|
|
|
|
/*
|
|
Normalize the position, so that we store (page_no+1, BINLOG_PAGE_DATA)
|
|
and not (page_no, page_size - BINLOG_PAGE_DATA_END).
|
|
*/
|
|
byte *partial_page;
|
|
if (p == page_end)
|
|
{
|
|
*out_page_no= p_0;
|
|
*out_pos_in_page= BINLOG_PAGE_DATA;
|
|
partial_page= nullptr;
|
|
}
|
|
else
|
|
{
|
|
*out_page_no= p_0 - 1;
|
|
*out_pos_in_page= (uint32_t)(p - page_buf);
|
|
partial_page= page_buf;
|
|
}
|
|
|
|
ret= fsp_binlog_open(file_name, fh, file_no, file_size,
|
|
*out_page_no, partial_page);
|
|
uint64_t pos= (*out_page_no << page_size_shift) | *out_pos_in_page;
|
|
binlog_cur_durable_offset[idx].store(pos, std::memory_order_relaxed);
|
|
binlog_cur_end_offset[idx].store(pos, std::memory_order_relaxed);
|
|
return ret ? -1 : 1;
|
|
}
|
|
|
|
|
|
static void
|
|
binlog_discover_init(uint64_t file_no, uint64_t interval)
|
|
{
|
|
active_binlog_file_no.store(file_no, std::memory_order_release);
|
|
ibb_file_hash.earliest_oob_ref.store(file_no, std::memory_order_relaxed);
|
|
current_binlog_state_interval= interval;
|
|
ibb_pending_lsn_fifo.init(file_no);
|
|
}
|
|
|
|
|
|
/**
|
|
Returns:
|
|
-1 error
|
|
0 No binlogs found
|
|
1 Just one binlog file found
|
|
2 Found two (or more) existing binlog files
|
|
*/
|
|
static int
|
|
innodb_binlog_discover()
|
|
{
|
|
uint64_t file_no;
|
|
const uint32_t page_size= (uint32_t)ibb_page_size;
|
|
const uint32_t page_size_shift= (uint32_t)ibb_page_size_shift;
|
|
struct found_binlogs binlog_files;
|
|
binlog_header_data header;
|
|
|
|
int res= scan_for_binlogs(innodb_binlog_directory, &binlog_files, false);
|
|
if (res <= 0)
|
|
{
|
|
if (res == 0)
|
|
ibb_pending_lsn_fifo.init(0);
|
|
return res;
|
|
}
|
|
|
|
/*
|
|
Now, if we found any binlog files, locate the point in one of them where
|
|
binlogging stopped, and where we should continue writing new binlog data.
|
|
*/
|
|
uint32_t page_no, prev_page_no, pos_in_page, prev_pos_in_page;
|
|
std::unique_ptr<byte, void (*)(void *)>
|
|
page_buf(static_cast<byte*>(aligned_malloc(page_size, page_size)),
|
|
&aligned_free);
|
|
if (!page_buf)
|
|
return -1;
|
|
if (binlog_files.num_found >= 1) {
|
|
earliest_binlog_file_no= binlog_files.earliest_file_no;
|
|
total_binlog_used_size= binlog_files.total_size;
|
|
|
|
res= find_pos_in_binlog(binlog_files.last_file_no,
|
|
binlog_files.last_size,
|
|
page_buf.get(), &page_no, &pos_in_page,
|
|
&header);
|
|
if (res < 0) {
|
|
file_no= binlog_files.last_file_no;
|
|
if (ibb_record_in_file_hash(file_no, ~(uint64_t)0, ~(uint64_t)0))
|
|
return -1;
|
|
binlog_discover_init(file_no, innodb_binlog_state_interval);
|
|
sql_print_warning("Binlog number %llu could no be opened. Starting a new "
|
|
"binlog file from number %llu",
|
|
binlog_files.last_file_no, (file_no + 1));
|
|
return 0;
|
|
}
|
|
|
|
if (res > 0) {
|
|
/* Found start position in the last binlog file. */
|
|
file_no= binlog_files.last_file_no;
|
|
if (ibb_record_in_file_hash(file_no, header.oob_ref_file_no,
|
|
header.xa_ref_file_no))
|
|
return -1;
|
|
binlog_discover_init(file_no, header.diff_state_interval);
|
|
binlog_cur_page_no= page_no;
|
|
binlog_cur_page_offset= pos_in_page;
|
|
ib::info() << "Continuing binlog number " << file_no << " from position "
|
|
<< (((uint64_t)page_no << page_size_shift) | pos_in_page)
|
|
<< ".";
|
|
return binlog_files.num_found;
|
|
}
|
|
|
|
/* res == 0, the last binlog is empty. */
|
|
if (binlog_files.num_found >= 2) {
|
|
/* The last binlog is empty, try the previous one. */
|
|
res= find_pos_in_binlog(binlog_files.prev_file_no,
|
|
binlog_files.prev_size,
|
|
page_buf.get(),
|
|
&prev_page_no, &prev_pos_in_page,
|
|
&header);
|
|
if (res < 0) {
|
|
file_no= binlog_files.last_file_no;
|
|
if (ibb_record_in_file_hash(file_no, ~(uint64_t)0, ~(uint64_t)0))
|
|
return -1;
|
|
binlog_discover_init(file_no, innodb_binlog_state_interval);
|
|
binlog_cur_page_no= page_no;
|
|
binlog_cur_page_offset= pos_in_page;
|
|
sql_print_warning("Binlog number %llu could not be opened, starting "
|
|
"from binlog number %llu instead",
|
|
binlog_files.prev_file_no, file_no);
|
|
return 1;
|
|
}
|
|
file_no= binlog_files.prev_file_no;
|
|
if (ibb_record_in_file_hash(file_no, header.oob_ref_file_no,
|
|
header.xa_ref_file_no))
|
|
return -1;
|
|
binlog_discover_init(file_no, header.diff_state_interval);
|
|
binlog_cur_page_no= prev_page_no;
|
|
binlog_cur_page_offset= prev_pos_in_page;
|
|
ib::info() << "Continuing binlog number " << file_no << " from position "
|
|
<< (((uint64_t)prev_page_no << page_size_shift) |
|
|
prev_pos_in_page)
|
|
<< ".";
|
|
return binlog_files.num_found;
|
|
}
|
|
|
|
/* Just one empty binlog file found. */
|
|
file_no= binlog_files.last_file_no;
|
|
if (ibb_record_in_file_hash(file_no, ~(uint64_t)0, ~(uint64_t)0))
|
|
return -1;
|
|
binlog_discover_init(file_no, innodb_binlog_state_interval);
|
|
binlog_cur_page_no= page_no;
|
|
binlog_cur_page_offset= pos_in_page;
|
|
ib::info() << "Continuing binlog number " << file_no << " from position "
|
|
<< BINLOG_PAGE_DATA << ".";
|
|
return binlog_files.num_found;
|
|
}
|
|
|
|
/* No binlog files found, start from scratch. */
|
|
file_no= 0;
|
|
earliest_binlog_file_no= 0;
|
|
ibb_file_hash.earliest_oob_ref.store(0, std::memory_order_relaxed);
|
|
total_binlog_used_size= 0;
|
|
ibb_pending_lsn_fifo.init(0);
|
|
current_binlog_state_interval= innodb_binlog_state_interval;
|
|
ib::info() << "Starting a new binlog from file number " << file_no << ".";
|
|
return 0;
|
|
}
|
|
|
|
|
|
void innodb_binlog_close(bool shutdown)
|
|
{
|
|
if (innodb_binlog_inited >= 2)
|
|
{
|
|
if (binlog_prealloc_thr_obj.joinable()) {
|
|
mysql_mutex_lock(&active_binlog_mutex);
|
|
prealloc_thread_end= true;
|
|
pthread_cond_signal(&active_binlog_cond);
|
|
mysql_mutex_unlock(&active_binlog_mutex);
|
|
binlog_prealloc_thr_obj.join();
|
|
}
|
|
|
|
uint64_t file_no= first_open_binlog_file_no;
|
|
if (file_no != ~(uint64_t)0) {
|
|
if (file_no <= last_created_binlog_file_no) {
|
|
fsp_binlog_tablespace_close(file_no);
|
|
if (file_no + 1 <= last_created_binlog_file_no) {
|
|
fsp_binlog_tablespace_close(file_no + 1);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if (shutdown && innodb_binlog_inited >= 1)
|
|
{
|
|
delete ibb_xa_xid_hash;
|
|
binlog_diff_state.free();
|
|
fsp_binlog_shutdown();
|
|
mysql_mutex_destroy(&purge_binlog_mutex);
|
|
}
|
|
}
|
|
|
|
|
|
/**
|
|
Background thread to close old binlog tablespaces and pre-allocate new ones.
|
|
*/
|
|
static void
|
|
innodb_binlog_prealloc_thread()
|
|
{
|
|
my_thread_init();
|
|
#ifdef UNIV_PFS_THREAD
|
|
pfs_register_thread(binlog_prealloc_thread_key);
|
|
#endif
|
|
LF_PINS *lf_pins= lf_hash_get_pins(&ibb_file_hash.hash);
|
|
ut_a(lf_pins);
|
|
|
|
mysql_mutex_lock(&active_binlog_mutex);
|
|
while (1)
|
|
{
|
|
uint64_t active= active_binlog_file_no.load(std::memory_order_relaxed);
|
|
uint64_t first_open= first_open_binlog_file_no;
|
|
|
|
/* Pre-allocate the next tablespace (if not done already). */
|
|
uint64_t last_created= last_created_binlog_file_no;
|
|
if (last_created <= active && last_created <= first_open) {
|
|
ut_ad(last_created == active);
|
|
ut_ad(last_created == first_open || first_open == ~(uint64_t)0);
|
|
/*
|
|
Note: `last_created` is initialized to ~0, so incrementing it here
|
|
makes us start from binlog file 0.
|
|
*/
|
|
++last_created;
|
|
mysql_mutex_unlock(&active_binlog_mutex);
|
|
|
|
mysql_mutex_lock(&purge_binlog_mutex);
|
|
uint32_t size_in_pages= innodb_binlog_size_in_pages;
|
|
dberr_t res2= fsp_binlog_tablespace_create(last_created, size_in_pages,
|
|
lf_pins);
|
|
if (earliest_binlog_file_no == ~(uint64_t)0)
|
|
earliest_binlog_file_no= last_created;
|
|
total_binlog_used_size+= (size_in_pages << ibb_page_size_shift);
|
|
|
|
innodb_binlog_autopurge(first_open, lf_pins);
|
|
mysql_mutex_unlock(&purge_binlog_mutex);
|
|
|
|
mysql_mutex_lock(&active_binlog_mutex);
|
|
/*
|
|
ToDo: Error handling.
|
|
For example, disk full, while tricky to handle well, should not crash
|
|
the server at least.
|
|
*/
|
|
ut_a(res2 == DB_SUCCESS);
|
|
last_created_binlog_file_no= last_created;
|
|
|
|
/* If we created the initial tablespace file, make it the active one. */
|
|
ut_ad(active < ~(uint64_t)0 || last_created == 0);
|
|
if (active == ~(uint64_t)0) {
|
|
binlog_cur_end_offset[last_created & 3].
|
|
store(0, std::memory_order_release);
|
|
binlog_cur_durable_offset[last_created & 3]
|
|
.store(0, std::memory_order_release);
|
|
active_binlog_file_no.store(last_created, std::memory_order_relaxed);
|
|
ibb_file_hash.earliest_oob_ref.store(last_created,
|
|
std::memory_order_relaxed);
|
|
}
|
|
if (first_open == ~(uint64_t)0)
|
|
first_open_binlog_file_no= first_open= last_created;
|
|
|
|
pthread_cond_signal(&active_binlog_cond);
|
|
continue; /* Re-start loop after releasing/reacquiring mutex. */
|
|
}
|
|
|
|
/*
|
|
Flush out to disk and close any binlog tablespace that has been
|
|
completely written.
|
|
*/
|
|
if (first_open < active) {
|
|
ut_ad(first_open == active - 1);
|
|
mysql_mutex_unlock(&active_binlog_mutex);
|
|
fsp_binlog_tablespace_close(active - 1);
|
|
mysql_mutex_lock(&active_binlog_mutex);
|
|
first_open_binlog_file_no= first_open + 1;
|
|
continue; /* Re-start loop after releasing/reacquiring mutex. */
|
|
}
|
|
|
|
/* Exit thread at server shutdown. */
|
|
if (prealloc_thread_end)
|
|
break;
|
|
my_cond_wait(&active_binlog_cond, &active_binlog_mutex.m_mutex);
|
|
|
|
}
|
|
mysql_mutex_unlock(&active_binlog_mutex);
|
|
|
|
lf_hash_put_pins(lf_pins);
|
|
my_thread_end();
|
|
|
|
#ifdef UNIV_PFS_THREAD
|
|
pfs_delete_thread();
|
|
#endif
|
|
}
|
|
|
|
|
|
bool
|
|
ibb_write_header_page(mtr_t *mtr, uint64_t file_no, uint64_t file_size_in_pages,
|
|
lsn_t start_lsn, uint64_t gtid_state_interval_in_pages,
|
|
LF_PINS *pins)
|
|
{
|
|
fsp_binlog_page_entry *block;
|
|
uint32_t used_bytes;
|
|
|
|
block= binlog_page_fifo->create_page(file_no, 0);
|
|
ut_a(block /* ToDo: error handling? */);
|
|
byte *ptr= &block->page_buf()[0];
|
|
uint64_t oob_ref_file_no=
|
|
ibb_file_hash.earliest_oob_ref.load(std::memory_order_relaxed);
|
|
uint64_t xa_ref_file_no=
|
|
ibb_file_hash.earliest_xa_ref.load(std::memory_order_relaxed);
|
|
ibb_file_hash.update_refs(file_no, pins, oob_ref_file_no, xa_ref_file_no);
|
|
|
|
int4store(ptr, IBB_MAGIC);
|
|
int4store(ptr + 4, ibb_page_size_shift);
|
|
int4store(ptr + 8, IBB_FILE_VERS_MAJOR);
|
|
int4store(ptr + 12, IBB_FILE_VERS_MINOR);
|
|
int8store(ptr + 16, file_no);
|
|
int8store(ptr + 24, file_size_in_pages);
|
|
int8store(ptr + 32, start_lsn);
|
|
int8store(ptr + 40, gtid_state_interval_in_pages);
|
|
int8store(ptr + 48, oob_ref_file_no);
|
|
int8store(ptr + 56, xa_ref_file_no);
|
|
used_bytes= IBB_BINLOG_HEADER_SIZE;
|
|
ut_ad(ibb_page_size >= IBB_HEADER_PAGE_SIZE);
|
|
memset(ptr + used_bytes, 0, ibb_page_size - (used_bytes + BINLOG_PAGE_CHECKSUM));
|
|
/*
|
|
For future expansion with configurable page size:
|
|
Write a CRC32 at the end of the minimal page size. This way, the header
|
|
page can be read and checksummed without knowing the page size used in
|
|
the file, and then the actual page size can be obtained from the header
|
|
page.
|
|
*/
|
|
const uint32_t payload= IBB_HEADER_PAGE_SIZE - BINLOG_PAGE_CHECKSUM;
|
|
int4store(ptr + payload, my_crc32c(0, ptr, payload));
|
|
|
|
fsp_log_header_page(mtr, block, file_no, used_bytes);
|
|
binlog_page_fifo->release_page_mtr(block, mtr);
|
|
|
|
return false; // No error
|
|
}
|
|
|
|
|
|
__attribute__((noinline))
|
|
static ssize_t
|
|
serialize_gtid_state(rpl_binlog_state_base *state, byte *buf, size_t buf_size)
|
|
noexcept
|
|
{
|
|
unsigned char *p= (unsigned char *)buf;
|
|
/*
|
|
1 uint64_t for the number of entries in the state stored.
|
|
1 uint64_t for the XA references file_no.
|
|
2 uint32_t + 1 uint64_t for at least one GTID.
|
|
*/
|
|
ut_ad(buf_size >= 2*COMPR_INT_MAX32 + 3*COMPR_INT_MAX64);
|
|
p= compr_int_write(p, state->count_nolock());
|
|
uint64_t xa_ref_file_no=
|
|
ibb_file_hash.earliest_xa_ref.load(std::memory_order_relaxed);
|
|
/* Write 1 +file_no, so that 0 (1 + ~0) means "no reference". */
|
|
p= compr_int_write(p, xa_ref_file_no + 1);
|
|
unsigned char * const pmax=
|
|
p + (buf_size - (2*COMPR_INT_MAX32 + COMPR_INT_MAX64));
|
|
|
|
if (state->iterate(
|
|
[pmax, &p] (const rpl_gtid *gtid) {
|
|
if (UNIV_UNLIKELY(p > pmax))
|
|
return true;
|
|
p= compr_int_write(p, gtid->domain_id);
|
|
p= compr_int_write(p, gtid->server_id);
|
|
p= compr_int_write(p, gtid->seq_no);
|
|
return false;
|
|
}))
|
|
return -1;
|
|
else
|
|
return p - (unsigned char *)buf;
|
|
}
|
|
|
|
|
|
bool
|
|
binlog_gtid_state(rpl_binlog_state_base *state, mtr_t *mtr,
|
|
fsp_binlog_page_entry * &block, uint32_t &page_no,
|
|
uint32_t &page_offset, uint64_t file_no)
|
|
{
|
|
/*
|
|
Use a small, efficient stack-allocated buffer by default, falling back to
|
|
malloc() if needed for large GTID state.
|
|
*/
|
|
byte small_buf[192];
|
|
byte *buf, *alloced_buf;
|
|
uint32_t block_page_no= ~(uint32_t)0;
|
|
block= nullptr;
|
|
|
|
ssize_t used_bytes= serialize_gtid_state(state, small_buf, sizeof(small_buf));
|
|
if (used_bytes >= 0)
|
|
{
|
|
buf= small_buf;
|
|
alloced_buf= nullptr;
|
|
}
|
|
else
|
|
{
|
|
size_t buf_size= 2*COMPR_INT_MAX64 +
|
|
state->count_nolock() * (2*COMPR_INT_MAX32 + COMPR_INT_MAX64);
|
|
alloced_buf= static_cast<byte *>(ut_malloc(buf_size, mem_key_binlog));
|
|
if (UNIV_UNLIKELY(!alloced_buf))
|
|
return true;
|
|
buf= alloced_buf;
|
|
used_bytes= serialize_gtid_state(state, buf, buf_size);
|
|
if (UNIV_UNLIKELY(used_bytes < 0))
|
|
{
|
|
ut_ad(0 /* Shouldn't happen, as we allocated maximum needed size. */);
|
|
ut_free(alloced_buf);
|
|
return true;
|
|
}
|
|
}
|
|
|
|
const uint32_t page_size= (uint32_t)ibb_page_size;
|
|
const uint32_t page_room= page_size - (BINLOG_PAGE_DATA + BINLOG_PAGE_DATA_END);
|
|
uint32_t needed_pages= (uint32_t)((used_bytes + page_room - 1) / page_room);
|
|
|
|
/* For now, GTID state always at the start of a page. */
|
|
ut_ad(page_offset == BINLOG_PAGE_DATA);
|
|
/* Page 0 is reserved for the header page. */
|
|
ut_ad(page_no != 0);
|
|
|
|
/*
|
|
Only write the GTID state record if there is room for actual event data
|
|
afterwards. There is no point in using space to allow fast search to a
|
|
point if there is no data to search for after that point.
|
|
*/
|
|
if (page_no + needed_pages < binlog_page_fifo->size_in_pages(file_no))
|
|
{
|
|
byte cont_flag= 0;
|
|
while (used_bytes > 0)
|
|
{
|
|
ut_ad(page_no < binlog_page_fifo->size_in_pages(file_no));
|
|
if (block)
|
|
binlog_page_fifo->release_page_mtr(block, mtr);
|
|
block_page_no= page_no;
|
|
block= binlog_page_fifo->create_page(file_no, block_page_no);
|
|
ut_a(block /* ToDo: error handling? */);
|
|
page_offset= BINLOG_PAGE_DATA;
|
|
byte *ptr= page_offset + &block->page_buf()[0];
|
|
uint32_t chunk= (uint32_t)used_bytes;
|
|
byte last_flag= FSP_BINLOG_FLAG_LAST;
|
|
if (chunk > page_room - 3) {
|
|
last_flag= 0;
|
|
chunk= page_room - 3;
|
|
++page_no;
|
|
}
|
|
ptr[0]= FSP_BINLOG_TYPE_GTID_STATE | cont_flag | last_flag;
|
|
ptr[1] = (byte)chunk & 0xff;
|
|
ptr[2] = (byte)(chunk >> 8);
|
|
ut_ad(chunk <= 0xffff);
|
|
memcpy(ptr+3, buf, chunk);
|
|
fsp_log_binlog_write(mtr, block, file_no, block_page_no, page_offset,
|
|
(uint32)(chunk+3));
|
|
page_offset+= chunk + 3;
|
|
buf+= chunk;
|
|
used_bytes-= chunk;
|
|
cont_flag= FSP_BINLOG_FLAG_CONT;
|
|
}
|
|
|
|
if (page_offset == page_size - BINLOG_PAGE_DATA_END) {
|
|
if (block)
|
|
binlog_page_fifo->release_page_mtr(block, mtr);
|
|
block= nullptr;
|
|
block_page_no= ~(uint32_t)0;
|
|
page_offset= BINLOG_PAGE_DATA;
|
|
++page_no;
|
|
}
|
|
}
|
|
ut_free(alloced_buf);
|
|
|
|
/* Make sure we return a page for caller to write the main event data into. */
|
|
if (UNIV_UNLIKELY(!block)) {
|
|
block= binlog_page_fifo->create_page(file_no, page_no);
|
|
ut_a(block /* ToDo: error handling? */);
|
|
}
|
|
|
|
return false; // No error
|
|
}
|
|
|
|
|
|
/**
|
|
Read a binlog state record. The passed in STATE object is updated with the
|
|
state read.
|
|
|
|
Returns:
|
|
1 State record found
|
|
0 No state record found
|
|
-1 Error
|
|
*/
|
|
static int
|
|
read_gtid_state(binlog_chunk_reader *chunk_reader,
|
|
rpl_binlog_state_base *state,
|
|
uint64_t *out_xa_ref_file_no) noexcept
|
|
{
|
|
byte buf[256];
|
|
static_assert(sizeof(buf) >= 2*COMPR_INT_MAX64 + 6*COMPR_INT_MAX64,
|
|
"buf must hold at least 2 GTIDs");
|
|
int res= chunk_reader->read_data(buf, sizeof(buf), true);
|
|
if (UNIV_UNLIKELY(res < 0))
|
|
return -1;
|
|
if (res == 0 || chunk_reader->cur_type() != FSP_BINLOG_TYPE_GTID_STATE)
|
|
return 0;
|
|
const byte *p= buf;
|
|
const byte *p_end= buf + res;
|
|
|
|
/* Read the number of GTIDs in the gtid state record. */
|
|
std::pair<uint64_t, const unsigned char *> v_and_p= compr_int_read(buf);
|
|
p= v_and_p.second;
|
|
if (UNIV_UNLIKELY(p > p_end))
|
|
return -1;
|
|
uint64_t num_gtid= v_and_p.first;
|
|
/*
|
|
Read the earliest file_no containing pending XA if any.
|
|
Note that unsigned underflow means 0 - 1 becomes ~0, as required.
|
|
*/
|
|
v_and_p= compr_int_read(p);
|
|
p= v_and_p.second;
|
|
if (UNIV_UNLIKELY(p > p_end))
|
|
return -1;
|
|
*out_xa_ref_file_no= v_and_p.first - 1;
|
|
|
|
/* Read each GTID one by one and add into the state. */
|
|
for (uint64_t count= num_gtid; count > 0; --count)
|
|
{
|
|
ptrdiff_t remain= p_end - p;
|
|
/* Read more data as needed to ensure we have read a full GTID. */
|
|
if (UNIV_UNLIKELY(!chunk_reader->end_of_record()) &&
|
|
UNIV_UNLIKELY(remain < 3*COMPR_INT_MAX64))
|
|
{
|
|
memmove(buf, p, remain);
|
|
res= chunk_reader->read_data(buf + remain, (int)(sizeof(buf) - remain),
|
|
true);
|
|
if (UNIV_UNLIKELY(res < 0))
|
|
return -1;
|
|
p= buf;
|
|
p_end= p + remain + res;
|
|
remain+= res;
|
|
}
|
|
rpl_gtid gtid;
|
|
if (UNIV_UNLIKELY(p >= p_end))
|
|
return -1;
|
|
v_and_p= compr_int_read(p);
|
|
if (UNIV_UNLIKELY(v_and_p.first > UINT32_MAX))
|
|
return -1;
|
|
gtid.domain_id= (uint32_t)v_and_p.first;
|
|
p= v_and_p.second;
|
|
if (UNIV_UNLIKELY(p >= p_end))
|
|
return -1;
|
|
v_and_p= compr_int_read(p);
|
|
if (UNIV_UNLIKELY(v_and_p.first > UINT32_MAX))
|
|
return -1;
|
|
gtid.server_id= (uint32_t)v_and_p.first;
|
|
p= v_and_p.second;
|
|
if (UNIV_UNLIKELY(p >= p_end))
|
|
return -1;
|
|
v_and_p= compr_int_read(p);
|
|
gtid.seq_no= v_and_p.first;
|
|
p= v_and_p.second;
|
|
if (UNIV_UNLIKELY(p > p_end))
|
|
return -1;
|
|
if (state->update_nolock(>id))
|
|
return -1;
|
|
}
|
|
|
|
/*
|
|
For now, we expect no more data.
|
|
Later it could be extended, as we store (and read) the count of GTIDs.
|
|
*/
|
|
ut_ad(p == p_end);
|
|
|
|
return 1;
|
|
}
|
|
|
|
|
|
/**
|
|
Recover the GTID binlog state at startup.
|
|
Read the full binlog state at the start of the current binlog file, as well
|
|
as the last differential binlog state on top, if any. Then scan from there to
|
|
the end to obtain the exact current GTID binlog state.
|
|
|
|
Return false if ok, true if error.
|
|
*/
|
|
static bool
|
|
binlog_state_recover()
|
|
{
|
|
rpl_binlog_state_base state;
|
|
state.init();
|
|
uint64_t active= active_binlog_file_no.load(std::memory_order_relaxed);
|
|
uint64_t diff_state_interval= current_binlog_state_interval;
|
|
uint32_t page_no= 1;
|
|
uint64_t xa_ref_file_no;
|
|
|
|
binlog_chunk_reader chunk_reader(binlog_cur_end_offset);
|
|
byte *page_buf=
|
|
static_cast<byte *>(ut_malloc(ibb_page_size, mem_key_binlog));
|
|
if (!page_buf)
|
|
return true;
|
|
chunk_reader.set_page_buf(page_buf);
|
|
chunk_reader.seek(active, page_no << ibb_page_size_shift);
|
|
int res= read_gtid_state(&chunk_reader, &state, &xa_ref_file_no);
|
|
if (res < 0)
|
|
{
|
|
ut_free(page_buf);
|
|
return true;
|
|
}
|
|
if (diff_state_interval == 0)
|
|
{
|
|
sql_print_warning("Invalid differential binlog state interval %llu found "
|
|
"in binlog file, ignoring", diff_state_interval);
|
|
}
|
|
else
|
|
{
|
|
page_no= (uint32_t)(binlog_cur_page_no -
|
|
(binlog_cur_page_no % diff_state_interval));
|
|
while (page_no > 1)
|
|
{
|
|
chunk_reader.seek(active, page_no << ibb_page_size_shift);
|
|
res= read_gtid_state(&chunk_reader, &state, &xa_ref_file_no);
|
|
if (res > 0)
|
|
break;
|
|
page_no-= (uint32_t)diff_state_interval;
|
|
}
|
|
}
|
|
ut_free(page_buf);
|
|
|
|
ha_innodb_binlog_reader reader(false, active,
|
|
page_no << ibb_page_size_shift);
|
|
return binlog_recover_gtid_state(&state, &reader);
|
|
}
|
|
|
|
|
|
/** Allocate a context for out-of-band binlogging. */
|
|
static binlog_oob_context *
|
|
alloc_oob_context(uint32 list_length= 10)
|
|
{
|
|
size_t needed= sizeof(binlog_oob_context) +
|
|
list_length * sizeof(binlog_oob_context::node_info);
|
|
binlog_oob_context *c=
|
|
static_cast<binlog_oob_context *>(ut_malloc(needed, mem_key_binlog));
|
|
if (c)
|
|
{
|
|
if (!(c->lf_pins= lf_hash_get_pins(&ibb_file_hash.hash)))
|
|
{
|
|
my_error(ER_OUT_OF_RESOURCES, MYF(0));
|
|
ut_free(c);
|
|
return nullptr;
|
|
}
|
|
c->stmt_start_point= nullptr;
|
|
c->savepoint_stack= nullptr;
|
|
c->pending_file_no= ~(uint64_t)0;
|
|
c->node_list_alloc_len= list_length;
|
|
c->node_list_len= 0;
|
|
c->secondary_ctx= nullptr;
|
|
c->pending_refcount= false;
|
|
c->is_xa_prepared= false;
|
|
}
|
|
else
|
|
my_error(ER_OUTOFMEMORY, MYF(0), needed);
|
|
|
|
return c;
|
|
}
|
|
|
|
|
|
static void
|
|
innodb_binlog_write_cache(IO_CACHE *cache,
|
|
handler_binlog_event_group_info *binlog_info, mtr_t *mtr)
|
|
{
|
|
binlog_oob_context *c=
|
|
static_cast<binlog_oob_context *>(binlog_info->engine_ptr);
|
|
if (!c)
|
|
binlog_info->engine_ptr= c= alloc_oob_context();
|
|
ut_a(c);
|
|
|
|
if (unlikely(binlog_info->xa_xid))
|
|
{
|
|
/*
|
|
Write an XID commit record just before the main commit record.
|
|
The XID commit record just contains the XID, and is used by binlog XA
|
|
crash recovery to ensure than the other storage engine(s) that are part
|
|
of the transaciton commit or rollback consistently with the binlog
|
|
engine.
|
|
*/
|
|
chunk_data_xa_complete chunk_data2(binlog_info->xa_xid, true);
|
|
fsp_binlog_write_rec(&chunk_data2, mtr, FSP_BINLOG_TYPE_XA_COMPLETE,
|
|
c->lf_pins);
|
|
}
|
|
|
|
chunk_data_cache chunk_data(cache, binlog_info);
|
|
|
|
fsp_binlog_write_rec(&chunk_data, mtr, FSP_BINLOG_TYPE_COMMIT, c->lf_pins);
|
|
uint64_t file_no= active_binlog_file_no.load(std::memory_order_relaxed);
|
|
c->pending_file_no= file_no;
|
|
c->pending_offset=
|
|
binlog_cur_end_offset[file_no & 3].load(std::memory_order_relaxed);
|
|
}
|
|
|
|
|
|
static inline void
|
|
reset_oob_context(binlog_oob_context *c)
|
|
{
|
|
if (c->stmt_start_point)
|
|
c->stmt_start_point->node_list_len= 0;
|
|
while (c->savepoint_stack != nullptr)
|
|
{
|
|
binlog_oob_context::savepoint *next_savepoint= c->savepoint_stack->next;
|
|
ut_free(c->savepoint_stack);
|
|
c->savepoint_stack= next_savepoint;
|
|
}
|
|
c->pending_file_no= ~(uint64_t)0;
|
|
if (c->pending_refcount)
|
|
{
|
|
ibb_file_hash.oob_ref_dec(c->first_node_file_no, c->lf_pins);
|
|
c->pending_refcount= false;
|
|
}
|
|
c->node_list_len= 0;
|
|
c->secondary_ctx= nullptr;
|
|
c->is_xa_prepared= false;
|
|
}
|
|
|
|
|
|
static inline void
|
|
free_oob_context(binlog_oob_context *c)
|
|
{
|
|
ut_ad(!c->pending_refcount /* Should not have pending until free */);
|
|
reset_oob_context(c); /* Defensive programming, should be redundant */
|
|
ut_free(c->stmt_start_point);
|
|
lf_hash_put_pins(c->lf_pins);
|
|
ut_free(c);
|
|
}
|
|
|
|
|
|
static binlog_oob_context *
|
|
ensure_oob_context(void **engine_data, uint32_t needed_len)
|
|
{
|
|
binlog_oob_context *c= static_cast<binlog_oob_context *>(*engine_data);
|
|
if (c->node_list_alloc_len >= needed_len)
|
|
return c;
|
|
if (needed_len < c->node_list_alloc_len + 10)
|
|
needed_len= c->node_list_alloc_len + 10;
|
|
binlog_oob_context *new_c= alloc_oob_context(needed_len);
|
|
if (UNIV_UNLIKELY(!new_c))
|
|
return nullptr;
|
|
ut_ad(c->node_list_len <= c->node_list_alloc_len);
|
|
memcpy(new_c, c, sizeof(binlog_oob_context) +
|
|
c->node_list_len*sizeof(binlog_oob_context::node_info));
|
|
new_c->node_list_alloc_len= needed_len;
|
|
*engine_data= new_c;
|
|
ut_free(c);
|
|
return new_c;
|
|
}
|
|
|
|
|
|
/**
|
|
Binlog an out-of-band piece of event group data.
|
|
|
|
For large transactions, we binlog the data in pieces spread out over the
|
|
binlog file(s), to avoid a large stall to write large amounts of data during
|
|
transaction commit, and to avoid having to keep all of the transaction in
|
|
memory or spill it to temporary file.
|
|
|
|
The chunks of data are written out in a binary tree structure, to allow
|
|
efficiently reading the transaction back in order from start to end. Note
|
|
that the binlog is written append-only, so we cannot simply link each chunk
|
|
to the following chunk, as the following chunk is unknown when binlogging the
|
|
prior chunk. With a binary tree structure, the reader can do a post-order
|
|
traversal and only need to keep log_2(N) node pointers in-memory at any time.
|
|
|
|
A perfect binary tree of height h has 2**h - 1 nodes. At any time during a
|
|
transaction, the out-of-band data in the binary log for that transaction
|
|
consists of a forest (eg. a list) of perfect binary trees of strictly
|
|
decreasing height, except that the last two trees may have the same height.
|
|
For example, here is how it looks for a transaction where 13 nodes (0-12)
|
|
have been binlogged out-of-band so far:
|
|
|
|
6
|
|
_ / \_
|
|
2 5 9 12
|
|
/ \ / \ / \ / \
|
|
0 1 3 4 7 8 10 11
|
|
|
|
In addition to the shown binary tree parent->child pointers, each leaf has a
|
|
(single) link to the root node of the prior (at the time the leaf was added)
|
|
tree. In the example this means the following links:
|
|
11->10, 10->9, 8->7, 7->6, 4->3, 3->2, 1->0
|
|
This allows to fully traverse the forest of perfect binary trees starting
|
|
from the last node (12 in the example). In the example, only 10->9 and 7->6
|
|
will be needed, but the other links would be needed if the tree had been
|
|
completed at earlier stages.
|
|
|
|
As a new node is added, there are two different cases on how to maintain
|
|
the binary tree forest structure:
|
|
|
|
1. If the last two trees in the forest have the same height h, then those
|
|
two trees are replaced by a single tree of height (h+1) with the new
|
|
node as root and the two trees as left and right child. The number of
|
|
trees in the forest thus decrease by one.
|
|
|
|
2. Otherwise the new node is added at the end of the forest as a tree of
|
|
height 1; in this case the forest increases by one tree.
|
|
|
|
In both cases, we maintain the invariants that the forest consist of a list
|
|
of perfect binary trees, and that the heights of the trees are strictly
|
|
decreasing except that the last two trees can have the same height.
|
|
|
|
When a transaction is committed, the commit record contains a pointer to
|
|
the root node of the last tree in the forest. If the transaction is never
|
|
committed (explicitly rolled back or lost due to disconnect or server
|
|
restart or crash), then the out-of-band data is simply left in place; it
|
|
will be ignored by readers and eventually discarded as the old binlog files
|
|
are purged.
|
|
*/
|
|
bool
|
|
innodb_binlog_oob_ordered(THD *thd, const unsigned char *data, size_t data_len,
|
|
void **engine_data, void **stm_start_data,
|
|
void **savepoint_data)
|
|
{
|
|
binlog_oob_context *c= static_cast<binlog_oob_context *>(*engine_data);
|
|
if (!c)
|
|
*engine_data= c= alloc_oob_context();
|
|
if (UNIV_UNLIKELY(!c))
|
|
return true;
|
|
if (UNIV_UNLIKELY(c->is_xa_prepared))
|
|
{
|
|
my_error(ER_XAER_RMFAIL, MYF(0), "IDLE");
|
|
return true;
|
|
}
|
|
|
|
if (stm_start_data)
|
|
{
|
|
if (c->create_stmt_start_point())
|
|
return true;
|
|
*stm_start_data= nullptr; /* We do not need to store any data there. */
|
|
if (data_len == 0 && !savepoint_data)
|
|
return false;
|
|
}
|
|
if (savepoint_data)
|
|
{
|
|
binlog_oob_context::savepoint *sv= c->create_savepoint();
|
|
if (!sv)
|
|
return true;
|
|
*((binlog_oob_context::savepoint **)savepoint_data)= sv;
|
|
if (data_len == 0)
|
|
return false;
|
|
}
|
|
ut_ad(data_len > 0);
|
|
|
|
mtr_t mtr{thd_to_trx(thd)};
|
|
uint32_t i= c->node_list_len;
|
|
uint64_t new_idx= i==0 ? 0 : c->node_list[i-1].node_index + 1;
|
|
if (i >= 2 && c->node_list[i-2].height == c->node_list[i-1].height)
|
|
{
|
|
/* Case 1: Replace two trees with a tree rooted in a new node. */
|
|
binlog_oob_context::chunk_data_oob oob_data
|
|
(new_idx,
|
|
c->node_list[i-2].file_no, c->node_list[i-2].offset,
|
|
c->node_list[i-1].file_no, c->node_list[i-1].offset,
|
|
static_cast<const byte *>(data), data_len);
|
|
if (c->binlog_node(i-2, new_idx, i-2, i-1, &oob_data, c->lf_pins, &mtr))
|
|
return true;
|
|
c->node_list_len= i - 1;
|
|
}
|
|
else if (i > 0)
|
|
{
|
|
/* Case 2: Add the new node as a singleton tree. */
|
|
c= ensure_oob_context(engine_data, i+1);
|
|
if (!c)
|
|
return true;
|
|
binlog_oob_context::chunk_data_oob oob_data
|
|
(new_idx,
|
|
0, 0, /* NULL left child signifies a leaf */
|
|
c->node_list[i-1].file_no, c->node_list[i-1].offset,
|
|
static_cast<const byte *>(data), data_len);
|
|
if (c->binlog_node(i, new_idx, i-1, i-1, &oob_data, c->lf_pins, &mtr))
|
|
return true;
|
|
c->node_list_len= i + 1;
|
|
}
|
|
else
|
|
{
|
|
/* Special case i==0, like case 2 but no prior node to link to. */
|
|
binlog_oob_context::chunk_data_oob oob_data
|
|
(new_idx, 0, 0, 0, 0, static_cast<const byte *>(data), data_len);
|
|
if (c->binlog_node(i, new_idx, ~(uint32_t)0, ~(uint32_t)0, &oob_data,
|
|
c->lf_pins, &mtr))
|
|
return true;
|
|
c->first_node_file_no= c->node_list[i].file_no;
|
|
c->first_node_offset= c->node_list[i].offset;
|
|
c->node_list_len= 1;
|
|
c->pending_refcount=
|
|
!ibb_file_hash.oob_ref_inc(c->first_node_file_no, c->lf_pins);
|
|
}
|
|
|
|
uint64_t file_no= active_binlog_file_no.load(std::memory_order_relaxed);
|
|
c->pending_file_no= file_no;
|
|
c->pending_offset=
|
|
binlog_cur_end_offset[file_no & 3].load(std::memory_order_relaxed);
|
|
innodb_binlog_post_commit(&mtr, c);
|
|
return false;
|
|
}
|
|
|
|
|
|
bool
|
|
innodb_binlog_oob(THD *thd, const unsigned char *data, size_t data_len,
|
|
void **engine_data)
|
|
{
|
|
binlog_oob_context *c= static_cast<binlog_oob_context *>(*engine_data);
|
|
if (UNIV_LIKELY(c != nullptr))
|
|
ibb_pending_lsn_fifo.record_commit(c);
|
|
return false;
|
|
}
|
|
|
|
|
|
/**
|
|
Binlog a new out-of-band tree node and put it at position `node` in the list
|
|
of trees. A leaf node is denoted by left and right child being identical (and
|
|
in this case they point to the root of the prior tree).
|
|
*/
|
|
bool
|
|
binlog_oob_context::binlog_node(uint32_t node, uint64_t new_idx,
|
|
uint32_t left_node, uint32_t right_node,
|
|
chunk_data_oob *oob_data, LF_PINS *pins,
|
|
mtr_t *mtr)
|
|
{
|
|
uint32_t new_height=
|
|
left_node == right_node ? 1 : 1 + node_list[left_node].height;
|
|
mtr->start();
|
|
std::pair<uint64_t, uint64_t> new_file_no_offset=
|
|
fsp_binlog_write_rec(oob_data, mtr, FSP_BINLOG_TYPE_OOB_DATA, pins);
|
|
mtr->commit();
|
|
node_list[node].file_no= new_file_no_offset.first;
|
|
node_list[node].offset= new_file_no_offset.second;
|
|
node_list[node].node_index= new_idx;
|
|
node_list[node].height= new_height;
|
|
return false; // ToDo: Error handling?
|
|
}
|
|
|
|
|
|
binlog_oob_context::chunk_data_oob::chunk_data_oob(uint64_t idx,
|
|
uint64_t left_file_no, uint64_t left_offset,
|
|
uint64_t right_file_no, uint64_t right_offset,
|
|
const byte *data, size_t data_len)
|
|
: sofar(0), main_len(data_len), main_data(data)
|
|
{
|
|
ut_ad(data_len > 0);
|
|
byte *p= &header_buf[0];
|
|
p= compr_int_write(p, idx);
|
|
p= compr_int_write(p, left_file_no);
|
|
p= compr_int_write(p, left_offset);
|
|
p= compr_int_write(p, right_file_no);
|
|
p= compr_int_write(p, right_offset);
|
|
ut_ad((uint32_t)(p - &header_buf[0]) <= max_buffer);
|
|
header_len= (uint32_t)(p - &header_buf[0]);
|
|
}
|
|
|
|
|
|
std::pair<uint32_t, bool>
|
|
binlog_oob_context::chunk_data_oob::copy_data(byte *p, uint32_t max_len)
|
|
{
|
|
uint32_t size= 0;
|
|
/* First write header data, if any left. */
|
|
if (sofar < header_len)
|
|
{
|
|
size= std::min(header_len - (uint32_t)sofar, max_len);
|
|
memcpy(p, header_buf + sofar, size);
|
|
p+= size;
|
|
sofar+= size;
|
|
if (UNIV_UNLIKELY(max_len == size))
|
|
return {size, sofar == header_len + main_len};
|
|
max_len-= size;
|
|
}
|
|
|
|
/* Then write the main chunk data. */
|
|
ut_ad(sofar >= header_len);
|
|
ut_ad(main_len > 0);
|
|
uint32_t size2=
|
|
(uint32_t)std::min(header_len + main_len - sofar, (uint64_t)max_len);
|
|
memcpy(p, main_data + (sofar - header_len), size2);
|
|
sofar+= size2;
|
|
return {size + size2, sofar == header_len + main_len};
|
|
}
|
|
|
|
|
|
bool
|
|
binlog_oob_context::create_stmt_start_point()
|
|
{
|
|
if (!stmt_start_point || node_list_len > stmt_start_point->alloc_len)
|
|
{
|
|
ut_free(stmt_start_point);
|
|
size_t size= sizeof(savepoint) + node_list_len * sizeof(node_info);
|
|
stmt_start_point=
|
|
static_cast<savepoint *>(ut_malloc(size, mem_key_binlog));
|
|
if (!stmt_start_point)
|
|
{
|
|
my_error(ER_OUTOFMEMORY, MYF(0), size);
|
|
return true;
|
|
}
|
|
stmt_start_point->alloc_len= node_list_len;
|
|
}
|
|
stmt_start_point->node_list_len= node_list_len;
|
|
memcpy(stmt_start_point->node_list, node_list,
|
|
node_list_len * sizeof(node_info));
|
|
return false;
|
|
}
|
|
|
|
|
|
binlog_oob_context::savepoint *
|
|
binlog_oob_context::create_savepoint()
|
|
{
|
|
size_t size= sizeof(savepoint) + node_list_len * sizeof(node_info);
|
|
savepoint *s= static_cast<savepoint *>(ut_malloc(size, mem_key_binlog));
|
|
if (!s)
|
|
{
|
|
my_error(ER_OUTOFMEMORY, MYF(0), size);
|
|
return nullptr;
|
|
}
|
|
s->next= savepoint_stack;
|
|
s->node_list_len= node_list_len;
|
|
memcpy(s->node_list, node_list, node_list_len * sizeof(node_info));
|
|
savepoint_stack= s;
|
|
return s;
|
|
}
|
|
|
|
|
|
void
|
|
binlog_oob_context::rollback_to_savepoint(savepoint *savepoint)
|
|
{
|
|
ut_a(node_list_alloc_len >= savepoint->node_list_len);
|
|
node_list_len= savepoint->node_list_len;
|
|
memcpy(node_list, savepoint->node_list,
|
|
savepoint->node_list_len * sizeof(node_info));
|
|
|
|
/* Remove any later savepoints from the stack. */
|
|
for (;;)
|
|
{
|
|
struct savepoint *s= savepoint_stack;
|
|
ut_ad(s != nullptr /* Should always find the savepoint on the stack. */);
|
|
if (UNIV_UNLIKELY(!s))
|
|
break;
|
|
if (s == savepoint)
|
|
break;
|
|
savepoint_stack= s->next;
|
|
ut_free(s);
|
|
}
|
|
}
|
|
|
|
|
|
void
|
|
binlog_oob_context::rollback_to_stmt_start()
|
|
{
|
|
ut_a(node_list_alloc_len >= stmt_start_point->node_list_len);
|
|
node_list_len= stmt_start_point->node_list_len;
|
|
memcpy(node_list, stmt_start_point->node_list,
|
|
stmt_start_point->node_list_len * sizeof(node_info));
|
|
}
|
|
|
|
|
|
void
|
|
ibb_savepoint_rollback(THD *thd, void **engine_data,
|
|
void **stmt_start_data, void **savepoint_data)
|
|
{
|
|
binlog_oob_context *c= static_cast<binlog_oob_context *>(*engine_data);
|
|
ut_a(c != nullptr);
|
|
|
|
if (stmt_start_data)
|
|
{
|
|
ut_ad(savepoint_data == nullptr);
|
|
c->rollback_to_stmt_start();
|
|
}
|
|
|
|
if (savepoint_data)
|
|
{
|
|
ut_ad(stmt_start_data == nullptr);
|
|
binlog_oob_context::savepoint *savepoint=
|
|
(binlog_oob_context::savepoint *)*savepoint_data;
|
|
c->rollback_to_savepoint(savepoint);
|
|
}
|
|
}
|
|
|
|
|
|
void
|
|
innodb_reset_oob(void **engine_data)
|
|
{
|
|
binlog_oob_context *c= static_cast<binlog_oob_context *>(*engine_data);
|
|
if (c)
|
|
reset_oob_context(c);
|
|
}
|
|
|
|
|
|
void
|
|
innodb_free_oob(void *engine_data)
|
|
{
|
|
free_oob_context(static_cast<binlog_oob_context *>(engine_data));
|
|
}
|
|
|
|
|
|
innodb_binlog_oob_reader::innodb_binlog_oob_reader()
|
|
{
|
|
/* Nothing. */
|
|
}
|
|
|
|
|
|
innodb_binlog_oob_reader::~innodb_binlog_oob_reader()
|
|
{
|
|
/* Nothing. */
|
|
}
|
|
|
|
|
|
void
|
|
innodb_binlog_oob_reader::push_state(enum oob_states state, uint64_t file_no,
|
|
uint64_t offset, bool is_leftmost)
|
|
{
|
|
stack_entry new_entry;
|
|
new_entry.state= state;
|
|
new_entry.file_no= file_no;
|
|
new_entry.offset= offset;
|
|
new_entry.is_leftmost= is_leftmost;
|
|
stack.emplace_back(std::move(new_entry));
|
|
}
|
|
|
|
|
|
void
|
|
innodb_binlog_oob_reader::start_traversal(uint64_t file_no, uint64_t offset)
|
|
{
|
|
stack.clear();
|
|
push_state(ST_initial, file_no, offset, true);
|
|
}
|
|
|
|
|
|
/**
|
|
Read from out-of-band event group data.
|
|
|
|
Does a state-machine incremental traversal of the forest of perfect binary
|
|
trees of oob records in the event group. May read just the data available
|
|
on one page, thus returning less than the requested number of bytes (this
|
|
is to prefer to inspect each page only once, returning data page-by-page as
|
|
long as reader asks for at least a full page worth of data).
|
|
*/
|
|
int
|
|
innodb_binlog_oob_reader::read_data(binlog_chunk_reader *chunk_rd,
|
|
uchar *buf, int len)
|
|
{
|
|
stack_entry *e;
|
|
uint64_t chunk_idx;
|
|
uint64_t left_file_no;
|
|
uint64_t left_offset;
|
|
int res;
|
|
const uchar *p_end;
|
|
const uchar *p;
|
|
std::pair<uint64_t, const unsigned char *> v_and_p;
|
|
int size;
|
|
|
|
if (stack.empty())
|
|
{
|
|
ut_ad(0 /* Should not call when no more oob data to read. */);
|
|
return 0;
|
|
}
|
|
|
|
again:
|
|
e= &(stack[stack.size() - 1]);
|
|
switch (e->state)
|
|
{
|
|
case ST_initial:
|
|
chunk_rd->seek(e->file_no, e->offset);
|
|
static_assert(sizeof(e->rd_buf) == 5*COMPR_INT_MAX64,
|
|
"rd_buf size must match code using it");
|
|
res= chunk_rd->read_data(e->rd_buf, 5*COMPR_INT_MAX64, true);
|
|
if (res < 0)
|
|
return -1;
|
|
if (chunk_rd->cur_type() != FSP_BINLOG_TYPE_OOB_DATA)
|
|
return chunk_rd->read_error_corruption("Wrong chunk type");
|
|
if (res == 0)
|
|
return chunk_rd->read_error_corruption("Unexpected EOF, expected "
|
|
"oob chunk");
|
|
e->rd_buf_len= res;
|
|
p_end= e->rd_buf + res;
|
|
v_and_p= compr_int_read(e->rd_buf);
|
|
p= v_and_p.second;
|
|
if (p > p_end)
|
|
return chunk_rd->read_error_corruption("Short chunk");
|
|
chunk_idx= v_and_p.first;
|
|
(void)chunk_idx;
|
|
|
|
v_and_p= compr_int_read(p);
|
|
p= v_and_p.second;
|
|
if (p > p_end)
|
|
return chunk_rd->read_error_corruption("Short chunk");
|
|
left_file_no= v_and_p.first;
|
|
v_and_p= compr_int_read(p);
|
|
p= v_and_p.second;
|
|
if (p > p_end)
|
|
return chunk_rd->read_error_corruption("Short chunk");
|
|
left_offset= v_and_p.first;
|
|
|
|
v_and_p= compr_int_read(p);
|
|
p= v_and_p.second;
|
|
if (p > p_end)
|
|
return chunk_rd->read_error_corruption("Short chunk");
|
|
e->right_file_no= v_and_p.first;
|
|
v_and_p= compr_int_read(p);
|
|
p= v_and_p.second;
|
|
if (p > p_end)
|
|
return chunk_rd->read_error_corruption("Short chunk");
|
|
e->right_offset= v_and_p.first;
|
|
e->rd_buf_sofar= (uint32_t)(p - e->rd_buf);
|
|
if (left_file_no == 0 && left_offset == 0)
|
|
{
|
|
/* Leaf node. */
|
|
if (e->is_leftmost && !(e->right_file_no == 0 && e->right_offset == 0))
|
|
{
|
|
/* Traverse the prior tree(s) in the forst. */
|
|
e->state= ST_traversing_prior_trees;
|
|
chunk_rd->save_pos(&e->saved_pos);
|
|
push_state(ST_initial, e->right_file_no, e->right_offset, true);
|
|
}
|
|
else
|
|
e->state= ST_self;
|
|
}
|
|
else
|
|
{
|
|
e->state= ST_traversing_left_child;
|
|
chunk_rd->save_pos(&e->saved_pos);
|
|
push_state(ST_initial, left_file_no, left_offset, e->is_leftmost);
|
|
}
|
|
goto again;
|
|
|
|
case ST_traversing_prior_trees:
|
|
chunk_rd->restore_pos(&e->saved_pos);
|
|
e->state= ST_self;
|
|
goto again;
|
|
|
|
case ST_traversing_left_child:
|
|
e->state= ST_traversing_right_child;
|
|
push_state(ST_initial, e->right_file_no, e->right_offset, false);
|
|
goto again;
|
|
|
|
case ST_traversing_right_child:
|
|
chunk_rd->restore_pos(&e->saved_pos);
|
|
e->state= ST_self;
|
|
goto again;
|
|
|
|
case ST_self:
|
|
size= 0;
|
|
if (e->rd_buf_len > e->rd_buf_sofar)
|
|
{
|
|
/* Use any excess data from when the header was read. */
|
|
size= std::min((int)(e->rd_buf_len - e->rd_buf_sofar), len);
|
|
memcpy(buf, e->rd_buf + e->rd_buf_sofar, size);
|
|
e->rd_buf_sofar+= size;
|
|
len-= size;
|
|
buf+= size;
|
|
}
|
|
|
|
if (UNIV_LIKELY(len > 0) && UNIV_LIKELY(!chunk_rd->end_of_record()))
|
|
{
|
|
res= chunk_rd->read_data(buf, len, false);
|
|
if (res < 0)
|
|
return -1;
|
|
size+= res;
|
|
}
|
|
|
|
if (chunk_rd->end_of_record())
|
|
{
|
|
/* This oob record done, pop the state. */
|
|
ut_ad(!stack.empty());
|
|
stack.erase(stack.end() - 1, stack.end());
|
|
}
|
|
return size;
|
|
|
|
default:
|
|
ut_ad(0);
|
|
return -1;
|
|
}
|
|
}
|
|
|
|
|
|
ha_innodb_binlog_reader::ha_innodb_binlog_reader(bool wait_durable,
|
|
uint64_t file_no,
|
|
uint64_t offset)
|
|
: chunk_rd(wait_durable ?
|
|
binlog_cur_durable_offset : binlog_cur_end_offset),
|
|
requested_file_no(~(uint64_t)0),
|
|
rd_buf_len(0), rd_buf_sofar(0), state(ST_read_next_event_group)
|
|
{
|
|
page_buf= static_cast<uchar *>(ut_malloc(ibb_page_size, mem_key_binlog));
|
|
chunk_rd.set_page_buf(page_buf);
|
|
if (offset < ibb_page_size)
|
|
offset= ibb_page_size;
|
|
chunk_rd.seek(file_no, offset);
|
|
chunk_rd.skip_partial(true);
|
|
}
|
|
|
|
|
|
ha_innodb_binlog_reader::~ha_innodb_binlog_reader()
|
|
{
|
|
ut_free(page_buf);
|
|
}
|
|
|
|
|
|
/**
|
|
Read data from current position in binlog.
|
|
|
|
If the data is written to disk (visible at the OS level, even if not
|
|
necessarily fsync()'ed to disk), we can read directly from the file.
|
|
Otherwise, the data must still be available in the buffer pool and
|
|
we can read it from there.
|
|
|
|
First try a dirty read of current state; if this says the data is available
|
|
to read from the file, this is safe to do (data cannot become un-written).
|
|
|
|
If not, then check if the page is in the buffer pool; if not, then likewise
|
|
we know it's safe to read from the file directly.
|
|
|
|
Finally, do another check of the current state. This will catch the case
|
|
where we looked for a page in binlog file N, but its tablespace id has been
|
|
recycled, so we got a page from (N+2) instead. In this case also, we can
|
|
then read from the real file.
|
|
*/
|
|
int ha_innodb_binlog_reader::read_binlog_data(uchar *buf, uint32_t len)
|
|
{
|
|
int res= read_data(buf, len);
|
|
chunk_rd.release(res == 0);
|
|
cur_file_no= chunk_rd.current_file_no();
|
|
cur_file_pos= chunk_rd.current_pos();
|
|
return res;
|
|
}
|
|
|
|
|
|
int ha_innodb_binlog_reader::read_data(uchar *buf, uint32_t len)
|
|
{
|
|
int res;
|
|
const uchar *p_end;
|
|
const uchar *p;
|
|
std::pair<uint64_t, const unsigned char *> v_and_p;
|
|
int sofar= 0;
|
|
|
|
again:
|
|
switch (state)
|
|
{
|
|
case ST_read_next_event_group:
|
|
static_assert(sizeof(rd_buf) == 5*COMPR_INT_MAX64,
|
|
"rd_buf size must match code using it");
|
|
res= chunk_rd.read_data(rd_buf, 5*COMPR_INT_MAX64, true);
|
|
if (res < 0)
|
|
return res;
|
|
if (res == 0)
|
|
return sofar;
|
|
if (chunk_rd.cur_type() != FSP_BINLOG_TYPE_COMMIT)
|
|
{
|
|
chunk_rd.skip_current();
|
|
goto again;
|
|
}
|
|
/* Found the start of a commit record. */
|
|
chunk_rd.skip_partial(false);
|
|
|
|
/* Read the header of the commit record to see if there's any oob data. */
|
|
rd_buf_len= res;
|
|
p_end= rd_buf + res;
|
|
v_and_p= compr_int_read(rd_buf);
|
|
p= v_and_p.second;
|
|
if (p > p_end)
|
|
return chunk_rd.read_error_corruption("Short chunk");
|
|
oob_count= v_and_p.first;
|
|
oob_count2= 0;
|
|
|
|
if (oob_count > 0)
|
|
{
|
|
/* Skip the pointer to first chunk. */
|
|
v_and_p= compr_int_read(p);
|
|
p= v_and_p.second;
|
|
if (p > p_end)
|
|
return chunk_rd.read_error_corruption("Short chunk");
|
|
v_and_p= compr_int_read(p);
|
|
p= v_and_p.second;
|
|
if (p > p_end)
|
|
return chunk_rd.read_error_corruption("Short chunk");
|
|
|
|
v_and_p= compr_int_read(p);
|
|
p= v_and_p.second;
|
|
if (p > p_end)
|
|
return chunk_rd.read_error_corruption("Short chunk");
|
|
oob_last_file_no= v_and_p.first;
|
|
v_and_p= compr_int_read(p);
|
|
p= v_and_p.second;
|
|
if (p > p_end)
|
|
return chunk_rd.read_error_corruption("Short chunk");
|
|
oob_last_offset= v_and_p.first;
|
|
|
|
/* Check for any secondary oob data. */
|
|
v_and_p= compr_int_read(p);
|
|
p= v_and_p.second;
|
|
if (p > p_end)
|
|
return chunk_rd.read_error_corruption("Short chunk");
|
|
oob_count2= v_and_p.first;
|
|
|
|
if (oob_count2 > 0)
|
|
{
|
|
/* Skip the pointer to first chunk. */
|
|
v_and_p= compr_int_read(p);
|
|
p= v_and_p.second;
|
|
if (p > p_end)
|
|
return chunk_rd.read_error_corruption("Short chunk");
|
|
v_and_p= compr_int_read(p);
|
|
p= v_and_p.second;
|
|
if (p > p_end)
|
|
return chunk_rd.read_error_corruption("Short chunk");
|
|
|
|
v_and_p= compr_int_read(p);
|
|
p= v_and_p.second;
|
|
if (p > p_end)
|
|
return chunk_rd.read_error_corruption("Short chunk");
|
|
oob_last_file_no2= v_and_p.first;
|
|
v_and_p= compr_int_read(p);
|
|
p= v_and_p.second;
|
|
if (p > p_end)
|
|
return chunk_rd.read_error_corruption("Short chunk");
|
|
oob_last_offset2= v_and_p.first;
|
|
}
|
|
}
|
|
|
|
rd_buf_sofar= (uint32_t)(p - rd_buf);
|
|
state= ST_read_commit_record;
|
|
goto again;
|
|
|
|
case ST_read_commit_record:
|
|
if (rd_buf_len > rd_buf_sofar)
|
|
{
|
|
/* Use any excess data from when the header was read. */
|
|
int size= std::min((int)(rd_buf_len - rd_buf_sofar), (int)len);
|
|
memcpy(buf, rd_buf + rd_buf_sofar, size);
|
|
rd_buf_sofar+= size;
|
|
len-= size;
|
|
buf+= size;
|
|
sofar+= size;
|
|
}
|
|
|
|
if (UNIV_LIKELY(len > 0) && UNIV_LIKELY(!chunk_rd.end_of_record()))
|
|
{
|
|
res= chunk_rd.read_data(buf, len, false);
|
|
if (res < 0)
|
|
return -1;
|
|
len-= res;
|
|
buf+= res;
|
|
sofar+= res;
|
|
}
|
|
|
|
if (UNIV_LIKELY(rd_buf_sofar == rd_buf_len) && chunk_rd.end_of_record())
|
|
{
|
|
if (oob_count == 0)
|
|
{
|
|
state= ST_read_next_event_group;
|
|
if (len > 0 && !chunk_rd.is_end_of_page())
|
|
{
|
|
/*
|
|
Let us try to read more data from this page. The goal is to read
|
|
from each page only once, as long as caller passes in a buffer at
|
|
least as big as our page size. Though commit record header that
|
|
spans a page boundary or oob records can break this property.
|
|
*/
|
|
goto again;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
oob_reader.start_traversal(oob_last_file_no, oob_last_offset);
|
|
chunk_rd.save_pos(&saved_commit_pos);
|
|
state= ST_read_oob_data;
|
|
}
|
|
if (sofar == 0)
|
|
goto again;
|
|
}
|
|
|
|
return sofar;
|
|
|
|
case ST_read_oob_data:
|
|
res= oob_reader.read_data(&chunk_rd, buf, len);
|
|
if (res < 0)
|
|
return -1;
|
|
if (oob_reader.oob_traversal_done())
|
|
{
|
|
if (UNIV_UNLIKELY(oob_count2 > 0))
|
|
{
|
|
/* Switch over to secondary oob data. */
|
|
oob_count= oob_count2;
|
|
oob_count2= 0;
|
|
oob_last_file_no= oob_last_file_no2;
|
|
oob_last_offset= oob_last_offset2;
|
|
oob_reader.start_traversal(oob_last_file_no, oob_last_offset);
|
|
state= ST_read_oob_data;
|
|
}
|
|
else
|
|
{
|
|
chunk_rd.restore_pos(&saved_commit_pos);
|
|
state= ST_read_next_event_group;
|
|
}
|
|
}
|
|
if (UNIV_UNLIKELY(res == 0))
|
|
{
|
|
ut_ad(0 /* Should have had oob_traversal_done() last time then. */);
|
|
if (sofar == 0)
|
|
goto again;
|
|
}
|
|
return sofar + res;
|
|
|
|
default:
|
|
ut_ad(0);
|
|
return -1;
|
|
}
|
|
}
|
|
|
|
|
|
bool
|
|
ha_innodb_binlog_reader::data_available()
|
|
{
|
|
if (state != ST_read_next_event_group)
|
|
return true;
|
|
return chunk_rd.data_available();
|
|
}
|
|
|
|
|
|
bool
|
|
ha_innodb_binlog_reader::wait_available(THD *thd,
|
|
const struct timespec *abstime)
|
|
{
|
|
bool is_timeout= false;
|
|
lsn_t pending_sync_lsn= 0;
|
|
bool did_enter_cond= false;
|
|
PSI_stage_info old_stage;
|
|
|
|
if (data_available())
|
|
return false;
|
|
|
|
mysql_mutex_lock(&binlog_durable_mutex);
|
|
for (;;)
|
|
{
|
|
/* Process anything that has become durable since we last looked. */
|
|
lsn_t durable_lsn= log_sys.get_flushed_lsn(std::memory_order_relaxed);
|
|
ibb_pending_lsn_fifo.process_durable_lsn(durable_lsn);
|
|
|
|
/* Check if there is anything more pending to be made durable. */
|
|
if (!ibb_pending_lsn_fifo.is_empty())
|
|
{
|
|
pending_lsn_fifo::entry &e= ibb_pending_lsn_fifo.cur_head();
|
|
if (durable_lsn < e.lsn)
|
|
pending_sync_lsn= e.lsn;
|
|
}
|
|
|
|
/*
|
|
Check if there is data available for us now.
|
|
As we are holding binlog_durable_mutex, active_binlog_file_no cannot
|
|
move during this check.
|
|
*/
|
|
uint64_t cur= active_binlog_file_no.load(std::memory_order_relaxed);
|
|
uint64_t durable_offset=
|
|
binlog_cur_durable_offset[cur & 3].load(std::memory_order_relaxed);
|
|
if (durable_offset == 0 && chunk_rd.s.file_no + 1 == cur)
|
|
{
|
|
/*
|
|
If active has durable position=0, it means the current durable
|
|
position is somewhere in active-1.
|
|
*/
|
|
cur= chunk_rd.s.file_no;
|
|
durable_offset=
|
|
binlog_cur_durable_offset[cur & 3].load(std::memory_order_relaxed);
|
|
}
|
|
if (chunk_rd.is_before_pos(cur, durable_offset))
|
|
break;
|
|
|
|
if (pending_sync_lsn != 0 && ibb_pending_lsn_fifo.flushing_lsn == 0)
|
|
{
|
|
/*
|
|
There is no data available for us now, but there is data that will be
|
|
available when the InnoDB redo log has been durably flushed to disk.
|
|
So now we will do such a sync (unless another thread is already doing
|
|
it), so we can proceed getting more data out.
|
|
*/
|
|
ibb_pending_lsn_fifo.flushing_lsn= pending_sync_lsn;
|
|
mysql_mutex_unlock(&binlog_durable_mutex);
|
|
log_write_up_to(pending_sync_lsn, true);
|
|
mysql_mutex_lock(&binlog_durable_mutex);
|
|
ibb_pending_lsn_fifo.flushing_lsn= pending_sync_lsn= 0;
|
|
/* Need to loop back to repeat all checks, after releasing the mutex. */
|
|
continue;
|
|
}
|
|
|
|
if (thd && thd_kill_level(thd))
|
|
break;
|
|
|
|
if (thd && !did_enter_cond)
|
|
{
|
|
THD_ENTER_COND(thd, &binlog_durable_cond, &binlog_durable_mutex,
|
|
&stage_master_has_sent_all_binlog_to_slave, &old_stage);
|
|
did_enter_cond= true;
|
|
}
|
|
if (abstime)
|
|
{
|
|
int res= mysql_cond_timedwait(&binlog_durable_cond,
|
|
&binlog_durable_mutex,
|
|
abstime);
|
|
if (res == ETIMEDOUT)
|
|
{
|
|
is_timeout= true;
|
|
break;
|
|
}
|
|
}
|
|
else
|
|
mysql_cond_wait(&binlog_durable_cond, &binlog_durable_mutex);
|
|
}
|
|
/*
|
|
If there is pending binlog data to durably sync to the redo log, but we
|
|
did not do this sync ourselves, then signal another thread (if any) to
|
|
wakeup and sync. This is necessary to not lose the sync wakeup signal.
|
|
|
|
(We use wake-one rather than wake-all for signalling a pending redo log
|
|
sync to avoid wakeup-storm).
|
|
*/
|
|
if (pending_sync_lsn != 0)
|
|
mysql_cond_signal(&binlog_durable_cond);
|
|
|
|
if (did_enter_cond)
|
|
THD_EXIT_COND(thd, &old_stage);
|
|
else
|
|
mysql_mutex_unlock(&binlog_durable_mutex);
|
|
|
|
return is_timeout;
|
|
}
|
|
|
|
|
|
handler_binlog_reader *
|
|
innodb_get_binlog_reader(bool wait_durable)
|
|
{
|
|
return new ha_innodb_binlog_reader(wait_durable);
|
|
}
|
|
|
|
|
|
gtid_search::gtid_search()
|
|
: cur_open_file_no(~(uint64_t)0), cur_open_file_length(0),
|
|
cur_open_file((File)-1)
|
|
{
|
|
/* Nothing else. */
|
|
}
|
|
|
|
|
|
gtid_search::~gtid_search()
|
|
{
|
|
if (cur_open_file >= (File)0)
|
|
my_close(cur_open_file, MYF(0));
|
|
}
|
|
|
|
|
|
/**
|
|
Search for a GTID position in the binlog.
|
|
Find a binlog file_no and an offset into the file that is guaranteed to
|
|
be before the target position. It can be a bit earlier, that only means a
|
|
bit more of the binlog needs to be scanned to find the real position.
|
|
|
|
Returns:
|
|
-1 error
|
|
0 Position not found (has been purged)
|
|
1 Position found
|
|
*/
|
|
int
|
|
gtid_search::find_gtid_pos(slave_connection_state *pos,
|
|
rpl_binlog_state_base *out_state,
|
|
uint64_t *out_file_no, uint64_t *out_offset)
|
|
{
|
|
uint64_t dummy_xa_ref;
|
|
/*
|
|
Dirty read, but getting a slightly stale value is no problem, we will just
|
|
be starting to scan the binlog file at a slightly earlier position than
|
|
necessary.
|
|
*/
|
|
uint64_t file_no= active_binlog_file_no.load(std::memory_order_relaxed);
|
|
|
|
std::unique_ptr<byte, void (*)(byte *)>
|
|
page_buf(static_cast<byte*>(ut_malloc(ibb_page_size, mem_key_binlog)),
|
|
[](byte *p) {ut_free(p);});
|
|
if (page_buf == nullptr)
|
|
{
|
|
my_error(ER_OUTOFMEMORY, MYF(0), ibb_page_size);
|
|
return -1;
|
|
}
|
|
binlog_chunk_reader chunk_reader(binlog_cur_durable_offset);
|
|
chunk_reader.set_page_buf(page_buf.get());
|
|
|
|
/* First search backwards for the right file to start from. */
|
|
uint64_t diff_state_page_interval= 0;
|
|
rpl_binlog_state_base base_state, page0_diff_state, tmp_diff_state;
|
|
base_state.init();
|
|
for (;;)
|
|
{
|
|
/* Read the header page, needed to get the binlog diff state interval. */
|
|
binlog_header_data header;
|
|
chunk_reader.seek(file_no, 0);
|
|
int res= chunk_reader.get_file_header(&header);
|
|
if (UNIV_UNLIKELY(res < 0))
|
|
return -1;
|
|
if (UNIV_UNLIKELY(res == 0))
|
|
goto not_found_in_file;
|
|
diff_state_page_interval= header.diff_state_interval;
|
|
|
|
chunk_reader.seek(file_no, ibb_page_size);
|
|
res= read_gtid_state(&chunk_reader, &base_state, &dummy_xa_ref);
|
|
if (UNIV_UNLIKELY(res < 0))
|
|
return -1;
|
|
if (res == 0)
|
|
{
|
|
not_found_in_file:
|
|
if (file_no == 0)
|
|
{
|
|
/* Handle the special case of a completely empty binlog file. */
|
|
out_state->reset_nolock();
|
|
*out_file_no= file_no;
|
|
*out_offset= ibb_page_size;
|
|
return 1;
|
|
}
|
|
/* If GTID state is not (durably) available, try the previous file. */
|
|
}
|
|
else if (base_state.is_before_pos(pos))
|
|
break;
|
|
base_state.reset_nolock();
|
|
if (file_no <= earliest_binlog_file_no)
|
|
return 0;
|
|
--file_no;
|
|
}
|
|
|
|
/*
|
|
Then binary search for the last differential state record that is still
|
|
before the searched position.
|
|
|
|
The invariant is that page2 is known to be after the target page, and page0
|
|
is known to be a valid position to start (but possibly earlier than needed).
|
|
*/
|
|
uint32_t page0= 0;
|
|
uint32_t page2= (uint32_t) (diff_state_page_interval +
|
|
((chunk_reader.cur_end_offset - 1) >> ibb_page_size_shift));
|
|
/* Round to the next diff_state_page_interval after file end. */
|
|
page2-= page2 % (uint32_t)diff_state_page_interval;
|
|
uint32_t page1= page0 +
|
|
((page2 - page0) /
|
|
(2*(uint32_t)diff_state_page_interval) *
|
|
(uint32_t)diff_state_page_interval);
|
|
page0_diff_state.init();
|
|
page0_diff_state.load_nolock(&base_state);
|
|
tmp_diff_state.init();
|
|
while (page1 >= page0 + diff_state_page_interval && page1 > 1)
|
|
{
|
|
ut_ad((page1 - page0) % diff_state_page_interval == 0);
|
|
tmp_diff_state.reset_nolock();
|
|
tmp_diff_state.load_nolock(&base_state);
|
|
chunk_reader.seek(file_no, page1 << ibb_page_size_shift);
|
|
int res= read_gtid_state(&chunk_reader, &tmp_diff_state, &dummy_xa_ref);
|
|
if (UNIV_UNLIKELY(res < 0))
|
|
return -1;
|
|
if (res == 0)
|
|
{
|
|
/*
|
|
If the diff state record was not written here for some reason, just
|
|
try the one just before. It will be safe, even if not always optimal,
|
|
and this is an abnormal situation anyway.
|
|
*/
|
|
page1= page1 - (uint32_t)diff_state_page_interval;
|
|
continue;
|
|
}
|
|
if (tmp_diff_state.is_before_pos(pos))
|
|
{
|
|
page0= page1;
|
|
page0_diff_state.reset_nolock();
|
|
page0_diff_state.load_nolock(&tmp_diff_state);
|
|
}
|
|
else
|
|
page2= page1;
|
|
page1= page0 +
|
|
((page2 - page0) /
|
|
(2*(uint32_t)diff_state_page_interval) *
|
|
(uint32_t)diff_state_page_interval);
|
|
}
|
|
ut_ad(page1 >= page0);
|
|
out_state->load_nolock(&page0_diff_state);
|
|
*out_file_no= file_no;
|
|
if (page0 == 0)
|
|
page0= 1; /* Skip the initial file header page. */
|
|
*out_offset= (uint64_t)page0 << ibb_page_size_shift;
|
|
return 1;
|
|
}
|
|
|
|
|
|
int
|
|
ha_innodb_binlog_reader::init_gtid_pos(THD *thd, slave_connection_state *pos,
|
|
rpl_binlog_state_base *state)
|
|
{
|
|
gtid_search search_obj;
|
|
uint64_t file_no;
|
|
uint64_t offset;
|
|
|
|
/*
|
|
Wait for at least the initial GTID state record to become durable before
|
|
looking for the starting GTID position.
|
|
This is unlikely to need to wait, as it would imply that _no_ part of the
|
|
binlog is durable at this point. But it might theoretically occur perhaps
|
|
after a PURGE of all binlog files but the active; and failing to do the
|
|
wait if needed might wrongly return an error that the GTID position is
|
|
too old.
|
|
*/
|
|
chunk_rd.seek(earliest_binlog_file_no, ibb_page_size);
|
|
if (UNIV_UNLIKELY(wait_available(thd, nullptr)))
|
|
return -1;
|
|
|
|
int res= search_obj.find_gtid_pos(pos, state, &file_no, &offset);
|
|
if (res < 0)
|
|
return -1;
|
|
if (res > 0)
|
|
{
|
|
requested_file_no= file_no;
|
|
chunk_rd.seek(file_no, offset);
|
|
chunk_rd.skip_partial(true);
|
|
cur_file_no= chunk_rd.current_file_no();
|
|
cur_file_pos= chunk_rd.current_pos();
|
|
}
|
|
return res;
|
|
}
|
|
|
|
|
|
int
|
|
ha_innodb_binlog_reader::init_legacy_pos(THD *thd, const char *filename,
|
|
ulonglong offset)
|
|
{
|
|
uint64_t file_no;
|
|
if (!filename)
|
|
{
|
|
mysql_mutex_lock(&purge_binlog_mutex);
|
|
file_no= earliest_binlog_file_no;
|
|
mysql_mutex_unlock(&purge_binlog_mutex);
|
|
}
|
|
else if (!is_binlog_name(filename, &file_no))
|
|
{
|
|
my_error(ER_UNKNOWN_TARGET_BINLOG, MYF(0));
|
|
return -1;
|
|
}
|
|
if (file_no > active_binlog_file_no.load(std::memory_order_acquire))
|
|
{
|
|
my_error(ER_ERROR_WHEN_EXECUTING_COMMAND, MYF(0), "SHOW BINLOG EVENTS",
|
|
"Could not find target log");
|
|
return -1;
|
|
}
|
|
requested_file_no= file_no;
|
|
if ((uint64_t)offset >= (uint64_t)(UINT32_MAX) << ibb_page_size_shift)
|
|
{
|
|
my_error(ER_BINLOG_POS_INVALID, MYF(0), offset);
|
|
return -1;
|
|
}
|
|
|
|
if (offset < ibb_page_size)
|
|
offset= ibb_page_size;
|
|
|
|
/*
|
|
Start at the beginning of the page containing the requested position. Then
|
|
read forwards until the requested position is reached. This way we avoid
|
|
reading garbaga data for invalid request offset.
|
|
*/
|
|
|
|
chunk_rd.seek(file_no,
|
|
(uint64_t)offset & ((uint64_t)~0 << ibb_page_size_shift));
|
|
int err=
|
|
chunk_rd.find_offset_in_page((uint32_t)(offset & (ibb_page_size - 1)));
|
|
chunk_rd.release(true);
|
|
chunk_rd.skip_partial(true);
|
|
|
|
cur_file_no= chunk_rd.current_file_no();
|
|
cur_file_pos= chunk_rd.current_pos();
|
|
return err;
|
|
}
|
|
|
|
|
|
void
|
|
ha_innodb_binlog_reader::enable_single_file()
|
|
{
|
|
chunk_rd.stop_file_no= requested_file_no != ~(uint64_t)0 ?
|
|
requested_file_no : chunk_rd.s.file_no;
|
|
}
|
|
|
|
|
|
void
|
|
ha_innodb_binlog_reader::seek_internal(uint64_t file_no, uint64_t offset)
|
|
{
|
|
chunk_rd.seek(file_no, offset);
|
|
chunk_rd.skip_partial(true);
|
|
cur_file_no= chunk_rd.current_file_no();
|
|
cur_file_pos= chunk_rd.current_pos();
|
|
}
|
|
|
|
|
|
void
|
|
ibb_wait_durable_offset(uint64_t file_no, uint64_t wait_offset)
|
|
{
|
|
uint64_t dur_offset=
|
|
binlog_cur_durable_offset[file_no & 3].load(std:: memory_order_relaxed);
|
|
ha_innodb_binlog_reader reader(true, file_no, dur_offset);
|
|
for (;;)
|
|
{
|
|
reader.wait_available(nullptr, nullptr);
|
|
dur_offset=
|
|
binlog_cur_durable_offset[file_no & 3].load(std:: memory_order_relaxed);
|
|
if (dur_offset >= wait_offset)
|
|
break;
|
|
reader.seek_internal(file_no, dur_offset);
|
|
}
|
|
}
|
|
|
|
|
|
pending_lsn_fifo::pending_lsn_fifo()
|
|
: flushing_lsn(0), last_lsn_added(0), cur_file_no(~(uint64_t)0),
|
|
head(0), tail(0)
|
|
{
|
|
}
|
|
|
|
|
|
void
|
|
pending_lsn_fifo::init(uint64_t start_file_no)
|
|
{
|
|
mysql_mutex_lock(&binlog_durable_mutex);
|
|
ut_ad(cur_file_no == ~(uint64_t)0);
|
|
cur_file_no= start_file_no;
|
|
mysql_mutex_unlock(&binlog_durable_mutex);
|
|
}
|
|
|
|
|
|
void
|
|
pending_lsn_fifo::reset()
|
|
{
|
|
mysql_mutex_lock(&binlog_durable_mutex);
|
|
cur_file_no= ~(uint64_t)0;
|
|
mysql_mutex_unlock(&binlog_durable_mutex);
|
|
}
|
|
|
|
|
|
bool
|
|
pending_lsn_fifo::process_durable_lsn(lsn_t lsn)
|
|
{
|
|
mysql_mutex_assert_owner(&binlog_durable_mutex);
|
|
ut_ad(cur_file_no != ~(uint64_t)0);
|
|
|
|
entry *got= nullptr;
|
|
for (;;)
|
|
{
|
|
if (is_empty())
|
|
break;
|
|
entry &e= cur_tail();
|
|
if (lsn < e.lsn)
|
|
break;
|
|
got= &e;
|
|
drop_tail();
|
|
}
|
|
if (got)
|
|
{
|
|
uint64_t active= active_binlog_file_no.load(std::memory_order_relaxed);
|
|
DBUG_EXECUTE_IF("block_binlog_durable", active= got->file_no + 2;);
|
|
if (got->file_no + 1 >= active)
|
|
{
|
|
/*
|
|
We must never set the durable offset back to a prior value.
|
|
This should be assured by never adding a smaller lsn into the fifo than
|
|
any prior lsn added, and checked by this assertion.
|
|
*/
|
|
ut_ad(binlog_cur_durable_offset[got->file_no & 3].
|
|
load(std::memory_order_relaxed) <= got->offset);
|
|
binlog_cur_durable_offset[got->file_no & 3].store
|
|
(got->offset, std::memory_order_relaxed);
|
|
}
|
|
/*
|
|
If we moved the durable point to the next file_no, mark the prior
|
|
file_no as now fully durable.
|
|
Since we only ever have at most two binlog tablespaces open, and since
|
|
we make file_no=N fully durable (by calling into this function) before
|
|
pre-allocating N+2, we can only ever move ahead one file_no at a time
|
|
here.
|
|
*/
|
|
if (cur_file_no != got->file_no)
|
|
{
|
|
ut_ad(got->file_no == cur_file_no + 1);
|
|
binlog_cur_durable_offset[cur_file_no & 3].store(
|
|
binlog_cur_end_offset[cur_file_no & 3].load(std::memory_order_relaxed),
|
|
std::memory_order_relaxed);
|
|
cur_file_no= got->file_no;
|
|
}
|
|
mysql_cond_broadcast(&binlog_durable_cond);
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
|
|
/**
|
|
After a binlog commit, put the LSN and the corresponding binlog position
|
|
into the ibb_pending_lsn_fifo. We do this here (rather than immediately in
|
|
innodb_binlog_post_commit()), so that we can delay it until we are no longer
|
|
holding more critical locks that could block other writers. As we will be
|
|
contending with readers here on binlog_durable_mutex.
|
|
*/
|
|
void
|
|
pending_lsn_fifo::record_commit(binlog_oob_context *c)
|
|
{
|
|
uint64_t pending_file_no= c->pending_file_no;
|
|
if (pending_file_no == ~(uint64_t)0)
|
|
return;
|
|
c->pending_file_no= ~(uint64_t)0;
|
|
lsn_t pending_lsn= c->pending_lsn;
|
|
uint64_t pending_offset= c->pending_offset;
|
|
add_to_fifo(pending_lsn, pending_file_no, pending_offset);
|
|
}
|
|
|
|
|
|
void
|
|
pending_lsn_fifo::add_to_fifo(uint64_t lsn, uint64_t file_no, uint64_t offset)
|
|
{
|
|
mysql_mutex_lock(&binlog_durable_mutex);
|
|
/*
|
|
The record_commit() operation is done outside of critical locks for
|
|
scalabitily, so can occur out-of-order. So only insert the new entry if
|
|
it is newer than any previously inserted.
|
|
*/
|
|
ut_ad(is_empty() || cur_head().lsn == last_lsn_added);
|
|
if (lsn > last_lsn_added)
|
|
{
|
|
if (is_full())
|
|
{
|
|
/*
|
|
When the fifo is full, we just overwrite the head with a newer LSN.
|
|
This way, whenever _some_ LSN gets synced durably to disk, we will
|
|
always be able to make some progress and clear some fifo entries. And
|
|
when this latest LSN gets eventually synced, any overwritten entry
|
|
will progress as well.
|
|
*/
|
|
}
|
|
else
|
|
{
|
|
/*
|
|
Insert a new head.
|
|
Note that we make the fifo size a power-of-two (2 <<fixed_size_log2).
|
|
So if we wrap around uint32_t here, the outcome is still valid.
|
|
*/
|
|
new_head();
|
|
}
|
|
entry &h= cur_head();
|
|
h.file_no= file_no;
|
|
h.offset= offset;
|
|
h.lsn= lsn;
|
|
last_lsn_added= lsn;
|
|
/* Make an immediate check in case the LSN is already durable. */
|
|
bool signalled=
|
|
process_durable_lsn(log_sys.get_flushed_lsn(std::memory_order_relaxed));
|
|
if (!signalled && flushing_lsn == 0)
|
|
{
|
|
/*
|
|
If process_durable_lsn() did not find any new data become durable, it
|
|
does not broadcast a wakeup signal. But since we inserted a new entry
|
|
in the fifo, we still want to signal _one_ other thread to potentially
|
|
wake up and start a redo log sync to make the new entry durable, unless
|
|
a thread is already doing such redo log sync.
|
|
*/
|
|
mysql_cond_signal(&binlog_durable_cond);
|
|
}
|
|
}
|
|
mysql_mutex_unlock(&binlog_durable_mutex);
|
|
}
|
|
|
|
|
|
static const uchar *get_xid_hash_key(const void *p, size_t *out_len, my_bool)
|
|
{
|
|
const XID *xid= &(reinterpret_cast<const ibb_xid_hash::xid_elem *>(p)->xid);
|
|
*out_len= xid->key_length();
|
|
return xid->key();
|
|
}
|
|
|
|
|
|
ibb_xid_hash::ibb_xid_hash()
|
|
{
|
|
mysql_mutex_init(ibb_xid_hash_mutex_key, &xid_mutex, nullptr);
|
|
my_hash_init(mem_key_binlog, &xid_hash, &my_charset_bin, 32, 0,
|
|
sizeof(XID), get_xid_hash_key, nullptr, MYF(HASH_UNIQUE));
|
|
}
|
|
|
|
|
|
ibb_xid_hash::~ibb_xid_hash()
|
|
{
|
|
for (uint32 i= 0; i < xid_hash.records; ++i)
|
|
my_free(my_hash_element(&xid_hash, i));
|
|
my_hash_free(&xid_hash);
|
|
mysql_mutex_destroy(&xid_mutex);
|
|
}
|
|
|
|
|
|
bool
|
|
ibb_xid_hash::add_xid(const XID *xid, const binlog_oob_context *c)
|
|
{
|
|
xid_elem *e=
|
|
(xid_elem *)my_malloc(mem_key_binlog, sizeof(xid_elem), MYF(MY_WME));
|
|
if (!e)
|
|
{
|
|
my_error(ER_OUTOFMEMORY, MYF(0), (int)sizeof(xid_elem));
|
|
return true;
|
|
}
|
|
e->xid.set(xid);
|
|
uint64_t refcnt_file_no;
|
|
if (UNIV_LIKELY(c->node_list_len > 0))
|
|
{
|
|
uint32_t last= c->node_list_len-1;
|
|
e->oob_num_nodes= c->node_list[last].node_index + 1;
|
|
e->oob_first_file_no= c->first_node_file_no;
|
|
e->oob_first_offset= c->first_node_offset;
|
|
e->oob_last_file_no= c->node_list[last].file_no;
|
|
e->oob_last_offset= c->node_list[last].offset;
|
|
refcnt_file_no= e->oob_first_file_no;
|
|
}
|
|
else
|
|
{
|
|
e->oob_num_nodes= 0;
|
|
e->oob_first_file_no= 0;
|
|
e->oob_first_offset= 0;
|
|
e->oob_last_file_no= 0;
|
|
e->oob_last_offset= 0;
|
|
/*
|
|
Empty XA transaction, but we still need to ensure the prepare record
|
|
is kept until the (empty) transactions gets XA COMMMIT'ted.
|
|
*/
|
|
refcnt_file_no= active_binlog_file_no.load(std::memory_order_acquire);
|
|
}
|
|
e->refcnt_file_no= refcnt_file_no;
|
|
mysql_mutex_lock(&xid_mutex);
|
|
if (my_hash_insert(&xid_hash, (uchar *)e))
|
|
{
|
|
mysql_mutex_unlock(&xid_mutex);
|
|
my_free(e);
|
|
return true;
|
|
}
|
|
mysql_mutex_unlock(&xid_mutex);
|
|
ibb_file_hash.oob_ref_inc(refcnt_file_no, c->lf_pins, true);
|
|
return false;
|
|
}
|
|
|
|
|
|
template <typename F> bool
|
|
ibb_xid_hash::run_on_xid(const XID *xid, F callback)
|
|
{
|
|
size_t key_len= 0;
|
|
const uchar *key_ptr= get_xid_hash_key(xid, &key_len, 1);
|
|
bool err;
|
|
|
|
mysql_mutex_lock(&xid_mutex);
|
|
uchar *rec= my_hash_search(&xid_hash, key_ptr, key_len);
|
|
if (UNIV_LIKELY(rec != nullptr))
|
|
{
|
|
err= callback(reinterpret_cast<xid_elem *>(rec));
|
|
}
|
|
else
|
|
err= true;
|
|
mysql_mutex_unlock(&xid_mutex);
|
|
return err;
|
|
}
|
|
|
|
|
|
/*
|
|
Look up an XID in the internal XID hash.
|
|
Remove the entry found (if any) and return it.
|
|
*/
|
|
ibb_xid_hash::xid_elem *
|
|
ibb_xid_hash::grab_xid(const XID *xid)
|
|
{
|
|
xid_elem *e= nullptr;
|
|
size_t key_len= 0;
|
|
const uchar *key_ptr= get_xid_hash_key(xid, &key_len, 1);
|
|
mysql_mutex_lock(&xid_mutex);
|
|
uchar *rec= my_hash_search(&xid_hash, key_ptr, key_len);
|
|
if (UNIV_LIKELY(rec != nullptr))
|
|
{
|
|
e= reinterpret_cast<xid_elem *>(rec);
|
|
my_hash_delete(&xid_hash, rec);
|
|
}
|
|
mysql_mutex_unlock(&xid_mutex);
|
|
return e;
|
|
}
|
|
|
|
|
|
void
|
|
ibb_get_filename(char name[FN_REFLEN], uint64_t file_no)
|
|
{
|
|
static_assert(BINLOG_NAME_MAX_LEN <= FN_REFLEN,
|
|
"FN_REFLEN too shot to hold InnoDB binlog name");
|
|
binlog_name_make_short(name, file_no);
|
|
}
|
|
|
|
|
|
extern "C" void binlog_get_cache(THD *, uint64_t, uint64_t, IO_CACHE **,
|
|
handler_binlog_event_group_info **,
|
|
const rpl_gtid **);
|
|
|
|
binlog_oob_context *
|
|
innodb_binlog_trx(trx_t *trx, mtr_t *mtr)
|
|
{
|
|
IO_CACHE *cache;
|
|
handler_binlog_event_group_info *binlog_info;
|
|
const rpl_gtid *gtid;
|
|
uint64_t file_no, pos;
|
|
|
|
if (!trx->mysql_thd)
|
|
return nullptr;
|
|
innodb_binlog_status(&file_no, &pos);
|
|
binlog_get_cache(trx->mysql_thd, file_no, pos, &cache, &binlog_info, >id);
|
|
if (UNIV_LIKELY(binlog_info != nullptr) &&
|
|
UNIV_LIKELY(binlog_info->gtid_offset > 0)) {
|
|
binlog_diff_state.update_nolock(gtid);
|
|
innodb_binlog_write_cache(cache, binlog_info, mtr);
|
|
return static_cast<binlog_oob_context *>(binlog_info->engine_ptr);
|
|
}
|
|
return nullptr;
|
|
}
|
|
|
|
|
|
void
|
|
innodb_binlog_post_commit(mtr_t *mtr, binlog_oob_context *c)
|
|
{
|
|
if (c)
|
|
{
|
|
c->pending_lsn= mtr->commit_lsn();
|
|
ut_ad(c->pending_lsn != 0);
|
|
}
|
|
}
|
|
|
|
|
|
/*
|
|
Function to record the write of a record to the binlog, when done outside
|
|
of a normal binlog commit, eg. XA PREPARE or XA ROLLBACK.
|
|
*/
|
|
static void
|
|
innodb_binlog_post_write_rec(mtr_t *mtr, binlog_oob_context *c)
|
|
{
|
|
uint64_t file_no= active_binlog_file_no.load(std::memory_order_relaxed);
|
|
c->pending_file_no= file_no;
|
|
c->pending_offset=
|
|
binlog_cur_end_offset[file_no & 3].load(std::memory_order_relaxed);
|
|
innodb_binlog_post_commit(mtr, c);
|
|
}
|
|
|
|
|
|
bool
|
|
innobase_binlog_write_direct_ordered(IO_CACHE *cache,
|
|
handler_binlog_event_group_info *binlog_info,
|
|
const rpl_gtid *gtid)
|
|
{
|
|
mtr_t mtr{nullptr};
|
|
ut_ad(binlog_info->engine_ptr2 == nullptr);
|
|
if (gtid)
|
|
binlog_diff_state.update_nolock(gtid);
|
|
innodb_binlog_status(&binlog_info->out_file_no, &binlog_info->out_offset);
|
|
mtr.start();
|
|
innodb_binlog_write_cache(cache, binlog_info, &mtr);
|
|
mtr.commit();
|
|
innodb_binlog_post_commit(&mtr, static_cast<binlog_oob_context *>
|
|
(binlog_info->engine_ptr));
|
|
return false;
|
|
}
|
|
|
|
|
|
bool
|
|
innobase_binlog_write_direct(IO_CACHE *cache,
|
|
handler_binlog_event_group_info *binlog_info,
|
|
const rpl_gtid *gtid)
|
|
{
|
|
ut_ad(binlog_info->engine_ptr2 == nullptr);
|
|
binlog_oob_context *c=
|
|
static_cast<binlog_oob_context *>(binlog_info->engine_ptr);
|
|
if (UNIV_LIKELY(c != nullptr))
|
|
{
|
|
if (srv_flush_log_at_trx_commit & 1)
|
|
log_write_up_to(c->pending_lsn, true);
|
|
ibb_pending_lsn_fifo.record_commit(c);
|
|
}
|
|
return false;
|
|
}
|
|
|
|
|
|
void
|
|
ibb_group_commit(THD *thd, handler_binlog_event_group_info *binlog_info)
|
|
{
|
|
binlog_oob_context *c=
|
|
static_cast<binlog_oob_context *>(binlog_info->engine_ptr);
|
|
if (UNIV_LIKELY(c != nullptr))
|
|
{
|
|
if (srv_flush_log_at_trx_commit & 1 && c->pending_lsn)
|
|
{
|
|
/*
|
|
Sync the InnoDB redo log durably to disk here for the entire group
|
|
commit, so that it will be available for all binlog readers.
|
|
*/
|
|
log_write_up_to(c->pending_lsn, true);
|
|
}
|
|
ibb_pending_lsn_fifo.record_commit(c);
|
|
}
|
|
}
|
|
|
|
|
|
bool
|
|
ibb_write_xa_prepare_ordered(THD *thd,
|
|
handler_binlog_event_group_info *binlog_info,
|
|
uchar engine_count)
|
|
{
|
|
mtr_t mtr{nullptr};
|
|
binlog_oob_context *c=
|
|
static_cast<binlog_oob_context *>(binlog_info->engine_ptr);
|
|
// ToDo: Here need also the oob ref.
|
|
chunk_data_xa_prepare chunk_data(binlog_info->xa_xid, engine_count);
|
|
mtr.start();
|
|
fsp_binlog_write_rec(&chunk_data, &mtr, FSP_BINLOG_TYPE_XA_PREPARE,
|
|
c->lf_pins);
|
|
mtr.commit();
|
|
innodb_binlog_post_write_rec(&mtr, c);
|
|
|
|
return false;
|
|
}
|
|
|
|
|
|
bool
|
|
ibb_write_xa_prepare(THD *thd,
|
|
handler_binlog_event_group_info *binlog_info,
|
|
uchar engine_count)
|
|
{
|
|
binlog_oob_context *c=
|
|
static_cast<binlog_oob_context *>(binlog_info->engine_ptr);
|
|
ut_ad(binlog_info->xa_xid != nullptr);
|
|
if (ibb_xa_xid_hash->add_xid(binlog_info->xa_xid, c))
|
|
return true;
|
|
|
|
/*
|
|
Sync the redo log to ensure that the prepare record is durably written to
|
|
disk. This is necessary before returning OK to the client, to be sure we
|
|
can recover the binlog part of the XA transaction in case of crash.
|
|
*/
|
|
if (srv_flush_log_at_trx_commit > 0)
|
|
log_write_up_to(c->pending_lsn, (srv_flush_log_at_trx_commit & 1));
|
|
ibb_pending_lsn_fifo.record_commit(c);
|
|
|
|
return false;
|
|
}
|
|
|
|
|
|
bool
|
|
ibb_xa_rollback_ordered(THD *thd, const XID *xid, void **engine_data)
|
|
{
|
|
binlog_oob_context *c=
|
|
static_cast<binlog_oob_context *>(*engine_data);
|
|
if (UNIV_UNLIKELY(c == nullptr))
|
|
*engine_data= c= alloc_oob_context();
|
|
|
|
/*
|
|
Write ROLLBACK record to the binlog.
|
|
This will be used during recovery to know that the XID is no longer active,
|
|
allowing purge of the associated binlogs.
|
|
*/
|
|
chunk_data_xa_complete chunk_data(xid, false);
|
|
mtr_t mtr{nullptr};
|
|
mtr.start();
|
|
fsp_binlog_write_rec(&chunk_data, &mtr, FSP_BINLOG_TYPE_XA_COMPLETE,
|
|
c->lf_pins);
|
|
mtr.commit();
|
|
innodb_binlog_post_write_rec(&mtr, c);
|
|
|
|
return false;
|
|
}
|
|
|
|
|
|
bool
|
|
ibb_xa_rollback(THD *thd, const XID *xid, void **engine_data)
|
|
{
|
|
binlog_oob_context *c=
|
|
static_cast<binlog_oob_context *>(*engine_data);
|
|
|
|
/*
|
|
Keep the reference count here, as we need the rollback record to be
|
|
available for recovery until all engines have durably rolled back.
|
|
Decrement will happen after that, in ibb_binlog_unlog().
|
|
*/
|
|
|
|
/*
|
|
Durably write the rollback record to disk. This way, when we return the
|
|
"ok" packet to the client, we are sure that crash recovery will make the
|
|
XID rollback in engines if needed.
|
|
*/
|
|
ut_ad(c->pending_lsn > 0);
|
|
if (srv_flush_log_at_trx_commit > 0)
|
|
log_write_up_to(c->pending_lsn, (srv_flush_log_at_trx_commit & 1));
|
|
|
|
ibb_pending_lsn_fifo.record_commit(c);
|
|
c->pending_lsn= 0;
|
|
return false;
|
|
}
|
|
|
|
|
|
void
|
|
ibb_binlog_unlog(const XID *xid, void **engine_data)
|
|
{
|
|
binlog_oob_context *c=
|
|
static_cast<binlog_oob_context *>(*engine_data);
|
|
if (UNIV_UNLIKELY(c == nullptr))
|
|
*engine_data= c= alloc_oob_context();
|
|
ibb_xid_hash::xid_elem *elem= ibb_xa_xid_hash->grab_xid(xid);
|
|
if (elem)
|
|
{
|
|
ibb_file_hash.oob_ref_dec(elem->refcnt_file_no, c->lf_pins, true);
|
|
my_free(elem);
|
|
}
|
|
}
|
|
|
|
|
|
bool
|
|
innodb_find_binlogs(uint64_t *out_first, uint64_t *out_last)
|
|
{
|
|
mysql_mutex_lock(&active_binlog_mutex);
|
|
*out_last= last_created_binlog_file_no;
|
|
mysql_mutex_unlock(&active_binlog_mutex);
|
|
mysql_mutex_lock(&purge_binlog_mutex);
|
|
*out_first= earliest_binlog_file_no;
|
|
mysql_mutex_unlock(&purge_binlog_mutex);
|
|
if (*out_first == ~(uint64_t)0 || *out_last == ~(uint64_t)0)
|
|
{
|
|
ut_ad(0 /* Impossible, we wait at startup for binlog to be created. */);
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
|
|
void
|
|
innodb_binlog_status(uint64_t *out_file_no, uint64_t *out_pos)
|
|
{
|
|
static_assert(BINLOG_NAME_MAX_LEN <= FN_REFLEN,
|
|
"FN_REFLEN too shot to hold InnoDB binlog name");
|
|
uint64_t file_no= active_binlog_file_no.load(std::memory_order_relaxed);
|
|
uint32_t page_no= binlog_cur_page_no;
|
|
uint32_t in_page_offset= binlog_cur_page_offset;
|
|
*out_file_no= file_no;
|
|
*out_pos= ((uint64_t)page_no << ibb_page_size_shift) | in_page_offset;
|
|
}
|
|
|
|
|
|
bool
|
|
innodb_binlog_get_init_state(rpl_binlog_state_base *out_state)
|
|
{
|
|
binlog_chunk_reader chunk_reader(binlog_cur_end_offset);
|
|
bool err= false;
|
|
uint64_t dummy_xa_ref;
|
|
|
|
byte *page_buf= static_cast<byte *>(ut_malloc(ibb_page_size, mem_key_binlog));
|
|
if (!page_buf)
|
|
{
|
|
my_error(ER_OUTOFMEMORY, MYF(0), ibb_page_size);
|
|
return true;
|
|
}
|
|
chunk_reader.set_page_buf(page_buf);
|
|
|
|
mysql_mutex_lock(&purge_binlog_mutex);
|
|
chunk_reader.seek(earliest_binlog_file_no, ibb_page_size);
|
|
int res= read_gtid_state(&chunk_reader, out_state, &dummy_xa_ref);
|
|
mysql_mutex_unlock(&purge_binlog_mutex);
|
|
if (res != 1)
|
|
err= true;
|
|
ut_free(page_buf);
|
|
return err;
|
|
|
|
}
|
|
|
|
|
|
bool
|
|
innodb_reset_binlogs()
|
|
{
|
|
bool err= false;
|
|
LF_PINS *lf_pins= lf_hash_get_pins(&ibb_file_hash.hash);
|
|
ut_a(lf_pins);
|
|
ut_a(innodb_binlog_inited >= 2);
|
|
|
|
/* Close existing binlog tablespaces and stop the pre-alloc thread. */
|
|
innodb_binlog_close(false);
|
|
|
|
/*
|
|
Durably flush the redo log to disk. This is mostly to simplify
|
|
conceptually (RESET MASTER is not performance critical). This way, we will
|
|
never see a state where recovery stops at an LSN prior to the RESET
|
|
MASTER, so we do not have any question around truncating the binlog to a
|
|
point before the RESET MASTER.
|
|
*/
|
|
log_buffer_flush_to_disk(true);
|
|
|
|
/* Prevent any flushing activity while resetting. */
|
|
binlog_page_fifo->lock_wait_for_idle();
|
|
binlog_page_fifo->reset();
|
|
ibb_pending_lsn_fifo.reset();
|
|
|
|
ibb_file_hash.remove_up_to(last_created_binlog_file_no, lf_pins);
|
|
|
|
/* Delete all binlog files in the directory. */
|
|
MY_DIR *dir= my_dir(innodb_binlog_directory, MYF(MY_WME));
|
|
if (!dir)
|
|
{
|
|
sql_print_error("Could not read the binlog directory '%s', error code %d",
|
|
innodb_binlog_directory, my_errno);
|
|
err= true;
|
|
}
|
|
else
|
|
{
|
|
size_t num_entries= dir->number_of_files;
|
|
fileinfo *entries= dir->dir_entry;
|
|
for (size_t i= 0; i < num_entries; ++i) {
|
|
const char *name= entries[i].name;
|
|
uint64_t file_no;
|
|
if (!is_binlog_name(name, &file_no))
|
|
continue;
|
|
char full_path[OS_FILE_MAX_PATH];
|
|
binlog_name_make(full_path, file_no);
|
|
if (my_delete(full_path, MYF(MY_WME)))
|
|
err= true;
|
|
/*
|
|
Just as defensive coding, also remove any entry from the file hash
|
|
with this file_no. We would expect to have already deleted everything
|
|
in remove_up_to() above.
|
|
*/
|
|
ibb_file_hash.remove(file_no, lf_pins);
|
|
}
|
|
my_dirend(dir);
|
|
}
|
|
/*
|
|
If we get an error deleting any of the existing files, we report the error
|
|
back up. But we still try to initialize an empty binlog state, better than
|
|
leaving a non-functional binlog with corrupt internal state.
|
|
*/
|
|
|
|
/* Re-initialize empty binlog state and start the pre-alloc thread. */
|
|
innodb_binlog_init_state();
|
|
ibb_pending_lsn_fifo.init(0);
|
|
binlog_page_fifo->unlock_with_delayed_free();
|
|
start_binlog_prealloc_thread();
|
|
binlog_sync_initial();
|
|
|
|
lf_hash_put_pins(lf_pins);
|
|
return err;
|
|
}
|
|
|
|
|
|
/*
|
|
Given a limit_file_no that is still needed by a slave (dump thread).
|
|
The dump thread will need to read any oob records references from event
|
|
groups in that file_no, so it will then also need to read from any earlier
|
|
file_no referenced from limit_file_no.
|
|
|
|
This function handles this dependency, by reading the header page (or
|
|
getting from the ibb_file_hash if available) to get any earlier file_no
|
|
containing such references.
|
|
*/
|
|
static bool
|
|
purge_adjust_limit_file_no(handler_binlog_purge_info *purge_info, LF_PINS *pins)
|
|
{
|
|
uint64_t limit_file_no= purge_info->limit_file_no;
|
|
if (limit_file_no == ~(uint64_t)0)
|
|
return false;
|
|
|
|
uint64_t referenced_file_no;
|
|
if (ibb_file_hash.get_oob_ref_file_no(limit_file_no, pins,
|
|
&referenced_file_no))
|
|
{
|
|
if (referenced_file_no < limit_file_no)
|
|
purge_info->limit_file_no= referenced_file_no;
|
|
else
|
|
ut_ad(referenced_file_no == limit_file_no ||
|
|
referenced_file_no == ~(uint64_t)0);
|
|
return false;
|
|
}
|
|
|
|
byte *page_buf= static_cast<byte *>(ut_malloc(ibb_page_size, mem_key_binlog));
|
|
if (!page_buf)
|
|
{
|
|
my_error(ER_OUTOFMEMORY, MYF(0), ibb_page_size);
|
|
return true;
|
|
}
|
|
char filename[OS_FILE_MAX_PATH];
|
|
binlog_name_make(filename, limit_file_no);
|
|
File fh= my_open(filename, O_RDONLY | O_BINARY, MYF(0));
|
|
if (fh < (File)0)
|
|
{
|
|
my_error(ER_ERROR_ON_READ, MYF(0), filename, my_errno);
|
|
ut_free(page_buf);
|
|
return true;
|
|
}
|
|
int res= crc32_pread_page(fh, page_buf, 0, MYF(0));
|
|
my_close(fh, MYF(0));
|
|
if (res <= 0)
|
|
{
|
|
ut_free(page_buf);
|
|
my_error(ER_ERROR_ON_READ, MYF(0), filename, my_errno);
|
|
return true;
|
|
}
|
|
binlog_header_data header;
|
|
fsp_binlog_extract_header_page(page_buf, &header);
|
|
ut_free(page_buf);
|
|
if (header.is_invalid || header.is_empty)
|
|
{
|
|
my_error(ER_ERROR_ON_READ, MYF(0), filename, my_errno);
|
|
return true;
|
|
}
|
|
if (header.oob_ref_file_no < limit_file_no)
|
|
purge_info->limit_file_no= header.oob_ref_file_no;
|
|
else
|
|
ut_ad(header.oob_ref_file_no == limit_file_no ||
|
|
header.oob_ref_file_no == ~(uint64_t)0);
|
|
ibb_record_in_file_hash(limit_file_no, header.oob_ref_file_no,
|
|
header.xa_ref_file_no, pins);
|
|
return false;
|
|
}
|
|
|
|
|
|
/**
|
|
The low-level function handling binlog purge.
|
|
|
|
How much to purge is determined by:
|
|
|
|
1. Lowest file_no that should not be purged. This is determined as the
|
|
minimum of:
|
|
1a. active_binlog_file_no
|
|
1b. first_open_binlog_file_no
|
|
1c. Any file_no in use by an active dump thread
|
|
1d. Any file_no containing oob data referenced by file_no from (1c)
|
|
1e. Any file_no containing oob data referenced by an active transaction.
|
|
1f. User specified file_no (from PURGE BINARY LOGS TO, if any).
|
|
|
|
2. Unix timestamp specifying the minimal value that should not be purged,
|
|
optional (used by PURGE BINARY LOGS BEFORE and --binlog-expire-log-seconds).
|
|
|
|
3. Maximum total size of binlogs, optional (from --max-binlog-total-size).
|
|
|
|
Sets out_file_no to the earliest binlog file not purged.
|
|
Additionally returns:
|
|
|
|
0 Purged all files as requested.
|
|
1 Some files were not purged due to being currently in-use (by binlog
|
|
writing or active dump threads).
|
|
*/
|
|
static int
|
|
innodb_binlog_purge_low(handler_binlog_purge_info *purge_info,
|
|
uint64_t limit_name_file_no, LF_PINS *lf_pins,
|
|
uint64_t *out_file_no)
|
|
noexcept
|
|
{
|
|
uint64_t limit_file_no= purge_info->limit_file_no;
|
|
bool by_date= purge_info->purge_by_date;
|
|
bool by_size= purge_info->purge_by_size;
|
|
bool by_name= purge_info->purge_by_name;
|
|
uint64_t active= active_binlog_file_no.load(std::memory_order_relaxed);
|
|
bool need_active_flush= (active <= limit_file_no + 2);
|
|
ut_ad(by_date || by_size || by_name);
|
|
ut_a(limit_file_no <= active);
|
|
ut_a(limit_file_no <= first_open_binlog_file_no);
|
|
|
|
mysql_mutex_assert_owner(&purge_binlog_mutex);
|
|
size_t loc_total_size= total_binlog_used_size;
|
|
uint64_t file_no;
|
|
bool want_purge;
|
|
|
|
for (file_no= earliest_binlog_file_no; ; ++file_no)
|
|
{
|
|
want_purge= false;
|
|
|
|
char filename[OS_FILE_MAX_PATH];
|
|
binlog_name_make(filename, file_no);
|
|
MY_STAT stat_buf;
|
|
if (!my_stat(filename, &stat_buf, MYF(0)))
|
|
{
|
|
if (my_errno == ENOENT)
|
|
sql_print_information("InnoDB: File already gone when purging binlog "
|
|
"file '%s'", filename);
|
|
else
|
|
sql_print_warning("InnoDB: Failed to stat() when trying to purge "
|
|
"binlog file '%' (errno: %d)", filename, my_errno);
|
|
continue;
|
|
}
|
|
|
|
if (by_date && stat_buf.st_mtime < purge_info->limit_date)
|
|
want_purge= true;
|
|
if (by_size && loc_total_size > purge_info->limit_size)
|
|
want_purge= true;
|
|
if (by_name && file_no < limit_name_file_no)
|
|
want_purge= true;
|
|
if (!want_purge ||
|
|
file_no >= limit_file_no ||
|
|
ibb_file_hash.get_oob_ref_in_use(file_no, lf_pins))
|
|
break;
|
|
|
|
earliest_binlog_file_no= file_no + 1;
|
|
if (loc_total_size < (size_t)stat_buf.st_size)
|
|
{
|
|
/*
|
|
Somehow we miscounted size, files changed from outside server or
|
|
possibly bug. We will handle not underflowing the total. If this
|
|
assertion becomes a problem for testing, it can just be removed.
|
|
*/
|
|
ut_ad(0);
|
|
}
|
|
else
|
|
loc_total_size-= (size_t)stat_buf.st_size;
|
|
|
|
/*
|
|
Make sure that we always leave at least one binlog file durably non-empty,
|
|
by fsync()'ing the first page of the active file before deleting file
|
|
(active-2). This way, recovery will always have at least one file header
|
|
from which to determine the LSN at which to start applying redo records.
|
|
*/
|
|
if (file_no + 2 >= active && need_active_flush)
|
|
{
|
|
binlog_page_fifo->flush_up_to(active, 0);
|
|
need_active_flush= false;
|
|
}
|
|
|
|
ibb_file_hash.remove(file_no, lf_pins);
|
|
if (my_delete(filename, MYF(0)))
|
|
{
|
|
if (my_errno == ENOENT)
|
|
{
|
|
/*
|
|
File already gone, just ignore the error.
|
|
(This should be somewhat unusual to happen as stat() succeeded).
|
|
*/
|
|
}
|
|
else
|
|
{
|
|
sql_print_warning("InnoDB: Delete failed while trying to purge binlog "
|
|
"file '%s' (errno: %d)", filename, my_error);
|
|
continue;
|
|
}
|
|
}
|
|
}
|
|
total_binlog_used_size= loc_total_size;
|
|
*out_file_no= file_no;
|
|
return (want_purge ? 1 : 0);
|
|
}
|
|
|
|
|
|
static void
|
|
innodb_binlog_autopurge(uint64_t first_open_file_no, LF_PINS *pins)
|
|
{
|
|
handler_binlog_purge_info purge_info;
|
|
#ifdef HAVE_REPLICATION
|
|
extern bool ha_binlog_purge_info(handler_binlog_purge_info *out_info);
|
|
bool can_purge= ha_binlog_purge_info(&purge_info);
|
|
#else
|
|
bool can_purge= false;
|
|
memset(&purge_info, 0, sizeof(purge_info)); /* Silence compiler warnings. */
|
|
#endif
|
|
if (!can_purge ||
|
|
!(purge_info.purge_by_size || purge_info.purge_by_date))
|
|
return;
|
|
|
|
if (purge_adjust_limit_file_no(&purge_info, pins))
|
|
return;
|
|
|
|
/* Don't purge any actively open tablespace files. */
|
|
uint64_t orig_limit_file_no= purge_info.limit_file_no;
|
|
if (purge_info.limit_file_no == ~(uint64_t)0 ||
|
|
purge_info.limit_file_no > first_open_file_no)
|
|
purge_info.limit_file_no= first_open_file_no;
|
|
uint64_t active= active_binlog_file_no.load(std::memory_order_relaxed);
|
|
if (purge_info.limit_file_no > active)
|
|
purge_info.limit_file_no= active;
|
|
purge_info.purge_by_name= false;
|
|
|
|
uint64_t file_no;
|
|
int res= innodb_binlog_purge_low(&purge_info, 0, pins, &file_no);
|
|
if (res)
|
|
{
|
|
if (!purge_warning_given)
|
|
{
|
|
char filename[BINLOG_NAME_MAX_LEN];
|
|
binlog_name_make_short(filename, file_no);
|
|
if (purge_info.nonpurge_reason)
|
|
sql_print_information("InnoDB: Binlog file %s could not be purged "
|
|
"because %s",
|
|
filename, purge_info.nonpurge_reason);
|
|
else if (orig_limit_file_no == file_no)
|
|
sql_print_information("InnoDB: Binlog file %s could not be purged "
|
|
"because it is in use by a binlog dump thread "
|
|
"(connected slave)", filename);
|
|
else if (purge_info.limit_file_no == file_no)
|
|
sql_print_information("InnoDB: Binlog file %s could not be purged "
|
|
"because it is in active use", filename);
|
|
else
|
|
sql_print_information("InnoDB: Binlog file %s could not be purged "
|
|
"because it might still be needed", filename);
|
|
purge_warning_given= true;
|
|
}
|
|
}
|
|
else
|
|
purge_warning_given= false;
|
|
}
|
|
|
|
|
|
int
|
|
innodb_binlog_purge(handler_binlog_purge_info *purge_info)
|
|
{
|
|
/*
|
|
Let us check that we do not get an attempt to purge by file, date, and/or
|
|
size at the same time.
|
|
(If we do, it is not necesarily a problem, but this cannot happen in
|
|
current server code).
|
|
*/
|
|
ut_ad(1 == (!!purge_info->purge_by_name +
|
|
!!purge_info->purge_by_date +
|
|
!!purge_info->purge_by_size));
|
|
|
|
if (!purge_info->purge_by_name && !purge_info->purge_by_date &&
|
|
!purge_info->purge_by_size)
|
|
return 0;
|
|
|
|
mysql_mutex_lock(&active_binlog_mutex);
|
|
uint64_t limit_file_no=
|
|
std::min(active_binlog_file_no.load(std::memory_order_relaxed),
|
|
first_open_binlog_file_no);
|
|
uint64_t last_created= last_created_binlog_file_no;
|
|
mysql_mutex_unlock(&active_binlog_mutex);
|
|
|
|
uint64_t to_file_no= ~(uint64_t)0;
|
|
if (purge_info->purge_by_name)
|
|
{
|
|
if (!is_binlog_name(purge_info->limit_name, &to_file_no) ||
|
|
to_file_no > last_created)
|
|
return LOG_INFO_EOF;
|
|
}
|
|
|
|
LF_PINS *lf_pins= lf_hash_get_pins(&ibb_file_hash.hash);
|
|
ut_a(lf_pins);
|
|
if (purge_adjust_limit_file_no(purge_info, lf_pins))
|
|
{
|
|
lf_hash_put_pins(lf_pins);
|
|
return LOG_INFO_IO;
|
|
}
|
|
|
|
uint64_t orig_limit_file_no= purge_info->limit_file_no;
|
|
purge_info->limit_file_no= std::min(orig_limit_file_no, limit_file_no);
|
|
|
|
mysql_mutex_lock(&purge_binlog_mutex);
|
|
uint64_t file_no;
|
|
int res= innodb_binlog_purge_low(purge_info, to_file_no, lf_pins, &file_no);
|
|
mysql_mutex_unlock(&purge_binlog_mutex);
|
|
lf_hash_put_pins(lf_pins);
|
|
|
|
if (res == 1)
|
|
{
|
|
static_assert(sizeof(purge_info->nonpurge_filename) >= BINLOG_NAME_MAX_LEN,
|
|
"No room to return filename");
|
|
binlog_name_make_short(purge_info->nonpurge_filename, file_no);
|
|
if (!purge_info->nonpurge_reason)
|
|
{
|
|
if (limit_file_no == file_no)
|
|
purge_info->nonpurge_reason= "the binlog file is in active use";
|
|
else if (orig_limit_file_no == file_no)
|
|
purge_info->nonpurge_reason= "it is in use by a binlog dump thread "
|
|
"(connected slave)";
|
|
}
|
|
res= LOG_INFO_IN_USE;
|
|
}
|
|
else
|
|
purge_warning_given= false;
|
|
|
|
return res;
|
|
}
|
|
|
|
|
|
bool
|
|
binlog_recover_write_data(bool space_id, uint32_t page_no,
|
|
uint16_t offset,
|
|
lsn_t start_lsn, lsn_t lsn,
|
|
const byte *buf, size_t size) noexcept
|
|
{
|
|
if (!recover_obj.inited)
|
|
return recover_obj.init_recovery(space_id, page_no, offset, start_lsn, lsn,
|
|
buf, size);
|
|
return recover_obj.apply_redo(space_id, page_no, offset, start_lsn, lsn,
|
|
buf, size);
|
|
}
|
|
|
|
|
|
void
|
|
binlog_recover_end(lsn_t lsn) noexcept
|
|
{
|
|
if (recover_obj.inited)
|
|
recover_obj.end_actions(true);
|
|
}
|