mirror of
https://github.com/MariaDB/server.git
synced 2025-01-25 00:04:33 +01:00
d039346a7a
Improve the performance of slave connect using B+-Tree indexes on each binlog file. The index allows fast lookup of a GTID position to the corresponding offset in the binlog file, as well as lookup of a position to find the corresponding GTID position. This eliminates a costly sequential scan of the starting binlog file to find the GTID starting position when a slave connects. This is especially costly if the binlog file is not cached in memory (IO cost), or if it is encrypted or a lot of slaves connect simultaneously (CPU cost). The size of the index files is generally less than 1% of the binlog data, so not expected to be an issue. Most of the work writing the index is done as a background task, in the binlog background thread. This minimises the performance impact on transaction commit. A simple global mutex is used to protect index reads and (background) index writes; this is fine as slave connect is a relatively infrequent operation. Here are the user-visible options and status variables. The feature is on by default and is expected to need no tuning or configuration for most users. binlog_gtid_index On by default. Can be used to disable the indexes for testing purposes. binlog_gtid_index_page_size (default 4096) Page size to use for the binlog GTID index. This is the size of the nodes in the B+-tree used internally in the index. A very small page-size (64 is the minimum) will be less efficient, but can be used to stress the BTree-code during testing. binlog_gtid_index_span_min (default 65536) Control sparseness of the binlog GTID index. If set to N, at most one index record will be added for every N bytes of binlog file written. This can be used to reduce the number of records in the index, at the cost only of having to scan a few more events in the binlog file before finding the target position Two status variables are available to monitor the use of the GTID indexes: Binlog_gtid_index_hit Binlog_gtid_index_miss The "hit" status increments for each successful lookup in a GTID index. The "miss" increments when a lookup is not possible. This indicates that the index file is missing (eg. binlog written by old server version without GTID index support), or corrupt. Signed-off-by: Kristian Nielsen <knielsen@knielsen-hq.org>
1434 lines
37 KiB
C++
1434 lines
37 KiB
C++
/*
|
|
Copyright (c) 2023 Kristian Nielsen <knielsen@knielsen-hq.org>
|
|
|
|
This program is free software; you can redistribute it and/or modify
|
|
it under the terms of the GNU General Public License as published by
|
|
the Free Software Foundation; version 2 of the License.
|
|
|
|
This program is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
GNU General Public License for more details.
|
|
|
|
You should have received a copy of the GNU General Public License
|
|
along with this program; if not, write to the Free Software
|
|
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
|
|
*/
|
|
|
|
#include "gtid_index.h"
|
|
#include "sql_const.h"
|
|
#include "log.h"
|
|
|
|
|
|
static const uchar GTID_INDEX_MAGIC[4]= {
|
|
254, 254, 12, 1,
|
|
};
|
|
|
|
Gtid_index_writer *Gtid_index_writer::hot_index_list= nullptr;
|
|
/* gtid_index_mutex is inited in MYSQL_LOG::init_pthread_objects(). */
|
|
mysql_mutex_t Gtid_index_writer::gtid_index_mutex;
|
|
|
|
|
|
Gtid_index_writer::Gtid_index_writer(const char *filename, uint32 offset,
|
|
rpl_binlog_state_base *binlog_state,
|
|
uint32 opt_page_size,
|
|
my_off_t opt_span_min)
|
|
: offset_min_threshold(opt_span_min),
|
|
nodes(nullptr), previous_offset(0),
|
|
max_level(0), index_file(-1),
|
|
error_state(false), file_header_written(false), in_hot_index_list(false)
|
|
{
|
|
uint32 count;
|
|
rpl_gtid *gtid_list;
|
|
page_size= opt_page_size;
|
|
pending_state.init();
|
|
|
|
if (alloc_level_if_missing(0))
|
|
{
|
|
give_error("Out of memory allocating node list");
|
|
return;
|
|
}
|
|
|
|
/*
|
|
Lock the index mutex at this point just before we create the new index
|
|
file on disk. From this point on, and until the index is fully written,
|
|
the reader will find us in the "hot index" list and will be able to read
|
|
from the index while it's still being constructed.
|
|
*/
|
|
lock_gtid_index();
|
|
|
|
build_index_filename(filename);
|
|
int create_flags= O_WRONLY|O_TRUNC|O_BINARY|O_EXCL;
|
|
index_file= mysql_file_create(key_file_gtid_index, index_file_name,
|
|
CREATE_MODE, create_flags, MYF(0));
|
|
if (index_file < 0 && my_errno == EEXIST)
|
|
{
|
|
/*
|
|
It shouldn't happen that an old GTID index file remains, as we remove
|
|
them as part of RESET MASTER and PURGE BINARY LOGS. But if it happens
|
|
due to some external file copy of the user or something, delete any old
|
|
GTID index file first.
|
|
*/
|
|
sql_print_information("Old GTID index file found '%s', deleting",
|
|
index_file_name);
|
|
my_errno= 0;
|
|
mysql_file_delete(key_file_gtid_index, index_file_name, MYF(0));
|
|
index_file= mysql_file_create(key_file_gtid_index, index_file_name,
|
|
CREATE_MODE, create_flags, MYF(0));
|
|
}
|
|
if (index_file < 0)
|
|
{
|
|
give_error("Failed to open new index file for writing");
|
|
goto err;
|
|
}
|
|
|
|
/*
|
|
Write out an initial index record, i.e. corresponding to the GTID_LIST
|
|
event / binlog state at the start of the binlog file.
|
|
*/
|
|
count= binlog_state->count_nolock();
|
|
gtid_list= gtid_list_buffer(count);
|
|
if (count > 0)
|
|
{
|
|
if (!gtid_list)
|
|
goto err;
|
|
binlog_state->get_gtid_list_nolock(gtid_list, count);
|
|
}
|
|
write_record(offset, gtid_list, count);
|
|
|
|
insert_in_hot_index();
|
|
|
|
err:
|
|
unlock_gtid_index();
|
|
}
|
|
|
|
|
|
Gtid_index_writer::~Gtid_index_writer()
|
|
{
|
|
if (in_hot_index_list)
|
|
{
|
|
lock_gtid_index();
|
|
close();
|
|
unlock_gtid_index();
|
|
}
|
|
|
|
if (index_file > 0)
|
|
{
|
|
/*
|
|
Should have been closed by call to Gtid_index_writer::close().
|
|
We can at least avoid leaking file descriptor.
|
|
*/
|
|
mysql_file_close(index_file, MYF(0));
|
|
}
|
|
|
|
if (nodes)
|
|
{
|
|
for (uint32 i= 0; i <= max_level; ++i)
|
|
delete nodes[i];
|
|
my_free(nodes);
|
|
}
|
|
|
|
/*
|
|
state.free() is not needed here, will be called from rpl_binlog_state_base
|
|
destructor.
|
|
*/
|
|
}
|
|
|
|
|
|
void
|
|
Gtid_index_writer::gtid_index_init()
|
|
{
|
|
mysql_mutex_init(key_gtid_index_lock, >id_index_mutex, MY_MUTEX_INIT_SLOW);
|
|
}
|
|
|
|
void
|
|
Gtid_index_writer::gtid_index_cleanup()
|
|
{
|
|
mysql_mutex_destroy(>id_index_mutex);
|
|
}
|
|
|
|
|
|
const Gtid_index_writer *
|
|
Gtid_index_writer::find_hot_index(const char *file_name)
|
|
{
|
|
mysql_mutex_assert_owner(>id_index_mutex);
|
|
|
|
for (const Gtid_index_writer *p= hot_index_list; p; p= p->next_hot_index)
|
|
{
|
|
if (0 == strcmp(file_name, p->index_file_name))
|
|
return p;
|
|
}
|
|
return nullptr;
|
|
}
|
|
|
|
void
|
|
Gtid_index_writer::insert_in_hot_index()
|
|
{
|
|
mysql_mutex_assert_owner(>id_index_mutex);
|
|
|
|
next_hot_index= hot_index_list;
|
|
hot_index_list= this;
|
|
in_hot_index_list= true;
|
|
}
|
|
|
|
|
|
void
|
|
Gtid_index_writer::remove_from_hot_index()
|
|
{
|
|
mysql_mutex_assert_owner(>id_index_mutex);
|
|
|
|
Gtid_index_writer **next_ptr_ptr= &hot_index_list;
|
|
for (;;)
|
|
{
|
|
Gtid_index_writer *p= *next_ptr_ptr;
|
|
if (!p)
|
|
break;
|
|
if (p == this)
|
|
{
|
|
*next_ptr_ptr= p->next_hot_index;
|
|
break;
|
|
}
|
|
next_ptr_ptr= &p->next_hot_index;
|
|
}
|
|
next_hot_index= nullptr;
|
|
in_hot_index_list= false;
|
|
}
|
|
|
|
void
|
|
Gtid_index_writer::process_gtid(uint32 offset, const rpl_gtid *gtid)
|
|
{
|
|
rpl_gtid *gtid_list;
|
|
uint32 gtid_count;
|
|
|
|
if (process_gtid_check_batch(offset, gtid, >id_list, >id_count))
|
|
return; // Error
|
|
|
|
if (gtid_list)
|
|
async_update(offset, gtid_list, gtid_count);
|
|
}
|
|
|
|
|
|
int
|
|
Gtid_index_writer::process_gtid_check_batch(uint32 offset, const rpl_gtid *gtid,
|
|
rpl_gtid **out_gtid_list,
|
|
uint32 *out_gtid_count)
|
|
{
|
|
uint32 count;
|
|
rpl_gtid *gtid_list;
|
|
|
|
mysql_mutex_assert_not_owner(>id_index_mutex);
|
|
|
|
if (unlikely(pending_state.update_nolock(gtid)))
|
|
{
|
|
give_error("Out of memory processing GTID for binlog GTID index");
|
|
return 1;
|
|
}
|
|
/*
|
|
Sparse index; we record only selected GTIDs, and scan the binlog forward
|
|
from there to find the exact spot.
|
|
*/
|
|
if (offset - previous_offset < offset_min_threshold)
|
|
{
|
|
*out_gtid_list= nullptr;
|
|
*out_gtid_count= 0;
|
|
return 0;
|
|
}
|
|
|
|
count= pending_state.count_nolock();
|
|
DBUG_ASSERT(count > 0 /* Since we just updated with a GTID. */);
|
|
gtid_list= (rpl_gtid *)
|
|
my_malloc(key_memory_binlog_gtid_index, count*sizeof(*gtid_list), MYF(0));
|
|
if (unlikely(!gtid_list))
|
|
{
|
|
give_error("Out of memory allocating GTID list for binlog GTID index");
|
|
return 1;
|
|
}
|
|
if (unlikely(pending_state.get_gtid_list_nolock(gtid_list, count)))
|
|
{
|
|
/* Shouldn't happen as we allocated the list with the correct length. */
|
|
DBUG_ASSERT(false);
|
|
give_error("Internal error allocating GTID list for binlog GTID index");
|
|
my_free(gtid_list);
|
|
return 1;
|
|
}
|
|
pending_state.reset_nolock();
|
|
previous_offset= offset;
|
|
*out_gtid_list= gtid_list;
|
|
*out_gtid_count= count;
|
|
return 0;
|
|
}
|
|
|
|
|
|
int
|
|
Gtid_index_writer::async_update(uint32 event_offset,
|
|
rpl_gtid *gtid_list,
|
|
uint32 gtid_count)
|
|
{
|
|
lock_gtid_index();
|
|
int res= write_record(event_offset, gtid_list, gtid_count);
|
|
unlock_gtid_index();
|
|
my_free(gtid_list);
|
|
return res;
|
|
}
|
|
|
|
|
|
void
|
|
Gtid_index_writer::close()
|
|
{
|
|
lock_gtid_index();
|
|
if (!error_state)
|
|
{
|
|
|
|
/*
|
|
Write out the remaining pending pages, and insert the final child pointer
|
|
in interior nodes.
|
|
*/
|
|
for (uint32 level= 0; ; ++level)
|
|
{
|
|
uint32 node_ptr= write_current_node(level, level==max_level);
|
|
nodes[level]->reset();
|
|
if (!node_ptr || level >= max_level)
|
|
break;
|
|
add_child_ptr(level+1, node_ptr);
|
|
}
|
|
}
|
|
remove_from_hot_index();
|
|
unlock_gtid_index();
|
|
|
|
if (!error_state)
|
|
{
|
|
if (mysql_file_sync(index_file, MYF(0)))
|
|
give_error("Error syncing index file to disk");
|
|
}
|
|
|
|
mysql_file_close(index_file, MYF(0));
|
|
index_file= (File)-1;
|
|
}
|
|
|
|
|
|
Gtid_index_base::Index_node_base::Index_node_base()
|
|
: first_page(nullptr), current_page(nullptr), current_ptr(nullptr)
|
|
{
|
|
}
|
|
|
|
|
|
Gtid_index_base::Index_node_base::~Index_node_base()
|
|
{
|
|
free_pages();
|
|
}
|
|
|
|
|
|
void
|
|
Gtid_index_base::Index_node_base::free_pages()
|
|
{
|
|
for (Node_page *p= first_page; p; )
|
|
{
|
|
Node_page *q= p->next;
|
|
my_free(p);
|
|
p= q;
|
|
}
|
|
}
|
|
|
|
|
|
void
|
|
Gtid_index_base::Index_node_base::reset()
|
|
{
|
|
free_pages();
|
|
first_page= current_page= nullptr;
|
|
}
|
|
|
|
|
|
Gtid_index_base::Gtid_index_base()
|
|
: gtid_buffer(nullptr), gtid_buffer_alloc(0)
|
|
{
|
|
}
|
|
|
|
|
|
Gtid_index_base::~Gtid_index_base()
|
|
{
|
|
if (gtid_buffer_alloc > 0)
|
|
my_free(gtid_buffer);
|
|
}
|
|
|
|
|
|
void
|
|
Gtid_index_base::make_gtid_index_file_name(char *out_name, size_t bufsize,
|
|
const char *base_filename)
|
|
{
|
|
char *p= strmake(out_name, base_filename, bufsize-1);
|
|
size_t remain= bufsize - (p - out_name);
|
|
strmake(p, ".idx", remain-1);
|
|
}
|
|
|
|
|
|
void
|
|
Gtid_index_base::build_index_filename(const char *filename)
|
|
{
|
|
make_gtid_index_file_name(index_file_name, sizeof(index_file_name), filename);
|
|
}
|
|
|
|
|
|
rpl_gtid *
|
|
Gtid_index_base::gtid_list_buffer(uint32 count)
|
|
{
|
|
if (gtid_buffer_alloc >= count)
|
|
return gtid_buffer;
|
|
rpl_gtid *new_buffer= (rpl_gtid *)
|
|
my_malloc(key_memory_binlog_gtid_index, count*sizeof(*new_buffer), MYF(0));
|
|
if (!new_buffer)
|
|
{
|
|
give_error("Out of memory allocating buffer for GTID list");
|
|
return NULL;
|
|
}
|
|
my_free(gtid_buffer);
|
|
gtid_buffer= new_buffer;
|
|
gtid_buffer_alloc= count;
|
|
return new_buffer;
|
|
}
|
|
|
|
|
|
Gtid_index_writer::Index_node::Index_node(uint32 level_)
|
|
: num_records(0), level(level_), force_spill_page(false)
|
|
{
|
|
state.init();
|
|
}
|
|
|
|
|
|
Gtid_index_writer::Index_node::~Index_node()
|
|
{
|
|
free_pages();
|
|
}
|
|
|
|
|
|
uint32
|
|
Gtid_index_writer::write_current_node(uint32 level, bool is_root)
|
|
{
|
|
Index_node *n= nodes[level];
|
|
|
|
uint32 node_pos= (uint32)mysql_file_tell(index_file, MYF(0));
|
|
|
|
for (Node_page *p= n->first_page; p ; p= p->next)
|
|
{
|
|
if (unlikely(is_root))
|
|
*(p->flag_ptr) |= PAGE_FLAG_ROOT;
|
|
if (likely(!p->next))
|
|
*(p->flag_ptr) |= PAGE_FLAG_LAST;
|
|
int4store(p->page + page_size - CHECKSUM_LEN,
|
|
my_checksum(0, p->page, page_size - CHECKSUM_LEN));
|
|
if (mysql_file_write(index_file, p->page, page_size, MYF(MY_NABP)))
|
|
{
|
|
give_error("Error writing index page");
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
DBUG_ASSERT(node_pos % page_size == 0);
|
|
/* Page numbers are +1 just so that zero can denote invalid page pointer. */
|
|
return 1 + (node_pos / (uint32)page_size);
|
|
}
|
|
|
|
|
|
void
|
|
Gtid_index_writer::Index_node::reset()
|
|
{
|
|
Index_node_base::reset();
|
|
state.reset_nolock();
|
|
num_records= 0;
|
|
force_spill_page= false;
|
|
}
|
|
|
|
|
|
/*
|
|
Make sure there is requested space in the current page, by allocating a
|
|
new spill page if necessary.
|
|
*/
|
|
int
|
|
Gtid_index_writer::reserve_space(Index_node *n, size_t bytes)
|
|
{
|
|
DBUG_ASSERT(bytes <= page_size);
|
|
if (likely(n->current_page) &&
|
|
likely(n->current_ptr - n->current_page->page + bytes <=
|
|
(page_size - CHECKSUM_LEN)))
|
|
return 0;
|
|
/* Not enough room, allocate a spill page. */
|
|
Node_page *page= alloc_page();
|
|
n->force_spill_page= false;
|
|
if (!page)
|
|
return 1;
|
|
n->current_ptr=
|
|
init_header(page, n->level==0, !n->current_page);
|
|
if (n->current_page)
|
|
n->current_page->next= page;
|
|
else
|
|
n->first_page= page;
|
|
n->current_page= page;
|
|
return 0;
|
|
}
|
|
|
|
|
|
int
|
|
Gtid_index_writer::do_write_record(uint32 level,
|
|
uint32 event_offset,
|
|
const rpl_gtid *gtid_list,
|
|
uint32 gtid_count)
|
|
{
|
|
DBUG_ASSERT(level <= max_level);
|
|
Index_node *n= nodes[level];
|
|
if (reserve_space(n, 8))
|
|
return 1;
|
|
/* Store the count as +1, so that 0 can mean "no more records". */
|
|
int4store(n->current_ptr, gtid_count+1);
|
|
int4store(n->current_ptr+4, event_offset);
|
|
n->current_ptr+= 8;
|
|
for (uint32 i= 0; i < gtid_count; ++i)
|
|
{
|
|
if (reserve_space(n, 16))
|
|
return 1;
|
|
int4store(n->current_ptr, gtid_list[i].domain_id);
|
|
int4store(n->current_ptr+4, gtid_list[i].server_id);
|
|
int8store(n->current_ptr+8, gtid_list[i].seq_no);
|
|
n->current_ptr+= 16;
|
|
}
|
|
|
|
++n->num_records;
|
|
return 0;
|
|
}
|
|
|
|
|
|
/*
|
|
Add a child pointer to the current node on LEVEL.
|
|
The first page has node_ptr=1 just so that a zero node_ptr can be used as
|
|
a no/invalid value (effectively node_ptr points to the end of the target
|
|
page, in unit of pages).
|
|
|
|
Adding a child pointer shouldn't spill to a new page, code must make sure that
|
|
there is always room for the final child pointer in current non-leaf node.
|
|
*/
|
|
int
|
|
Gtid_index_writer::add_child_ptr(uint32 level, my_off_t node_offset)
|
|
{
|
|
DBUG_ASSERT(level <= max_level);
|
|
DBUG_ASSERT(node_offset > 0);
|
|
Index_node *n= nodes[level];
|
|
if (reserve_space(n, 4))
|
|
return 1;
|
|
DBUG_ASSERT(n->current_page);
|
|
DBUG_ASSERT((size_t)(n->current_ptr - n->current_page->page + 4) <=
|
|
page_size - CHECKSUM_LEN);
|
|
|
|
int4store(n->current_ptr, node_offset);
|
|
n->current_ptr+= 4;
|
|
return 0;
|
|
}
|
|
|
|
|
|
/*
|
|
Write one index record to the GTID index, flushing nodes and allocating
|
|
new nodes as necessary.
|
|
*/
|
|
int
|
|
Gtid_index_writer::write_record(uint32 event_offset,
|
|
const rpl_gtid *gtid_list,
|
|
uint32 gtid_count)
|
|
{
|
|
if (error_state)
|
|
return 1; /* Avoid continuing on a possibly corrupt state. */
|
|
|
|
uint32 level= 0;
|
|
/*
|
|
The most frequent case is when there is room in the current page for the
|
|
current position to be written, in which case we exit early in the first
|
|
iteration of the following loop.
|
|
|
|
In the general case, we move up through the path to the root, writing
|
|
lower-level node page to disk and adding child pointers in higher-level
|
|
nodes, until we reach a node that has room. This final node may be a
|
|
freshly allocated new root node in the few times when the height of the
|
|
tree increases.
|
|
*/
|
|
for (;;)
|
|
{
|
|
Index_node *n= nodes[level];
|
|
if (update_gtid_state(&n->state, gtid_list, gtid_count))
|
|
return give_error("Out of memory updating the local GTID state");
|
|
|
|
if (check_room(level, gtid_count))
|
|
{
|
|
/* There is room in the node, just add the index record. */
|
|
return do_write_record(level, event_offset, gtid_list, gtid_count);
|
|
}
|
|
|
|
/*
|
|
This node is full:
|
|
- First, write out this node to disk.
|
|
- Add a child pointer in the parent node (allocating one if needed).
|
|
- On level 0, allocate a new leaf node and add the index record there.
|
|
- On levels >0, skip the last index record when the node gets full
|
|
(B+-Tree has (k-1) keys for k child pointers).
|
|
- Loop to the parent node to add an index record there.
|
|
*/
|
|
uint32 node_ptr= write_current_node(level, false);
|
|
if (!node_ptr)
|
|
return 1;
|
|
if (alloc_level_if_missing(level+1) ||
|
|
add_child_ptr(level+1, node_ptr))
|
|
return 1;
|
|
uint32 new_count= n->state.count_nolock();
|
|
rpl_gtid *new_gtid_list= gtid_list_buffer(new_count);
|
|
if (new_count > 0 && !new_gtid_list)
|
|
return 1;
|
|
if (n->state.get_gtid_list_nolock(new_gtid_list, new_count))
|
|
return give_error("Internal error processing GTID state");
|
|
n->reset();
|
|
if (level == 0)
|
|
{
|
|
if (do_write_record(level, event_offset, new_gtid_list, new_count))
|
|
return 1;
|
|
}
|
|
else
|
|
{
|
|
/*
|
|
Allocate a page for the node. This is mostly to help the reader of hot
|
|
index to not see NULL pointers, and we will need the page later anyway
|
|
to put at least one child pointer to the level below.
|
|
*/
|
|
if (reserve_space(n, 4))
|
|
return 1;
|
|
}
|
|
gtid_list= new_gtid_list;
|
|
gtid_count= new_count;
|
|
++level;
|
|
}
|
|
// NotReached.
|
|
}
|
|
|
|
|
|
bool
|
|
Gtid_index_writer::check_room(uint32 level, uint32 gtid_count)
|
|
{
|
|
Index_node *n= nodes[level];
|
|
/* There's always room in an empty (to-be-allocated) page. */
|
|
if (!n->current_page || n->num_records == 0)
|
|
return true;
|
|
/*
|
|
Make sure we use at least 1/2 a page of room after the initial record,
|
|
setting a flag to allocate a spill page later if needed.
|
|
*/
|
|
size_t avail= page_size - CHECKSUM_LEN - (n->current_ptr - n->current_page->page);
|
|
if (n->num_records==1 && avail < page_size/2)
|
|
{
|
|
n->force_spill_page= true;
|
|
return true;
|
|
}
|
|
if (n->force_spill_page)
|
|
return true;
|
|
size_t needed= 8 + 16*gtid_count;
|
|
/* Non-leaf pages need extra 4 bytes for a child pointer. */
|
|
if (level > 0)
|
|
needed+= 4;
|
|
return needed <= avail;
|
|
}
|
|
|
|
|
|
int
|
|
Gtid_index_writer::alloc_level_if_missing(uint32 level)
|
|
{
|
|
if (likely(nodes))
|
|
{
|
|
if (likely(max_level >= level))
|
|
return 0;
|
|
DBUG_ASSERT(level == max_level+1); // Alloc one at a time
|
|
}
|
|
|
|
Index_node *node= new Index_node(level);
|
|
if (!node)
|
|
return give_error("Out of memory allocating new node");
|
|
Index_node **new_nodes= (Index_node **)
|
|
my_realloc(key_memory_binlog_gtid_index, nodes, (level+1)*sizeof(*nodes),
|
|
MYF(MY_ALLOW_ZERO_PTR|MY_ZEROFILL));
|
|
if (!new_nodes)
|
|
{
|
|
delete node;
|
|
return give_error("Out of memory allocating larger node list");
|
|
}
|
|
new_nodes[level]= node;
|
|
nodes= new_nodes;
|
|
max_level= level;
|
|
return 0;
|
|
}
|
|
|
|
|
|
/*
|
|
Initialize the start of a data page.
|
|
This is at the start of a page, except for the very first page where it
|
|
comes after the global file header.
|
|
Format:
|
|
0 flags.
|
|
1-3 unused padding/reserved.
|
|
|
|
The argument FIRST denotes if this is the first page (if false it is a
|
|
continuation page).
|
|
*/
|
|
uchar *
|
|
Gtid_index_writer::init_header(Node_page *page, bool is_leaf, bool is_first)
|
|
{
|
|
uchar *p= page->page;
|
|
bool is_file_header= !file_header_written;
|
|
|
|
if (unlikely(is_file_header))
|
|
{
|
|
memcpy(p, GTID_INDEX_MAGIC, sizeof(GTID_INDEX_MAGIC));
|
|
p+= sizeof(GTID_INDEX_MAGIC);
|
|
*p++= GTID_INDEX_VERSION_MAJOR;
|
|
*p++= GTID_INDEX_VERSION_MINOR;
|
|
/* Flags/padding currently unused. */
|
|
*p++= 0;
|
|
*p++= 0;
|
|
int4store(p, page_size);
|
|
p+= 4;
|
|
DBUG_ASSERT(p == page->page + GTID_INDEX_FILE_HEADER_SIZE);
|
|
file_header_written= true;
|
|
}
|
|
|
|
uchar flags= 0;
|
|
if (is_leaf)
|
|
flags|= PAGE_FLAG_IS_LEAF;
|
|
if (unlikely(!is_first))
|
|
flags|= PAGE_FLAG_IS_CONT;
|
|
page->flag_ptr= p;
|
|
*p++= flags;
|
|
/* Padding/reserved. */
|
|
p+= 3;
|
|
DBUG_ASSERT(p == page->page +
|
|
(is_file_header ? GTID_INDEX_FILE_HEADER_SIZE : 0) +
|
|
GTID_INDEX_PAGE_HEADER_SIZE);
|
|
DBUG_ASSERT((size_t)(p - page->page) < page_size - CHECKSUM_LEN);
|
|
return p;
|
|
}
|
|
|
|
|
|
int
|
|
Gtid_index_base::update_gtid_state(rpl_binlog_state_base *state,
|
|
const rpl_gtid *gtid_list, uint32 gtid_count)
|
|
{
|
|
for (uint32 i= 0; i < gtid_count; ++i)
|
|
if (state->update_nolock(>id_list[i]))
|
|
return 1;
|
|
return 0;
|
|
}
|
|
|
|
|
|
Gtid_index_base::Node_page *Gtid_index_base::alloc_page()
|
|
{
|
|
Node_page *new_node= (Node_page *)
|
|
my_malloc(key_memory_binlog_gtid_index,
|
|
sizeof(Node_page) + page_size,
|
|
MYF(MY_ZEROFILL));
|
|
if (!new_node)
|
|
give_error("Out of memory for allocating index page");
|
|
return new_node;
|
|
}
|
|
|
|
|
|
int Gtid_index_writer::give_error(const char *msg)
|
|
{
|
|
if (!error_state)
|
|
{
|
|
sql_print_information("Error during binlog GTID index creation, will "
|
|
"fallback to slower sequential binlog scan. "
|
|
"Error is: %s", msg);
|
|
error_state= true;
|
|
}
|
|
return 1;
|
|
}
|
|
|
|
|
|
Gtid_index_reader::Gtid_index_reader()
|
|
: n(nullptr), index_file(-1),
|
|
file_open(false), index_valid(false), has_root_node(false),
|
|
version_major(0), version_minor(0)
|
|
{
|
|
current_state.init();
|
|
compare_state.init();
|
|
}
|
|
|
|
|
|
Gtid_index_reader::~Gtid_index_reader()
|
|
{
|
|
if (file_open)
|
|
mysql_file_close(index_file, MYF(0));
|
|
}
|
|
|
|
|
|
int
|
|
Gtid_index_reader::search_offset(uint32 in_offset,
|
|
uint32 *out_offset, uint32 *out_gtid_count)
|
|
{
|
|
in_search_offset= in_offset;
|
|
search_cmp_function= &Gtid_index_reader::search_cmp_offset;
|
|
|
|
return do_index_search(out_offset, out_gtid_count);
|
|
}
|
|
|
|
int
|
|
Gtid_index_reader::search_gtid_pos(slave_connection_state *in_gtid_pos,
|
|
uint32 *out_offset, uint32 *out_gtid_count)
|
|
{
|
|
in_search_gtid_pos= in_gtid_pos;
|
|
search_cmp_function= &Gtid_index_reader::search_cmp_gtid_pos;
|
|
|
|
int res= do_index_search(out_offset, out_gtid_count);
|
|
/* Let's not leave a dangling pointer to the caller's memory. */
|
|
in_search_gtid_pos= nullptr;
|
|
|
|
return res;
|
|
}
|
|
|
|
rpl_gtid *
|
|
Gtid_index_reader::search_gtid_list()
|
|
{
|
|
return gtid_buffer;
|
|
}
|
|
|
|
|
|
int
|
|
Gtid_index_reader::search_cmp_offset(uint32 offset,
|
|
rpl_binlog_state_base *state)
|
|
{
|
|
if (offset <= in_search_offset)
|
|
return 0;
|
|
else
|
|
return -1;
|
|
}
|
|
|
|
|
|
int
|
|
Gtid_index_reader::search_cmp_gtid_pos(uint32 offset,
|
|
rpl_binlog_state_base *state)
|
|
{
|
|
if (state->is_before_pos(in_search_gtid_pos))
|
|
return 0;
|
|
else
|
|
return -1;
|
|
}
|
|
|
|
|
|
int
|
|
Gtid_index_reader::next_page()
|
|
{
|
|
if (!read_page->next)
|
|
return 1;
|
|
read_page= read_page->next;
|
|
read_ptr= read_page->flag_ptr + 4;
|
|
return 0;
|
|
}
|
|
|
|
|
|
int
|
|
Gtid_index_reader::find_bytes(uint32 num_bytes)
|
|
{
|
|
if ((my_ptrdiff_t)(read_ptr - read_page->page + num_bytes) <=
|
|
(my_ptrdiff_t)(page_size - CHECKSUM_LEN))
|
|
return 0;
|
|
return next_page();
|
|
}
|
|
|
|
|
|
int
|
|
Gtid_index_reader::get_child_ptr(uint32 *out_child_ptr)
|
|
{
|
|
if (find_bytes(4))
|
|
return give_error("Corrupt index, short index node");
|
|
*out_child_ptr= (uint32)uint4korr(read_ptr);
|
|
read_ptr+= 4;
|
|
return 0;
|
|
}
|
|
|
|
|
|
/*
|
|
Read the start of an index record (count of GTIDs in the differential state
|
|
and offset).
|
|
Returns:
|
|
0 ok
|
|
1 EOF, no more data in this node
|
|
*/
|
|
int
|
|
Gtid_index_reader::get_offset_count(uint32 *out_offset, uint32 *out_gtid_count)
|
|
{
|
|
if (find_bytes(8))
|
|
return 1;
|
|
uint32 gtid_count= uint4korr(read_ptr);
|
|
if (gtid_count == 0)
|
|
{
|
|
/* 0 means invalid/no record (we store N+1 for N GTIDs in record). */
|
|
return 1;
|
|
}
|
|
*out_gtid_count= gtid_count - 1;
|
|
*out_offset= uint4korr(read_ptr + 4);
|
|
read_ptr+= 8;
|
|
return 0;
|
|
}
|
|
|
|
|
|
int
|
|
Gtid_index_reader::get_gtid_list(rpl_gtid *out_gtid_list, uint32 count)
|
|
{
|
|
for (uint32 i= 0; i < count; ++i)
|
|
{
|
|
if (find_bytes(16))
|
|
return give_error("Corrupt index, short index node");
|
|
out_gtid_list[i].domain_id= uint4korr(read_ptr);
|
|
out_gtid_list[i].server_id= uint4korr(read_ptr + 4);
|
|
out_gtid_list[i].seq_no= uint8korr(read_ptr + 8);
|
|
read_ptr+= 16;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
|
|
int
|
|
Gtid_index_reader::open_index_file(const char *binlog_filename)
|
|
{
|
|
close_index_file();
|
|
build_index_filename(binlog_filename);
|
|
if ((index_file= mysql_file_open(key_file_gtid_index, index_file_name,
|
|
O_RDONLY|O_BINARY, MYF(0))) < 0)
|
|
return 1; // No error for missing index (eg. upgrade)
|
|
|
|
file_open= true;
|
|
if (read_file_header())
|
|
return 1;
|
|
|
|
return 0;
|
|
}
|
|
|
|
void
|
|
Gtid_index_reader::close_index_file()
|
|
{
|
|
if (!file_open)
|
|
return;
|
|
mysql_file_close(index_file, MYF(0));
|
|
file_open= false;
|
|
index_valid= false;
|
|
}
|
|
|
|
|
|
int
|
|
Gtid_index_reader::do_index_search(uint32 *out_offset, uint32 *out_gtid_count)
|
|
{
|
|
/* In cold index, we require a complete index with a valid root node. */
|
|
if (!has_root_node)
|
|
return -1;
|
|
|
|
return do_index_search_root(out_offset, out_gtid_count);
|
|
}
|
|
|
|
|
|
int
|
|
Gtid_index_reader::do_index_search_root(uint32 *out_offset,
|
|
uint32 *out_gtid_count)
|
|
{
|
|
current_state.reset_nolock();
|
|
compare_state.reset_nolock();
|
|
/*
|
|
These states will be initialized to the full state stored at the start of
|
|
the root node and then incrementally updated.
|
|
*/
|
|
bool current_state_updated= false;
|
|
|
|
if (read_root_node())
|
|
return -1;
|
|
for (;;)
|
|
{
|
|
if (*n->first_page->flag_ptr & PAGE_FLAG_IS_LEAF)
|
|
break;
|
|
|
|
if (compare_state.load_nolock(¤t_state))
|
|
{
|
|
give_error("Out of memory allocating GTID list");
|
|
return -1;
|
|
}
|
|
uint32 child_ptr;
|
|
if (get_child_ptr(&child_ptr))
|
|
return -1;
|
|
|
|
/* Scan over the keys in the node to find the child pointer to follow */
|
|
for (;;)
|
|
{
|
|
uint32 offset, gtid_count;
|
|
int res= get_offset_count(&offset, >id_count);
|
|
if (res == 1) // EOF?
|
|
{
|
|
/* Follow the right-most child pointer. */
|
|
if (read_node(child_ptr))
|
|
return -1;
|
|
break;
|
|
}
|
|
rpl_gtid *gtid_list= gtid_list_buffer(gtid_count);
|
|
uint32 child2_ptr;
|
|
if ((gtid_count > 0 && !gtid_list) ||
|
|
get_gtid_list(gtid_list, gtid_count) ||
|
|
get_child_ptr(&child2_ptr))
|
|
return -1;
|
|
if (update_gtid_state(&compare_state, gtid_list, gtid_count))
|
|
return -1;
|
|
int cmp= (this->*search_cmp_function)(offset, &compare_state);
|
|
if (cmp < 0)
|
|
{
|
|
/* Follow the left child of this key. */
|
|
if (read_node(child_ptr))
|
|
return -1;
|
|
break;
|
|
}
|
|
/* Continue to scan the next key. */
|
|
update_gtid_state(¤t_state, gtid_list, gtid_count);
|
|
current_state_updated= true;
|
|
current_offset= offset;
|
|
child_ptr= child2_ptr;
|
|
}
|
|
}
|
|
return do_index_search_leaf(current_state_updated,
|
|
out_offset, out_gtid_count);
|
|
}
|
|
|
|
int Gtid_index_reader::do_index_search_leaf(bool current_state_updated,
|
|
uint32 *out_offset,
|
|
uint32 *out_gtid_count)
|
|
{
|
|
uint32 offset, gtid_count;
|
|
int res= get_offset_count(&offset, >id_count);
|
|
if (res == 1)
|
|
{
|
|
DBUG_ASSERT(0);
|
|
give_error("Corrupt index; empty leaf node");
|
|
return -1;
|
|
}
|
|
rpl_gtid *gtid_list= gtid_list_buffer(gtid_count);
|
|
if ((gtid_count > 0 && !gtid_list) ||
|
|
get_gtid_list(gtid_list, gtid_count))
|
|
return -1;
|
|
/*
|
|
The first key is ignored (already included in the current state), unless
|
|
it is the very first state in the index.
|
|
*/
|
|
if (!current_state_updated)
|
|
update_gtid_state(¤t_state, gtid_list, gtid_count);
|
|
current_offset= offset;
|
|
if (compare_state.load_nolock(¤t_state))
|
|
{
|
|
give_error("Out of memory allocating GTID state");
|
|
return -1;
|
|
}
|
|
int cmp= (this->*search_cmp_function)(offset, &compare_state);
|
|
if (cmp < 0)
|
|
return 0; // Search position is before start of index.
|
|
|
|
/* Scan over the keys in the leaf node. */
|
|
for (;;)
|
|
{
|
|
uint32 offset, gtid_count;
|
|
int res= get_offset_count(&offset, >id_count);
|
|
if (res == 1) // EOF?
|
|
{
|
|
/* Reached end of leaf, last key is the one searched for. */
|
|
break;
|
|
}
|
|
gtid_list= gtid_list_buffer(gtid_count);
|
|
if ((gtid_count > 0 && !gtid_list) ||
|
|
get_gtid_list(gtid_list, gtid_count))
|
|
return -1;
|
|
if (update_gtid_state(&compare_state, gtid_list, gtid_count))
|
|
return -1;
|
|
cmp= (this->*search_cmp_function)(offset, &compare_state);
|
|
if (cmp < 0)
|
|
{
|
|
/* Next key is larger, so current state is the one searched for. */
|
|
break;
|
|
}
|
|
update_gtid_state(¤t_state, gtid_list, gtid_count);
|
|
current_offset= offset;
|
|
}
|
|
|
|
*out_offset= current_offset;
|
|
*out_gtid_count= current_state.count_nolock();
|
|
/* Save the result in the shared gtid list buffer. */
|
|
if ((!(gtid_list= gtid_list_buffer(*out_gtid_count)) && *out_gtid_count > 0) ||
|
|
current_state.get_gtid_list_nolock(gtid_list, *out_gtid_count))
|
|
return -1;
|
|
|
|
return 1;
|
|
}
|
|
|
|
|
|
/*
|
|
Read the file header and check that it's valid and that the format is not
|
|
too new a version for us to be able to read it.
|
|
*/
|
|
int
|
|
Gtid_index_reader::read_file_header()
|
|
{
|
|
if (!file_open)
|
|
return 1;
|
|
|
|
uchar buf[GTID_INDEX_FILE_HEADER_SIZE + GTID_INDEX_PAGE_HEADER_SIZE];
|
|
|
|
if (MY_FILEPOS_ERROR == mysql_file_seek(index_file, 0, MY_SEEK_SET, MYF(0)) ||
|
|
mysql_file_read(index_file, buf,
|
|
GTID_INDEX_FILE_HEADER_SIZE + GTID_INDEX_PAGE_HEADER_SIZE,
|
|
MYF(MY_NABP)))
|
|
return give_error("Error reading page from index file");
|
|
if (memcmp(&buf[0], GTID_INDEX_MAGIC, sizeof(GTID_INDEX_MAGIC)))
|
|
return give_error("Corrupt index file, magic not found in header");
|
|
version_major= buf[4];
|
|
version_minor= buf[5];
|
|
/* We cannot safely read a major version we don't know about. */
|
|
if (version_major > GTID_INDEX_VERSION_MAJOR)
|
|
return give_error("Incompatible index file, version too high");
|
|
page_size= uint4korr(&buf[8]);
|
|
|
|
/* Verify checksum integrity of page_size and major/minor version. */
|
|
uint32 crc= my_checksum(0, buf, sizeof(buf));
|
|
uchar *buf3= (uchar *)
|
|
my_malloc(key_memory_binlog_gtid_index, page_size - sizeof(buf), MYF(0));
|
|
if (!buf3)
|
|
return give_error("Error allocating memory for index page");
|
|
int res= 0;
|
|
if (mysql_file_read(index_file, buf3, page_size - sizeof(buf), MYF(MY_NABP)))
|
|
res= give_error("Error reading page from index file");
|
|
else
|
|
{
|
|
crc= my_checksum(crc, buf3, page_size - sizeof(buf) - CHECKSUM_LEN);
|
|
if (crc != uint4korr(buf3 + page_size - sizeof(buf) - CHECKSUM_LEN))
|
|
res= give_error("Corrupt page, invalid checksum");
|
|
}
|
|
my_free(buf3);
|
|
if (res)
|
|
return res;
|
|
|
|
/*
|
|
Check that there is a valid root node at the end of the file.
|
|
If there is not, the index may be a "hot index" that is currently being
|
|
constructed. Or it was only partially written before server crash and not
|
|
recovered for some reason.
|
|
*/
|
|
uchar flags= buf[GTID_INDEX_PAGE_HEADER_SIZE];
|
|
constexpr uchar needed_flags= PAGE_FLAG_ROOT|PAGE_FLAG_LAST;
|
|
if ((flags & needed_flags) == needed_flags)
|
|
{
|
|
/* Special case: the index is a single page, which is the root node. */
|
|
has_root_node= true;
|
|
}
|
|
else
|
|
{
|
|
uchar buf2[GTID_INDEX_PAGE_HEADER_SIZE];
|
|
if (MY_FILEPOS_ERROR == mysql_file_seek(index_file, -(int32)page_size,
|
|
MY_SEEK_END, MYF(0)) ||
|
|
mysql_file_read(index_file, buf2, GTID_INDEX_PAGE_HEADER_SIZE,
|
|
MYF(MY_NABP)))
|
|
return give_error("Error reading root page from index file");
|
|
flags= buf2[0];
|
|
has_root_node= ((flags & needed_flags) == needed_flags);
|
|
/* No need to verify checksum here, will be done by read_root_node(). */
|
|
}
|
|
index_valid= true;
|
|
return 0;
|
|
}
|
|
|
|
|
|
int
|
|
Gtid_index_reader::verify_checksum(Gtid_index_base::Node_page *page)
|
|
{
|
|
uint32 calc_checksum= my_checksum(0, page->page, page_size - CHECKSUM_LEN);
|
|
uint32 read_checksum= uint4korr(page->page + page_size - CHECKSUM_LEN);
|
|
if (calc_checksum != read_checksum)
|
|
return give_error("Corrupt page, invalid checksum");
|
|
return 0;
|
|
}
|
|
|
|
|
|
Gtid_index_base::Node_page *
|
|
Gtid_index_reader::alloc_and_read_page()
|
|
{
|
|
Node_page *page= alloc_page();
|
|
if (!page)
|
|
{
|
|
give_error("Error allocating memory for index page");
|
|
return nullptr;
|
|
}
|
|
if (mysql_file_read(index_file, page->page, page_size, MYF(MY_NABP)))
|
|
{
|
|
my_free(page);
|
|
give_error("Error reading page from index file");
|
|
return nullptr;
|
|
}
|
|
if (verify_checksum(page))
|
|
{
|
|
my_free(page);
|
|
return nullptr;
|
|
}
|
|
return page;
|
|
}
|
|
|
|
|
|
int
|
|
Gtid_index_reader::read_root_node()
|
|
{
|
|
if (!index_valid || !has_root_node)
|
|
return 1;
|
|
|
|
cold_node.reset();
|
|
n= &cold_node;
|
|
/*
|
|
Read pages one by one from the back of the file until we have a complete
|
|
root node.
|
|
*/
|
|
if (MY_FILEPOS_ERROR == mysql_file_seek(index_file, -(int32)page_size,
|
|
MY_SEEK_END, MYF(0)))
|
|
return give_error("Error seeking index file");
|
|
|
|
for (;;)
|
|
{
|
|
Node_page *page= alloc_and_read_page();
|
|
if (!page)
|
|
return 1;
|
|
if (mysql_file_tell(index_file, MYF(0)) == page_size)
|
|
page->flag_ptr= &page->page[GTID_INDEX_FILE_HEADER_SIZE];
|
|
else
|
|
page->flag_ptr= &page->page[0];
|
|
page->next= n->first_page;
|
|
n->first_page= page;
|
|
uchar flags= *page->flag_ptr;
|
|
if (unlikely(!(flags & PAGE_FLAG_ROOT)))
|
|
return give_error("Corrupt or truncated index, no root node found");
|
|
if (!(flags & PAGE_FLAG_IS_CONT))
|
|
break; // Found start of root node
|
|
if (MY_FILEPOS_ERROR == mysql_file_seek(index_file, -(int32)(2*page_size),
|
|
MY_SEEK_CUR, MYF(0)))
|
|
return give_error("Error seeking index file for multi-page root node");
|
|
}
|
|
|
|
read_page= n->first_page;
|
|
read_ptr= read_page->flag_ptr + GTID_INDEX_PAGE_HEADER_SIZE;
|
|
return 0;
|
|
}
|
|
|
|
|
|
int
|
|
Gtid_index_reader::read_node(uint32 page_ptr)
|
|
{
|
|
DBUG_ASSERT(page_ptr != 0 /* No zero child pointers in on-disk pages. */);
|
|
if (!index_valid || !page_ptr)
|
|
return 1;
|
|
return read_node_cold(page_ptr);
|
|
}
|
|
|
|
|
|
int
|
|
Gtid_index_reader::read_node_cold(uint32 page_ptr)
|
|
{
|
|
if (MY_FILEPOS_ERROR == mysql_file_seek(index_file, (page_ptr-1)*page_size,
|
|
MY_SEEK_SET, MYF(0)))
|
|
return give_error("Error seeking index file");
|
|
|
|
bool file_header= (page_ptr == 1);
|
|
cold_node.reset();
|
|
n= &cold_node;
|
|
Node_page **next_ptr_ptr= &n->first_page;
|
|
for (;;)
|
|
{
|
|
Node_page *page= alloc_and_read_page();
|
|
if (!page)
|
|
return 1;
|
|
page->flag_ptr= &page->page[file_header ? GTID_INDEX_FILE_HEADER_SIZE : 0];
|
|
file_header= false;
|
|
/* Insert the page at the end of the list. */
|
|
page->next= nullptr;
|
|
*next_ptr_ptr= page;
|
|
next_ptr_ptr= &page->next;
|
|
|
|
uchar flags= *(page->flag_ptr);
|
|
if (flags & PAGE_FLAG_LAST)
|
|
break;
|
|
}
|
|
|
|
read_page= n->first_page;
|
|
read_ptr= read_page->flag_ptr + GTID_INDEX_PAGE_HEADER_SIZE;
|
|
return 0;
|
|
}
|
|
|
|
|
|
int Gtid_index_reader::give_error(const char *msg)
|
|
{
|
|
sql_print_information("Error reading binlog GTID index, will "
|
|
"fallback to slower sequential binlog scan. "
|
|
"Error is: %s", msg);
|
|
return 1;
|
|
}
|
|
|
|
|
|
Gtid_index_reader_hot::Gtid_index_reader_hot()
|
|
: hot_writer(nullptr)
|
|
{
|
|
}
|
|
|
|
|
|
int
|
|
Gtid_index_reader_hot::get_child_ptr(uint32 *out_child_ptr)
|
|
{
|
|
if (find_bytes(4))
|
|
{
|
|
/*
|
|
If reading hot index, EOF or zero child ptr means the child pointer has
|
|
not yet been written. A zero out_child_ptr makes read_node() read the
|
|
hot node for the child.
|
|
*/
|
|
if (hot_writer)
|
|
{
|
|
*out_child_ptr= 0;
|
|
return 0;
|
|
}
|
|
return give_error("Corrupt index, short index node");
|
|
}
|
|
*out_child_ptr= (uint32)uint4korr(read_ptr);
|
|
read_ptr+= 4;
|
|
return 0;
|
|
}
|
|
|
|
|
|
int
|
|
Gtid_index_reader_hot::do_index_search(uint32 *out_offset,
|
|
uint32 *out_gtid_count)
|
|
{
|
|
/* Check for a "hot" index. */
|
|
Gtid_index_writer::lock_gtid_index();
|
|
hot_writer= Gtid_index_writer::find_hot_index(index_file_name);
|
|
if (!hot_writer)
|
|
{
|
|
Gtid_index_writer::unlock_gtid_index();
|
|
/*
|
|
Check the index file header (and index end) again, in case it was
|
|
hot when open_index_file() was called, but became cold in the meantime.
|
|
*/
|
|
if (!has_root_node && Gtid_index_reader::read_file_header())
|
|
return -1;
|
|
}
|
|
|
|
int res= do_index_search_root(out_offset, out_gtid_count);
|
|
|
|
if (hot_writer)
|
|
{
|
|
hot_writer= nullptr;
|
|
Gtid_index_writer::unlock_gtid_index();
|
|
}
|
|
return res;
|
|
}
|
|
|
|
|
|
int
|
|
Gtid_index_reader_hot::read_file_header()
|
|
{
|
|
if (!file_open)
|
|
return 1;
|
|
|
|
Gtid_index_writer::lock_gtid_index();
|
|
hot_writer= Gtid_index_writer::find_hot_index(index_file_name);
|
|
if (!hot_writer)
|
|
Gtid_index_writer::unlock_gtid_index();
|
|
|
|
int res;
|
|
if (hot_writer && hot_writer->max_level == 0)
|
|
{
|
|
/*
|
|
No pages from the hot index have been written to disk, there's just a
|
|
single incomplete node at level 0.
|
|
We have to read the file header from the in-memory page.
|
|
*/
|
|
uchar *p= hot_writer->nodes[0]->first_page->page;
|
|
page_size= uint4korr(p + 8);
|
|
has_root_node= false;
|
|
index_valid= true;
|
|
res= 0;
|
|
}
|
|
else
|
|
res= Gtid_index_reader::read_file_header();
|
|
|
|
if (hot_writer)
|
|
{
|
|
hot_writer= nullptr;
|
|
Gtid_index_writer::unlock_gtid_index();
|
|
}
|
|
return res;
|
|
}
|
|
|
|
|
|
int
|
|
Gtid_index_reader_hot::read_root_node()
|
|
{
|
|
if (!index_valid)
|
|
return 1;
|
|
|
|
if (hot_writer)
|
|
{
|
|
hot_level= hot_writer->max_level;
|
|
return read_node_hot();
|
|
}
|
|
if (has_root_node)
|
|
{
|
|
return Gtid_index_reader::read_root_node();
|
|
}
|
|
return 1;
|
|
}
|
|
|
|
|
|
int
|
|
Gtid_index_reader_hot::read_node(uint32 page_ptr)
|
|
{
|
|
if (!index_valid || (!page_ptr && !hot_writer))
|
|
return 1;
|
|
|
|
if (hot_writer)
|
|
{
|
|
if (!page_ptr)
|
|
{
|
|
/*
|
|
The "hot" index is only partially written. Not yet written child pages
|
|
are indicated by zero child pointers. Such child pages are found from
|
|
the list of active nodes in the writer.
|
|
*/
|
|
if (hot_level <= 0)
|
|
{
|
|
DBUG_ASSERT(0 /* Should be no child pointer to follow on leaf page. */);
|
|
return give_error("Corrupt hot index (child pointer on leaf page");
|
|
}
|
|
DBUG_ASSERT(n == hot_writer->nodes[hot_level]);
|
|
--hot_level;
|
|
return read_node_hot();
|
|
}
|
|
|
|
/*
|
|
We started searching the "hot" index, but now we've reached a "cold"
|
|
part of the index that's already fully written. So leave the "hot index"
|
|
mode and continue reading pages from the on-disk index from here.
|
|
*/
|
|
hot_writer= nullptr;
|
|
Gtid_index_writer::unlock_gtid_index();
|
|
}
|
|
|
|
return read_node_cold(page_ptr);
|
|
}
|
|
|
|
|
|
int
|
|
Gtid_index_reader_hot::read_node_hot()
|
|
{
|
|
if (hot_writer->error_state)
|
|
return give_error("Cannot access hot index");
|
|
n= hot_writer->nodes[hot_level];
|
|
read_page= n->first_page;
|
|
/* The writer should allocate pages for all nodes. */
|
|
DBUG_ASSERT(read_page != nullptr);
|
|
if (!read_page)
|
|
return give_error("Page not available in hot index");
|
|
read_ptr= read_page->flag_ptr + GTID_INDEX_PAGE_HEADER_SIZE;
|
|
return 0;
|
|
}
|