mirror of
https://github.com/MariaDB/server.git
synced 2025-01-16 03:52:35 +01:00
72cc58bb71
for large transaction Description =========== When a transaction commits, it copies the binlog events from binlog cache to binlog file. Very large transactions (eg. gigabytes) can stall other transactions for a long time because the data is copied while holding LOCK_log, which blocks other commits from binlogging. The solution in this patch is to rename the binlog cache file to a binlog file instead of copy, if the commiting transaction has large binlog cache. Rename is a very fast operation, it doesn't block other transactions a long time. Design ====== * binlog_large_commit_threshold type: ulonglong scope: global dynamic: yes default: 128MB Only the binlog cache temporary files large than 128MB are renamed to binlog file. * #binlog_cache_files directory To support rename, all binlog cache temporary files are managed as normal files now. `#binlog_cache_files` directory is in the same directory with binlog files. It is created at server startup if it doesn't exist. Otherwise, all files in the directory is deleted at startup. The temporary files are named with ML_ prefix and the memorary address of the binlog_cache_data object which guarantees it is unique. * Reserve space To supprot rename feature, It must reserve enough space at the begin of the binlog cache file. The space is required for Format description, Gtid list, checkpoint and Gtid events when renaming it to a binlog file. Since binlog_cache_data's cache_log is directly accessed by binlog log, online alter and wsrep. It is not easy to update all the code. Thus binlog cache will not reserve space if it is not session binlog cache or wsrep session is enabled. - m_file_reserved_bytes Stores the bytes reserved at the begin of the cache file. It is initialized in write_prepare() and cleared by reset(). The reserved file header is hide to callers. Thus there is no change for callers. E.g. - get_byte_position() still get the length of binlog data written to the cache, but not the file length. - truncate(0) will truncate the file to m_file_reserved_bytes but not 0. - write_prepare() write_prepare() is called everytime when anything is being written into the cache. It will call init_file_reserved_bytes() to create the cache file (if it doesn't exist) and reserve suitable space if the data written exceeds buffer's size. * Binlog_commit_by_rotate It is used to encapsulate the code for remaing a binlog cache tempoary file to binlog file. - should_commit_by_rotate() it is called by write_transaction_to_binlog_events() to check if a binlog cache should be rename to a binlog file. - commit() That is the entry to rename a binlog cache and commit the transaction. Both rename and commit are protected by LOCK_log, Thus not other transactions can write anything into the renamed binlog before it. Rename happens in a rotation. After the new binlog file is generated, replace_binlog_file() is called to: - copy data from the new binlog file to its binlog cache file. - write gtid event. - rename the binlog cache file to binlog file. After that the rotation will continue to succeed. Then the transaction is committed in a seperated group itself. Its cache file will be detached and cache log will be reset before calling trx_group_commit_with_engines(). Thus only Xid event be written.
381 lines
11 KiB
C++
381 lines
11 KiB
C++
/*
|
|
Copyright (c) 2023, MariaDB plc
|
|
|
|
This program is free software; you can redistribute it and/or
|
|
modify it under the terms of the GNU General Public License
|
|
as published by the Free Software Foundation; version 2 of
|
|
the License.
|
|
|
|
This program is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
GNU General Public License for more details.
|
|
|
|
You should have received a copy of the GNU General Public License
|
|
along with this program; if not, write to the Free Software
|
|
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1335 USA
|
|
*/
|
|
|
|
#include "log_event.h"
|
|
|
|
static constexpr my_off_t MY_OFF_T_UNDEF= ~0ULL;
|
|
/** Truncate cache log files bigger than this */
|
|
static constexpr my_off_t CACHE_FILE_TRUNC_SIZE = 65536;
|
|
|
|
/**
|
|
Create binlog cache directory if it doesn't exist, otherwise delete all
|
|
files existing in the directory.
|
|
|
|
@retval false Succeeds to initialize the directory.
|
|
@retval true Failed to initialize the directory.
|
|
*/
|
|
bool init_binlog_cache_dir();
|
|
|
|
extern char binlog_cache_dir[FN_REFLEN];
|
|
|
|
/*
|
|
Helper classes to store non-transactional and transactional data
|
|
before copying it to the binary log.
|
|
*/
|
|
|
|
class binlog_cache_data
|
|
{
|
|
public:
|
|
binlog_cache_data(bool precompute_checksums):
|
|
before_stmt_pos(MY_OFF_T_UNDEF), m_pending(0), status(0),
|
|
incident(FALSE), precompute_checksums(precompute_checksums),
|
|
saved_max_binlog_cache_size(0), ptr_binlog_cache_use(0),
|
|
ptr_binlog_cache_disk_use(0), m_file_reserved_bytes(0)
|
|
{
|
|
/*
|
|
Read the current checksum setting. We will use this setting to decide
|
|
whether to pre-compute checksums in the cache. Then when writing the cache
|
|
to the actual binlog, another check will be made and checksums recomputed
|
|
in the unlikely case that the setting changed meanwhile.
|
|
*/
|
|
checksum_opt= !precompute_checksums ? BINLOG_CHECKSUM_ALG_OFF :
|
|
(enum_binlog_checksum_alg)binlog_checksum_options;
|
|
}
|
|
|
|
~binlog_cache_data()
|
|
{
|
|
DBUG_ASSERT(empty());
|
|
|
|
if (cache_log.file != -1 && !encrypt_tmp_files)
|
|
unlink(my_filename(cache_log.file));
|
|
|
|
close_cached_file(&cache_log);
|
|
}
|
|
|
|
/*
|
|
Return 1 if there is no relevant entries in the cache
|
|
|
|
This is:
|
|
- Cache is empty
|
|
- There are row or critical (DDL?) events in the cache
|
|
|
|
The status test is needed to avoid writing entries with only
|
|
a table map entry, which would crash in do_apply_event() on the slave
|
|
as it assumes that there is always a row entry after a table map.
|
|
*/
|
|
bool empty() const
|
|
{
|
|
return (pending() == NULL &&
|
|
(my_b_write_tell(&cache_log) - m_file_reserved_bytes == 0 ||
|
|
((status & (LOGGED_ROW_EVENT | LOGGED_CRITICAL)) == 0)));
|
|
}
|
|
|
|
Rows_log_event *pending() const
|
|
{
|
|
return m_pending;
|
|
}
|
|
|
|
void set_pending(Rows_log_event *const pending_arg)
|
|
{
|
|
m_pending= pending_arg;
|
|
}
|
|
|
|
void set_incident(void)
|
|
{
|
|
incident= TRUE;
|
|
}
|
|
|
|
bool has_incident(void) const
|
|
{
|
|
return(incident);
|
|
}
|
|
|
|
void reset()
|
|
{
|
|
bool cache_was_empty= empty();
|
|
bool truncate_file= (cache_log.file != -1 &&
|
|
my_b_write_tell(&cache_log) >
|
|
MY_MIN(CACHE_FILE_TRUNC_SIZE, binlog_stmt_cache_size));
|
|
// m_file_reserved_bytes must be reset to 0, before truncate.
|
|
m_file_reserved_bytes= 0;
|
|
truncate(0,1); // Forget what's in cache
|
|
checksum_opt= !precompute_checksums ? BINLOG_CHECKSUM_ALG_OFF :
|
|
(enum_binlog_checksum_alg)binlog_checksum_options;
|
|
if (!cache_was_empty)
|
|
compute_statistics();
|
|
if (truncate_file)
|
|
truncate_io_cache(&cache_log);
|
|
status= 0;
|
|
incident= FALSE;
|
|
before_stmt_pos= MY_OFF_T_UNDEF;
|
|
DBUG_ASSERT(empty());
|
|
}
|
|
|
|
my_off_t get_byte_position() const
|
|
{
|
|
DBUG_ASSERT(cache_log.type == WRITE_CACHE);
|
|
return my_b_tell(&cache_log) - m_file_reserved_bytes;
|
|
}
|
|
|
|
my_off_t get_prev_position() const
|
|
{
|
|
return(before_stmt_pos);
|
|
}
|
|
|
|
void set_prev_position(my_off_t pos)
|
|
{
|
|
before_stmt_pos= pos;
|
|
}
|
|
|
|
void restore_prev_position()
|
|
{
|
|
truncate(before_stmt_pos);
|
|
}
|
|
|
|
void restore_savepoint(my_off_t pos)
|
|
{
|
|
truncate(pos);
|
|
if (pos < before_stmt_pos)
|
|
before_stmt_pos= MY_OFF_T_UNDEF;
|
|
}
|
|
|
|
void set_binlog_cache_info(my_off_t param_max_binlog_cache_size,
|
|
ulong *param_ptr_binlog_cache_use,
|
|
ulong *param_ptr_binlog_cache_disk_use)
|
|
{
|
|
/*
|
|
The assertions guarantee that the set_binlog_cache_info is
|
|
called just once and information passed as parameters are
|
|
never zero.
|
|
|
|
This is done while calling the constructor binlog_cache_mngr.
|
|
We cannot set information in the constructor binlog_cache_data
|
|
because the space for binlog_cache_mngr is allocated through
|
|
a placement new.
|
|
|
|
In the future, we can refactor this and change it to avoid
|
|
the set_binlog_info.
|
|
*/
|
|
DBUG_ASSERT(saved_max_binlog_cache_size == 0);
|
|
DBUG_ASSERT(param_max_binlog_cache_size != 0);
|
|
DBUG_ASSERT(ptr_binlog_cache_use == 0);
|
|
DBUG_ASSERT(param_ptr_binlog_cache_use != 0);
|
|
DBUG_ASSERT(ptr_binlog_cache_disk_use == 0);
|
|
DBUG_ASSERT(param_ptr_binlog_cache_disk_use != 0);
|
|
|
|
saved_max_binlog_cache_size= param_max_binlog_cache_size;
|
|
ptr_binlog_cache_use= param_ptr_binlog_cache_use;
|
|
ptr_binlog_cache_disk_use= param_ptr_binlog_cache_disk_use;
|
|
cache_log.end_of_file= saved_max_binlog_cache_size;
|
|
}
|
|
|
|
void add_status(enum_logged_status status_arg)
|
|
{
|
|
status|= status_arg;
|
|
}
|
|
|
|
/**
|
|
This function is called everytime when anything is being written into the
|
|
cache_log. To support rename binlog cache to binlog file, the cache_log
|
|
should be initialized with reserved space.
|
|
*/
|
|
bool write_prepare(size_t write_length)
|
|
{
|
|
/* Data will exceed the buffer size in this write */
|
|
if (unlikely(cache_log.write_pos + write_length > cache_log.write_end &&
|
|
cache_log.pos_in_file == 0))
|
|
{
|
|
/* Only session's binlog cache need to reserve space. */
|
|
if (cache_log.dir == binlog_cache_dir && !encrypt_tmp_files)
|
|
return init_file_reserved_bytes();
|
|
}
|
|
return false;
|
|
}
|
|
|
|
/**
|
|
For session's binlog cache, it have to call this function to skip the
|
|
reserved before reading the cache file.
|
|
*/
|
|
bool init_for_read()
|
|
{
|
|
return reinit_io_cache(&cache_log, READ_CACHE, m_file_reserved_bytes, 0, 0);
|
|
}
|
|
|
|
/**
|
|
For session's binlog cache, it have to call this function to get the
|
|
actual data length.
|
|
*/
|
|
my_off_t length_for_read() const
|
|
{
|
|
DBUG_ASSERT(cache_log.type == READ_CACHE);
|
|
return cache_log.end_of_file - m_file_reserved_bytes;
|
|
}
|
|
|
|
/**
|
|
It function returns the cache file's actual length which includes the
|
|
reserved space.
|
|
*/
|
|
my_off_t temp_file_length()
|
|
{
|
|
return my_b_tell(&cache_log);
|
|
}
|
|
|
|
uint32 file_reserved_bytes() { return m_file_reserved_bytes; }
|
|
|
|
/**
|
|
Flush and sync the data of the file into storage.
|
|
|
|
@retval true Error happens
|
|
@retval false Succeeds
|
|
*/
|
|
bool sync_temp_file()
|
|
{
|
|
DBUG_ASSERT(cache_log.file != -1);
|
|
|
|
if (my_b_flush_io_cache(&cache_log, 1) ||
|
|
mysql_file_sync(cache_log.file, MYF(0)))
|
|
return true;
|
|
return false;
|
|
}
|
|
|
|
/**
|
|
Copy the name of the cache file to the argument name.
|
|
*/
|
|
const char *temp_file_name() { return my_filename(cache_log.file); }
|
|
|
|
/**
|
|
It is called after renaming the cache file to a binlog file. The file
|
|
now is a binlog file, so detach it from the binlog cache.
|
|
*/
|
|
void detach_temp_file();
|
|
|
|
/*
|
|
Cache to store data before copying it to the binary log.
|
|
*/
|
|
IO_CACHE cache_log;
|
|
|
|
protected:
|
|
/*
|
|
Binlog position before the start of the current statement.
|
|
*/
|
|
my_off_t before_stmt_pos;
|
|
|
|
private:
|
|
/*
|
|
Pending binrows event. This event is the event where the rows are currently
|
|
written.
|
|
*/
|
|
Rows_log_event *m_pending;
|
|
|
|
/*
|
|
Bit flags for what has been writing to cache. Used to
|
|
discard logs without any data changes.
|
|
see enum_logged_status;
|
|
*/
|
|
uint32 status;
|
|
|
|
public:
|
|
/*
|
|
The algorithm (if any) used to pre-compute checksums in the cache.
|
|
Initialized from binlog_checksum_options when the cache is reset.
|
|
*/
|
|
enum_binlog_checksum_alg checksum_opt;
|
|
|
|
private:
|
|
/*
|
|
This indicates that some events did not get into the cache and most likely
|
|
it is corrupted.
|
|
*/
|
|
bool incident;
|
|
|
|
/* Whether the caller requested precomputing checksums. */
|
|
bool precompute_checksums;
|
|
|
|
/**
|
|
This function computes binlog cache and disk usage.
|
|
*/
|
|
void compute_statistics()
|
|
{
|
|
statistic_increment(*ptr_binlog_cache_use, &LOCK_status);
|
|
if (cache_log.disk_writes != 0)
|
|
{
|
|
#ifdef REAL_STATISTICS
|
|
statistic_add(*ptr_binlog_cache_disk_use,
|
|
cache_log.disk_writes, &LOCK_status);
|
|
#else
|
|
statistic_increment(*ptr_binlog_cache_disk_use, &LOCK_status);
|
|
#endif
|
|
cache_log.disk_writes= 0;
|
|
}
|
|
}
|
|
|
|
/*
|
|
Stores the values of maximum size of the cache allowed when this cache
|
|
is configured. This corresponds to either
|
|
. max_binlog_cache_size or max_binlog_stmt_cache_size.
|
|
*/
|
|
my_off_t saved_max_binlog_cache_size;
|
|
|
|
/*
|
|
Stores a pointer to the status variable that keeps track of the in-memory
|
|
cache usage. This corresponds to either
|
|
. binlog_cache_use or binlog_stmt_cache_use.
|
|
*/
|
|
ulong *ptr_binlog_cache_use;
|
|
|
|
/*
|
|
Stores a pointer to the status variable that keeps track of the disk
|
|
cache usage. This corresponds to either
|
|
. binlog_cache_disk_use or binlog_stmt_cache_disk_use.
|
|
*/
|
|
ulong *ptr_binlog_cache_disk_use;
|
|
|
|
/*
|
|
Stores the bytes reserved at the begin of the cache file. It could be
|
|
0 for cases that reserved space are not supported. see write_prepare().
|
|
*/
|
|
uint32 m_file_reserved_bytes {0};
|
|
|
|
/*
|
|
It truncates the cache to a certain position. This includes deleting the
|
|
pending event.
|
|
*/
|
|
void truncate(my_off_t pos, bool reset_cache=0)
|
|
{
|
|
DBUG_PRINT("info", ("truncating to position %lu", (ulong) pos));
|
|
cache_log.error=0;
|
|
if (pending())
|
|
{
|
|
delete pending();
|
|
set_pending(0);
|
|
}
|
|
my_bool res __attribute__((unused))= reinit_io_cache(
|
|
&cache_log, WRITE_CACHE, pos + m_file_reserved_bytes, 0, reset_cache);
|
|
DBUG_ASSERT(res == 0);
|
|
cache_log.end_of_file= saved_max_binlog_cache_size;
|
|
}
|
|
|
|
/**
|
|
Reserve required space at the begin of the tempoary file. It will create
|
|
the temporary file if it doesn't exist.
|
|
*/
|
|
bool init_file_reserved_bytes();
|
|
|
|
binlog_cache_data& operator=(const binlog_cache_data& info);
|
|
binlog_cache_data(const binlog_cache_data& info);
|
|
};
|