mariadb/sql/log_cache.h
Libing Song 72cc58bb71 MDEV-32014 Rename binlog cache temporary file to binlog file
for large transaction

Description
===========
When a transaction commits, it copies the binlog events from
binlog cache to binlog file. Very large transactions
(eg. gigabytes) can stall other transactions for a long time
because the data is copied while holding LOCK_log, which blocks
other commits from binlogging.

The solution in this patch is to rename the binlog cache file to
a binlog file instead of copy, if the commiting transaction has
large binlog cache. Rename is a very fast operation, it doesn't
block other transactions a long time.

Design
======
* binlog_large_commit_threshold
  type: ulonglong
  scope: global
  dynamic: yes
  default: 128MB

  Only the binlog cache temporary files large than 128MB are
  renamed to binlog file.

* #binlog_cache_files directory
  To support rename, all binlog cache temporary files are managed
  as normal files now. `#binlog_cache_files` directory is in the same
  directory with binlog files. It is created at server startup if it doesn't
  exist. Otherwise, all files in the directory is deleted at startup.

  The temporary files are named with ML_ prefix and the memorary address
  of the binlog_cache_data object which guarantees it is unique.

* Reserve space
  To supprot rename feature, It must reserve enough space at the
  begin of the binlog cache file. The space is required for
  Format description, Gtid list, checkpoint and Gtid events when
  renaming it to a binlog file.

  Since binlog_cache_data's cache_log is directly accessed by binlog log,
  online alter and wsrep. It is not easy to update all the code. Thus
  binlog cache will not reserve space if it is not session binlog cache or
  wsrep session is enabled.

  - m_file_reserved_bytes
    Stores the bytes reserved at the begin of the cache file.
    It is initialized in write_prepare() and cleared by reset().

    The reserved file header is hide to callers. Thus there is no
    change for callers. E.g.
    - get_byte_position() still get the length of binlog data
      written to the cache, but not the file length.
    - truncate(0) will truncate the file to m_file_reserved_bytes but not 0.

  - write_prepare()
    write_prepare() is called everytime when anything is being written
    into the cache. It will call init_file_reserved_bytes() to  create
    the cache file (if it doesn't exist) and reserve suitable space if
    the data written exceeds buffer's size.

* Binlog_commit_by_rotate
  It is used to encapsulate the code for remaing a binlog cache
  tempoary file to binlog file.
  - should_commit_by_rotate()
    it is called by write_transaction_to_binlog_events() to check if
    a binlog cache should be rename to a binlog file.
  - commit()
    That is the entry to rename a binlog cache and commit the
    transaction. Both rename and commit are protected by LOCK_log,
    Thus not other transactions can write anything into the renamed
    binlog before it.

    Rename happens in a rotation. After the new binlog file is generated,
    replace_binlog_file() is called to:
    - copy data from the new binlog file to its binlog cache file.
    - write gtid event.
    - rename the binlog cache file to binlog file.

    After that the rotation will continue to succeed. Then the transaction
    is committed in a seperated group itself. Its cache file will be
    detached and cache log will be reset before calling
    trx_group_commit_with_engines(). Thus only Xid event be written.
2024-10-17 07:53:59 -06:00

381 lines
11 KiB
C++

/*
Copyright (c) 2023, MariaDB plc
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; version 2 of
the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1335 USA
*/
#include "log_event.h"
static constexpr my_off_t MY_OFF_T_UNDEF= ~0ULL;
/** Truncate cache log files bigger than this */
static constexpr my_off_t CACHE_FILE_TRUNC_SIZE = 65536;
/**
Create binlog cache directory if it doesn't exist, otherwise delete all
files existing in the directory.
@retval false Succeeds to initialize the directory.
@retval true Failed to initialize the directory.
*/
bool init_binlog_cache_dir();
extern char binlog_cache_dir[FN_REFLEN];
/*
Helper classes to store non-transactional and transactional data
before copying it to the binary log.
*/
class binlog_cache_data
{
public:
binlog_cache_data(bool precompute_checksums):
before_stmt_pos(MY_OFF_T_UNDEF), m_pending(0), status(0),
incident(FALSE), precompute_checksums(precompute_checksums),
saved_max_binlog_cache_size(0), ptr_binlog_cache_use(0),
ptr_binlog_cache_disk_use(0), m_file_reserved_bytes(0)
{
/*
Read the current checksum setting. We will use this setting to decide
whether to pre-compute checksums in the cache. Then when writing the cache
to the actual binlog, another check will be made and checksums recomputed
in the unlikely case that the setting changed meanwhile.
*/
checksum_opt= !precompute_checksums ? BINLOG_CHECKSUM_ALG_OFF :
(enum_binlog_checksum_alg)binlog_checksum_options;
}
~binlog_cache_data()
{
DBUG_ASSERT(empty());
if (cache_log.file != -1 && !encrypt_tmp_files)
unlink(my_filename(cache_log.file));
close_cached_file(&cache_log);
}
/*
Return 1 if there is no relevant entries in the cache
This is:
- Cache is empty
- There are row or critical (DDL?) events in the cache
The status test is needed to avoid writing entries with only
a table map entry, which would crash in do_apply_event() on the slave
as it assumes that there is always a row entry after a table map.
*/
bool empty() const
{
return (pending() == NULL &&
(my_b_write_tell(&cache_log) - m_file_reserved_bytes == 0 ||
((status & (LOGGED_ROW_EVENT | LOGGED_CRITICAL)) == 0)));
}
Rows_log_event *pending() const
{
return m_pending;
}
void set_pending(Rows_log_event *const pending_arg)
{
m_pending= pending_arg;
}
void set_incident(void)
{
incident= TRUE;
}
bool has_incident(void) const
{
return(incident);
}
void reset()
{
bool cache_was_empty= empty();
bool truncate_file= (cache_log.file != -1 &&
my_b_write_tell(&cache_log) >
MY_MIN(CACHE_FILE_TRUNC_SIZE, binlog_stmt_cache_size));
// m_file_reserved_bytes must be reset to 0, before truncate.
m_file_reserved_bytes= 0;
truncate(0,1); // Forget what's in cache
checksum_opt= !precompute_checksums ? BINLOG_CHECKSUM_ALG_OFF :
(enum_binlog_checksum_alg)binlog_checksum_options;
if (!cache_was_empty)
compute_statistics();
if (truncate_file)
truncate_io_cache(&cache_log);
status= 0;
incident= FALSE;
before_stmt_pos= MY_OFF_T_UNDEF;
DBUG_ASSERT(empty());
}
my_off_t get_byte_position() const
{
DBUG_ASSERT(cache_log.type == WRITE_CACHE);
return my_b_tell(&cache_log) - m_file_reserved_bytes;
}
my_off_t get_prev_position() const
{
return(before_stmt_pos);
}
void set_prev_position(my_off_t pos)
{
before_stmt_pos= pos;
}
void restore_prev_position()
{
truncate(before_stmt_pos);
}
void restore_savepoint(my_off_t pos)
{
truncate(pos);
if (pos < before_stmt_pos)
before_stmt_pos= MY_OFF_T_UNDEF;
}
void set_binlog_cache_info(my_off_t param_max_binlog_cache_size,
ulong *param_ptr_binlog_cache_use,
ulong *param_ptr_binlog_cache_disk_use)
{
/*
The assertions guarantee that the set_binlog_cache_info is
called just once and information passed as parameters are
never zero.
This is done while calling the constructor binlog_cache_mngr.
We cannot set information in the constructor binlog_cache_data
because the space for binlog_cache_mngr is allocated through
a placement new.
In the future, we can refactor this and change it to avoid
the set_binlog_info.
*/
DBUG_ASSERT(saved_max_binlog_cache_size == 0);
DBUG_ASSERT(param_max_binlog_cache_size != 0);
DBUG_ASSERT(ptr_binlog_cache_use == 0);
DBUG_ASSERT(param_ptr_binlog_cache_use != 0);
DBUG_ASSERT(ptr_binlog_cache_disk_use == 0);
DBUG_ASSERT(param_ptr_binlog_cache_disk_use != 0);
saved_max_binlog_cache_size= param_max_binlog_cache_size;
ptr_binlog_cache_use= param_ptr_binlog_cache_use;
ptr_binlog_cache_disk_use= param_ptr_binlog_cache_disk_use;
cache_log.end_of_file= saved_max_binlog_cache_size;
}
void add_status(enum_logged_status status_arg)
{
status|= status_arg;
}
/**
This function is called everytime when anything is being written into the
cache_log. To support rename binlog cache to binlog file, the cache_log
should be initialized with reserved space.
*/
bool write_prepare(size_t write_length)
{
/* Data will exceed the buffer size in this write */
if (unlikely(cache_log.write_pos + write_length > cache_log.write_end &&
cache_log.pos_in_file == 0))
{
/* Only session's binlog cache need to reserve space. */
if (cache_log.dir == binlog_cache_dir && !encrypt_tmp_files)
return init_file_reserved_bytes();
}
return false;
}
/**
For session's binlog cache, it have to call this function to skip the
reserved before reading the cache file.
*/
bool init_for_read()
{
return reinit_io_cache(&cache_log, READ_CACHE, m_file_reserved_bytes, 0, 0);
}
/**
For session's binlog cache, it have to call this function to get the
actual data length.
*/
my_off_t length_for_read() const
{
DBUG_ASSERT(cache_log.type == READ_CACHE);
return cache_log.end_of_file - m_file_reserved_bytes;
}
/**
It function returns the cache file's actual length which includes the
reserved space.
*/
my_off_t temp_file_length()
{
return my_b_tell(&cache_log);
}
uint32 file_reserved_bytes() { return m_file_reserved_bytes; }
/**
Flush and sync the data of the file into storage.
@retval true Error happens
@retval false Succeeds
*/
bool sync_temp_file()
{
DBUG_ASSERT(cache_log.file != -1);
if (my_b_flush_io_cache(&cache_log, 1) ||
mysql_file_sync(cache_log.file, MYF(0)))
return true;
return false;
}
/**
Copy the name of the cache file to the argument name.
*/
const char *temp_file_name() { return my_filename(cache_log.file); }
/**
It is called after renaming the cache file to a binlog file. The file
now is a binlog file, so detach it from the binlog cache.
*/
void detach_temp_file();
/*
Cache to store data before copying it to the binary log.
*/
IO_CACHE cache_log;
protected:
/*
Binlog position before the start of the current statement.
*/
my_off_t before_stmt_pos;
private:
/*
Pending binrows event. This event is the event where the rows are currently
written.
*/
Rows_log_event *m_pending;
/*
Bit flags for what has been writing to cache. Used to
discard logs without any data changes.
see enum_logged_status;
*/
uint32 status;
public:
/*
The algorithm (if any) used to pre-compute checksums in the cache.
Initialized from binlog_checksum_options when the cache is reset.
*/
enum_binlog_checksum_alg checksum_opt;
private:
/*
This indicates that some events did not get into the cache and most likely
it is corrupted.
*/
bool incident;
/* Whether the caller requested precomputing checksums. */
bool precompute_checksums;
/**
This function computes binlog cache and disk usage.
*/
void compute_statistics()
{
statistic_increment(*ptr_binlog_cache_use, &LOCK_status);
if (cache_log.disk_writes != 0)
{
#ifdef REAL_STATISTICS
statistic_add(*ptr_binlog_cache_disk_use,
cache_log.disk_writes, &LOCK_status);
#else
statistic_increment(*ptr_binlog_cache_disk_use, &LOCK_status);
#endif
cache_log.disk_writes= 0;
}
}
/*
Stores the values of maximum size of the cache allowed when this cache
is configured. This corresponds to either
. max_binlog_cache_size or max_binlog_stmt_cache_size.
*/
my_off_t saved_max_binlog_cache_size;
/*
Stores a pointer to the status variable that keeps track of the in-memory
cache usage. This corresponds to either
. binlog_cache_use or binlog_stmt_cache_use.
*/
ulong *ptr_binlog_cache_use;
/*
Stores a pointer to the status variable that keeps track of the disk
cache usage. This corresponds to either
. binlog_cache_disk_use or binlog_stmt_cache_disk_use.
*/
ulong *ptr_binlog_cache_disk_use;
/*
Stores the bytes reserved at the begin of the cache file. It could be
0 for cases that reserved space are not supported. see write_prepare().
*/
uint32 m_file_reserved_bytes {0};
/*
It truncates the cache to a certain position. This includes deleting the
pending event.
*/
void truncate(my_off_t pos, bool reset_cache=0)
{
DBUG_PRINT("info", ("truncating to position %lu", (ulong) pos));
cache_log.error=0;
if (pending())
{
delete pending();
set_pending(0);
}
my_bool res __attribute__((unused))= reinit_io_cache(
&cache_log, WRITE_CACHE, pos + m_file_reserved_bytes, 0, reset_cache);
DBUG_ASSERT(res == 0);
cache_log.end_of_file= saved_max_binlog_cache_size;
}
/**
Reserve required space at the begin of the tempoary file. It will create
the temporary file if it doesn't exist.
*/
bool init_file_reserved_bytes();
binlog_cache_data& operator=(const binlog_cache_data& info);
binlog_cache_data(const binlog_cache_data& info);
};