MDEV-33894: Resurrect innodb_log_write_ahead_size

As part of commit 685d958e38 (MDEV-14425)
the parameter innodb_log_write_ahead_size was removed, because it was
thought that determining the physical block size would be a sufficient
replacement.

However, we can only determine the physical block size on Linux or
Microsoft Windows. On some file systems, the physical block size
is not relevant. For example, XFS uses a block size of 4096 bytes
even if the underlying block size may be smaller.

On Linux, we failed to determine the physical block size if
innodb_log_file_buffered=OFF was not requested or possible.
This will be fixed.

log_sys.write_size: The value of the reintroduced parameter
innodb_log_write_ahead_size. To keep it simple, this is read-only
and a power of two between 512 and 4096 bytes, so that the previous
alignment guarantees are fulfilled. This will replace the previous
log_sys.get_block_size().

log_sys.block_size, log_t::get_block_size(): Remove.

log_t::set_block_size(): Ensure that write_size will not be less
than the physical block size. There is no point to invoke this
function with 512 or less, because that is the minimum value of
write_size.

innodb_params_adjust(): Add some disabled code for adjusting
the minimum value and default value of innodb_log_write_ahead_size
to reflect the log_sys.write_size.

log_t::set_recovered(): Mark the recovery completed. This is the
place to adjust some things if we want to allow write_size>4096.

log_t::resize_write_buf(): Refer to write_size.

log_t::resize_start(): Refer to write_size instead of get_block_size().

log_write_buf(): Simplify some arithmetics and remove a goto.

log_t::write_buf(): Refer to write_size. If we are writing less than
that, do not switch buffers, but keep writing to the same buffer.
Move some code to improve the locality of reference.

recv_scan_log(): Refer to write_size instead of get_block_size().

os_file_create_func(): For type==OS_LOG_FILE on Linux, always invoke
os_file_log_maybe_unbuffered(), so that log_sys.set_block_size() will
be invoked even if we are not attempting to use O_DIRECT.

recv_sys_t::find_checkpoint(): Read the entire log header
in a single 12 KiB request into log_sys.buf.

Tested with:
./mtr --loose-innodb-log-write-ahead-size=4096
./mtr --loose-innodb-log-write-ahead-size=2048
This commit is contained in:
Marko Mäkelä 2024-06-27 16:38:08 +03:00
parent 27a3366663
commit 4ca355d863
13 changed files with 232 additions and 141 deletions

View file

@ -1333,7 +1333,7 @@ enum options_xtrabackup
OPT_INNODB_LOG_FILE_BUFFERING,
#endif
OPT_INNODB_LOG_FILE_SIZE,
OPT_INNODB_LOG_FILES_IN_GROUP,
OPT_INNODB_LOG_WRITE_AHEAD_SIZE,
OPT_INNODB_OPEN_FILES,
OPT_XTRA_DEBUG_SYNC,
OPT_INNODB_CHECKSUM_ALGORITHM,
@ -1905,6 +1905,10 @@ struct my_option xb_server_options[] =
{"innodb_log_group_home_dir", OPT_INNODB_LOG_GROUP_HOME_DIR,
"Path to InnoDB log files.", &srv_log_group_home_dir,
&srv_log_group_home_dir, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
{"innodb_log_write_ahead_size", OPT_INNODB_LOG_WRITE_AHEAD_SIZE,
"ib_logfile0 write size",
(G_PTR*) &log_sys.write_size, (G_PTR*) &srv_log_file_size, 0,
GET_UINT, REQUIRED_ARG, 512, 512, 4096, 0, 1, 0},
{"innodb_max_dirty_pages_pct", OPT_INNODB_MAX_DIRTY_PAGES_PCT,
"Percentage of dirty pages allowed in bufferpool.",
(G_PTR*) &srv_max_buf_pool_modified_pct,
@ -2233,7 +2237,6 @@ xb_get_one_option(const struct my_option *opt,
ADD_PRINT_PARAM_OPT(srv_log_group_home_dir);
break;
case OPT_INNODB_LOG_FILES_IN_GROUP:
case OPT_INNODB_LOG_FILE_SIZE:
break;
@ -2374,6 +2377,11 @@ xb_get_one_option(const struct my_option *opt,
static bool innodb_init_param()
{
if (!ut_is_2pow(log_sys.write_size)) {
msg("InnoDB: innodb_log_write_ahead_size=%u"
" is not a power of two", log_sys.write_size);
return true;
}
srv_is_being_started = TRUE;
/* === some variables from mysqld === */
memset((G_PTR) &mysql_tmpdir_list, 0, sizeof(mysql_tmpdir_list));
@ -3370,7 +3378,7 @@ static bool xtrabackup_copy_logfile()
ut_a(dst_log_file);
ut_ad(recv_sys.is_initialised());
const size_t sequence_offset{log_sys.is_encrypted() ? 8U + 5U : 5U};
const size_t block_size_1{log_sys.get_block_size() - 1};
const size_t block_size_1{log_sys.write_size - 1};
ut_ad(!log_sys.is_pmem());
@ -3445,7 +3453,7 @@ static bool xtrabackup_copy_logfile()
if (r == recv_sys_t::GOT_EOF)
break;
if (recv_sys.offset < log_sys.get_block_size())
if (recv_sys.offset < log_sys.write_size)
break;
if (xtrabackup_throttle && io_ticket-- < 0)

View file

@ -287,7 +287,20 @@ WHERE engine='innodb'
AND support IN ('YES', 'DEFAULT', 'ENABLED');
1
1
# restart
# restart: --innodb-log-write-ahead-size=513
SELECT * FROM INFORMATION_SCHEMA.ENGINES
WHERE engine = 'innodb'
AND support IN ('YES', 'DEFAULT', 'ENABLED');
ENGINE SUPPORT COMMENT TRANSACTIONS XA SAVEPOINTS
# restart: --innodb-log-write-ahead-size=4095
SELECT * FROM INFORMATION_SCHEMA.ENGINES
WHERE engine = 'innodb'
AND support IN ('YES', 'DEFAULT', 'ENABLED');
ENGINE SUPPORT COMMENT TRANSACTIONS XA SAVEPOINTS
# restart: --innodb-log-write-ahead-size=10000
SELECT @@innodb_log_write_ahead_size;
@@innodb_log_write_ahead_size
4096
# Cleanup
bak_ib_logfile0
bak_ibdata1

View file

@ -210,8 +210,20 @@ eval $check_no_innodb;
eval $check_yes_innodb;
--source include/shutdown_mysqld.inc
--let $restart_parameters=
--let $restart_parameters=--innodb-log-write-ahead-size=513
--source include/start_mysqld.inc
eval $check_no_innodb;
--source include/shutdown_mysqld.inc
--let $restart_parameters=--innodb-log-write-ahead-size=4095
--source include/start_mysqld.inc
eval $check_no_innodb;
--source include/shutdown_mysqld.inc
# this will be silently truncated to the maximum
--let $restart_parameters=--innodb-log-write-ahead-size=10000
--source include/start_mysqld.inc
SELECT @@innodb_log_write_ahead_size;
--echo # Cleanup
--list_files $bugdir

View file

@ -7,7 +7,9 @@ let $targetdir=$MYSQLTEST_VARDIR/tmp/backup;
--let $backup_log=$MYSQLTEST_VARDIR/tmp/backup.log
--disable_result_log
exec $XTRABACKUP --defaults-file=$MYSQLTEST_VARDIR/my.cnf --backup --target-dir=$targetdir --parallel=10 > $backup_log 2>&1;
--error 1
exec $XTRABACKUP --defaults-file=$MYSQLTEST_VARDIR/my.cnf --backup --target-dir=$targetdir --parallel=10 --innodb-log-write-ahead-size=4095 > $backup_log 2>&1;
exec $XTRABACKUP --defaults-file=$MYSQLTEST_VARDIR/my.cnf --backup --target-dir=$targetdir --parallel=10 --innodb-log-write-ahead-size=10000 > $backup_log 2>&1;
--enable_result_log
# The following warning must not appear after MDEV-27343 fix

View file

@ -1039,6 +1039,18 @@ NUMERIC_BLOCK_SIZE 0
ENUM_VALUE_LIST NULL
READ_ONLY NO
COMMAND_LINE_ARGUMENT OPTIONAL
VARIABLE_NAME INNODB_LOG_WRITE_AHEAD_SIZE
SESSION_VALUE NULL
DEFAULT_VALUE 512
VARIABLE_SCOPE GLOBAL
VARIABLE_TYPE INT UNSIGNED
VARIABLE_COMMENT Redo log write size to avoid read-on-write; must be a power of two
NUMERIC_MIN_VALUE 512
NUMERIC_MAX_VALUE 4096
NUMERIC_BLOCK_SIZE 1
ENUM_VALUE_LIST NULL
READ_ONLY YES
COMMAND_LINE_ARGUMENT REQUIRED
VARIABLE_NAME INNODB_LRU_FLUSH_SIZE
SESSION_VALUE NULL
DEFAULT_VALUE 32

View file

@ -5355,7 +5355,6 @@ static int init_server_components()
MARIADB_REMOVED_OPTION("innodb-log-compressed-pages"),
MARIADB_REMOVED_OPTION("innodb-log-files-in-group"),
MARIADB_REMOVED_OPTION("innodb-log-optimize-ddl"),
MARIADB_REMOVED_OPTION("innodb-log-write-ahead-size"),
MARIADB_REMOVED_OPTION("innodb-page-cleaners"),
MARIADB_REMOVED_OPTION("innodb-replication-delay"),
MARIADB_REMOVED_OPTION("innodb-scrub-log"),

View file

@ -1796,15 +1796,18 @@ inline void log_t::write_checkpoint(lsn_t end_lsn) noexcept
log_write_and_flush_prepare();
resizing= resize_lsn.load(std::memory_order_relaxed);
/* FIXME: issue an asynchronous write */
log.write(offset, {c, get_block_size()});
ut_ad(ut_is_2pow(write_size));
ut_ad(write_size >= 512);
ut_ad(write_size <= 4096);
log.write(offset, {c, write_size});
if (resizing > 1 && resizing <= next_checkpoint_lsn)
{
resize_log.write(CHECKPOINT_1, {c, write_size});
byte *buf= static_cast<byte*>(aligned_malloc(4096, 4096));
memset_aligned<4096>(buf, 0, 4096);
header_write(buf, resizing, is_encrypted());
resize_log.write(0, {buf, 4096});
aligned_free(buf);
resize_log.write(CHECKPOINT_1, {c, get_block_size()});
}
if (srv_file_flush_method != SRV_O_DSYNC)

View file

@ -1214,11 +1214,8 @@ struct
}
log_requests;
/** @brief Adjust some InnoDB startup parameters based on file contents
or innodb_page_size. */
static
void
innodb_params_adjust();
/** Adjust some InnoDB startup parameters based on the data directory */
static void innodb_params_adjust();
/*******************************************************************//**
This function is used to prepare an X/Open XA distributed transaction.
@ -3688,6 +3685,11 @@ static MYSQL_SYSVAR_ULONGLONG(buffer_pool_size, innobase_buffer_pool_size,
2ULL << 20,
LLONG_MAX, 1024*1024L);
static MYSQL_SYSVAR_UINT(log_write_ahead_size, log_sys.write_size,
PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
"Redo log write size to avoid read-on-write; must be a power of two",
nullptr, nullptr, 512, 512, 4096, 1);
/****************************************************************//**
Gives the file extension of an InnoDB single-table tablespace. */
static const char* ha_innobase_exts[] = {
@ -3809,6 +3811,13 @@ static int innodb_init_params()
DBUG_RETURN(HA_ERR_INITIALIZATION);
}
if (!ut_is_2pow(log_sys.write_size)) {
sql_print_error("InnoDB: innodb_log_write_ahead_size=%u"
" is not a power of two",
log_sys.write_size);
DBUG_RETURN(HA_ERR_INITIALIZATION);
}
if (compression_algorithm_is_not_loaded(innodb_compression_algorithm, ME_ERROR_LOG))
DBUG_RETURN(HA_ERR_INITIALIZATION);
@ -19850,6 +19859,7 @@ static struct st_mysql_sys_var* innobase_system_variables[]= {
MYSQL_SYSVAR(log_file_buffering),
#endif
MYSQL_SYSVAR(log_file_size),
MYSQL_SYSVAR(log_write_ahead_size),
MYSQL_SYSVAR(log_spin_wait_delay),
MYSQL_SYSVAR(log_group_home_dir),
MYSQL_SYSVAR(max_dirty_pages_pct),
@ -20010,20 +20020,32 @@ i_s_innodb_sys_virtual,
i_s_innodb_tablespaces_encryption
maria_declare_plugin_end;
/** @brief Adjust some InnoDB startup parameters based on file contents
or innodb_page_size. */
static
void
innodb_params_adjust()
/** Adjust some InnoDB startup parameters based on the data directory */
static void innodb_params_adjust()
{
MYSQL_SYSVAR_NAME(max_undo_log_size).max_val
= 1ULL << (32U + srv_page_size_shift);
MYSQL_SYSVAR_NAME(max_undo_log_size).min_val
= MYSQL_SYSVAR_NAME(max_undo_log_size).def_val
= ulonglong(SRV_UNDO_TABLESPACE_SIZE_IN_PAGES)
<< srv_page_size_shift;
MYSQL_SYSVAR_NAME(max_undo_log_size).max_val
= 1ULL << (32U + srv_page_size_shift);
MYSQL_SYSVAR_NAME(max_undo_log_size).max_val=
1ULL << (32U + srv_page_size_shift);
MYSQL_SYSVAR_NAME(max_undo_log_size).min_val=
MYSQL_SYSVAR_NAME(max_undo_log_size).def_val=
ulonglong{SRV_UNDO_TABLESPACE_SIZE_IN_PAGES} << srv_page_size_shift;
MYSQL_SYSVAR_NAME(max_undo_log_size).max_val=
1ULL << (32U + srv_page_size_shift);
#if 0 /* FIXME: INFORMATION_SCHEMA.SYSTEM_VARIABLES won't reflect this. */
/* plugin_opt_set_limits() would have copied all MYSQL_SYSVAR
before innodb_init() was invoked. Therefore, changing the
min_val, def_val, max_val will have no observable effect. */
# if defined __linux__ || defined _WIN32
uint &min_val= MYSQL_SYSVAR_NAME(log_write_ahead_size).min_val;
if (min_val < log_sys.write_size)
{
min_val= log_sys.write_size;
MYSQL_SYSVAR_NAME(log_write_ahead_size).def_val= log_sys.write_size;
}
# endif
ut_ad(MYSQL_SYSVAR_NAME(log_write_ahead_size).min_val <=
log_sys.write_size);
#endif
ut_ad(MYSQL_SYSVAR_NAME(log_write_ahead_size).max_val == 4096);
}
/****************************************************************************

View file

@ -274,11 +274,9 @@ private:
std::atomic<lsn_t> resize_lsn;
/** the log sequence number at the start of the log file */
lsn_t first_lsn;
#if defined __linux__ || defined _WIN32
/** The physical block size of the storage */
uint32_t block_size;
#endif
public:
/** current innodb_log_write_ahead_size */
uint write_size;
/** format of the redo log: e.g., FORMAT_10_8 */
uint32_t format;
#if defined __linux__ || defined _WIN32
@ -328,6 +326,8 @@ public:
max_buf_free;
}
inline void set_recovered() noexcept;
void set_buf_free(size_t f) noexcept
{ ut_ad(f < buf_free_LOCK); buf_free.store(f, std::memory_order_relaxed); }
@ -368,9 +368,12 @@ public:
inline void resize_write(lsn_t lsn, const byte *end,
size_t len, size_t seq) noexcept;
private:
/** Write resize_buf to resize_log.
@param length the used length of resize_buf */
ATTRIBUTE_COLD void resize_write_buf(size_t length) noexcept;
ATTRIBUTE_COLD ATTRIBUTE_NOINLINE
void resize_write_buf(size_t length) noexcept;
public:
/** Rename a log file after resizing.
@return whether an error occurred */
@ -467,14 +470,12 @@ public:
void close();
#if defined __linux__ || defined _WIN32
/** @return the physical block size of the storage */
size_t get_block_size() const noexcept
{ ut_ad(block_size); return block_size; }
/** Set the log block size for file I/O. */
void set_block_size(uint32_t size) noexcept { block_size= size; }
#else
/** @return the physical block size of the storage */
static size_t get_block_size() { return 512; }
void set_block_size(uint32 size) noexcept
{
if (write_size < size)
write_size= size;
}
#endif
private:

View file

@ -236,9 +236,6 @@ void log_t::attach_low(log_file_t file, os_offset_t size)
mprotect(ptr, size_t(size), PROT_READ);
buf= static_cast<byte*>(ptr);
max_buf_free= 1;
# if defined __linux__ || defined _WIN32
set_block_size(CPU_LEVEL1_DCACHE_LINESIZE);
# endif
log_maybe_unbuffered= true;
log_buffered= false;
mtr_t::finisher_update();
@ -273,13 +270,16 @@ void log_t::attach_low(log_file_t file, os_offset_t size)
log_buffered
? "Buffered log writes"
: "File system buffers for log disabled",
block_size);
write_size);
#endif
mtr_t::finisher_update();
#ifdef HAVE_PMEM
checkpoint_buf= static_cast<byte*>(aligned_malloc(block_size, block_size));
memset_aligned<64>(checkpoint_buf, 0, block_size);
ut_ad(ut_is_2pow(write_size));
ut_ad(write_size >= 512);
ut_ad(write_size <= 4096);
checkpoint_buf= static_cast<byte*>(aligned_malloc(write_size, write_size));
memset_aligned<512>(checkpoint_buf, 0, write_size);
return true;
#endif
}
@ -430,7 +430,7 @@ void log_t::set_buffered(bool buffered)
log_buffered
? "Buffered log writes"
: "File system buffers for log disabled",
block_size);
write_size);
}
log_resize_release();
}
@ -467,6 +467,8 @@ log_t::resize_start_status log_t::resize_start(os_offset_t size) noexcept
OS_FILE_NORMAL, OS_LOG_FILE, false, &success);
if (success)
{
ut_ad(!(size_t(file_size) & (write_size - 1)));
ut_ad(!(size_t(size) & (write_size - 1)));
log_resize_release();
void *ptr= nullptr, *ptr2= nullptr;
@ -522,7 +524,7 @@ log_t::resize_start_status log_t::resize_start(os_offset_t size) noexcept
{
memcpy_aligned<16>(resize_buf, buf, (buf_free + 15) & ~15);
start_lsn= first_lsn +
(~lsn_t{get_block_size() - 1} & (write_lsn - first_lsn));
(~lsn_t{write_size - 1} & (write_lsn - first_lsn));
}
}
resize_lsn.store(start_lsn, std::memory_order_relaxed);
@ -578,32 +580,30 @@ void log_t::resize_abort() noexcept
/** Write an aligned buffer to ib_logfile0.
@param buf buffer to be written
@param len length of data to be written
@param length length of data to be written
@param offset log file offset */
static void log_write_buf(const byte *buf, size_t len, lsn_t offset)
static void log_write_buf(const byte *buf, size_t length, lsn_t offset)
{
ut_ad(write_lock.is_owner());
ut_ad(!recv_no_log_write);
ut_d(const size_t block_size_1= log_sys.get_block_size() - 1);
ut_d(const size_t block_size_1= log_sys.write_size - 1);
ut_ad(!(offset & block_size_1));
ut_ad(!(len & block_size_1));
ut_ad(!(length & block_size_1));
ut_ad(!(size_t(buf) & block_size_1));
ut_ad(len);
ut_ad(length);
if (UNIV_LIKELY(offset + len <= log_sys.file_size))
const lsn_t maximum_write_length{log_sys.file_size - offset};
ut_ad(maximum_write_length <= log_sys.file_size - log_sys.START_OFFSET);
if (UNIV_UNLIKELY(length > maximum_write_length))
{
write:
log_sys.log.write(offset, {buf, len});
return;
log_sys.log.write(offset, {buf, size_t(maximum_write_length)});
length-= size_t(maximum_write_length);
buf+= size_t(maximum_write_length);
ut_ad(log_sys.START_OFFSET + length < offset);
offset= log_sys.START_OFFSET;
}
const size_t write_len= size_t(log_sys.file_size - offset);
log_sys.log.write(offset, {buf, write_len});
len-= write_len;
buf+= write_len;
ut_ad(log_sys.START_OFFSET + len < offset);
offset= log_sys.START_OFFSET;
goto write;
log_sys.log.write(offset, {buf, length});
}
/** Invoke commit_checkpoint_notify_ha() to notify that outstanding
@ -778,11 +778,12 @@ inline void log_t::persist(lsn_t lsn) noexcept
}
#endif
ATTRIBUTE_COLD ATTRIBUTE_NOINLINE
/** Write resize_buf to resize_log.
@param length the used length of resize_buf */
ATTRIBUTE_COLD void log_t::resize_write_buf(size_t length) noexcept
void log_t::resize_write_buf(size_t length) noexcept
{
const size_t block_size_1= get_block_size() - 1;
const size_t block_size_1= write_size - 1;
ut_ad(!(resize_target & block_size_1));
ut_ad(!(length & block_size_1));
ut_ad(length > block_size_1);
@ -802,7 +803,7 @@ ATTRIBUTE_COLD void log_t::resize_write_buf(size_t length) noexcept
}
ut_a(os_file_write_func(IORequestWrite, "ib_logfile101", resize_log.m_file,
resize_flush_buf, offset, length) == DB_SUCCESS);
buf, offset, length) == DB_SUCCESS);
}
/** Write buf to ib_logfile0.
@ -824,64 +825,88 @@ template<bool release_latch> inline lsn_t log_t::write_buf() noexcept
}
else
{
ut_ad(write_lock.is_owner());
ut_ad(!recv_no_log_write);
write_lock.set_pending(lsn);
ut_ad(write_lsn >= get_flushed_lsn());
const size_t block_size_1{get_block_size() - 1};
lsn_t offset{calc_lsn_offset(write_lsn) & ~lsn_t{block_size_1}};
const size_t write_size_1{write_size - 1};
ut_ad(ut_is_2pow(write_size));
size_t length{buf_free.load(std::memory_order_relaxed)};
lsn_t offset{calc_lsn_offset(write_lsn)};
ut_ad(length >= (offset & write_size_1));
ut_ad(write_size_1 >= 511);
DBUG_PRINT("ib_log", ("write " LSN_PF " to " LSN_PF " at " LSN_PF,
write_lsn, lsn, offset));
const byte *write_buf{buf};
size_t length{buf_free};
ut_ad(length >= (calc_lsn_offset(write_lsn) & block_size_1));
const size_t new_buf_free{length & block_size_1};
buf_free= new_buf_free;
ut_ad(new_buf_free == ((lsn - first_lsn) & block_size_1));
const byte *const write_buf{buf};
offset&= ~lsn_t{write_size_1};
if (new_buf_free)
if (length <= write_size_1)
{
ut_ad(!((length ^ (size_t(lsn) - size_t(first_lsn))) & write_size_1));
/* Keep filling the same buffer until we have more than one block. */
#if 0 /* TODO: Pad the last log block with dummy records. */
buf_free= log_pad(lsn, get_block_size() - new_buf_free,
buf + new_buf_free, flush_buf);
buf_free= log_pad(lsn, (write_size_1 + 1) - length,
buf + length, flush_buf);
... /* TODO: Update the LSN and adjust other code. */
#else
/* The rest of the block will be written as garbage.
(We want to avoid memset() while holding exclusive log_sys.latch)
This block will be overwritten later, once records beyond
the current LSN are generated. */
# ifdef HAVE_valgrind
MEM_MAKE_DEFINED(buf + length, get_block_size() - new_buf_free);
if (UNIV_LIKELY_NULL(resize_flush_buf))
MEM_MAKE_DEFINED(resize_buf + length, get_block_size() - new_buf_free);
MEM_MAKE_DEFINED(buf + length, (write_size_1 + 1) - length);
if (UNIV_LIKELY_NULL(resize_buf))
MEM_MAKE_DEFINED(resize_buf + length, (write_size_1 + 1) - length);
# endif
buf[length]= 0; /* allow recovery to catch EOF faster */
length&= ~block_size_1;
memcpy_aligned<16>(flush_buf, buf + length, (new_buf_free + 15) & ~15);
if (UNIV_LIKELY_NULL(resize_flush_buf))
memcpy_aligned<16>(resize_flush_buf, resize_buf + length,
(new_buf_free + 15) & ~15);
length+= get_block_size();
#endif
length= write_size_1 + 1;
}
else
{
const size_t new_buf_free{length & write_size_1};
ut_ad(new_buf_free == ((lsn - first_lsn) & write_size_1));
buf_free.store(new_buf_free, std::memory_order_relaxed);
if (new_buf_free)
{
/* The rest of the block will be written as garbage.
(We want to avoid memset() while holding exclusive log_sys.latch)
This block will be overwritten later, once records beyond
the current LSN are generated. */
#ifdef HAVE_valgrind
MEM_MAKE_DEFINED(buf + length, (write_size_1 + 1) - new_buf_free);
if (UNIV_LIKELY_NULL(resize_buf))
MEM_MAKE_DEFINED(resize_buf + length, (write_size_1 + 1) -
new_buf_free);
#endif
buf[length]= 0; /* allow recovery to catch EOF faster */
length&= ~write_size_1;
memcpy_aligned<16>(flush_buf, buf + length, (new_buf_free + 15) & ~15);
if (UNIV_LIKELY_NULL(resize_buf))
memcpy_aligned<16>(resize_flush_buf, resize_buf + length,
(new_buf_free + 15) & ~15);
length+= write_size_1 + 1;
}
std::swap(buf, flush_buf);
std::swap(resize_buf, resize_flush_buf);
}
std::swap(buf, flush_buf);
std::swap(resize_buf, resize_flush_buf);
write_to_log++;
if (release_latch)
latch.wr_unlock();
DBUG_PRINT("ib_log", ("write " LSN_PF " to " LSN_PF " at " LSN_PF,
write_lsn, lsn, offset));
/* Do the write to the log file */
log_write_buf(write_buf, length, offset);
if (UNIV_LIKELY_NULL(resize_buf))
resize_write_buf(length);
write_lsn= lsn;
if (UNIV_UNLIKELY(srv_shutdown_state > SRV_SHUTDOWN_INITIATED))
{
service_manager_extend_timeout(INNODB_EXTEND_TIMEOUT_INTERVAL,
"InnoDB log write: " LSN_PF, write_lsn);
}
/* Do the write to the log file */
log_write_buf(write_buf, length, offset);
if (UNIV_LIKELY_NULL(resize_buf))
resize_write_buf(length);
write_lsn= lsn;
}
set_check_for_checkpoint(false);

View file

@ -1773,7 +1773,7 @@ dberr_t recv_sys_t::find_checkpoint()
lsn= 0;
buf= my_assume_aligned<4096>(log_sys.buf);
if (!log_sys.is_pmem())
if (dberr_t err= log_sys.log.read(0, {buf, 4096}))
if (dberr_t err= log_sys.log.read(0, {buf, log_sys.START_OFFSET}))
return err;
/* Check the header page checksum. There was no
checksum in the first redo log format (version 0). */
@ -1842,12 +1842,7 @@ dberr_t recv_sys_t::find_checkpoint()
for (size_t field= log_t::CHECKPOINT_1; field <= log_t::CHECKPOINT_2;
field+= log_t::CHECKPOINT_2 - log_t::CHECKPOINT_1)
{
if (log_sys.is_pmem())
buf= log_sys.buf + field;
else
if (dberr_t err= log_sys.log.read(field,
{buf, log_sys.get_block_size()}))
return err;
buf= log_sys.buf + field;
const lsn_t checkpoint_lsn{mach_read_from_8(buf)};
const lsn_t end_lsn{mach_read_from_8(buf + 8)};
if (checkpoint_lsn < first_lsn || end_lsn < checkpoint_lsn ||
@ -4019,7 +4014,7 @@ static bool recv_scan_log(bool last_phase)
DBUG_ENTER("recv_scan_log");
ut_ad(log_sys.is_latest());
const size_t block_size_1{log_sys.get_block_size() - 1};
const size_t block_size_1{log_sys.write_size - 1};
mysql_mutex_lock(&recv_sys.mutex);
if (!last_phase)
@ -4201,7 +4196,7 @@ static bool recv_scan_log(bool last_phase)
if (recv_sys.is_corrupt_log())
break;
if (recv_sys.offset < log_sys.get_block_size() &&
if (recv_sys.offset < log_sys.write_size &&
recv_sys.lsn == recv_sys.scanned_lsn)
goto got_eof;
@ -4537,6 +4532,24 @@ dberr_t recv_recovery_read_checkpoint()
return err;
}
inline void log_t::set_recovered() noexcept
{
ut_ad(get_flushed_lsn() == get_lsn());
ut_ad(recv_sys.lsn == get_lsn());
size_t offset{recv_sys.offset};
if (!is_pmem())
{
const size_t bs{log_sys.write_size}, bs_1{bs - 1};
memmove_aligned<512>(buf, buf + (offset & ~bs_1), bs);
offset&= bs_1;
}
#ifdef HAVE_PMEM
else
mprotect(buf, size_t(file_size), PROT_READ | PROT_WRITE);
#endif
set_buf_free(offset);
}
/** Start recovering from a redo log checkpoint.
of first system tablespace page
@return error code or DB_SUCCESS */
@ -4710,22 +4723,7 @@ err_exit:
}
if (!srv_read_only_mode && log_sys.is_latest()) {
ut_ad(log_sys.get_flushed_lsn() == log_sys.get_lsn());
ut_ad(recv_sys.lsn == log_sys.get_lsn());
if (!log_sys.is_pmem()) {
const size_t bs_1{log_sys.get_block_size() - 1};
const size_t ro{recv_sys.offset};
recv_sys.offset &= bs_1;
memmove_aligned<64>(log_sys.buf,
log_sys.buf + (ro & ~bs_1),
log_sys.get_block_size());
#ifdef HAVE_PMEM
} else {
mprotect(log_sys.buf, size_t(log_sys.file_size),
PROT_READ | PROT_WRITE);
#endif
}
log_sys.set_buf_free(recv_sys.offset);
log_sys.set_recovered();
if (recv_needed_recovery
&& srv_operation <= SRV_OPERATION_EXPORT_RESTORED
&& recv_sys.lsn - log_sys.next_checkpoint_lsn

View file

@ -1094,7 +1094,6 @@ static ATTRIBUTE_COLD void os_file_log_buffered()
{
log_sys.log_maybe_unbuffered= false;
log_sys.log_buffered= true;
log_sys.set_block_size(512);
}
# endif
@ -1209,11 +1208,7 @@ os_file_create_func(
break;
}
# ifdef __linux__
} else if (type != OS_LOG_FILE) {
} else if (log_sys.log_buffered) {
skip_o_direct:
os_file_log_buffered();
} else if (create_mode != OS_FILE_CREATE
} else if (type == OS_LOG_FILE && create_mode != OS_FILE_CREATE
&& create_mode != OS_FILE_CREATE_SILENT
&& !log_sys.is_opened()) {
if (stat(name, &st)) {
@ -1225,15 +1220,16 @@ os_file_create_func(
"InnoDB: File %s was not found", name);
goto not_found;
}
log_sys.set_block_size(512);
goto skip_o_direct;
} else if (!os_file_log_maybe_unbuffered(st)
|| log_sys.log_buffered) {
skip_o_direct:
os_file_log_buffered();
} else {
direct_flag = O_DIRECT;
log_sys.log_maybe_unbuffered = true;
}
if (!os_file_log_maybe_unbuffered(st)) {
goto skip_o_direct;
}
direct_flag = O_DIRECT;
log_sys.log_maybe_unbuffered= true;
# endif
}
#else

View file

@ -175,7 +175,7 @@ static dberr_t create_log_file(bool create_new_db, lsn_t lsn)
/* We will retain ib_logfile0 until we have written a new logically
empty log as ib_logfile101 and atomically renamed it to
ib_logfile0 in log_t::rename_resized(). */
ib_logfile0 in log_t::resize_rename(). */
delete_log_files();
ut_ad(!os_aio_pending_reads());