MDEV-33894: Resurrect innodb_log_write_ahead_size

As part of commit 685d958e38 (MDEV-14425)
the parameter innodb_log_write_ahead_size was removed, because it was
thought that determining the physical block size would be a sufficient
replacement.

However, we can only determine the physical block size on Linux or
Microsoft Windows. On some file systems, the physical block size
is not relevant. For example, XFS uses a block size of 4096 bytes
even if the underlying block size may be smaller.

On Linux, we failed to determine the physical block size if
innodb_log_file_buffered=OFF was not requested or possible.
This will be fixed.

log_sys.write_size: The value of the reintroduced parameter
innodb_log_write_ahead_size. To keep it simple, this is read-only
and a power of two between 512 and 4096 bytes, so that the previous
alignment guarantees are fulfilled. This will replace the previous
log_sys.get_block_size().

log_sys.block_size, log_t::get_block_size(): Remove.

log_t::set_block_size(): Ensure that write_size will not be less
than the physical block size. There is no point to invoke this
function with 512 or less, because that is the minimum value of
write_size.

innodb_params_adjust(): Add some disabled code for adjusting
the minimum value and default value of innodb_log_write_ahead_size
to reflect the log_sys.write_size.

log_t::set_recovered(): Mark the recovery completed. This is the
place to adjust some things if we want to allow write_size>4096.

log_t::resize_write_buf(): Refer to write_size.

log_t::resize_start(): Refer to write_size instead of get_block_size().

log_write_buf(): Simplify some arithmetics and remove a goto.

log_t::write_buf(): Refer to write_size. If we are writing less than
that, do not switch buffers, but keep writing to the same buffer.
Move some code to improve the locality of reference.

recv_scan_log(): Refer to write_size instead of get_block_size().

os_file_create_func(): For type==OS_LOG_FILE on Linux, always invoke
os_file_log_maybe_unbuffered(), so that log_sys.set_block_size() will
be invoked even if we are not attempting to use O_DIRECT.

recv_sys_t::find_checkpoint(): Read the entire log header
in a single 12 KiB request into log_sys.buf.

Tested with:
./mtr --loose-innodb-log-write-ahead-size=4096
./mtr --loose-innodb-log-write-ahead-size=2048
This commit is contained in:
Marko Mäkelä 2024-06-27 16:38:08 +03:00
parent 27a3366663
commit 4ca355d863
13 changed files with 232 additions and 141 deletions

View file

@ -1333,7 +1333,7 @@ enum options_xtrabackup
OPT_INNODB_LOG_FILE_BUFFERING, OPT_INNODB_LOG_FILE_BUFFERING,
#endif #endif
OPT_INNODB_LOG_FILE_SIZE, OPT_INNODB_LOG_FILE_SIZE,
OPT_INNODB_LOG_FILES_IN_GROUP, OPT_INNODB_LOG_WRITE_AHEAD_SIZE,
OPT_INNODB_OPEN_FILES, OPT_INNODB_OPEN_FILES,
OPT_XTRA_DEBUG_SYNC, OPT_XTRA_DEBUG_SYNC,
OPT_INNODB_CHECKSUM_ALGORITHM, OPT_INNODB_CHECKSUM_ALGORITHM,
@ -1905,6 +1905,10 @@ struct my_option xb_server_options[] =
{"innodb_log_group_home_dir", OPT_INNODB_LOG_GROUP_HOME_DIR, {"innodb_log_group_home_dir", OPT_INNODB_LOG_GROUP_HOME_DIR,
"Path to InnoDB log files.", &srv_log_group_home_dir, "Path to InnoDB log files.", &srv_log_group_home_dir,
&srv_log_group_home_dir, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0}, &srv_log_group_home_dir, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
{"innodb_log_write_ahead_size", OPT_INNODB_LOG_WRITE_AHEAD_SIZE,
"ib_logfile0 write size",
(G_PTR*) &log_sys.write_size, (G_PTR*) &srv_log_file_size, 0,
GET_UINT, REQUIRED_ARG, 512, 512, 4096, 0, 1, 0},
{"innodb_max_dirty_pages_pct", OPT_INNODB_MAX_DIRTY_PAGES_PCT, {"innodb_max_dirty_pages_pct", OPT_INNODB_MAX_DIRTY_PAGES_PCT,
"Percentage of dirty pages allowed in bufferpool.", "Percentage of dirty pages allowed in bufferpool.",
(G_PTR*) &srv_max_buf_pool_modified_pct, (G_PTR*) &srv_max_buf_pool_modified_pct,
@ -2233,7 +2237,6 @@ xb_get_one_option(const struct my_option *opt,
ADD_PRINT_PARAM_OPT(srv_log_group_home_dir); ADD_PRINT_PARAM_OPT(srv_log_group_home_dir);
break; break;
case OPT_INNODB_LOG_FILES_IN_GROUP:
case OPT_INNODB_LOG_FILE_SIZE: case OPT_INNODB_LOG_FILE_SIZE:
break; break;
@ -2374,6 +2377,11 @@ xb_get_one_option(const struct my_option *opt,
static bool innodb_init_param() static bool innodb_init_param()
{ {
if (!ut_is_2pow(log_sys.write_size)) {
msg("InnoDB: innodb_log_write_ahead_size=%u"
" is not a power of two", log_sys.write_size);
return true;
}
srv_is_being_started = TRUE; srv_is_being_started = TRUE;
/* === some variables from mysqld === */ /* === some variables from mysqld === */
memset((G_PTR) &mysql_tmpdir_list, 0, sizeof(mysql_tmpdir_list)); memset((G_PTR) &mysql_tmpdir_list, 0, sizeof(mysql_tmpdir_list));
@ -3370,7 +3378,7 @@ static bool xtrabackup_copy_logfile()
ut_a(dst_log_file); ut_a(dst_log_file);
ut_ad(recv_sys.is_initialised()); ut_ad(recv_sys.is_initialised());
const size_t sequence_offset{log_sys.is_encrypted() ? 8U + 5U : 5U}; const size_t sequence_offset{log_sys.is_encrypted() ? 8U + 5U : 5U};
const size_t block_size_1{log_sys.get_block_size() - 1}; const size_t block_size_1{log_sys.write_size - 1};
ut_ad(!log_sys.is_pmem()); ut_ad(!log_sys.is_pmem());
@ -3445,7 +3453,7 @@ static bool xtrabackup_copy_logfile()
if (r == recv_sys_t::GOT_EOF) if (r == recv_sys_t::GOT_EOF)
break; break;
if (recv_sys.offset < log_sys.get_block_size()) if (recv_sys.offset < log_sys.write_size)
break; break;
if (xtrabackup_throttle && io_ticket-- < 0) if (xtrabackup_throttle && io_ticket-- < 0)

View file

@ -287,7 +287,20 @@ WHERE engine='innodb'
AND support IN ('YES', 'DEFAULT', 'ENABLED'); AND support IN ('YES', 'DEFAULT', 'ENABLED');
1 1
1 1
# restart # restart: --innodb-log-write-ahead-size=513
SELECT * FROM INFORMATION_SCHEMA.ENGINES
WHERE engine = 'innodb'
AND support IN ('YES', 'DEFAULT', 'ENABLED');
ENGINE SUPPORT COMMENT TRANSACTIONS XA SAVEPOINTS
# restart: --innodb-log-write-ahead-size=4095
SELECT * FROM INFORMATION_SCHEMA.ENGINES
WHERE engine = 'innodb'
AND support IN ('YES', 'DEFAULT', 'ENABLED');
ENGINE SUPPORT COMMENT TRANSACTIONS XA SAVEPOINTS
# restart: --innodb-log-write-ahead-size=10000
SELECT @@innodb_log_write_ahead_size;
@@innodb_log_write_ahead_size
4096
# Cleanup # Cleanup
bak_ib_logfile0 bak_ib_logfile0
bak_ibdata1 bak_ibdata1

View file

@ -210,8 +210,20 @@ eval $check_no_innodb;
eval $check_yes_innodb; eval $check_yes_innodb;
--source include/shutdown_mysqld.inc --source include/shutdown_mysqld.inc
--let $restart_parameters= --let $restart_parameters=--innodb-log-write-ahead-size=513
--source include/start_mysqld.inc --source include/start_mysqld.inc
eval $check_no_innodb;
--source include/shutdown_mysqld.inc
--let $restart_parameters=--innodb-log-write-ahead-size=4095
--source include/start_mysqld.inc
eval $check_no_innodb;
--source include/shutdown_mysqld.inc
# this will be silently truncated to the maximum
--let $restart_parameters=--innodb-log-write-ahead-size=10000
--source include/start_mysqld.inc
SELECT @@innodb_log_write_ahead_size;
--echo # Cleanup --echo # Cleanup
--list_files $bugdir --list_files $bugdir

View file

@ -7,7 +7,9 @@ let $targetdir=$MYSQLTEST_VARDIR/tmp/backup;
--let $backup_log=$MYSQLTEST_VARDIR/tmp/backup.log --let $backup_log=$MYSQLTEST_VARDIR/tmp/backup.log
--disable_result_log --disable_result_log
exec $XTRABACKUP --defaults-file=$MYSQLTEST_VARDIR/my.cnf --backup --target-dir=$targetdir --parallel=10 > $backup_log 2>&1; --error 1
exec $XTRABACKUP --defaults-file=$MYSQLTEST_VARDIR/my.cnf --backup --target-dir=$targetdir --parallel=10 --innodb-log-write-ahead-size=4095 > $backup_log 2>&1;
exec $XTRABACKUP --defaults-file=$MYSQLTEST_VARDIR/my.cnf --backup --target-dir=$targetdir --parallel=10 --innodb-log-write-ahead-size=10000 > $backup_log 2>&1;
--enable_result_log --enable_result_log
# The following warning must not appear after MDEV-27343 fix # The following warning must not appear after MDEV-27343 fix

View file

@ -1039,6 +1039,18 @@ NUMERIC_BLOCK_SIZE 0
ENUM_VALUE_LIST NULL ENUM_VALUE_LIST NULL
READ_ONLY NO READ_ONLY NO
COMMAND_LINE_ARGUMENT OPTIONAL COMMAND_LINE_ARGUMENT OPTIONAL
VARIABLE_NAME INNODB_LOG_WRITE_AHEAD_SIZE
SESSION_VALUE NULL
DEFAULT_VALUE 512
VARIABLE_SCOPE GLOBAL
VARIABLE_TYPE INT UNSIGNED
VARIABLE_COMMENT Redo log write size to avoid read-on-write; must be a power of two
NUMERIC_MIN_VALUE 512
NUMERIC_MAX_VALUE 4096
NUMERIC_BLOCK_SIZE 1
ENUM_VALUE_LIST NULL
READ_ONLY YES
COMMAND_LINE_ARGUMENT REQUIRED
VARIABLE_NAME INNODB_LRU_FLUSH_SIZE VARIABLE_NAME INNODB_LRU_FLUSH_SIZE
SESSION_VALUE NULL SESSION_VALUE NULL
DEFAULT_VALUE 32 DEFAULT_VALUE 32

View file

@ -5355,7 +5355,6 @@ static int init_server_components()
MARIADB_REMOVED_OPTION("innodb-log-compressed-pages"), MARIADB_REMOVED_OPTION("innodb-log-compressed-pages"),
MARIADB_REMOVED_OPTION("innodb-log-files-in-group"), MARIADB_REMOVED_OPTION("innodb-log-files-in-group"),
MARIADB_REMOVED_OPTION("innodb-log-optimize-ddl"), MARIADB_REMOVED_OPTION("innodb-log-optimize-ddl"),
MARIADB_REMOVED_OPTION("innodb-log-write-ahead-size"),
MARIADB_REMOVED_OPTION("innodb-page-cleaners"), MARIADB_REMOVED_OPTION("innodb-page-cleaners"),
MARIADB_REMOVED_OPTION("innodb-replication-delay"), MARIADB_REMOVED_OPTION("innodb-replication-delay"),
MARIADB_REMOVED_OPTION("innodb-scrub-log"), MARIADB_REMOVED_OPTION("innodb-scrub-log"),

View file

@ -1796,15 +1796,18 @@ inline void log_t::write_checkpoint(lsn_t end_lsn) noexcept
log_write_and_flush_prepare(); log_write_and_flush_prepare();
resizing= resize_lsn.load(std::memory_order_relaxed); resizing= resize_lsn.load(std::memory_order_relaxed);
/* FIXME: issue an asynchronous write */ /* FIXME: issue an asynchronous write */
log.write(offset, {c, get_block_size()}); ut_ad(ut_is_2pow(write_size));
ut_ad(write_size >= 512);
ut_ad(write_size <= 4096);
log.write(offset, {c, write_size});
if (resizing > 1 && resizing <= next_checkpoint_lsn) if (resizing > 1 && resizing <= next_checkpoint_lsn)
{ {
resize_log.write(CHECKPOINT_1, {c, write_size});
byte *buf= static_cast<byte*>(aligned_malloc(4096, 4096)); byte *buf= static_cast<byte*>(aligned_malloc(4096, 4096));
memset_aligned<4096>(buf, 0, 4096); memset_aligned<4096>(buf, 0, 4096);
header_write(buf, resizing, is_encrypted()); header_write(buf, resizing, is_encrypted());
resize_log.write(0, {buf, 4096}); resize_log.write(0, {buf, 4096});
aligned_free(buf); aligned_free(buf);
resize_log.write(CHECKPOINT_1, {c, get_block_size()});
} }
if (srv_file_flush_method != SRV_O_DSYNC) if (srv_file_flush_method != SRV_O_DSYNC)

View file

@ -1214,11 +1214,8 @@ struct
} }
log_requests; log_requests;
/** @brief Adjust some InnoDB startup parameters based on file contents /** Adjust some InnoDB startup parameters based on the data directory */
or innodb_page_size. */ static void innodb_params_adjust();
static
void
innodb_params_adjust();
/*******************************************************************//** /*******************************************************************//**
This function is used to prepare an X/Open XA distributed transaction. This function is used to prepare an X/Open XA distributed transaction.
@ -3688,6 +3685,11 @@ static MYSQL_SYSVAR_ULONGLONG(buffer_pool_size, innobase_buffer_pool_size,
2ULL << 20, 2ULL << 20,
LLONG_MAX, 1024*1024L); LLONG_MAX, 1024*1024L);
static MYSQL_SYSVAR_UINT(log_write_ahead_size, log_sys.write_size,
PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
"Redo log write size to avoid read-on-write; must be a power of two",
nullptr, nullptr, 512, 512, 4096, 1);
/****************************************************************//** /****************************************************************//**
Gives the file extension of an InnoDB single-table tablespace. */ Gives the file extension of an InnoDB single-table tablespace. */
static const char* ha_innobase_exts[] = { static const char* ha_innobase_exts[] = {
@ -3809,6 +3811,13 @@ static int innodb_init_params()
DBUG_RETURN(HA_ERR_INITIALIZATION); DBUG_RETURN(HA_ERR_INITIALIZATION);
} }
if (!ut_is_2pow(log_sys.write_size)) {
sql_print_error("InnoDB: innodb_log_write_ahead_size=%u"
" is not a power of two",
log_sys.write_size);
DBUG_RETURN(HA_ERR_INITIALIZATION);
}
if (compression_algorithm_is_not_loaded(innodb_compression_algorithm, ME_ERROR_LOG)) if (compression_algorithm_is_not_loaded(innodb_compression_algorithm, ME_ERROR_LOG))
DBUG_RETURN(HA_ERR_INITIALIZATION); DBUG_RETURN(HA_ERR_INITIALIZATION);
@ -19850,6 +19859,7 @@ static struct st_mysql_sys_var* innobase_system_variables[]= {
MYSQL_SYSVAR(log_file_buffering), MYSQL_SYSVAR(log_file_buffering),
#endif #endif
MYSQL_SYSVAR(log_file_size), MYSQL_SYSVAR(log_file_size),
MYSQL_SYSVAR(log_write_ahead_size),
MYSQL_SYSVAR(log_spin_wait_delay), MYSQL_SYSVAR(log_spin_wait_delay),
MYSQL_SYSVAR(log_group_home_dir), MYSQL_SYSVAR(log_group_home_dir),
MYSQL_SYSVAR(max_dirty_pages_pct), MYSQL_SYSVAR(max_dirty_pages_pct),
@ -20010,20 +20020,32 @@ i_s_innodb_sys_virtual,
i_s_innodb_tablespaces_encryption i_s_innodb_tablespaces_encryption
maria_declare_plugin_end; maria_declare_plugin_end;
/** @brief Adjust some InnoDB startup parameters based on file contents /** Adjust some InnoDB startup parameters based on the data directory */
or innodb_page_size. */ static void innodb_params_adjust()
static
void
innodb_params_adjust()
{ {
MYSQL_SYSVAR_NAME(max_undo_log_size).max_val MYSQL_SYSVAR_NAME(max_undo_log_size).max_val=
= 1ULL << (32U + srv_page_size_shift); 1ULL << (32U + srv_page_size_shift);
MYSQL_SYSVAR_NAME(max_undo_log_size).min_val MYSQL_SYSVAR_NAME(max_undo_log_size).min_val=
= MYSQL_SYSVAR_NAME(max_undo_log_size).def_val MYSQL_SYSVAR_NAME(max_undo_log_size).def_val=
= ulonglong(SRV_UNDO_TABLESPACE_SIZE_IN_PAGES) ulonglong{SRV_UNDO_TABLESPACE_SIZE_IN_PAGES} << srv_page_size_shift;
<< srv_page_size_shift; MYSQL_SYSVAR_NAME(max_undo_log_size).max_val=
MYSQL_SYSVAR_NAME(max_undo_log_size).max_val 1ULL << (32U + srv_page_size_shift);
= 1ULL << (32U + srv_page_size_shift); #if 0 /* FIXME: INFORMATION_SCHEMA.SYSTEM_VARIABLES won't reflect this. */
/* plugin_opt_set_limits() would have copied all MYSQL_SYSVAR
before innodb_init() was invoked. Therefore, changing the
min_val, def_val, max_val will have no observable effect. */
# if defined __linux__ || defined _WIN32
uint &min_val= MYSQL_SYSVAR_NAME(log_write_ahead_size).min_val;
if (min_val < log_sys.write_size)
{
min_val= log_sys.write_size;
MYSQL_SYSVAR_NAME(log_write_ahead_size).def_val= log_sys.write_size;
}
# endif
ut_ad(MYSQL_SYSVAR_NAME(log_write_ahead_size).min_val <=
log_sys.write_size);
#endif
ut_ad(MYSQL_SYSVAR_NAME(log_write_ahead_size).max_val == 4096);
} }
/**************************************************************************** /****************************************************************************

View file

@ -274,11 +274,9 @@ private:
std::atomic<lsn_t> resize_lsn; std::atomic<lsn_t> resize_lsn;
/** the log sequence number at the start of the log file */ /** the log sequence number at the start of the log file */
lsn_t first_lsn; lsn_t first_lsn;
#if defined __linux__ || defined _WIN32
/** The physical block size of the storage */
uint32_t block_size;
#endif
public: public:
/** current innodb_log_write_ahead_size */
uint write_size;
/** format of the redo log: e.g., FORMAT_10_8 */ /** format of the redo log: e.g., FORMAT_10_8 */
uint32_t format; uint32_t format;
#if defined __linux__ || defined _WIN32 #if defined __linux__ || defined _WIN32
@ -328,6 +326,8 @@ public:
max_buf_free; max_buf_free;
} }
inline void set_recovered() noexcept;
void set_buf_free(size_t f) noexcept void set_buf_free(size_t f) noexcept
{ ut_ad(f < buf_free_LOCK); buf_free.store(f, std::memory_order_relaxed); } { ut_ad(f < buf_free_LOCK); buf_free.store(f, std::memory_order_relaxed); }
@ -368,9 +368,12 @@ public:
inline void resize_write(lsn_t lsn, const byte *end, inline void resize_write(lsn_t lsn, const byte *end,
size_t len, size_t seq) noexcept; size_t len, size_t seq) noexcept;
private:
/** Write resize_buf to resize_log. /** Write resize_buf to resize_log.
@param length the used length of resize_buf */ @param length the used length of resize_buf */
ATTRIBUTE_COLD void resize_write_buf(size_t length) noexcept; ATTRIBUTE_COLD ATTRIBUTE_NOINLINE
void resize_write_buf(size_t length) noexcept;
public:
/** Rename a log file after resizing. /** Rename a log file after resizing.
@return whether an error occurred */ @return whether an error occurred */
@ -467,14 +470,12 @@ public:
void close(); void close();
#if defined __linux__ || defined _WIN32 #if defined __linux__ || defined _WIN32
/** @return the physical block size of the storage */
size_t get_block_size() const noexcept
{ ut_ad(block_size); return block_size; }
/** Set the log block size for file I/O. */ /** Set the log block size for file I/O. */
void set_block_size(uint32_t size) noexcept { block_size= size; } void set_block_size(uint32 size) noexcept
#else {
/** @return the physical block size of the storage */ if (write_size < size)
static size_t get_block_size() { return 512; } write_size= size;
}
#endif #endif
private: private:

View file

@ -236,9 +236,6 @@ void log_t::attach_low(log_file_t file, os_offset_t size)
mprotect(ptr, size_t(size), PROT_READ); mprotect(ptr, size_t(size), PROT_READ);
buf= static_cast<byte*>(ptr); buf= static_cast<byte*>(ptr);
max_buf_free= 1; max_buf_free= 1;
# if defined __linux__ || defined _WIN32
set_block_size(CPU_LEVEL1_DCACHE_LINESIZE);
# endif
log_maybe_unbuffered= true; log_maybe_unbuffered= true;
log_buffered= false; log_buffered= false;
mtr_t::finisher_update(); mtr_t::finisher_update();
@ -273,13 +270,16 @@ void log_t::attach_low(log_file_t file, os_offset_t size)
log_buffered log_buffered
? "Buffered log writes" ? "Buffered log writes"
: "File system buffers for log disabled", : "File system buffers for log disabled",
block_size); write_size);
#endif #endif
mtr_t::finisher_update(); mtr_t::finisher_update();
#ifdef HAVE_PMEM #ifdef HAVE_PMEM
checkpoint_buf= static_cast<byte*>(aligned_malloc(block_size, block_size)); ut_ad(ut_is_2pow(write_size));
memset_aligned<64>(checkpoint_buf, 0, block_size); ut_ad(write_size >= 512);
ut_ad(write_size <= 4096);
checkpoint_buf= static_cast<byte*>(aligned_malloc(write_size, write_size));
memset_aligned<512>(checkpoint_buf, 0, write_size);
return true; return true;
#endif #endif
} }
@ -430,7 +430,7 @@ void log_t::set_buffered(bool buffered)
log_buffered log_buffered
? "Buffered log writes" ? "Buffered log writes"
: "File system buffers for log disabled", : "File system buffers for log disabled",
block_size); write_size);
} }
log_resize_release(); log_resize_release();
} }
@ -467,6 +467,8 @@ log_t::resize_start_status log_t::resize_start(os_offset_t size) noexcept
OS_FILE_NORMAL, OS_LOG_FILE, false, &success); OS_FILE_NORMAL, OS_LOG_FILE, false, &success);
if (success) if (success)
{ {
ut_ad(!(size_t(file_size) & (write_size - 1)));
ut_ad(!(size_t(size) & (write_size - 1)));
log_resize_release(); log_resize_release();
void *ptr= nullptr, *ptr2= nullptr; void *ptr= nullptr, *ptr2= nullptr;
@ -522,7 +524,7 @@ log_t::resize_start_status log_t::resize_start(os_offset_t size) noexcept
{ {
memcpy_aligned<16>(resize_buf, buf, (buf_free + 15) & ~15); memcpy_aligned<16>(resize_buf, buf, (buf_free + 15) & ~15);
start_lsn= first_lsn + start_lsn= first_lsn +
(~lsn_t{get_block_size() - 1} & (write_lsn - first_lsn)); (~lsn_t{write_size - 1} & (write_lsn - first_lsn));
} }
} }
resize_lsn.store(start_lsn, std::memory_order_relaxed); resize_lsn.store(start_lsn, std::memory_order_relaxed);
@ -578,32 +580,30 @@ void log_t::resize_abort() noexcept
/** Write an aligned buffer to ib_logfile0. /** Write an aligned buffer to ib_logfile0.
@param buf buffer to be written @param buf buffer to be written
@param len length of data to be written @param length length of data to be written
@param offset log file offset */ @param offset log file offset */
static void log_write_buf(const byte *buf, size_t len, lsn_t offset) static void log_write_buf(const byte *buf, size_t length, lsn_t offset)
{ {
ut_ad(write_lock.is_owner()); ut_ad(write_lock.is_owner());
ut_ad(!recv_no_log_write); ut_ad(!recv_no_log_write);
ut_d(const size_t block_size_1= log_sys.get_block_size() - 1); ut_d(const size_t block_size_1= log_sys.write_size - 1);
ut_ad(!(offset & block_size_1)); ut_ad(!(offset & block_size_1));
ut_ad(!(len & block_size_1)); ut_ad(!(length & block_size_1));
ut_ad(!(size_t(buf) & block_size_1)); ut_ad(!(size_t(buf) & block_size_1));
ut_ad(len); ut_ad(length);
if (UNIV_LIKELY(offset + len <= log_sys.file_size)) const lsn_t maximum_write_length{log_sys.file_size - offset};
ut_ad(maximum_write_length <= log_sys.file_size - log_sys.START_OFFSET);
if (UNIV_UNLIKELY(length > maximum_write_length))
{ {
write: log_sys.log.write(offset, {buf, size_t(maximum_write_length)});
log_sys.log.write(offset, {buf, len}); length-= size_t(maximum_write_length);
return; buf+= size_t(maximum_write_length);
} ut_ad(log_sys.START_OFFSET + length < offset);
const size_t write_len= size_t(log_sys.file_size - offset);
log_sys.log.write(offset, {buf, write_len});
len-= write_len;
buf+= write_len;
ut_ad(log_sys.START_OFFSET + len < offset);
offset= log_sys.START_OFFSET; offset= log_sys.START_OFFSET;
goto write; }
log_sys.log.write(offset, {buf, length});
} }
/** Invoke commit_checkpoint_notify_ha() to notify that outstanding /** Invoke commit_checkpoint_notify_ha() to notify that outstanding
@ -778,11 +778,12 @@ inline void log_t::persist(lsn_t lsn) noexcept
} }
#endif #endif
ATTRIBUTE_COLD ATTRIBUTE_NOINLINE
/** Write resize_buf to resize_log. /** Write resize_buf to resize_log.
@param length the used length of resize_buf */ @param length the used length of resize_buf */
ATTRIBUTE_COLD void log_t::resize_write_buf(size_t length) noexcept void log_t::resize_write_buf(size_t length) noexcept
{ {
const size_t block_size_1= get_block_size() - 1; const size_t block_size_1= write_size - 1;
ut_ad(!(resize_target & block_size_1)); ut_ad(!(resize_target & block_size_1));
ut_ad(!(length & block_size_1)); ut_ad(!(length & block_size_1));
ut_ad(length > block_size_1); ut_ad(length > block_size_1);
@ -802,7 +803,7 @@ ATTRIBUTE_COLD void log_t::resize_write_buf(size_t length) noexcept
} }
ut_a(os_file_write_func(IORequestWrite, "ib_logfile101", resize_log.m_file, ut_a(os_file_write_func(IORequestWrite, "ib_logfile101", resize_log.m_file,
resize_flush_buf, offset, length) == DB_SUCCESS); buf, offset, length) == DB_SUCCESS);
} }
/** Write buf to ib_logfile0. /** Write buf to ib_logfile0.
@ -824,64 +825,88 @@ template<bool release_latch> inline lsn_t log_t::write_buf() noexcept
} }
else else
{ {
ut_ad(write_lock.is_owner());
ut_ad(!recv_no_log_write); ut_ad(!recv_no_log_write);
write_lock.set_pending(lsn); write_lock.set_pending(lsn);
ut_ad(write_lsn >= get_flushed_lsn()); ut_ad(write_lsn >= get_flushed_lsn());
const size_t block_size_1{get_block_size() - 1}; const size_t write_size_1{write_size - 1};
lsn_t offset{calc_lsn_offset(write_lsn) & ~lsn_t{block_size_1}}; ut_ad(ut_is_2pow(write_size));
size_t length{buf_free.load(std::memory_order_relaxed)};
lsn_t offset{calc_lsn_offset(write_lsn)};
ut_ad(length >= (offset & write_size_1));
ut_ad(write_size_1 >= 511);
DBUG_PRINT("ib_log", ("write " LSN_PF " to " LSN_PF " at " LSN_PF, const byte *const write_buf{buf};
write_lsn, lsn, offset)); offset&= ~lsn_t{write_size_1};
const byte *write_buf{buf};
size_t length{buf_free}; if (length <= write_size_1)
ut_ad(length >= (calc_lsn_offset(write_lsn) & block_size_1)); {
const size_t new_buf_free{length & block_size_1}; ut_ad(!((length ^ (size_t(lsn) - size_t(first_lsn))) & write_size_1));
buf_free= new_buf_free; /* Keep filling the same buffer until we have more than one block. */
ut_ad(new_buf_free == ((lsn - first_lsn) & block_size_1)); #if 0 /* TODO: Pad the last log block with dummy records. */
buf_free= log_pad(lsn, (write_size_1 + 1) - length,
buf + length, flush_buf);
... /* TODO: Update the LSN and adjust other code. */
#else
# ifdef HAVE_valgrind
MEM_MAKE_DEFINED(buf + length, (write_size_1 + 1) - length);
if (UNIV_LIKELY_NULL(resize_buf))
MEM_MAKE_DEFINED(resize_buf + length, (write_size_1 + 1) - length);
# endif
buf[length]= 0; /* allow recovery to catch EOF faster */
#endif
length= write_size_1 + 1;
}
else
{
const size_t new_buf_free{length & write_size_1};
ut_ad(new_buf_free == ((lsn - first_lsn) & write_size_1));
buf_free.store(new_buf_free, std::memory_order_relaxed);
if (new_buf_free) if (new_buf_free)
{ {
#if 0 /* TODO: Pad the last log block with dummy records. */
buf_free= log_pad(lsn, get_block_size() - new_buf_free,
buf + new_buf_free, flush_buf);
... /* TODO: Update the LSN and adjust other code. */
#else
/* The rest of the block will be written as garbage. /* The rest of the block will be written as garbage.
(We want to avoid memset() while holding exclusive log_sys.latch) (We want to avoid memset() while holding exclusive log_sys.latch)
This block will be overwritten later, once records beyond This block will be overwritten later, once records beyond
the current LSN are generated. */ the current LSN are generated. */
# ifdef HAVE_valgrind #ifdef HAVE_valgrind
MEM_MAKE_DEFINED(buf + length, get_block_size() - new_buf_free); MEM_MAKE_DEFINED(buf + length, (write_size_1 + 1) - new_buf_free);
if (UNIV_LIKELY_NULL(resize_flush_buf)) if (UNIV_LIKELY_NULL(resize_buf))
MEM_MAKE_DEFINED(resize_buf + length, get_block_size() - new_buf_free); MEM_MAKE_DEFINED(resize_buf + length, (write_size_1 + 1) -
# endif new_buf_free);
#endif
buf[length]= 0; /* allow recovery to catch EOF faster */ buf[length]= 0; /* allow recovery to catch EOF faster */
length&= ~block_size_1; length&= ~write_size_1;
memcpy_aligned<16>(flush_buf, buf + length, (new_buf_free + 15) & ~15); memcpy_aligned<16>(flush_buf, buf + length, (new_buf_free + 15) & ~15);
if (UNIV_LIKELY_NULL(resize_flush_buf)) if (UNIV_LIKELY_NULL(resize_buf))
memcpy_aligned<16>(resize_flush_buf, resize_buf + length, memcpy_aligned<16>(resize_flush_buf, resize_buf + length,
(new_buf_free + 15) & ~15); (new_buf_free + 15) & ~15);
length+= get_block_size(); length+= write_size_1 + 1;
#endif
} }
std::swap(buf, flush_buf); std::swap(buf, flush_buf);
std::swap(resize_buf, resize_flush_buf); std::swap(resize_buf, resize_flush_buf);
}
write_to_log++; write_to_log++;
if (release_latch) if (release_latch)
latch.wr_unlock(); latch.wr_unlock();
DBUG_PRINT("ib_log", ("write " LSN_PF " to " LSN_PF " at " LSN_PF,
write_lsn, lsn, offset));
/* Do the write to the log file */
log_write_buf(write_buf, length, offset);
if (UNIV_LIKELY_NULL(resize_buf))
resize_write_buf(length);
write_lsn= lsn;
if (UNIV_UNLIKELY(srv_shutdown_state > SRV_SHUTDOWN_INITIATED)) if (UNIV_UNLIKELY(srv_shutdown_state > SRV_SHUTDOWN_INITIATED))
{ {
service_manager_extend_timeout(INNODB_EXTEND_TIMEOUT_INTERVAL, service_manager_extend_timeout(INNODB_EXTEND_TIMEOUT_INTERVAL,
"InnoDB log write: " LSN_PF, write_lsn); "InnoDB log write: " LSN_PF, write_lsn);
} }
/* Do the write to the log file */
log_write_buf(write_buf, length, offset);
if (UNIV_LIKELY_NULL(resize_buf))
resize_write_buf(length);
write_lsn= lsn;
} }
set_check_for_checkpoint(false); set_check_for_checkpoint(false);

View file

@ -1773,7 +1773,7 @@ dberr_t recv_sys_t::find_checkpoint()
lsn= 0; lsn= 0;
buf= my_assume_aligned<4096>(log_sys.buf); buf= my_assume_aligned<4096>(log_sys.buf);
if (!log_sys.is_pmem()) if (!log_sys.is_pmem())
if (dberr_t err= log_sys.log.read(0, {buf, 4096})) if (dberr_t err= log_sys.log.read(0, {buf, log_sys.START_OFFSET}))
return err; return err;
/* Check the header page checksum. There was no /* Check the header page checksum. There was no
checksum in the first redo log format (version 0). */ checksum in the first redo log format (version 0). */
@ -1842,12 +1842,7 @@ dberr_t recv_sys_t::find_checkpoint()
for (size_t field= log_t::CHECKPOINT_1; field <= log_t::CHECKPOINT_2; for (size_t field= log_t::CHECKPOINT_1; field <= log_t::CHECKPOINT_2;
field+= log_t::CHECKPOINT_2 - log_t::CHECKPOINT_1) field+= log_t::CHECKPOINT_2 - log_t::CHECKPOINT_1)
{ {
if (log_sys.is_pmem())
buf= log_sys.buf + field; buf= log_sys.buf + field;
else
if (dberr_t err= log_sys.log.read(field,
{buf, log_sys.get_block_size()}))
return err;
const lsn_t checkpoint_lsn{mach_read_from_8(buf)}; const lsn_t checkpoint_lsn{mach_read_from_8(buf)};
const lsn_t end_lsn{mach_read_from_8(buf + 8)}; const lsn_t end_lsn{mach_read_from_8(buf + 8)};
if (checkpoint_lsn < first_lsn || end_lsn < checkpoint_lsn || if (checkpoint_lsn < first_lsn || end_lsn < checkpoint_lsn ||
@ -4019,7 +4014,7 @@ static bool recv_scan_log(bool last_phase)
DBUG_ENTER("recv_scan_log"); DBUG_ENTER("recv_scan_log");
ut_ad(log_sys.is_latest()); ut_ad(log_sys.is_latest());
const size_t block_size_1{log_sys.get_block_size() - 1}; const size_t block_size_1{log_sys.write_size - 1};
mysql_mutex_lock(&recv_sys.mutex); mysql_mutex_lock(&recv_sys.mutex);
if (!last_phase) if (!last_phase)
@ -4201,7 +4196,7 @@ static bool recv_scan_log(bool last_phase)
if (recv_sys.is_corrupt_log()) if (recv_sys.is_corrupt_log())
break; break;
if (recv_sys.offset < log_sys.get_block_size() && if (recv_sys.offset < log_sys.write_size &&
recv_sys.lsn == recv_sys.scanned_lsn) recv_sys.lsn == recv_sys.scanned_lsn)
goto got_eof; goto got_eof;
@ -4537,6 +4532,24 @@ dberr_t recv_recovery_read_checkpoint()
return err; return err;
} }
inline void log_t::set_recovered() noexcept
{
ut_ad(get_flushed_lsn() == get_lsn());
ut_ad(recv_sys.lsn == get_lsn());
size_t offset{recv_sys.offset};
if (!is_pmem())
{
const size_t bs{log_sys.write_size}, bs_1{bs - 1};
memmove_aligned<512>(buf, buf + (offset & ~bs_1), bs);
offset&= bs_1;
}
#ifdef HAVE_PMEM
else
mprotect(buf, size_t(file_size), PROT_READ | PROT_WRITE);
#endif
set_buf_free(offset);
}
/** Start recovering from a redo log checkpoint. /** Start recovering from a redo log checkpoint.
of first system tablespace page of first system tablespace page
@return error code or DB_SUCCESS */ @return error code or DB_SUCCESS */
@ -4710,22 +4723,7 @@ err_exit:
} }
if (!srv_read_only_mode && log_sys.is_latest()) { if (!srv_read_only_mode && log_sys.is_latest()) {
ut_ad(log_sys.get_flushed_lsn() == log_sys.get_lsn()); log_sys.set_recovered();
ut_ad(recv_sys.lsn == log_sys.get_lsn());
if (!log_sys.is_pmem()) {
const size_t bs_1{log_sys.get_block_size() - 1};
const size_t ro{recv_sys.offset};
recv_sys.offset &= bs_1;
memmove_aligned<64>(log_sys.buf,
log_sys.buf + (ro & ~bs_1),
log_sys.get_block_size());
#ifdef HAVE_PMEM
} else {
mprotect(log_sys.buf, size_t(log_sys.file_size),
PROT_READ | PROT_WRITE);
#endif
}
log_sys.set_buf_free(recv_sys.offset);
if (recv_needed_recovery if (recv_needed_recovery
&& srv_operation <= SRV_OPERATION_EXPORT_RESTORED && srv_operation <= SRV_OPERATION_EXPORT_RESTORED
&& recv_sys.lsn - log_sys.next_checkpoint_lsn && recv_sys.lsn - log_sys.next_checkpoint_lsn

View file

@ -1094,7 +1094,6 @@ static ATTRIBUTE_COLD void os_file_log_buffered()
{ {
log_sys.log_maybe_unbuffered= false; log_sys.log_maybe_unbuffered= false;
log_sys.log_buffered= true; log_sys.log_buffered= true;
log_sys.set_block_size(512);
} }
# endif # endif
@ -1209,11 +1208,7 @@ os_file_create_func(
break; break;
} }
# ifdef __linux__ # ifdef __linux__
} else if (type != OS_LOG_FILE) { } else if (type == OS_LOG_FILE && create_mode != OS_FILE_CREATE
} else if (log_sys.log_buffered) {
skip_o_direct:
os_file_log_buffered();
} else if (create_mode != OS_FILE_CREATE
&& create_mode != OS_FILE_CREATE_SILENT && create_mode != OS_FILE_CREATE_SILENT
&& !log_sys.is_opened()) { && !log_sys.is_opened()) {
if (stat(name, &st)) { if (stat(name, &st)) {
@ -1225,15 +1220,16 @@ os_file_create_func(
"InnoDB: File %s was not found", name); "InnoDB: File %s was not found", name);
goto not_found; goto not_found;
} }
log_sys.set_block_size(512);
goto skip_o_direct; goto skip_o_direct;
} } else if (!os_file_log_maybe_unbuffered(st)
|| log_sys.log_buffered) {
if (!os_file_log_maybe_unbuffered(st)) { skip_o_direct:
goto skip_o_direct; os_file_log_buffered();
} } else {
direct_flag = O_DIRECT; direct_flag = O_DIRECT;
log_sys.log_maybe_unbuffered= true; log_sys.log_maybe_unbuffered = true;
}
# endif # endif
} }
#else #else

View file

@ -175,7 +175,7 @@ static dberr_t create_log_file(bool create_new_db, lsn_t lsn)
/* We will retain ib_logfile0 until we have written a new logically /* We will retain ib_logfile0 until we have written a new logically
empty log as ib_logfile101 and atomically renamed it to empty log as ib_logfile101 and atomically renamed it to
ib_logfile0 in log_t::rename_resized(). */ ib_logfile0 in log_t::resize_rename(). */
delete_log_files(); delete_log_files();
ut_ad(!os_aio_pending_reads()); ut_ad(!os_aio_pending_reads());